mlrun 1.7.1rc4__py3-none-any.whl → 1.8.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +23 -21
- mlrun/__main__.py +3 -3
- mlrun/alerts/alert.py +148 -14
- mlrun/artifacts/__init__.py +1 -2
- mlrun/artifacts/base.py +46 -12
- mlrun/artifacts/dataset.py +16 -16
- mlrun/artifacts/document.py +334 -0
- mlrun/artifacts/manager.py +15 -13
- mlrun/artifacts/model.py +66 -53
- mlrun/common/constants.py +7 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/base/__init__.py → common/formatters/model_endpoint.py} +16 -1
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +1 -29
- mlrun/common/runtimes/constants.py +1 -2
- mlrun/common/schemas/__init__.py +6 -2
- mlrun/common/schemas/alert.py +111 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +11 -7
- mlrun/common/schemas/auth.py +6 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +2 -3
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +66 -14
- mlrun/common/schemas/model_monitoring/grafana.py +1 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +91 -147
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +137 -0
- mlrun/common/schemas/pipeline.py +2 -2
- mlrun/common/schemas/project.py +25 -17
- mlrun/common/schemas/runs.py +2 -2
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +5 -5
- mlrun/config.py +67 -10
- mlrun/data_types/__init__.py +0 -2
- mlrun/data_types/infer.py +3 -1
- mlrun/data_types/spark.py +2 -1
- mlrun/datastore/__init__.py +0 -2
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +12 -4
- mlrun/datastore/datastore.py +9 -3
- mlrun/datastore/datastore_profile.py +79 -20
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +4 -1
- mlrun/datastore/sources.py +52 -51
- mlrun/datastore/store_resources.py +0 -2
- mlrun/datastore/targets.py +21 -21
- mlrun/datastore/utils.py +2 -2
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +194 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +208 -82
- mlrun/db/factory.py +0 -3
- mlrun/db/httpdb.py +1237 -386
- mlrun/db/nopdb.py +201 -74
- mlrun/errors.py +2 -2
- mlrun/execution.py +136 -50
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +41 -40
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +27 -24
- mlrun/feature_store/retrieval/base.py +14 -9
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/steps.py +2 -2
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +29 -27
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +4 -3
- mlrun/model.py +117 -46
- mlrun/model_monitoring/__init__.py +4 -4
- mlrun/model_monitoring/api.py +61 -59
- mlrun/model_monitoring/applications/_application_steps.py +17 -17
- mlrun/model_monitoring/applications/base.py +165 -6
- mlrun/model_monitoring/applications/context.py +88 -37
- mlrun/model_monitoring/applications/evidently_base.py +1 -2
- mlrun/model_monitoring/applications/histogram_data_drift.py +43 -21
- mlrun/model_monitoring/applications/results.py +55 -3
- mlrun/model_monitoring/controller.py +207 -239
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +156 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/base.py +78 -25
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +90 -16
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +279 -59
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +78 -17
- mlrun/model_monitoring/helpers.py +152 -49
- mlrun/model_monitoring/stream_processing.py +99 -283
- mlrun/model_monitoring/tracking_policy.py +10 -3
- mlrun/model_monitoring/writer.py +48 -36
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +31 -14
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +27 -27
- mlrun/projects/pipelines.py +75 -38
- mlrun/projects/project.py +865 -206
- mlrun/run.py +53 -10
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +15 -11
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/generators.py +2 -1
- mlrun/runtimes/kubejob.py +4 -5
- mlrun/runtimes/mounts.py +572 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -11
- mlrun/runtimes/nuclio/function.py +19 -17
- mlrun/runtimes/nuclio/serving.py +18 -11
- mlrun/runtimes/pod.py +154 -45
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +21 -11
- mlrun/runtimes/utils.py +6 -5
- mlrun/serving/merger.py +6 -4
- mlrun/serving/remote.py +18 -17
- mlrun/serving/routers.py +185 -172
- mlrun/serving/server.py +7 -1
- mlrun/serving/states.py +97 -78
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +74 -65
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/clones.py +1 -1
- mlrun/utils/helpers.py +66 -18
- mlrun/utils/logger.py +106 -4
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +33 -14
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +6 -6
- mlrun/utils/notifications/notification/webhook.py +6 -6
- mlrun/utils/notifications/notification_pusher.py +86 -44
- mlrun/utils/regex.py +3 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/METADATA +191 -186
- mlrun-1.8.0rc8.dist-info/RECORD +347 -0
- {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.1rc4.dist-info/RECORD +0 -351
- {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/LICENSE +0 -0
- {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.1rc4.dist-info → mlrun-1.8.0rc8.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import collections
|
|
16
16
|
import datetime
|
|
17
|
-
import json
|
|
18
17
|
import os
|
|
19
18
|
import typing
|
|
20
19
|
|
|
@@ -30,14 +29,12 @@ import mlrun.model_monitoring.db
|
|
|
30
29
|
import mlrun.serving.states
|
|
31
30
|
import mlrun.utils
|
|
32
31
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
32
|
+
EndpointType,
|
|
33
33
|
EventFieldType,
|
|
34
|
-
EventKeyMetrics,
|
|
35
|
-
EventLiveStats,
|
|
36
34
|
FileTargetKind,
|
|
37
|
-
ModelEndpointTarget,
|
|
38
35
|
ProjectSecretKeys,
|
|
39
36
|
)
|
|
40
|
-
from mlrun.model_monitoring.db import
|
|
37
|
+
from mlrun.model_monitoring.db import TSDBConnector
|
|
41
38
|
from mlrun.utils import logger
|
|
42
39
|
|
|
43
40
|
|
|
@@ -51,7 +48,7 @@ class EventStreamProcessor:
|
|
|
51
48
|
parquet_target: str,
|
|
52
49
|
aggregate_windows: typing.Optional[list[str]] = None,
|
|
53
50
|
aggregate_period: str = "5m",
|
|
54
|
-
model_monitoring_access_key: str = None,
|
|
51
|
+
model_monitoring_access_key: typing.Optional[str] = None,
|
|
55
52
|
):
|
|
56
53
|
# General configurations, mainly used for the storey steps in the future serving graph
|
|
57
54
|
self.project = project
|
|
@@ -85,7 +82,7 @@ class EventStreamProcessor:
|
|
|
85
82
|
v3io_access_key: typing.Optional[str] = None,
|
|
86
83
|
v3io_framesd: typing.Optional[str] = None,
|
|
87
84
|
v3io_api: typing.Optional[str] = None,
|
|
88
|
-
model_monitoring_access_key: str = None,
|
|
85
|
+
model_monitoring_access_key: typing.Optional[str] = None,
|
|
89
86
|
):
|
|
90
87
|
# Get the V3IO configurations
|
|
91
88
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
@@ -101,18 +98,6 @@ class EventStreamProcessor:
|
|
|
101
98
|
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
102
99
|
)
|
|
103
100
|
|
|
104
|
-
# KV path
|
|
105
|
-
kv_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
106
|
-
project=self.project, kind=FileTargetKind.ENDPOINTS
|
|
107
|
-
)
|
|
108
|
-
(
|
|
109
|
-
_,
|
|
110
|
-
self.kv_container,
|
|
111
|
-
self.kv_path,
|
|
112
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
113
|
-
kv_path
|
|
114
|
-
)
|
|
115
|
-
|
|
116
101
|
# TSDB path and configurations
|
|
117
102
|
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
118
103
|
project=self.project, kind=FileTargetKind.EVENTS
|
|
@@ -133,7 +118,6 @@ class EventStreamProcessor:
|
|
|
133
118
|
self,
|
|
134
119
|
fn: mlrun.runtimes.ServingRuntime,
|
|
135
120
|
tsdb_connector: TSDBConnector,
|
|
136
|
-
endpoint_store: StoreBase,
|
|
137
121
|
) -> None:
|
|
138
122
|
"""
|
|
139
123
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -162,31 +146,23 @@ class EventStreamProcessor:
|
|
|
162
146
|
|
|
163
147
|
:param fn: A serving function.
|
|
164
148
|
:param tsdb_connector: Time series database connector.
|
|
165
|
-
:param endpoint_store: KV/SQL store used for endpoint data.
|
|
166
149
|
"""
|
|
167
150
|
|
|
168
151
|
graph = typing.cast(
|
|
169
152
|
mlrun.serving.states.RootFlowStep,
|
|
170
153
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
171
154
|
)
|
|
172
|
-
graph.add_step(
|
|
173
|
-
"ExtractEndpointID",
|
|
174
|
-
"extract_endpoint",
|
|
175
|
-
full_event=True,
|
|
176
|
-
)
|
|
177
155
|
|
|
178
156
|
# split the graph between event with error vs valid event
|
|
179
157
|
graph.add_step(
|
|
180
158
|
"storey.Filter",
|
|
181
159
|
"FilterError",
|
|
182
|
-
after="extract_endpoint",
|
|
183
160
|
_fn="(event.get('error') is None)",
|
|
184
161
|
)
|
|
185
162
|
|
|
186
163
|
graph.add_step(
|
|
187
164
|
"storey.Filter",
|
|
188
165
|
"ForwardError",
|
|
189
|
-
after="extract_endpoint",
|
|
190
166
|
_fn="(event.get('error') is not None)",
|
|
191
167
|
)
|
|
192
168
|
|
|
@@ -198,7 +174,7 @@ class EventStreamProcessor:
|
|
|
198
174
|
def apply_process_endpoint_event():
|
|
199
175
|
graph.add_step(
|
|
200
176
|
"ProcessEndpointEvent",
|
|
201
|
-
after="
|
|
177
|
+
after="FilterError",
|
|
202
178
|
full_event=True,
|
|
203
179
|
project=self.project,
|
|
204
180
|
)
|
|
@@ -233,79 +209,11 @@ class EventStreamProcessor:
|
|
|
233
209
|
)
|
|
234
210
|
|
|
235
211
|
apply_map_feature_names()
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
class_name="storey.AggregateByKey",
|
|
242
|
-
aggregates=[
|
|
243
|
-
{
|
|
244
|
-
"name": EventFieldType.LATENCY,
|
|
245
|
-
"column": EventFieldType.LATENCY,
|
|
246
|
-
"operations": ["count", "avg"],
|
|
247
|
-
"windows": self.aggregate_windows,
|
|
248
|
-
"period": self.aggregate_period,
|
|
249
|
-
}
|
|
250
|
-
],
|
|
251
|
-
name=EventFieldType.LATENCY,
|
|
252
|
-
after="MapFeatureNames",
|
|
253
|
-
step_name="Aggregates",
|
|
254
|
-
table=".",
|
|
255
|
-
key_field=EventFieldType.ENDPOINT_ID,
|
|
256
|
-
)
|
|
257
|
-
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
258
|
-
graph.add_step(
|
|
259
|
-
class_name="storey.Rename",
|
|
260
|
-
mapping={
|
|
261
|
-
"latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
262
|
-
"latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
263
|
-
},
|
|
264
|
-
name="Rename",
|
|
265
|
-
after=EventFieldType.LATENCY,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
apply_storey_aggregations()
|
|
269
|
-
|
|
270
|
-
# KV/SQL branch
|
|
271
|
-
# Filter relevant keys from the event before writing the data into the database table
|
|
272
|
-
def apply_process_before_endpoint_update():
|
|
273
|
-
graph.add_step(
|
|
274
|
-
"ProcessBeforeEndpointUpdate",
|
|
275
|
-
name="ProcessBeforeEndpointUpdate",
|
|
276
|
-
after="Rename",
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
apply_process_before_endpoint_update()
|
|
280
|
-
|
|
281
|
-
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
282
|
-
# about average latency and the amount of predictions over time
|
|
283
|
-
def apply_update_endpoint():
|
|
284
|
-
graph.add_step(
|
|
285
|
-
"UpdateEndpoint",
|
|
286
|
-
name="UpdateEndpoint",
|
|
287
|
-
after="ProcessBeforeEndpointUpdate",
|
|
288
|
-
project=self.project,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
apply_update_endpoint()
|
|
292
|
-
|
|
293
|
-
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
294
|
-
# which will be used by Grafana monitoring dashboards
|
|
295
|
-
def apply_infer_schema():
|
|
296
|
-
graph.add_step(
|
|
297
|
-
"InferSchema",
|
|
298
|
-
name="InferSchema",
|
|
299
|
-
after="UpdateEndpoint",
|
|
300
|
-
v3io_framesd=self.v3io_framesd,
|
|
301
|
-
container=self.kv_container,
|
|
302
|
-
table=self.kv_path,
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
306
|
-
apply_infer_schema()
|
|
307
|
-
|
|
308
|
-
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
212
|
+
tsdb_connector.apply_monitoring_stream_steps(
|
|
213
|
+
graph=graph,
|
|
214
|
+
aggregate_windows=self.aggregate_windows,
|
|
215
|
+
aggregate_period=self.aggregate_period,
|
|
216
|
+
)
|
|
309
217
|
|
|
310
218
|
# Parquet branch
|
|
311
219
|
# Filter and validate different keys before writing the data to Parquet target
|
|
@@ -341,91 +249,6 @@ class EventStreamProcessor:
|
|
|
341
249
|
apply_parquet_target()
|
|
342
250
|
|
|
343
251
|
|
|
344
|
-
class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
345
|
-
def __init__(self, **kwargs):
|
|
346
|
-
"""
|
|
347
|
-
Filter relevant keys from the event before writing the data to database table (in EndpointUpdate step).
|
|
348
|
-
Note that in the endpoint table we only keep metadata (function_uri, model_class, etc.) and stats about the
|
|
349
|
-
average latency and the number of predictions (per 5min and 1hour).
|
|
350
|
-
|
|
351
|
-
:returns: A filtered event as a dictionary which will be written to the endpoint table in the next step.
|
|
352
|
-
"""
|
|
353
|
-
super().__init__(**kwargs)
|
|
354
|
-
|
|
355
|
-
def do(self, event):
|
|
356
|
-
# Compute prediction per second
|
|
357
|
-
event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
|
|
358
|
-
float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
|
|
359
|
-
)
|
|
360
|
-
# Filter relevant keys
|
|
361
|
-
e = {
|
|
362
|
-
k: event[k]
|
|
363
|
-
for k in [
|
|
364
|
-
EventFieldType.FUNCTION_URI,
|
|
365
|
-
EventFieldType.MODEL,
|
|
366
|
-
EventFieldType.MODEL_CLASS,
|
|
367
|
-
EventFieldType.ENDPOINT_ID,
|
|
368
|
-
EventFieldType.LABELS,
|
|
369
|
-
EventFieldType.FIRST_REQUEST,
|
|
370
|
-
EventFieldType.LAST_REQUEST,
|
|
371
|
-
EventFieldType.ERROR_COUNT,
|
|
372
|
-
]
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
# Add generic metrics statistics
|
|
376
|
-
generic_metrics = {
|
|
377
|
-
k: event[k]
|
|
378
|
-
for k in [
|
|
379
|
-
EventLiveStats.LATENCY_AVG_5M,
|
|
380
|
-
EventLiveStats.LATENCY_AVG_1H,
|
|
381
|
-
EventLiveStats.PREDICTIONS_PER_SECOND,
|
|
382
|
-
EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
383
|
-
EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
384
|
-
]
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
e[EventFieldType.METRICS] = json.dumps(
|
|
388
|
-
{EventKeyMetrics.GENERIC: generic_metrics}
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
# Write labels as json string as required by the DB format
|
|
392
|
-
e[EventFieldType.LABELS] = json.dumps(e[EventFieldType.LABELS])
|
|
393
|
-
|
|
394
|
-
return e
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
398
|
-
def __init__(self, **kwargs) -> None:
|
|
399
|
-
"""
|
|
400
|
-
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
401
|
-
"""
|
|
402
|
-
super().__init__(**kwargs)
|
|
403
|
-
|
|
404
|
-
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
405
|
-
# Getting model version and function uri from event
|
|
406
|
-
# and use them for retrieving the endpoint_id
|
|
407
|
-
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
408
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
409
|
-
return None
|
|
410
|
-
|
|
411
|
-
model = full_event.body.get(EventFieldType.MODEL)
|
|
412
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
413
|
-
return None
|
|
414
|
-
|
|
415
|
-
version = full_event.body.get(EventFieldType.VERSION)
|
|
416
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
417
|
-
|
|
418
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
419
|
-
function_uri=function_uri,
|
|
420
|
-
versioned_model=versioned_model,
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
endpoint_id = str(endpoint_id)
|
|
424
|
-
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
425
|
-
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
426
|
-
return full_event
|
|
427
|
-
|
|
428
|
-
|
|
429
252
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
430
253
|
def __init__(self, **kwargs):
|
|
431
254
|
"""
|
|
@@ -498,20 +321,27 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
498
321
|
|
|
499
322
|
def do(self, full_event):
|
|
500
323
|
event = full_event.body
|
|
324
|
+
# Getting model version and function uri from event
|
|
325
|
+
# and use them for retrieving the endpoint_id
|
|
326
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
327
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
331
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
332
|
+
return None
|
|
501
333
|
|
|
502
|
-
|
|
334
|
+
version = full_event.body.get(EventFieldType.VERSION)
|
|
335
|
+
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
336
|
+
|
|
337
|
+
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
503
338
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
504
|
-
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
505
339
|
|
|
506
340
|
# In case this process fails, resume state from existing record
|
|
507
|
-
self.resume_state(
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
error = event.get("error")
|
|
512
|
-
if error: # TODO: delete this in ML-7456
|
|
513
|
-
self.error_count[endpoint_id] += 1
|
|
514
|
-
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
341
|
+
self.resume_state(
|
|
342
|
+
endpoint_id,
|
|
343
|
+
full_event.body.get(EventFieldType.MODEL),
|
|
344
|
+
)
|
|
515
345
|
|
|
516
346
|
# Validate event fields
|
|
517
347
|
model_class = event.get("model_class") or event.get("class")
|
|
@@ -535,11 +365,6 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
535
365
|
# Set time for the first request of the current endpoint
|
|
536
366
|
self.first_request[endpoint_id] = timestamp
|
|
537
367
|
|
|
538
|
-
# Validate that the request time of the current event is later than the previous request time
|
|
539
|
-
self._validate_last_request_timestamp(
|
|
540
|
-
endpoint_id=endpoint_id, timestamp=timestamp
|
|
541
|
-
)
|
|
542
|
-
|
|
543
368
|
# Set time for the last reqeust of the current endpoint
|
|
544
369
|
self.last_request[endpoint_id] = timestamp
|
|
545
370
|
|
|
@@ -609,6 +434,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
609
434
|
{
|
|
610
435
|
EventFieldType.FUNCTION_URI: function_uri,
|
|
611
436
|
EventFieldType.MODEL: versioned_model,
|
|
437
|
+
EventFieldType.ENDPOINT_NAME: event.get(EventFieldType.MODEL),
|
|
612
438
|
EventFieldType.MODEL_CLASS: model_class,
|
|
613
439
|
EventFieldType.TIMESTAMP: timestamp,
|
|
614
440
|
EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
@@ -635,33 +461,19 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
635
461
|
storey_event = storey.Event(body=events, key=endpoint_id)
|
|
636
462
|
return storey_event
|
|
637
463
|
|
|
638
|
-
def
|
|
639
|
-
"""Validate that the request time of the current event is later than the previous request time that has
|
|
640
|
-
already been processed.
|
|
641
|
-
|
|
642
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
643
|
-
:param timestamp: Event request time as a string.
|
|
644
|
-
|
|
645
|
-
:raise MLRunPreconditionFailedError: If the request time of the current is later than the previous request time.
|
|
646
|
-
"""
|
|
647
|
-
|
|
648
|
-
if (
|
|
649
|
-
endpoint_id in self.last_request
|
|
650
|
-
and self.last_request[endpoint_id] > timestamp
|
|
651
|
-
):
|
|
652
|
-
logger.error(
|
|
653
|
-
f"current event request time {timestamp} is earlier than the last request time "
|
|
654
|
-
f"{self.last_request[endpoint_id]} - write to TSDB will be rejected"
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
def resume_state(self, endpoint_id):
|
|
464
|
+
def resume_state(self, endpoint_id, endpoint_name):
|
|
658
465
|
# Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
|
|
659
466
|
# left them
|
|
660
467
|
if endpoint_id not in self.endpoints:
|
|
661
468
|
logger.info("Trying to resume state", endpoint_id=endpoint_id)
|
|
662
|
-
endpoint_record =
|
|
663
|
-
|
|
664
|
-
|
|
469
|
+
endpoint_record = (
|
|
470
|
+
mlrun.db.get_run_db()
|
|
471
|
+
.get_model_endpoint(
|
|
472
|
+
project=self.project,
|
|
473
|
+
endpoint_id=endpoint_id,
|
|
474
|
+
name=endpoint_name,
|
|
475
|
+
)
|
|
476
|
+
.flat_dict()
|
|
665
477
|
)
|
|
666
478
|
|
|
667
479
|
# If model endpoint found, get first_request, last_request and error_count values
|
|
@@ -735,6 +547,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
735
547
|
# and labels columns were not found in the current event
|
|
736
548
|
self.feature_names = {}
|
|
737
549
|
self.label_columns = {}
|
|
550
|
+
self.first_request = {}
|
|
738
551
|
|
|
739
552
|
# Dictionary to manage the model endpoint types - important for the V3IO TSDB
|
|
740
553
|
self.endpoint_type = {}
|
|
@@ -766,23 +579,29 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
766
579
|
if isinstance(feature_value, int):
|
|
767
580
|
feature_values[index] = float(feature_value)
|
|
768
581
|
|
|
582
|
+
attributes_to_update = {}
|
|
583
|
+
endpoint_record = None
|
|
769
584
|
# Get feature names and label columns
|
|
770
585
|
if endpoint_id not in self.feature_names:
|
|
771
|
-
endpoint_record =
|
|
772
|
-
|
|
773
|
-
|
|
586
|
+
endpoint_record = (
|
|
587
|
+
mlrun.db.get_run_db()
|
|
588
|
+
.get_model_endpoint(
|
|
589
|
+
project=self.project,
|
|
590
|
+
endpoint_id=endpoint_id,
|
|
591
|
+
name=event[EventFieldType.ENDPOINT_NAME],
|
|
592
|
+
)
|
|
593
|
+
.flat_dict()
|
|
774
594
|
)
|
|
775
595
|
feature_names = endpoint_record.get(EventFieldType.FEATURE_NAMES)
|
|
776
|
-
feature_names = json.loads(feature_names) if feature_names else None
|
|
777
596
|
|
|
778
597
|
label_columns = endpoint_record.get(EventFieldType.LABEL_NAMES)
|
|
779
|
-
label_columns = json.loads(label_columns) if label_columns else None
|
|
780
598
|
|
|
781
599
|
# If feature names were not found,
|
|
782
600
|
# try to retrieve them from the previous events of the current process
|
|
783
601
|
if not feature_names and self._infer_columns_from_data:
|
|
784
602
|
feature_names = self._infer_feature_names_from_data(event)
|
|
785
603
|
|
|
604
|
+
endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
|
|
786
605
|
if not feature_names:
|
|
787
606
|
logger.warn(
|
|
788
607
|
"Feature names are not initialized, they will be automatically generated",
|
|
@@ -793,19 +612,14 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
793
612
|
]
|
|
794
613
|
|
|
795
614
|
# Update the endpoint record with the generated features
|
|
796
|
-
|
|
797
|
-
project=self.project,
|
|
798
|
-
endpoint_id=endpoint_id,
|
|
799
|
-
attributes={
|
|
800
|
-
EventFieldType.FEATURE_NAMES: json.dumps(feature_names)
|
|
801
|
-
},
|
|
802
|
-
)
|
|
615
|
+
attributes_to_update[EventFieldType.FEATURE_NAMES] = feature_names
|
|
803
616
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
617
|
+
if endpoint_type != EndpointType.ROUTER.value:
|
|
618
|
+
update_monitoring_feature_set(
|
|
619
|
+
endpoint_record=endpoint_record,
|
|
620
|
+
feature_names=feature_names,
|
|
621
|
+
feature_values=feature_values,
|
|
622
|
+
)
|
|
809
623
|
|
|
810
624
|
# Similar process with label columns
|
|
811
625
|
if not label_columns and self._infer_columns_from_data:
|
|
@@ -819,17 +633,13 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
819
633
|
label_columns = [
|
|
820
634
|
f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
|
|
821
635
|
]
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
endpoint_record=endpoint_record,
|
|
830
|
-
feature_names=label_columns,
|
|
831
|
-
feature_values=label_values,
|
|
832
|
-
)
|
|
636
|
+
attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns
|
|
637
|
+
if endpoint_type != EndpointType.ROUTER.value:
|
|
638
|
+
update_monitoring_feature_set(
|
|
639
|
+
endpoint_record=endpoint_record,
|
|
640
|
+
feature_names=label_columns,
|
|
641
|
+
feature_values=label_values,
|
|
642
|
+
)
|
|
833
643
|
|
|
834
644
|
self.label_columns[endpoint_id] = label_columns
|
|
835
645
|
self.feature_names[endpoint_id] = feature_names
|
|
@@ -842,9 +652,39 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
842
652
|
)
|
|
843
653
|
|
|
844
654
|
# Update the endpoint type within the endpoint types dictionary
|
|
845
|
-
endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
|
|
846
655
|
self.endpoint_type[endpoint_id] = endpoint_type
|
|
847
656
|
|
|
657
|
+
# Update the first request time in the endpoint record
|
|
658
|
+
if endpoint_id not in self.first_request:
|
|
659
|
+
endpoint_record = endpoint_record or (
|
|
660
|
+
mlrun.db.get_run_db()
|
|
661
|
+
.get_model_endpoint(
|
|
662
|
+
project=self.project,
|
|
663
|
+
endpoint_id=endpoint_id,
|
|
664
|
+
name=event[EventFieldType.ENDPOINT_NAME],
|
|
665
|
+
)
|
|
666
|
+
.flat_dict()
|
|
667
|
+
)
|
|
668
|
+
if not endpoint_record.get(EventFieldType.FIRST_REQUEST):
|
|
669
|
+
attributes_to_update[EventFieldType.FIRST_REQUEST] = (
|
|
670
|
+
mlrun.utils.enrich_datetime_with_tz_info(
|
|
671
|
+
event[EventFieldType.FIRST_REQUEST]
|
|
672
|
+
)
|
|
673
|
+
)
|
|
674
|
+
self.first_request[endpoint_id] = True
|
|
675
|
+
if attributes_to_update:
|
|
676
|
+
logger.info(
|
|
677
|
+
"Updating endpoint record",
|
|
678
|
+
endpoint_id=endpoint_id,
|
|
679
|
+
attributes=attributes_to_update,
|
|
680
|
+
)
|
|
681
|
+
update_endpoint_record(
|
|
682
|
+
project=self.project,
|
|
683
|
+
endpoint_id=endpoint_id,
|
|
684
|
+
attributes=attributes_to_update,
|
|
685
|
+
endpoint_name=event[EventFieldType.ENDPOINT_NAME],
|
|
686
|
+
)
|
|
687
|
+
|
|
848
688
|
# Add feature_name:value pairs along with a mapping dictionary of all of these pairs
|
|
849
689
|
feature_names = self.feature_names[endpoint_id]
|
|
850
690
|
self._map_dictionary_values(
|
|
@@ -895,30 +735,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
895
735
|
event[mapping_dictionary][name] = value
|
|
896
736
|
|
|
897
737
|
|
|
898
|
-
class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
899
|
-
def __init__(self, project: str, **kwargs):
|
|
900
|
-
"""
|
|
901
|
-
Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
|
|
902
|
-
the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
|
|
903
|
-
such as "Model Monitoring - Performance" which can be found in Grafana.
|
|
904
|
-
|
|
905
|
-
:returns: Event as a dictionary (without any changes) for the next step (InferSchema).
|
|
906
|
-
"""
|
|
907
|
-
super().__init__(**kwargs)
|
|
908
|
-
self.project = project
|
|
909
|
-
|
|
910
|
-
def do(self, event: dict):
|
|
911
|
-
# Remove labels from the event
|
|
912
|
-
event.pop(EventFieldType.LABELS)
|
|
913
|
-
|
|
914
|
-
update_endpoint_record(
|
|
915
|
-
project=self.project,
|
|
916
|
-
endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
|
|
917
|
-
attributes=event,
|
|
918
|
-
)
|
|
919
|
-
return event
|
|
920
|
-
|
|
921
|
-
|
|
922
738
|
class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
923
739
|
def __init__(
|
|
924
740
|
self,
|
|
@@ -963,14 +779,14 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
963
779
|
def update_endpoint_record(
|
|
964
780
|
project: str,
|
|
965
781
|
endpoint_id: str,
|
|
782
|
+
endpoint_name: str,
|
|
966
783
|
attributes: dict,
|
|
967
784
|
):
|
|
968
|
-
|
|
785
|
+
mlrun.db.get_run_db().patch_model_endpoint(
|
|
969
786
|
project=project,
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
endpoint_id=endpoint_id, attributes=attributes
|
|
787
|
+
endpoint_id=endpoint_id,
|
|
788
|
+
attributes=attributes,
|
|
789
|
+
name=endpoint_name,
|
|
974
790
|
)
|
|
975
791
|
|
|
976
792
|
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import warnings
|
|
16
|
-
from typing import Union
|
|
16
|
+
from typing import Optional, Union
|
|
17
17
|
|
|
18
18
|
import mlrun.common.schemas.schedule
|
|
19
19
|
import mlrun.model
|
|
@@ -74,7 +74,9 @@ class TrackingPolicy(mlrun.model.ModelObj):
|
|
|
74
74
|
self.default_controller_image = default_controller_image
|
|
75
75
|
|
|
76
76
|
@classmethod
|
|
77
|
-
def from_dict(
|
|
77
|
+
def from_dict(
|
|
78
|
+
cls, struct=None, fields=None, deprecated_fields: Optional[dict] = None
|
|
79
|
+
):
|
|
78
80
|
new_obj = super().from_dict(
|
|
79
81
|
struct, fields=cls._dict_fields, deprecated_fields=deprecated_fields
|
|
80
82
|
)
|
|
@@ -102,7 +104,12 @@ class TrackingPolicy(mlrun.model.ModelObj):
|
|
|
102
104
|
)
|
|
103
105
|
return new_obj
|
|
104
106
|
|
|
105
|
-
def to_dict(
|
|
107
|
+
def to_dict(
|
|
108
|
+
self,
|
|
109
|
+
fields: Optional[list] = None,
|
|
110
|
+
exclude: Optional[list] = None,
|
|
111
|
+
strip: bool = False,
|
|
112
|
+
):
|
|
106
113
|
struct = super().to_dict(
|
|
107
114
|
fields,
|
|
108
115
|
exclude=[
|