mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +26 -22
- mlrun/__main__.py +15 -16
- mlrun/alerts/alert.py +150 -15
- mlrun/api/schemas/__init__.py +1 -9
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +62 -19
- mlrun/artifacts/dataset.py +17 -17
- mlrun/artifacts/document.py +454 -0
- mlrun/artifacts/manager.py +28 -18
- mlrun/artifacts/model.py +91 -59
- mlrun/artifacts/plots.py +2 -2
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -1
- mlrun/common/formatters/feature_set.py +2 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +12 -62
- mlrun/common/runtimes/constants.py +25 -4
- mlrun/common/schemas/__init__.py +9 -5
- mlrun/common/schemas/alert.py +114 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +22 -9
- mlrun/common/schemas/auth.py +8 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +4 -4
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +4 -8
- mlrun/common/schemas/model_monitoring/constants.py +127 -46
- mlrun/common/schemas/model_monitoring/grafana.py +18 -12
- mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +142 -0
- mlrun/common/schemas/pipeline.py +3 -3
- mlrun/common/schemas/project.py +26 -18
- mlrun/common/schemas/runs.py +3 -3
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +6 -5
- mlrun/common/types.py +1 -0
- mlrun/config.py +157 -89
- mlrun/data_types/__init__.py +5 -3
- mlrun/data_types/infer.py +13 -3
- mlrun/data_types/spark.py +2 -1
- mlrun/datastore/__init__.py +59 -18
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +19 -24
- mlrun/datastore/datastore.py +10 -4
- mlrun/datastore/datastore_profile.py +178 -45
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +14 -3
- mlrun/datastore/sources.py +89 -92
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/storeytargets.py +51 -16
- mlrun/datastore/targets.py +38 -31
- mlrun/datastore/utils.py +87 -4
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +291 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +286 -100
- mlrun/db/httpdb.py +1562 -490
- mlrun/db/nopdb.py +250 -83
- mlrun/errors.py +6 -2
- mlrun/execution.py +194 -50
- mlrun/feature_store/__init__.py +2 -10
- mlrun/feature_store/api.py +20 -458
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +105 -479
- mlrun/feature_store/feature_vector_utils.py +466 -0
- mlrun/feature_store/retrieval/base.py +15 -11
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/retrieval/storey_merger.py +1 -1
- mlrun/feature_store/steps.py +3 -3
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +31 -31
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/k8s_utils.py +2 -5
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +6 -2
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +8 -4
- mlrun/model.py +132 -46
- mlrun/model_monitoring/__init__.py +3 -5
- mlrun/model_monitoring/api.py +113 -98
- mlrun/model_monitoring/applications/__init__.py +0 -5
- mlrun/model_monitoring/applications/_application_steps.py +81 -50
- mlrun/model_monitoring/applications/base.py +467 -14
- mlrun/model_monitoring/applications/context.py +212 -134
- mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
- mlrun/model_monitoring/applications/evidently/base.py +146 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
- mlrun/model_monitoring/applications/results.py +67 -15
- mlrun/model_monitoring/controller.py +701 -315
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +242 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
- mlrun/model_monitoring/db/tsdb/base.py +243 -49
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
- mlrun/model_monitoring/helpers.py +356 -114
- mlrun/model_monitoring/stream_processing.py +190 -345
- mlrun/model_monitoring/tracking_policy.py +11 -4
- mlrun/model_monitoring/writer.py +49 -90
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +2 -2
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +35 -32
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +30 -30
- mlrun/projects/pipelines.py +116 -47
- mlrun/projects/project.py +1292 -329
- mlrun/render.py +5 -9
- mlrun/run.py +57 -14
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +30 -22
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
- mlrun/runtimes/function_reference.py +5 -2
- mlrun/runtimes/generators.py +3 -2
- mlrun/runtimes/kubejob.py +6 -7
- mlrun/runtimes/mounts.py +574 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -13
- mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
- mlrun/runtimes/nuclio/function.py +127 -70
- mlrun/runtimes/nuclio/serving.py +105 -37
- mlrun/runtimes/pod.py +159 -54
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +22 -12
- mlrun/runtimes/utils.py +7 -6
- mlrun/secrets.py +2 -2
- mlrun/serving/__init__.py +8 -0
- mlrun/serving/merger.py +7 -5
- mlrun/serving/remote.py +35 -22
- mlrun/serving/routers.py +186 -240
- mlrun/serving/server.py +41 -10
- mlrun/serving/states.py +432 -118
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +161 -203
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +35 -22
- mlrun/utils/clones.py +7 -4
- mlrun/utils/helpers.py +511 -58
- mlrun/utils/logger.py +119 -13
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +39 -15
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +16 -8
- mlrun/utils/notifications/notification/webhook.py +24 -8
- mlrun/utils/notifications/notification_pusher.py +191 -200
- mlrun/utils/regex.py +12 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
- mlrun-1.8.0.dist-info/RECORD +351 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/applications/evidently_base.py +0 -137
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.2rc3.dist-info/RECORD +0 -351
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -12,32 +12,24 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import collections
|
|
16
15
|
import datetime
|
|
17
|
-
import json
|
|
18
|
-
import os
|
|
19
16
|
import typing
|
|
20
17
|
|
|
21
|
-
import storey
|
|
22
|
-
|
|
23
18
|
import mlrun
|
|
24
19
|
import mlrun.common.model_monitoring.helpers
|
|
25
|
-
import mlrun.config
|
|
26
|
-
import mlrun.datastore.targets
|
|
27
20
|
import mlrun.feature_store as fstore
|
|
28
21
|
import mlrun.feature_store.steps
|
|
29
|
-
import mlrun.model_monitoring.db
|
|
30
22
|
import mlrun.serving.states
|
|
31
23
|
import mlrun.utils
|
|
32
24
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
25
|
+
ControllerEvent,
|
|
26
|
+
ControllerEventKind,
|
|
27
|
+
EndpointType,
|
|
33
28
|
EventFieldType,
|
|
34
|
-
EventKeyMetrics,
|
|
35
|
-
EventLiveStats,
|
|
36
29
|
FileTargetKind,
|
|
37
|
-
ModelEndpointTarget,
|
|
38
30
|
ProjectSecretKeys,
|
|
39
31
|
)
|
|
40
|
-
from mlrun.model_monitoring.db import
|
|
32
|
+
from mlrun.model_monitoring.db import TSDBConnector
|
|
41
33
|
from mlrun.utils import logger
|
|
42
34
|
|
|
43
35
|
|
|
@@ -51,7 +43,7 @@ class EventStreamProcessor:
|
|
|
51
43
|
parquet_target: str,
|
|
52
44
|
aggregate_windows: typing.Optional[list[str]] = None,
|
|
53
45
|
aggregate_period: str = "5m",
|
|
54
|
-
model_monitoring_access_key: str = None,
|
|
46
|
+
model_monitoring_access_key: typing.Optional[str] = None,
|
|
55
47
|
):
|
|
56
48
|
# General configurations, mainly used for the storey steps in the future serving graph
|
|
57
49
|
self.project = project
|
|
@@ -69,14 +61,11 @@ class EventStreamProcessor:
|
|
|
69
61
|
parquet_batching_max_events=self.parquet_batching_max_events,
|
|
70
62
|
)
|
|
71
63
|
|
|
72
|
-
self.storage_options = None
|
|
73
64
|
self.tsdb_configurations = {}
|
|
74
65
|
if not mlrun.mlconf.is_ce_mode():
|
|
75
66
|
self._initialize_v3io_configurations(
|
|
76
67
|
model_monitoring_access_key=model_monitoring_access_key
|
|
77
68
|
)
|
|
78
|
-
elif self.parquet_path.startswith("s3://"):
|
|
79
|
-
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
80
69
|
|
|
81
70
|
def _initialize_v3io_configurations(
|
|
82
71
|
self,
|
|
@@ -85,33 +74,18 @@ class EventStreamProcessor:
|
|
|
85
74
|
v3io_access_key: typing.Optional[str] = None,
|
|
86
75
|
v3io_framesd: typing.Optional[str] = None,
|
|
87
76
|
v3io_api: typing.Optional[str] = None,
|
|
88
|
-
model_monitoring_access_key: str = None,
|
|
77
|
+
model_monitoring_access_key: typing.Optional[str] = None,
|
|
89
78
|
):
|
|
90
79
|
# Get the V3IO configurations
|
|
91
80
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
92
81
|
self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
|
|
93
82
|
|
|
94
|
-
self.v3io_access_key = v3io_access_key or
|
|
83
|
+
self.v3io_access_key = v3io_access_key or mlrun.mlconf.get_v3io_access_key()
|
|
95
84
|
self.model_monitoring_access_key = (
|
|
96
85
|
model_monitoring_access_key
|
|
97
|
-
or
|
|
86
|
+
or mlrun.get_secret_or_env(ProjectSecretKeys.ACCESS_KEY)
|
|
98
87
|
or self.v3io_access_key
|
|
99
88
|
)
|
|
100
|
-
self.storage_options = dict(
|
|
101
|
-
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# KV path
|
|
105
|
-
kv_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
106
|
-
project=self.project, kind=FileTargetKind.ENDPOINTS
|
|
107
|
-
)
|
|
108
|
-
(
|
|
109
|
-
_,
|
|
110
|
-
self.kv_container,
|
|
111
|
-
self.kv_path,
|
|
112
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
113
|
-
kv_path
|
|
114
|
-
)
|
|
115
89
|
|
|
116
90
|
# TSDB path and configurations
|
|
117
91
|
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
@@ -133,7 +107,7 @@ class EventStreamProcessor:
|
|
|
133
107
|
self,
|
|
134
108
|
fn: mlrun.runtimes.ServingRuntime,
|
|
135
109
|
tsdb_connector: TSDBConnector,
|
|
136
|
-
|
|
110
|
+
controller_stream_uri: str,
|
|
137
111
|
) -> None:
|
|
138
112
|
"""
|
|
139
113
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
@@ -162,31 +136,25 @@ class EventStreamProcessor:
|
|
|
162
136
|
|
|
163
137
|
:param fn: A serving function.
|
|
164
138
|
:param tsdb_connector: Time series database connector.
|
|
165
|
-
:param
|
|
139
|
+
:param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
|
|
140
|
+
input
|
|
166
141
|
"""
|
|
167
142
|
|
|
168
143
|
graph = typing.cast(
|
|
169
144
|
mlrun.serving.states.RootFlowStep,
|
|
170
|
-
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
171
|
-
)
|
|
172
|
-
graph.add_step(
|
|
173
|
-
"ExtractEndpointID",
|
|
174
|
-
"extract_endpoint",
|
|
175
|
-
full_event=True,
|
|
145
|
+
fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
|
|
176
146
|
)
|
|
177
147
|
|
|
178
148
|
# split the graph between event with error vs valid event
|
|
179
149
|
graph.add_step(
|
|
180
150
|
"storey.Filter",
|
|
181
151
|
"FilterError",
|
|
182
|
-
after="extract_endpoint",
|
|
183
152
|
_fn="(event.get('error') is None)",
|
|
184
153
|
)
|
|
185
154
|
|
|
186
155
|
graph.add_step(
|
|
187
156
|
"storey.Filter",
|
|
188
157
|
"ForwardError",
|
|
189
|
-
after="extract_endpoint",
|
|
190
158
|
_fn="(event.get('error') is not None)",
|
|
191
159
|
)
|
|
192
160
|
|
|
@@ -198,7 +166,7 @@ class EventStreamProcessor:
|
|
|
198
166
|
def apply_process_endpoint_event():
|
|
199
167
|
graph.add_step(
|
|
200
168
|
"ProcessEndpointEvent",
|
|
201
|
-
after="
|
|
169
|
+
after="FilterError",
|
|
202
170
|
full_event=True,
|
|
203
171
|
project=self.project,
|
|
204
172
|
)
|
|
@@ -233,79 +201,25 @@ class EventStreamProcessor:
|
|
|
233
201
|
)
|
|
234
202
|
|
|
235
203
|
apply_map_feature_names()
|
|
204
|
+
# split the graph between event with error vs valid event
|
|
205
|
+
graph.add_step(
|
|
206
|
+
"storey.Filter",
|
|
207
|
+
"FilterNOP",
|
|
208
|
+
after="MapFeatureNames",
|
|
209
|
+
_fn="(event.get('kind', " ") != 'nop_event')",
|
|
210
|
+
)
|
|
211
|
+
graph.add_step(
|
|
212
|
+
"storey.Filter",
|
|
213
|
+
"ForwardNOP",
|
|
214
|
+
after="MapFeatureNames",
|
|
215
|
+
_fn="(event.get('kind', " ") == 'nop_event')",
|
|
216
|
+
)
|
|
236
217
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
aggregates=[
|
|
243
|
-
{
|
|
244
|
-
"name": EventFieldType.LATENCY,
|
|
245
|
-
"column": EventFieldType.LATENCY,
|
|
246
|
-
"operations": ["count", "avg"],
|
|
247
|
-
"windows": self.aggregate_windows,
|
|
248
|
-
"period": self.aggregate_period,
|
|
249
|
-
}
|
|
250
|
-
],
|
|
251
|
-
name=EventFieldType.LATENCY,
|
|
252
|
-
after="MapFeatureNames",
|
|
253
|
-
step_name="Aggregates",
|
|
254
|
-
table=".",
|
|
255
|
-
key_field=EventFieldType.ENDPOINT_ID,
|
|
256
|
-
)
|
|
257
|
-
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
258
|
-
graph.add_step(
|
|
259
|
-
class_name="storey.Rename",
|
|
260
|
-
mapping={
|
|
261
|
-
"latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
262
|
-
"latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
263
|
-
},
|
|
264
|
-
name="Rename",
|
|
265
|
-
after=EventFieldType.LATENCY,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
apply_storey_aggregations()
|
|
269
|
-
|
|
270
|
-
# KV/SQL branch
|
|
271
|
-
# Filter relevant keys from the event before writing the data into the database table
|
|
272
|
-
def apply_process_before_endpoint_update():
|
|
273
|
-
graph.add_step(
|
|
274
|
-
"ProcessBeforeEndpointUpdate",
|
|
275
|
-
name="ProcessBeforeEndpointUpdate",
|
|
276
|
-
after="Rename",
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
apply_process_before_endpoint_update()
|
|
280
|
-
|
|
281
|
-
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
282
|
-
# about average latency and the amount of predictions over time
|
|
283
|
-
def apply_update_endpoint():
|
|
284
|
-
graph.add_step(
|
|
285
|
-
"UpdateEndpoint",
|
|
286
|
-
name="UpdateEndpoint",
|
|
287
|
-
after="ProcessBeforeEndpointUpdate",
|
|
288
|
-
project=self.project,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
apply_update_endpoint()
|
|
292
|
-
|
|
293
|
-
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
294
|
-
# which will be used by Grafana monitoring dashboards
|
|
295
|
-
def apply_infer_schema():
|
|
296
|
-
graph.add_step(
|
|
297
|
-
"InferSchema",
|
|
298
|
-
name="InferSchema",
|
|
299
|
-
after="UpdateEndpoint",
|
|
300
|
-
v3io_framesd=self.v3io_framesd,
|
|
301
|
-
container=self.kv_container,
|
|
302
|
-
table=self.kv_path,
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
306
|
-
apply_infer_schema()
|
|
307
|
-
|
|
308
|
-
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
218
|
+
tsdb_connector.apply_monitoring_stream_steps(
|
|
219
|
+
graph=graph,
|
|
220
|
+
aggregate_windows=self.aggregate_windows,
|
|
221
|
+
aggregate_period=self.aggregate_period,
|
|
222
|
+
)
|
|
309
223
|
|
|
310
224
|
# Parquet branch
|
|
311
225
|
# Filter and validate different keys before writing the data to Parquet target
|
|
@@ -313,7 +227,7 @@ class EventStreamProcessor:
|
|
|
313
227
|
graph.add_step(
|
|
314
228
|
"ProcessBeforeParquet",
|
|
315
229
|
name="ProcessBeforeParquet",
|
|
316
|
-
after="
|
|
230
|
+
after="FilterNOP",
|
|
317
231
|
_fn="(event)",
|
|
318
232
|
)
|
|
319
233
|
|
|
@@ -322,12 +236,12 @@ class EventStreamProcessor:
|
|
|
322
236
|
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
323
237
|
def apply_parquet_target():
|
|
324
238
|
graph.add_step(
|
|
325
|
-
"
|
|
239
|
+
"mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
240
|
+
alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
|
|
326
241
|
name="ParquetTarget",
|
|
327
242
|
after="ProcessBeforeParquet",
|
|
328
243
|
graph_shape="cylinder",
|
|
329
244
|
path=self.parquet_path,
|
|
330
|
-
storage_options=self.storage_options,
|
|
331
245
|
max_events=self.parquet_batching_max_events,
|
|
332
246
|
flush_after_seconds=self.parquet_batching_timeout_secs,
|
|
333
247
|
attributes={"infer_columns_from_data": True},
|
|
@@ -340,90 +254,20 @@ class EventStreamProcessor:
|
|
|
340
254
|
|
|
341
255
|
apply_parquet_target()
|
|
342
256
|
|
|
257
|
+
# controller branch
|
|
258
|
+
def apply_push_controller_stream(stream_uri: str):
|
|
259
|
+
graph.add_step(
|
|
260
|
+
">>",
|
|
261
|
+
"controller_stream",
|
|
262
|
+
path=stream_uri,
|
|
263
|
+
sharding_func=ControllerEvent.ENDPOINT_ID,
|
|
264
|
+
after="ForwardNOP",
|
|
265
|
+
# Force using the pipeline key instead of the one in the profile in case of v3io profile.
|
|
266
|
+
# In case of Kafka, this parameter will be ignored.
|
|
267
|
+
alternative_v3io_access_key="V3IO_ACCESS_KEY",
|
|
268
|
+
)
|
|
343
269
|
|
|
344
|
-
|
|
345
|
-
def __init__(self, **kwargs):
|
|
346
|
-
"""
|
|
347
|
-
Filter relevant keys from the event before writing the data to database table (in EndpointUpdate step).
|
|
348
|
-
Note that in the endpoint table we only keep metadata (function_uri, model_class, etc.) and stats about the
|
|
349
|
-
average latency and the number of predictions (per 5min and 1hour).
|
|
350
|
-
|
|
351
|
-
:returns: A filtered event as a dictionary which will be written to the endpoint table in the next step.
|
|
352
|
-
"""
|
|
353
|
-
super().__init__(**kwargs)
|
|
354
|
-
|
|
355
|
-
def do(self, event):
|
|
356
|
-
# Compute prediction per second
|
|
357
|
-
event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
|
|
358
|
-
float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
|
|
359
|
-
)
|
|
360
|
-
# Filter relevant keys
|
|
361
|
-
e = {
|
|
362
|
-
k: event[k]
|
|
363
|
-
for k in [
|
|
364
|
-
EventFieldType.FUNCTION_URI,
|
|
365
|
-
EventFieldType.MODEL,
|
|
366
|
-
EventFieldType.MODEL_CLASS,
|
|
367
|
-
EventFieldType.ENDPOINT_ID,
|
|
368
|
-
EventFieldType.LABELS,
|
|
369
|
-
EventFieldType.FIRST_REQUEST,
|
|
370
|
-
EventFieldType.LAST_REQUEST,
|
|
371
|
-
EventFieldType.ERROR_COUNT,
|
|
372
|
-
]
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
# Add generic metrics statistics
|
|
376
|
-
generic_metrics = {
|
|
377
|
-
k: event[k]
|
|
378
|
-
for k in [
|
|
379
|
-
EventLiveStats.LATENCY_AVG_5M,
|
|
380
|
-
EventLiveStats.LATENCY_AVG_1H,
|
|
381
|
-
EventLiveStats.PREDICTIONS_PER_SECOND,
|
|
382
|
-
EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
383
|
-
EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
384
|
-
]
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
e[EventFieldType.METRICS] = json.dumps(
|
|
388
|
-
{EventKeyMetrics.GENERIC: generic_metrics}
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
# Write labels as json string as required by the DB format
|
|
392
|
-
e[EventFieldType.LABELS] = json.dumps(e[EventFieldType.LABELS])
|
|
393
|
-
|
|
394
|
-
return e
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
398
|
-
def __init__(self, **kwargs) -> None:
|
|
399
|
-
"""
|
|
400
|
-
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
401
|
-
"""
|
|
402
|
-
super().__init__(**kwargs)
|
|
403
|
-
|
|
404
|
-
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
405
|
-
# Getting model version and function uri from event
|
|
406
|
-
# and use them for retrieving the endpoint_id
|
|
407
|
-
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
408
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
409
|
-
return None
|
|
410
|
-
|
|
411
|
-
model = full_event.body.get(EventFieldType.MODEL)
|
|
412
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
413
|
-
return None
|
|
414
|
-
|
|
415
|
-
version = full_event.body.get(EventFieldType.VERSION)
|
|
416
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
417
|
-
|
|
418
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
419
|
-
function_uri=function_uri,
|
|
420
|
-
versioned_model=versioned_model,
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
endpoint_id = str(endpoint_id)
|
|
424
|
-
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
425
|
-
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
426
|
-
return full_event
|
|
270
|
+
apply_push_controller_stream(controller_stream_uri)
|
|
427
271
|
|
|
428
272
|
|
|
429
273
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
@@ -490,28 +334,34 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
490
334
|
self.first_request: dict[str, str] = dict()
|
|
491
335
|
self.last_request: dict[str, str] = dict()
|
|
492
336
|
|
|
493
|
-
# Number of errors (value) per endpoint (key)
|
|
494
|
-
self.error_count: dict[str, int] = collections.defaultdict(int)
|
|
495
|
-
|
|
496
337
|
# Set of endpoints in the current events
|
|
497
338
|
self.endpoints: set[str] = set()
|
|
498
339
|
|
|
499
340
|
def do(self, full_event):
|
|
500
341
|
event = full_event.body
|
|
342
|
+
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
343
|
+
logger.debug(
|
|
344
|
+
"Skipped nop event inside of ProcessEndpointEvent", event=event
|
|
345
|
+
)
|
|
346
|
+
full_event.body = [event]
|
|
347
|
+
return full_event
|
|
348
|
+
# Getting model version and function uri from event
|
|
349
|
+
# and use them for retrieving the endpoint_id
|
|
350
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
351
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
355
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
356
|
+
return None
|
|
501
357
|
|
|
502
|
-
versioned_model = event[EventFieldType.VERSIONED_MODEL]
|
|
503
358
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
504
|
-
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
505
359
|
|
|
506
360
|
# In case this process fails, resume state from existing record
|
|
507
|
-
self.resume_state(
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
error = event.get("error")
|
|
512
|
-
if error: # TODO: delete this in ML-7456
|
|
513
|
-
self.error_count[endpoint_id] += 1
|
|
514
|
-
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
361
|
+
self.resume_state(
|
|
362
|
+
endpoint_id=endpoint_id,
|
|
363
|
+
endpoint_name=full_event.body.get(EventFieldType.MODEL),
|
|
364
|
+
)
|
|
515
365
|
|
|
516
366
|
# Validate event fields
|
|
517
367
|
model_class = event.get("model_class") or event.get("class")
|
|
@@ -524,10 +374,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
524
374
|
predictions = event.get("resp", {}).get("outputs")
|
|
525
375
|
|
|
526
376
|
if not self.is_valid(
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
["when"],
|
|
377
|
+
validation_function=is_not_none,
|
|
378
|
+
field=timestamp,
|
|
379
|
+
dict_path=["when"],
|
|
531
380
|
):
|
|
532
381
|
return None
|
|
533
382
|
|
|
@@ -535,45 +384,33 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
535
384
|
# Set time for the first request of the current endpoint
|
|
536
385
|
self.first_request[endpoint_id] = timestamp
|
|
537
386
|
|
|
538
|
-
# Validate that the request time of the current event is later than the previous request time
|
|
539
|
-
self._validate_last_request_timestamp(
|
|
540
|
-
endpoint_id=endpoint_id, timestamp=timestamp
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
# Set time for the last reqeust of the current endpoint
|
|
544
|
-
self.last_request[endpoint_id] = timestamp
|
|
545
|
-
|
|
546
387
|
if not self.is_valid(
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
["request", "id"],
|
|
388
|
+
validation_function=is_not_none,
|
|
389
|
+
field=request_id,
|
|
390
|
+
dict_path=["request", "id"],
|
|
551
391
|
):
|
|
552
392
|
return None
|
|
553
393
|
if not self.is_valid(
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
["microsec"],
|
|
394
|
+
validation_function=is_not_none,
|
|
395
|
+
field=latency,
|
|
396
|
+
dict_path=["microsec"],
|
|
558
397
|
):
|
|
559
398
|
return None
|
|
560
399
|
if not self.is_valid(
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
["request", "inputs"],
|
|
400
|
+
validation_function=is_not_none,
|
|
401
|
+
field=features,
|
|
402
|
+
dict_path=["request", "inputs"],
|
|
565
403
|
):
|
|
566
404
|
return None
|
|
567
405
|
if not self.is_valid(
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
["resp", "outputs"],
|
|
406
|
+
validation_function=is_not_none,
|
|
407
|
+
field=predictions,
|
|
408
|
+
dict_path=["resp", "outputs"],
|
|
572
409
|
):
|
|
573
410
|
return None
|
|
574
411
|
|
|
575
412
|
# Convert timestamp to a datetime object
|
|
576
|
-
|
|
413
|
+
timestamp_obj = datetime.datetime.fromisoformat(timestamp)
|
|
577
414
|
|
|
578
415
|
# Separate each model invocation into sub events that will be stored as dictionary
|
|
579
416
|
# in list of events. This list will be used as the body for the storey event.
|
|
@@ -605,96 +442,93 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
605
442
|
if not isinstance(feature, list):
|
|
606
443
|
feature = [feature]
|
|
607
444
|
|
|
445
|
+
effective_sample_count, estimated_prediction_count = (
|
|
446
|
+
self._get_effective_and_estimated_counts(event=event)
|
|
447
|
+
)
|
|
448
|
+
|
|
608
449
|
events.append(
|
|
609
450
|
{
|
|
610
451
|
EventFieldType.FUNCTION_URI: function_uri,
|
|
611
|
-
EventFieldType.
|
|
452
|
+
EventFieldType.ENDPOINT_NAME: event.get(EventFieldType.MODEL),
|
|
612
453
|
EventFieldType.MODEL_CLASS: model_class,
|
|
613
|
-
EventFieldType.TIMESTAMP:
|
|
454
|
+
EventFieldType.TIMESTAMP: timestamp_obj,
|
|
614
455
|
EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
615
456
|
EventFieldType.REQUEST_ID: request_id,
|
|
616
457
|
EventFieldType.LATENCY: latency,
|
|
617
458
|
EventFieldType.FEATURES: feature,
|
|
618
459
|
EventFieldType.PREDICTION: prediction,
|
|
619
460
|
EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
|
|
620
|
-
EventFieldType.LAST_REQUEST:
|
|
461
|
+
EventFieldType.LAST_REQUEST: timestamp,
|
|
621
462
|
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
622
|
-
|
|
463
|
+
timestamp
|
|
623
464
|
).timestamp(),
|
|
624
|
-
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
625
465
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
626
466
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
|
627
467
|
EventFieldType.ENTITIES: event.get("request", {}).get(
|
|
628
468
|
EventFieldType.ENTITIES, {}
|
|
629
469
|
),
|
|
470
|
+
EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
|
|
471
|
+
EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
|
|
630
472
|
}
|
|
631
473
|
)
|
|
632
474
|
|
|
633
475
|
# Create a storey event object with list of events, based on endpoint_id which will be used
|
|
634
476
|
# in the upcoming steps
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
def _validate_last_request_timestamp(self, endpoint_id: str, timestamp: str):
|
|
639
|
-
"""Validate that the request time of the current event is later than the previous request time that has
|
|
640
|
-
already been processed.
|
|
641
|
-
|
|
642
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
643
|
-
:param timestamp: Event request time as a string.
|
|
644
|
-
|
|
645
|
-
:raise MLRunPreconditionFailedError: If the request time of the current is later than the previous request time.
|
|
646
|
-
"""
|
|
647
|
-
|
|
648
|
-
if (
|
|
649
|
-
endpoint_id in self.last_request
|
|
650
|
-
and self.last_request[endpoint_id] > timestamp
|
|
651
|
-
):
|
|
652
|
-
logger.error(
|
|
653
|
-
f"current event request time {timestamp} is earlier than the last request time "
|
|
654
|
-
f"{self.last_request[endpoint_id]} - write to TSDB will be rejected"
|
|
655
|
-
)
|
|
477
|
+
full_event.key = endpoint_id
|
|
478
|
+
full_event.body = events
|
|
479
|
+
return full_event
|
|
656
480
|
|
|
657
|
-
def resume_state(self, endpoint_id):
|
|
481
|
+
def resume_state(self, endpoint_id, endpoint_name):
|
|
658
482
|
# Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
|
|
659
483
|
# left them
|
|
660
484
|
if endpoint_id not in self.endpoints:
|
|
661
485
|
logger.info("Trying to resume state", endpoint_id=endpoint_id)
|
|
662
|
-
endpoint_record =
|
|
663
|
-
|
|
664
|
-
|
|
486
|
+
endpoint_record = (
|
|
487
|
+
mlrun.db.get_run_db()
|
|
488
|
+
.get_model_endpoint(
|
|
489
|
+
project=self.project,
|
|
490
|
+
endpoint_id=endpoint_id,
|
|
491
|
+
name=endpoint_name,
|
|
492
|
+
tsdb_metrics=False,
|
|
493
|
+
)
|
|
494
|
+
.flat_dict()
|
|
665
495
|
)
|
|
666
496
|
|
|
667
|
-
# If model endpoint found, get first_request
|
|
497
|
+
# If model endpoint found, get first_request & last_request values
|
|
668
498
|
if endpoint_record:
|
|
669
499
|
first_request = endpoint_record.get(EventFieldType.FIRST_REQUEST)
|
|
670
500
|
|
|
671
501
|
if first_request:
|
|
672
502
|
self.first_request[endpoint_id] = first_request
|
|
673
503
|
|
|
674
|
-
last_request = endpoint_record.get(EventFieldType.LAST_REQUEST)
|
|
675
|
-
if last_request:
|
|
676
|
-
self.last_request[endpoint_id] = last_request
|
|
677
|
-
|
|
678
|
-
error_count = endpoint_record.get(EventFieldType.ERROR_COUNT)
|
|
679
|
-
|
|
680
|
-
if error_count:
|
|
681
|
-
self.error_count[endpoint_id] = int(error_count)
|
|
682
|
-
|
|
683
504
|
# add endpoint to endpoints set
|
|
684
505
|
self.endpoints.add(endpoint_id)
|
|
685
506
|
|
|
686
507
|
def is_valid(
|
|
687
508
|
self,
|
|
688
|
-
endpoint_id: str,
|
|
689
509
|
validation_function,
|
|
690
510
|
field: typing.Any,
|
|
691
511
|
dict_path: list[str],
|
|
692
512
|
):
|
|
693
513
|
if validation_function(field, dict_path):
|
|
694
514
|
return True
|
|
695
|
-
|
|
515
|
+
|
|
696
516
|
return False
|
|
697
517
|
|
|
518
|
+
@staticmethod
|
|
519
|
+
def _get_effective_and_estimated_counts(event):
|
|
520
|
+
"""
|
|
521
|
+
Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
|
|
522
|
+
sampling percentage. These values will be stored in the TSDB target.
|
|
523
|
+
Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
|
|
524
|
+
percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
|
|
525
|
+
"""
|
|
526
|
+
effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
|
|
527
|
+
estimated_prediction_count = effective_sample_count * (
|
|
528
|
+
100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
|
|
529
|
+
)
|
|
530
|
+
return effective_sample_count, estimated_prediction_count
|
|
531
|
+
|
|
698
532
|
|
|
699
533
|
def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
700
534
|
if field is not None:
|
|
@@ -735,6 +569,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
735
569
|
# and labels columns were not found in the current event
|
|
736
570
|
self.feature_names = {}
|
|
737
571
|
self.label_columns = {}
|
|
572
|
+
self.first_request = {}
|
|
738
573
|
|
|
739
574
|
# Dictionary to manage the model endpoint types - important for the V3IO TSDB
|
|
740
575
|
self.endpoint_type = {}
|
|
@@ -756,6 +591,8 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
756
591
|
return None
|
|
757
592
|
|
|
758
593
|
def do(self, event: dict):
|
|
594
|
+
if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
|
|
595
|
+
return event
|
|
759
596
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
760
597
|
|
|
761
598
|
feature_values = event[EventFieldType.FEATURES]
|
|
@@ -766,23 +603,30 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
766
603
|
if isinstance(feature_value, int):
|
|
767
604
|
feature_values[index] = float(feature_value)
|
|
768
605
|
|
|
606
|
+
attributes_to_update = {}
|
|
607
|
+
endpoint_record = None
|
|
769
608
|
# Get feature names and label columns
|
|
770
609
|
if endpoint_id not in self.feature_names:
|
|
771
|
-
endpoint_record =
|
|
772
|
-
|
|
773
|
-
|
|
610
|
+
endpoint_record = (
|
|
611
|
+
mlrun.db.get_run_db()
|
|
612
|
+
.get_model_endpoint(
|
|
613
|
+
project=self.project,
|
|
614
|
+
endpoint_id=endpoint_id,
|
|
615
|
+
name=event[EventFieldType.ENDPOINT_NAME],
|
|
616
|
+
tsdb_metrics=False,
|
|
617
|
+
)
|
|
618
|
+
.flat_dict()
|
|
774
619
|
)
|
|
775
620
|
feature_names = endpoint_record.get(EventFieldType.FEATURE_NAMES)
|
|
776
|
-
feature_names = json.loads(feature_names) if feature_names else None
|
|
777
621
|
|
|
778
622
|
label_columns = endpoint_record.get(EventFieldType.LABEL_NAMES)
|
|
779
|
-
label_columns = json.loads(label_columns) if label_columns else None
|
|
780
623
|
|
|
781
624
|
# If feature names were not found,
|
|
782
625
|
# try to retrieve them from the previous events of the current process
|
|
783
626
|
if not feature_names and self._infer_columns_from_data:
|
|
784
627
|
feature_names = self._infer_feature_names_from_data(event)
|
|
785
628
|
|
|
629
|
+
endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
|
|
786
630
|
if not feature_names:
|
|
787
631
|
logger.warn(
|
|
788
632
|
"Feature names are not initialized, they will be automatically generated",
|
|
@@ -793,19 +637,14 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
793
637
|
]
|
|
794
638
|
|
|
795
639
|
# Update the endpoint record with the generated features
|
|
796
|
-
|
|
797
|
-
project=self.project,
|
|
798
|
-
endpoint_id=endpoint_id,
|
|
799
|
-
attributes={
|
|
800
|
-
EventFieldType.FEATURE_NAMES: json.dumps(feature_names)
|
|
801
|
-
},
|
|
802
|
-
)
|
|
640
|
+
attributes_to_update[EventFieldType.FEATURE_NAMES] = feature_names
|
|
803
641
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
642
|
+
if endpoint_type != EndpointType.ROUTER.value:
|
|
643
|
+
update_monitoring_feature_set(
|
|
644
|
+
endpoint_record=endpoint_record,
|
|
645
|
+
feature_names=feature_names,
|
|
646
|
+
feature_values=feature_values,
|
|
647
|
+
)
|
|
809
648
|
|
|
810
649
|
# Similar process with label columns
|
|
811
650
|
if not label_columns and self._infer_columns_from_data:
|
|
@@ -819,17 +658,13 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
819
658
|
label_columns = [
|
|
820
659
|
f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
|
|
821
660
|
]
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
endpoint_record=endpoint_record,
|
|
830
|
-
feature_names=label_columns,
|
|
831
|
-
feature_values=label_values,
|
|
832
|
-
)
|
|
661
|
+
attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns
|
|
662
|
+
if endpoint_type != EndpointType.ROUTER.value:
|
|
663
|
+
update_monitoring_feature_set(
|
|
664
|
+
endpoint_record=endpoint_record,
|
|
665
|
+
feature_names=label_columns,
|
|
666
|
+
feature_values=label_values,
|
|
667
|
+
)
|
|
833
668
|
|
|
834
669
|
self.label_columns[endpoint_id] = label_columns
|
|
835
670
|
self.feature_names[endpoint_id] = feature_names
|
|
@@ -842,9 +677,41 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
842
677
|
)
|
|
843
678
|
|
|
844
679
|
# Update the endpoint type within the endpoint types dictionary
|
|
845
|
-
endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
|
|
846
680
|
self.endpoint_type[endpoint_id] = endpoint_type
|
|
847
681
|
|
|
682
|
+
# Update the first request time in the endpoint record
|
|
683
|
+
if endpoint_id not in self.first_request:
|
|
684
|
+
endpoint_record = endpoint_record or (
|
|
685
|
+
mlrun.db.get_run_db()
|
|
686
|
+
.get_model_endpoint(
|
|
687
|
+
project=self.project,
|
|
688
|
+
endpoint_id=endpoint_id,
|
|
689
|
+
name=event[EventFieldType.ENDPOINT_NAME],
|
|
690
|
+
tsdb_metrics=False,
|
|
691
|
+
)
|
|
692
|
+
.flat_dict()
|
|
693
|
+
)
|
|
694
|
+
if not endpoint_record.get(EventFieldType.FIRST_REQUEST):
|
|
695
|
+
attributes_to_update[EventFieldType.FIRST_REQUEST] = (
|
|
696
|
+
mlrun.utils.enrich_datetime_with_tz_info(
|
|
697
|
+
event[EventFieldType.FIRST_REQUEST]
|
|
698
|
+
)
|
|
699
|
+
)
|
|
700
|
+
self.first_request[endpoint_id] = True
|
|
701
|
+
|
|
702
|
+
if attributes_to_update:
|
|
703
|
+
logger.info(
|
|
704
|
+
"Updating endpoint record",
|
|
705
|
+
endpoint_id=endpoint_id,
|
|
706
|
+
attributes=attributes_to_update,
|
|
707
|
+
)
|
|
708
|
+
update_endpoint_record(
|
|
709
|
+
project=self.project,
|
|
710
|
+
endpoint_id=endpoint_id,
|
|
711
|
+
attributes=attributes_to_update,
|
|
712
|
+
endpoint_name=event[EventFieldType.ENDPOINT_NAME],
|
|
713
|
+
)
|
|
714
|
+
|
|
848
715
|
# Add feature_name:value pairs along with a mapping dictionary of all of these pairs
|
|
849
716
|
feature_names = self.feature_names[endpoint_id]
|
|
850
717
|
self._map_dictionary_values(
|
|
@@ -890,35 +757,13 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
890
757
|
|
|
891
758
|
"""
|
|
892
759
|
event[mapping_dictionary] = {}
|
|
760
|
+
diff = len(named_iters) - len(values_iters)
|
|
761
|
+
values_iters += [None] * diff
|
|
893
762
|
for name, value in zip(named_iters, values_iters):
|
|
894
763
|
event[name] = value
|
|
895
764
|
event[mapping_dictionary][name] = value
|
|
896
765
|
|
|
897
766
|
|
|
898
|
-
class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
899
|
-
def __init__(self, project: str, **kwargs):
|
|
900
|
-
"""
|
|
901
|
-
Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
|
|
902
|
-
the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
|
|
903
|
-
such as "Model Monitoring - Performance" which can be found in Grafana.
|
|
904
|
-
|
|
905
|
-
:returns: Event as a dictionary (without any changes) for the next step (InferSchema).
|
|
906
|
-
"""
|
|
907
|
-
super().__init__(**kwargs)
|
|
908
|
-
self.project = project
|
|
909
|
-
|
|
910
|
-
def do(self, event: dict):
|
|
911
|
-
# Remove labels from the event
|
|
912
|
-
event.pop(EventFieldType.LABELS)
|
|
913
|
-
|
|
914
|
-
update_endpoint_record(
|
|
915
|
-
project=self.project,
|
|
916
|
-
endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
|
|
917
|
-
attributes=event,
|
|
918
|
-
)
|
|
919
|
-
return event
|
|
920
|
-
|
|
921
|
-
|
|
922
767
|
class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
923
768
|
def __init__(
|
|
924
769
|
self,
|
|
@@ -963,14 +808,14 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
963
808
|
def update_endpoint_record(
|
|
964
809
|
project: str,
|
|
965
810
|
endpoint_id: str,
|
|
811
|
+
endpoint_name: str,
|
|
966
812
|
attributes: dict,
|
|
967
813
|
):
|
|
968
|
-
|
|
814
|
+
mlrun.db.get_run_db().patch_model_endpoint(
|
|
969
815
|
project=project,
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
endpoint_id=endpoint_id, attributes=attributes
|
|
816
|
+
endpoint_id=endpoint_id,
|
|
817
|
+
attributes=attributes,
|
|
818
|
+
name=endpoint_name,
|
|
974
819
|
)
|
|
975
820
|
|
|
976
821
|
|