mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +18 -109
- mlrun/{runtimes/mpijob/v1alpha1.py → alerts/__init__.py} +2 -16
- mlrun/alerts/alert.py +141 -0
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +36 -253
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +20 -41
- mlrun/artifacts/model.py +8 -140
- mlrun/artifacts/plots.py +14 -375
- mlrun/common/schemas/__init__.py +4 -2
- mlrun/common/schemas/alert.py +46 -4
- mlrun/common/schemas/api_gateway.py +4 -0
- mlrun/common/schemas/artifact.py +15 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/model_monitoring/__init__.py +8 -1
- mlrun/common/schemas/model_monitoring/constants.py +40 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +73 -2
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +7 -4
- mlrun/data_types/to_pandas.py +4 -4
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore_profile.py +54 -4
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/sources.py +43 -2
- mlrun/datastore/store_resources.py +2 -6
- mlrun/datastore/targets.py +106 -39
- mlrun/db/base.py +23 -3
- mlrun/db/httpdb.py +101 -47
- mlrun/db/nopdb.py +20 -2
- mlrun/errors.py +5 -0
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +4 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +2 -0
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +5 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
- mlrun/launcher/base.py +4 -3
- mlrun/launcher/client.py +1 -1
- mlrun/lists.py +4 -2
- mlrun/model.py +25 -11
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +41 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/base/store.py +9 -36
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +104 -187
- mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
- mlrun/model_monitoring/db/tsdb/base.py +135 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +404 -0
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +1 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +48 -213
- mlrun/model_monitoring/writer.py +101 -121
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +11 -7
- mlrun/projects/pipelines.py +13 -76
- mlrun/projects/project.py +73 -45
- mlrun/render.py +11 -13
- mlrun/run.py +6 -41
- mlrun/runtimes/__init__.py +3 -3
- mlrun/runtimes/base.py +6 -6
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +75 -9
- mlrun/runtimes/nuclio/function.py +9 -35
- mlrun/runtimes/pod.py +16 -36
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +1 -39
- mlrun/utils/helpers.py +72 -71
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +12 -5
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +134 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/METADATA +4 -3
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/RECORD +105 -95
- mlrun/kfpops.py +0 -865
- mlrun/platforms/other.py +0 -305
- /mlrun/{runtimes → common/runtimes}/constants.py +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -12,121 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
import
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from mlrun.errors import MLRunIncompatibleVersionError
|
|
23
|
-
from mlrun.model_monitoring.application import ModelMonitoringApplicationBase
|
|
24
|
-
|
|
25
|
-
SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.11")
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
|
|
29
|
-
if ref.is_compatible(cur) or (
|
|
30
|
-
cur.major == ref.major == 0 and cur.minor == ref.minor and cur.patch > ref.patch
|
|
31
|
-
):
|
|
32
|
-
return
|
|
33
|
-
if cur.major == ref.major == 0 and cur.minor > ref.minor:
|
|
34
|
-
warnings.warn(
|
|
35
|
-
f"Evidently version {cur} is not compatible with the tested "
|
|
36
|
-
f"version {ref}, use at your own risk."
|
|
37
|
-
)
|
|
38
|
-
else:
|
|
39
|
-
raise MLRunIncompatibleVersionError(
|
|
40
|
-
f"Evidently version {cur} is not supported, please change to "
|
|
41
|
-
f"{ref} (or another compatible version)."
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
_HAS_EVIDENTLY = False
|
|
46
|
-
try:
|
|
47
|
-
import evidently # noqa: F401
|
|
48
|
-
|
|
49
|
-
_check_evidently_version(
|
|
50
|
-
cur=semver.Version.parse(evidently.__version__),
|
|
51
|
-
ref=SUPPORTED_EVIDENTLY_VERSION,
|
|
52
|
-
)
|
|
53
|
-
_HAS_EVIDENTLY = True
|
|
54
|
-
except ModuleNotFoundError:
|
|
55
|
-
pass
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if _HAS_EVIDENTLY:
|
|
59
|
-
from evidently.renderers.notebook_utils import determine_template
|
|
60
|
-
from evidently.report.report import Report
|
|
61
|
-
from evidently.suite.base_suite import Suite
|
|
62
|
-
from evidently.ui.type_aliases import STR_UUID
|
|
63
|
-
from evidently.ui.workspace import Workspace
|
|
64
|
-
from evidently.utils.dashboard import TemplateParams
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class EvidentlyModelMonitoringApplicationBase(ModelMonitoringApplicationBase):
|
|
68
|
-
def __init__(
|
|
69
|
-
self, evidently_workspace_path: str, evidently_project_id: "STR_UUID"
|
|
70
|
-
) -> None:
|
|
71
|
-
"""
|
|
72
|
-
A class for integrating Evidently for mlrun model monitoring within a monitoring application.
|
|
73
|
-
Note: evidently is not installed by default in the mlrun/mlrun image.
|
|
74
|
-
It must be installed separately to use this class.
|
|
75
|
-
|
|
76
|
-
:param evidently_workspace_path: (str) The path to the Evidently workspace.
|
|
77
|
-
:param evidently_project_id: (str) The ID of the Evidently project.
|
|
78
|
-
|
|
79
|
-
"""
|
|
80
|
-
if not _HAS_EVIDENTLY:
|
|
81
|
-
raise ModuleNotFoundError("Evidently is not installed - the app cannot run")
|
|
82
|
-
self.evidently_workspace = Workspace.create(evidently_workspace_path)
|
|
83
|
-
self.evidently_project_id = evidently_project_id
|
|
84
|
-
self.evidently_project = self.evidently_workspace.get_project(
|
|
85
|
-
evidently_project_id
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
def log_evidently_object(
|
|
89
|
-
self, evidently_object: Union["Report", "Suite"], artifact_name: str
|
|
90
|
-
):
|
|
91
|
-
"""
|
|
92
|
-
Logs an Evidently report or suite as an artifact.
|
|
93
|
-
|
|
94
|
-
:param evidently_object: (Union[Report, Suite]) The Evidently report or suite object.
|
|
95
|
-
:param artifact_name: (str) The name for the logged artifact.
|
|
96
|
-
"""
|
|
97
|
-
evidently_object_html = evidently_object.get_html()
|
|
98
|
-
self.context.log_artifact(
|
|
99
|
-
artifact_name, body=evidently_object_html.encode("utf-8"), format="html"
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
def log_project_dashboard(
|
|
103
|
-
self,
|
|
104
|
-
timestamp_start: pd.Timestamp,
|
|
105
|
-
timestamp_end: pd.Timestamp,
|
|
106
|
-
artifact_name: str = "dashboard",
|
|
107
|
-
):
|
|
108
|
-
"""
|
|
109
|
-
Logs an Evidently project dashboard.
|
|
110
|
-
|
|
111
|
-
:param timestamp_start: (pd.Timestamp) The start timestamp for the dashboard data.
|
|
112
|
-
:param timestamp_end: (pd.Timestamp) The end timestamp for the dashboard data.
|
|
113
|
-
:param artifact_name: (str) The name for the logged artifact.
|
|
114
|
-
"""
|
|
115
|
-
|
|
116
|
-
dashboard_info = self.evidently_project.build_dashboard_info(
|
|
117
|
-
timestamp_start, timestamp_end
|
|
118
|
-
)
|
|
119
|
-
template_params = TemplateParams(
|
|
120
|
-
dashboard_id="pd_" + str(uuid.uuid4()).replace("-", ""),
|
|
121
|
-
dashboard_info=dashboard_info,
|
|
122
|
-
additional_graphs={},
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
dashboard_html = self._render(determine_template("inline"), template_params)
|
|
126
|
-
self.context.log_artifact(
|
|
127
|
-
artifact_name, body=dashboard_html.encode("utf-8"), format="html"
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
@staticmethod
|
|
131
|
-
def _render(temple_func, template_params: "TemplateParams"):
|
|
132
|
-
return temple_func(params=template_params)
|
|
15
|
+
# TODO : delete this file in 1.9.0
|
|
16
|
+
from mlrun.model_monitoring.applications import ( # noqa: F401
|
|
17
|
+
_HAS_EVIDENTLY,
|
|
18
|
+
SUPPORTED_EVIDENTLY_VERSION,
|
|
19
|
+
EvidentlyModelMonitoringApplicationBase,
|
|
20
|
+
)
|
|
@@ -215,7 +215,7 @@ def update_model_endpoint_last_request(
|
|
|
215
215
|
|
|
216
216
|
def calculate_inputs_statistics(
|
|
217
217
|
sample_set_statistics: dict, inputs: pd.DataFrame
|
|
218
|
-
) ->
|
|
218
|
+
) -> mlrun.common.model_monitoring.helpers.FeatureStats:
|
|
219
219
|
"""
|
|
220
220
|
Calculate the inputs data statistics for drift monitoring purpose.
|
|
221
221
|
|
|
@@ -17,6 +17,7 @@ from dataclasses import dataclass, field
|
|
|
17
17
|
from typing import Any
|
|
18
18
|
|
|
19
19
|
import mlrun.model
|
|
20
|
+
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
20
21
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
21
22
|
EndpointType,
|
|
22
23
|
EventKeyMetrics,
|
|
@@ -42,8 +43,8 @@ class ModelEndpointSpec(mlrun.model.ModelObj):
|
|
|
42
43
|
|
|
43
44
|
@dataclass
|
|
44
45
|
class ModelEndpointStatus(mlrun.model.ModelObj):
|
|
45
|
-
feature_stats:
|
|
46
|
-
current_stats:
|
|
46
|
+
feature_stats: FeatureStats = field(default_factory=dict)
|
|
47
|
+
current_stats: FeatureStats = field(default_factory=dict)
|
|
47
48
|
first_request: str = ""
|
|
48
49
|
last_request: str = ""
|
|
49
50
|
error_count: int = 0
|
|
@@ -30,7 +30,6 @@ import mlrun.model_monitoring.db
|
|
|
30
30
|
import mlrun.model_monitoring.prometheus
|
|
31
31
|
import mlrun.serving.states
|
|
32
32
|
import mlrun.utils
|
|
33
|
-
import mlrun.utils.v3io_clients
|
|
34
33
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
35
34
|
EventFieldType,
|
|
36
35
|
EventKeyMetrics,
|
|
@@ -40,7 +39,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
40
39
|
ProjectSecretKeys,
|
|
41
40
|
PrometheusEndpoints,
|
|
42
41
|
)
|
|
43
|
-
from mlrun.model_monitoring.helpers import get_endpoint_record
|
|
44
42
|
from mlrun.utils import logger
|
|
45
43
|
|
|
46
44
|
|
|
@@ -79,6 +77,7 @@ class EventStreamProcessor:
|
|
|
79
77
|
)
|
|
80
78
|
|
|
81
79
|
self.storage_options = None
|
|
80
|
+
self.tsdb_configurations = {}
|
|
82
81
|
if not mlrun.mlconf.is_ce_mode():
|
|
83
82
|
self._initialize_v3io_configurations(
|
|
84
83
|
model_monitoring_access_key=model_monitoring_access_key
|
|
@@ -139,29 +138,29 @@ class EventStreamProcessor:
|
|
|
139
138
|
|
|
140
139
|
def apply_monitoring_serving_graph(self, fn: mlrun.runtimes.ServingRuntime) -> None:
|
|
141
140
|
"""
|
|
142
|
-
Apply monitoring serving graph to a given serving function. The following serving graph includes about
|
|
143
|
-
of different operations that are executed on the events from
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
141
|
+
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
142
|
+
parts that each one them includes several steps of different operations that are executed on the events from
|
|
143
|
+
the model server.
|
|
144
|
+
Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
|
|
145
|
+
metrics from the model server.
|
|
146
|
+
In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
|
|
147
|
+
validation of the event data and adding important details to the event such as endpoint_id.
|
|
148
|
+
In the next parts, the serving graph stores data to 3 different targets:
|
|
149
|
+
1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
|
|
150
|
+
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
|
|
151
|
+
endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
|
|
152
|
+
represents sample statistics from the training data. If the target is from type KV, then the model endpoints
|
|
153
|
+
table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
|
|
154
|
+
SQL, then the table is stored within the database that was defined in the provided connection string.
|
|
155
|
+
2. TSDB: live data of different key metric dictionaries in tsdb target.
|
|
156
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
|
|
156
157
|
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
157
158
|
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
158
159
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
164
|
-
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
160
|
+
3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
|
|
161
|
+
the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
|
|
162
|
+
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
163
|
+
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
165
164
|
|
|
166
165
|
:param fn: A serving function.
|
|
167
166
|
"""
|
|
@@ -171,7 +170,7 @@ class EventStreamProcessor:
|
|
|
171
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
172
171
|
)
|
|
173
172
|
|
|
174
|
-
#
|
|
173
|
+
# Event routing based on the provided path
|
|
175
174
|
def apply_event_routing():
|
|
176
175
|
typing.cast(
|
|
177
176
|
mlrun.serving.TaskStep,
|
|
@@ -184,7 +183,7 @@ class EventStreamProcessor:
|
|
|
184
183
|
|
|
185
184
|
apply_event_routing()
|
|
186
185
|
|
|
187
|
-
#
|
|
186
|
+
# Filter out events with '-' in the path basename from going forward
|
|
188
187
|
# through the next steps of the stream graph
|
|
189
188
|
def apply_storey_filter_stream_events():
|
|
190
189
|
# Filter events with Prometheus endpoints path
|
|
@@ -197,7 +196,7 @@ class EventStreamProcessor:
|
|
|
197
196
|
|
|
198
197
|
apply_storey_filter_stream_events()
|
|
199
198
|
|
|
200
|
-
#
|
|
199
|
+
# Process endpoint event: splitting into sub-events and validate event data
|
|
201
200
|
def apply_process_endpoint_event():
|
|
202
201
|
graph.add_step(
|
|
203
202
|
"ProcessEndpointEvent",
|
|
@@ -208,7 +207,7 @@ class EventStreamProcessor:
|
|
|
208
207
|
|
|
209
208
|
apply_process_endpoint_event()
|
|
210
209
|
|
|
211
|
-
#
|
|
210
|
+
# Applying Storey operations of filtering and flatten
|
|
212
211
|
def apply_storey_filter_and_flatmap():
|
|
213
212
|
# Remove none values from each event
|
|
214
213
|
graph.add_step(
|
|
@@ -225,7 +224,7 @@ class EventStreamProcessor:
|
|
|
225
224
|
|
|
226
225
|
apply_storey_filter_and_flatmap()
|
|
227
226
|
|
|
228
|
-
#
|
|
227
|
+
# Validating feature names and map each feature to its value
|
|
229
228
|
def apply_map_feature_names():
|
|
230
229
|
graph.add_step(
|
|
231
230
|
"MapFeatureNames",
|
|
@@ -237,9 +236,9 @@ class EventStreamProcessor:
|
|
|
237
236
|
|
|
238
237
|
apply_map_feature_names()
|
|
239
238
|
|
|
240
|
-
#
|
|
239
|
+
# Calculate number of predictions and average latency
|
|
241
240
|
def apply_storey_aggregations():
|
|
242
|
-
#
|
|
241
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
243
242
|
graph.add_step(
|
|
244
243
|
class_name="storey.AggregateByKey",
|
|
245
244
|
aggregates=[
|
|
@@ -257,7 +256,7 @@ class EventStreamProcessor:
|
|
|
257
256
|
table=".",
|
|
258
257
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
259
258
|
)
|
|
260
|
-
#
|
|
259
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
261
260
|
graph.add_step(
|
|
262
261
|
class_name="storey.Rename",
|
|
263
262
|
mapping={
|
|
@@ -270,8 +269,8 @@ class EventStreamProcessor:
|
|
|
270
269
|
|
|
271
270
|
apply_storey_aggregations()
|
|
272
271
|
|
|
273
|
-
#
|
|
274
|
-
#
|
|
272
|
+
# KV/SQL branch
|
|
273
|
+
# Filter relevant keys from the event before writing the data into the database table
|
|
275
274
|
def apply_process_before_endpoint_update():
|
|
276
275
|
graph.add_step(
|
|
277
276
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -281,7 +280,7 @@ class EventStreamProcessor:
|
|
|
281
280
|
|
|
282
281
|
apply_process_before_endpoint_update()
|
|
283
282
|
|
|
284
|
-
#
|
|
283
|
+
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
285
284
|
# about average latency and the amount of predictions over time
|
|
286
285
|
def apply_update_endpoint():
|
|
287
286
|
graph.add_step(
|
|
@@ -294,7 +293,7 @@ class EventStreamProcessor:
|
|
|
294
293
|
|
|
295
294
|
apply_update_endpoint()
|
|
296
295
|
|
|
297
|
-
#
|
|
296
|
+
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
298
297
|
# which will be used by Grafana monitoring dashboards
|
|
299
298
|
def apply_infer_schema():
|
|
300
299
|
graph.add_step(
|
|
@@ -309,7 +308,7 @@ class EventStreamProcessor:
|
|
|
309
308
|
if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
|
|
310
309
|
apply_infer_schema()
|
|
311
310
|
|
|
312
|
-
#
|
|
311
|
+
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
313
312
|
def apply_storey_sample_window():
|
|
314
313
|
graph.add_step(
|
|
315
314
|
"storey.steps.SampleWindow",
|
|
@@ -321,84 +320,18 @@ class EventStreamProcessor:
|
|
|
321
320
|
|
|
322
321
|
apply_storey_sample_window()
|
|
323
322
|
|
|
324
|
-
#
|
|
325
|
-
# Steps 20-21 - Prometheus branch
|
|
323
|
+
# TSDB branch (skip to Prometheus if in CE env)
|
|
326
324
|
if not mlrun.mlconf.is_ce_mode():
|
|
327
325
|
# TSDB branch
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
# stats and details about the events
|
|
331
|
-
def apply_process_before_tsdb():
|
|
332
|
-
graph.add_step(
|
|
333
|
-
"ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
apply_process_before_tsdb()
|
|
337
|
-
|
|
338
|
-
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
339
|
-
def apply_filter_and_unpacked_keys(name, keys):
|
|
340
|
-
graph.add_step(
|
|
341
|
-
"FilterAndUnpackKeys",
|
|
342
|
-
name=name,
|
|
343
|
-
after="ProcessBeforeTSDB",
|
|
344
|
-
keys=[keys],
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
def apply_tsdb_target(name, after):
|
|
348
|
-
graph.add_step(
|
|
349
|
-
"storey.TSDBTarget",
|
|
350
|
-
name=name,
|
|
351
|
-
after=after,
|
|
352
|
-
path=self.tsdb_path,
|
|
353
|
-
rate="10/m",
|
|
354
|
-
time_col=EventFieldType.TIMESTAMP,
|
|
355
|
-
container=self.tsdb_container,
|
|
356
|
-
v3io_frames=self.v3io_framesd,
|
|
357
|
-
infer_columns_from_data=True,
|
|
358
|
-
index_cols=[
|
|
359
|
-
EventFieldType.ENDPOINT_ID,
|
|
360
|
-
EventFieldType.RECORD_TYPE,
|
|
361
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
362
|
-
],
|
|
363
|
-
max_events=self.tsdb_batching_max_events,
|
|
364
|
-
flush_after_seconds=self.tsdb_batching_timeout_secs,
|
|
365
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
# Steps 13-14 - unpacked base_metrics dictionary
|
|
369
|
-
apply_filter_and_unpacked_keys(
|
|
370
|
-
name="FilterAndUnpackKeys1",
|
|
371
|
-
keys=EventKeyMetrics.BASE_METRICS,
|
|
372
|
-
)
|
|
373
|
-
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
374
|
-
|
|
375
|
-
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
376
|
-
apply_filter_and_unpacked_keys(
|
|
377
|
-
name="FilterAndUnpackKeys2",
|
|
378
|
-
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
379
|
-
)
|
|
380
|
-
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
381
|
-
|
|
382
|
-
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
383
|
-
apply_filter_and_unpacked_keys(
|
|
384
|
-
name="FilterAndUnpackKeys3",
|
|
385
|
-
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
326
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
327
|
+
project=self.project,
|
|
386
328
|
)
|
|
329
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
387
330
|
|
|
388
|
-
def apply_storey_filter():
|
|
389
|
-
graph.add_step(
|
|
390
|
-
"storey.Filter",
|
|
391
|
-
"FilterNotNone",
|
|
392
|
-
after="FilterAndUnpackKeys3",
|
|
393
|
-
_fn="(event is not None)",
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
apply_storey_filter()
|
|
397
|
-
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
398
331
|
else:
|
|
399
|
-
# Prometheus
|
|
332
|
+
# Prometheus
|
|
400
333
|
|
|
401
|
-
#
|
|
334
|
+
# Increase the prediction counter by 1 and update the latency value
|
|
402
335
|
graph.add_step(
|
|
403
336
|
"IncCounter",
|
|
404
337
|
name="IncCounter",
|
|
@@ -406,7 +339,7 @@ class EventStreamProcessor:
|
|
|
406
339
|
project=self.project,
|
|
407
340
|
)
|
|
408
341
|
|
|
409
|
-
#
|
|
342
|
+
# Record a sample of features and labels
|
|
410
343
|
def apply_record_features_to_prometheus():
|
|
411
344
|
graph.add_step(
|
|
412
345
|
"RecordFeatures",
|
|
@@ -417,8 +350,8 @@ class EventStreamProcessor:
|
|
|
417
350
|
|
|
418
351
|
apply_record_features_to_prometheus()
|
|
419
352
|
|
|
420
|
-
#
|
|
421
|
-
#
|
|
353
|
+
# Parquet branch
|
|
354
|
+
# Filter and validate different keys before writing the data to Parquet target
|
|
422
355
|
def apply_process_before_parquet():
|
|
423
356
|
graph.add_step(
|
|
424
357
|
"ProcessBeforeParquet",
|
|
@@ -429,7 +362,7 @@ class EventStreamProcessor:
|
|
|
429
362
|
|
|
430
363
|
apply_process_before_parquet()
|
|
431
364
|
|
|
432
|
-
#
|
|
365
|
+
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
433
366
|
def apply_parquet_target():
|
|
434
367
|
graph.add_step(
|
|
435
368
|
"storey.ParquetTarget",
|
|
@@ -503,76 +436,6 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
503
436
|
return e
|
|
504
437
|
|
|
505
438
|
|
|
506
|
-
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
507
|
-
def __init__(self, **kwargs):
|
|
508
|
-
"""
|
|
509
|
-
Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
|
|
510
|
-
that each one of them contains important details and stats about the events:
|
|
511
|
-
1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
|
|
512
|
-
storey.AggregateByKey which was executed in step 5.
|
|
513
|
-
2. endpoint_features: feature names and values along with the prediction names and value.
|
|
514
|
-
3. custom_metric (opt): optional metrics provided by the user.
|
|
515
|
-
|
|
516
|
-
:returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
|
|
517
|
-
|
|
518
|
-
"""
|
|
519
|
-
super().__init__(**kwargs)
|
|
520
|
-
|
|
521
|
-
def do(self, event):
|
|
522
|
-
# Compute prediction per second
|
|
523
|
-
event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
|
|
524
|
-
float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
|
|
525
|
-
)
|
|
526
|
-
base_fields = [
|
|
527
|
-
EventFieldType.TIMESTAMP,
|
|
528
|
-
EventFieldType.ENDPOINT_ID,
|
|
529
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
530
|
-
]
|
|
531
|
-
|
|
532
|
-
# Getting event timestamp and endpoint_id
|
|
533
|
-
base_event = {k: event[k] for k in base_fields}
|
|
534
|
-
|
|
535
|
-
# base_metrics includes the stats about the average latency and the amount of predictions over time
|
|
536
|
-
base_metrics = {
|
|
537
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
|
|
538
|
-
EventLiveStats.PREDICTIONS_PER_SECOND: event[
|
|
539
|
-
EventLiveStats.PREDICTIONS_PER_SECOND
|
|
540
|
-
],
|
|
541
|
-
EventLiveStats.PREDICTIONS_COUNT_5M: event[
|
|
542
|
-
EventLiveStats.PREDICTIONS_COUNT_5M
|
|
543
|
-
],
|
|
544
|
-
EventLiveStats.PREDICTIONS_COUNT_1H: event[
|
|
545
|
-
EventLiveStats.PREDICTIONS_COUNT_1H
|
|
546
|
-
],
|
|
547
|
-
EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
|
|
548
|
-
EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
|
|
549
|
-
**base_event,
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
# endpoint_features includes the event values of each feature and prediction
|
|
553
|
-
endpoint_features = {
|
|
554
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
555
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
556
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
557
|
-
**base_event,
|
|
558
|
-
}
|
|
559
|
-
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
560
|
-
processed = {
|
|
561
|
-
EventKeyMetrics.BASE_METRICS: base_metrics,
|
|
562
|
-
EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
# If metrics provided, add another dictionary if custom_metrics values
|
|
566
|
-
if event[EventFieldType.METRICS]:
|
|
567
|
-
processed[EventKeyMetrics.CUSTOM_METRICS] = {
|
|
568
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
|
|
569
|
-
**event[EventFieldType.METRICS],
|
|
570
|
-
**base_event,
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
return processed
|
|
574
|
-
|
|
575
|
-
|
|
576
439
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
577
440
|
def __init__(self, **kwargs):
|
|
578
441
|
"""
|
|
@@ -807,7 +670,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
807
670
|
# left them
|
|
808
671
|
if endpoint_id not in self.endpoints:
|
|
809
672
|
logger.info("Trying to resume state", endpoint_id=endpoint_id)
|
|
810
|
-
endpoint_record = get_endpoint_record(
|
|
673
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
811
674
|
project=self.project,
|
|
812
675
|
endpoint_id=endpoint_id,
|
|
813
676
|
)
|
|
@@ -853,36 +716,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
|
853
716
|
return False
|
|
854
717
|
|
|
855
718
|
|
|
856
|
-
class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
857
|
-
def __init__(self, keys, **kwargs):
|
|
858
|
-
"""
|
|
859
|
-
Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
|
|
860
|
-
or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
|
|
861
|
-
|
|
862
|
-
:param keys: list of key metrics.
|
|
863
|
-
|
|
864
|
-
:returns: An unpacked dictionary of event filtered by the provided key metrics.
|
|
865
|
-
"""
|
|
866
|
-
super().__init__(**kwargs)
|
|
867
|
-
self.keys = keys
|
|
868
|
-
|
|
869
|
-
def do(self, event):
|
|
870
|
-
# Keep only the relevant dictionary based on the provided keys
|
|
871
|
-
new_event = {}
|
|
872
|
-
for key in self.keys:
|
|
873
|
-
if key in event:
|
|
874
|
-
new_event[key] = event[key]
|
|
875
|
-
|
|
876
|
-
# Create unpacked dictionary
|
|
877
|
-
unpacked = {}
|
|
878
|
-
for key in new_event.keys():
|
|
879
|
-
if key in self.keys:
|
|
880
|
-
unpacked = {**unpacked, **new_event[key]}
|
|
881
|
-
else:
|
|
882
|
-
unpacked[key] = new_event[key]
|
|
883
|
-
return unpacked if unpacked else None
|
|
884
|
-
|
|
885
|
-
|
|
886
719
|
class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
887
720
|
def __init__(
|
|
888
721
|
self,
|
|
@@ -940,7 +773,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
940
773
|
label_values = event[EventFieldType.PREDICTION]
|
|
941
774
|
# Get feature names and label columns
|
|
942
775
|
if endpoint_id not in self.feature_names:
|
|
943
|
-
endpoint_record = get_endpoint_record(
|
|
776
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
944
777
|
project=self.project,
|
|
945
778
|
endpoint_id=endpoint_id,
|
|
946
779
|
)
|
|
@@ -1118,6 +951,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1118
951
|
def do(self, event: dict):
|
|
1119
952
|
key_set = set(event.keys())
|
|
1120
953
|
if not key_set.issubset(self.keys):
|
|
954
|
+
import mlrun.utils.v3io_clients
|
|
955
|
+
|
|
1121
956
|
self.keys.update(key_set)
|
|
1122
957
|
# Apply infer_schema on the kv table for generating the schema file
|
|
1123
958
|
mlrun.utils.v3io_clients.get_frames_client(
|