mlrun 1.7.0rc34__py3-none-any.whl → 1.7.0rc35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +1 -0
- mlrun/common/schemas/__init__.py +0 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -2
- mlrun/common/schemas/model_monitoring/constants.py +3 -16
- mlrun/common/schemas/notification.py +1 -1
- mlrun/common/types.py +1 -0
- mlrun/config.py +6 -7
- mlrun/datastore/sources.py +8 -4
- mlrun/db/base.py +2 -3
- mlrun/db/httpdb.py +3 -3
- mlrun/model.py +1 -1
- mlrun/model_monitoring/applications/evidently_base.py +4 -5
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +5 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +2 -2
- mlrun/model_monitoring/db/tsdb/base.py +6 -3
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +22 -3
- mlrun/model_monitoring/stream_processing.py +5 -153
- mlrun/projects/pipelines.py +76 -73
- mlrun/run.py +4 -0
- mlrun/runtimes/nuclio/application/application.py +25 -2
- mlrun/runtimes/nuclio/function.py +5 -0
- mlrun/runtimes/nuclio/serving.py +1 -1
- mlrun/runtimes/pod.py +2 -4
- mlrun/serving/states.py +3 -1
- mlrun/utils/helpers.py +27 -14
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/METADATA +3 -1
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/RECORD +33 -34
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc34.dist-info → mlrun-1.7.0rc35.dist-info}/top_level.txt +0 -0
mlrun/artifacts/base.py
CHANGED
mlrun/common/schemas/__init__.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from .constants import (
|
|
16
|
+
V3IO_MODEL_MONITORING_DB,
|
|
16
17
|
ControllerPolicy,
|
|
17
18
|
DriftStatus,
|
|
18
19
|
EndpointType,
|
|
@@ -31,8 +32,6 @@ from .constants import (
|
|
|
31
32
|
MonitoringFunctionNames,
|
|
32
33
|
PredictionsQueryConstants,
|
|
33
34
|
ProjectSecretKeys,
|
|
34
|
-
PrometheusEndpoints,
|
|
35
|
-
PrometheusMetric,
|
|
36
35
|
ResultData,
|
|
37
36
|
ResultKindApp,
|
|
38
37
|
SchedulingKeys,
|
|
@@ -170,7 +170,6 @@ class StreamKind(MonitoringStrEnum):
|
|
|
170
170
|
class TSDBTarget(MonitoringStrEnum):
|
|
171
171
|
V3IO_TSDB = "v3io-tsdb"
|
|
172
172
|
TDEngine = "tdengine"
|
|
173
|
-
PROMETHEUS = "prometheus"
|
|
174
173
|
|
|
175
174
|
|
|
176
175
|
class ProjectSecretKeys:
|
|
@@ -231,21 +230,6 @@ class EndpointType(IntEnum):
|
|
|
231
230
|
LEAF_EP = 3 # end point that is a child of a router
|
|
232
231
|
|
|
233
232
|
|
|
234
|
-
class PrometheusMetric:
|
|
235
|
-
PREDICTIONS_TOTAL = "predictions_total"
|
|
236
|
-
MODEL_LATENCY_SECONDS = "model_latency_seconds"
|
|
237
|
-
INCOME_FEATURES = "income_features"
|
|
238
|
-
ERRORS_TOTAL = "errors_total"
|
|
239
|
-
DRIFT_METRICS = "drift_metrics"
|
|
240
|
-
DRIFT_STATUS = "drift_status"
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
class PrometheusEndpoints(MonitoringStrEnum):
|
|
244
|
-
MODEL_MONITORING_METRICS = "/model-monitoring-metrics"
|
|
245
|
-
MONITORING_BATCH_METRICS = "/monitoring-batch-metrics"
|
|
246
|
-
MONITORING_DRIFT_STATUS = "/monitoring-drift-status"
|
|
247
|
-
|
|
248
|
-
|
|
249
233
|
class MonitoringFunctionNames(MonitoringStrEnum):
|
|
250
234
|
STREAM = "model-monitoring-stream"
|
|
251
235
|
APPLICATION_CONTROLLER = "model-monitoring-controller"
|
|
@@ -381,3 +365,6 @@ class SpecialApps:
|
|
|
381
365
|
|
|
382
366
|
|
|
383
367
|
_RESERVED_FUNCTION_NAMES = MonitoringFunctionNames.list() + [SpecialApps.MLRUN_INFRA]
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
V3IO_MODEL_MONITORING_DB = "v3io"
|
|
@@ -55,7 +55,7 @@ class Notification(pydantic.BaseModel):
|
|
|
55
55
|
message: str
|
|
56
56
|
severity: NotificationSeverity
|
|
57
57
|
when: list[str]
|
|
58
|
-
condition: str
|
|
58
|
+
condition: str = None
|
|
59
59
|
params: dict[str, typing.Any] = None
|
|
60
60
|
status: NotificationStatus = None
|
|
61
61
|
sent_time: typing.Union[str, datetime.datetime] = None
|
mlrun/common/types.py
CHANGED
mlrun/config.py
CHANGED
|
@@ -485,10 +485,10 @@ default_config = {
|
|
|
485
485
|
# pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
|
|
486
486
|
# git+https://github.com/mlrun/mlrun@development. by default uses the version
|
|
487
487
|
"mlrun_version_specifier": "",
|
|
488
|
-
"kaniko_image": "gcr.io/kaniko-project/executor:v1.
|
|
488
|
+
"kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2", # kaniko builder image
|
|
489
489
|
"kaniko_init_container_image": "alpine:3.18",
|
|
490
490
|
# image for kaniko init container when docker registry is ECR
|
|
491
|
-
"kaniko_aws_cli_image": "amazon/aws-cli:2.
|
|
491
|
+
"kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
|
|
492
492
|
# kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
|
|
493
493
|
# a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
|
|
494
494
|
"kaniko_image_fs_extraction_retries": "3",
|
|
@@ -1237,12 +1237,11 @@ class Config:
|
|
|
1237
1237
|
|
|
1238
1238
|
return storage_options
|
|
1239
1239
|
|
|
1240
|
-
def
|
|
1241
|
-
if not version:
|
|
1242
|
-
version = self.nuclio_version
|
|
1240
|
+
def is_explicit_ack_enabled(self) -> bool:
|
|
1243
1241
|
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1244
|
-
not
|
|
1245
|
-
or semver.VersionInfo.parse(
|
|
1242
|
+
not self.nuclio_version
|
|
1243
|
+
or semver.VersionInfo.parse(self.nuclio_version)
|
|
1244
|
+
>= semver.VersionInfo.parse("1.12.10")
|
|
1246
1245
|
)
|
|
1247
1246
|
|
|
1248
1247
|
|
mlrun/datastore/sources.py
CHANGED
|
@@ -85,7 +85,8 @@ class BaseSourceDriver(DataSource):
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
explicit_ack = (
|
|
88
|
-
is_explicit_ack_supported(context)
|
|
88
|
+
is_explicit_ack_supported(context)
|
|
89
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
89
90
|
)
|
|
90
91
|
return storey.SyncEmitSource(
|
|
91
92
|
context=context,
|
|
@@ -944,7 +945,8 @@ class OnlineSource(BaseSourceDriver):
|
|
|
944
945
|
|
|
945
946
|
source_args = self.attributes.get("source_args", {})
|
|
946
947
|
explicit_ack = (
|
|
947
|
-
is_explicit_ack_supported(context)
|
|
948
|
+
is_explicit_ack_supported(context)
|
|
949
|
+
and mlrun.mlconf.is_explicit_ack_enabled()
|
|
948
950
|
)
|
|
949
951
|
# TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
|
|
950
952
|
src_class = storey.SyncEmitSource(
|
|
@@ -1029,7 +1031,8 @@ class StreamSource(OnlineSource):
|
|
|
1029
1031
|
engine = "async"
|
|
1030
1032
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
1031
1033
|
engine = function.spec.graph.engine
|
|
1032
|
-
|
|
1034
|
+
|
|
1035
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
1033
1036
|
kwargs["explicit_ack_mode"] = "explicitOnly"
|
|
1034
1037
|
kwargs["worker_allocation_mode"] = "static"
|
|
1035
1038
|
|
|
@@ -1116,7 +1119,8 @@ class KafkaSource(OnlineSource):
|
|
|
1116
1119
|
engine = "async"
|
|
1117
1120
|
if hasattr(function.spec, "graph") and function.spec.graph.engine:
|
|
1118
1121
|
engine = function.spec.graph.engine
|
|
1119
|
-
|
|
1122
|
+
|
|
1123
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
1120
1124
|
explicit_ack_mode = "explicitOnly"
|
|
1121
1125
|
extra_attributes["workerAllocationMode"] = extra_attributes.get(
|
|
1122
1126
|
"worker_allocation_mode", "static"
|
mlrun/db/base.py
CHANGED
|
@@ -242,9 +242,8 @@ class RunDBInterface(ABC):
|
|
|
242
242
|
)
|
|
243
243
|
artifact_identifiers.append(
|
|
244
244
|
mlrun.common.schemas.ArtifactIdentifier(
|
|
245
|
-
key
|
|
246
|
-
|
|
247
|
-
# pass the tree as uid
|
|
245
|
+
# we pass the db_key and not the key so the API will be able to find the artifact in the db
|
|
246
|
+
key=mlrun.utils.get_in_artifact(artifact_obj, "db_key"),
|
|
248
247
|
uid=mlrun.utils.get_in_artifact(artifact_obj, "uid"),
|
|
249
248
|
producer_id=mlrun.utils.get_in_artifact(artifact_obj, "tree"),
|
|
250
249
|
kind=mlrun.utils.get_in_artifact(artifact_obj, "kind"),
|
mlrun/db/httpdb.py
CHANGED
|
@@ -1015,7 +1015,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1015
1015
|
"format": format_,
|
|
1016
1016
|
"tag": tag,
|
|
1017
1017
|
"tree": tree,
|
|
1018
|
-
"
|
|
1018
|
+
"object-uid": uid,
|
|
1019
1019
|
}
|
|
1020
1020
|
if iter is not None:
|
|
1021
1021
|
params["iter"] = str(iter)
|
|
@@ -1051,7 +1051,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1051
1051
|
"key": key,
|
|
1052
1052
|
"tag": tag,
|
|
1053
1053
|
"tree": tree,
|
|
1054
|
-
"
|
|
1054
|
+
"object-uid": uid,
|
|
1055
1055
|
"iter": iter,
|
|
1056
1056
|
"deletion_strategy": deletion_strategy,
|
|
1057
1057
|
}
|
|
@@ -3380,7 +3380,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3380
3380
|
By default, the image is mlrun/mlrun.
|
|
3381
3381
|
"""
|
|
3382
3382
|
self.api_call(
|
|
3383
|
-
method=mlrun.common.types.HTTPMethod.
|
|
3383
|
+
method=mlrun.common.types.HTTPMethod.PATCH,
|
|
3384
3384
|
path=f"projects/{project}/model-monitoring/model-monitoring-controller",
|
|
3385
3385
|
params={
|
|
3386
3386
|
"base_period": base_period,
|
mlrun/model.py
CHANGED
|
@@ -754,7 +754,7 @@ class Notification(ModelObj):
|
|
|
754
754
|
"Both 'secret_params' and 'params' are empty, at least one must be defined."
|
|
755
755
|
)
|
|
756
756
|
|
|
757
|
-
notification_class.validate_params(secret_params
|
|
757
|
+
notification_class.validate_params(secret_params | params)
|
|
758
758
|
|
|
759
759
|
@staticmethod
|
|
760
760
|
def validate_notification_uniqueness(notifications: list["Notification"]):
|
|
@@ -23,7 +23,7 @@ import mlrun.model_monitoring.applications.base as mm_base
|
|
|
23
23
|
import mlrun.model_monitoring.applications.context as mm_context
|
|
24
24
|
from mlrun.errors import MLRunIncompatibleVersionError
|
|
25
25
|
|
|
26
|
-
SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.
|
|
26
|
+
SUPPORTED_EVIDENTLY_VERSION = semver.Version.parse("0.4.32")
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def _check_evidently_version(*, cur: semver.Version, ref: semver.Version) -> None:
|
|
@@ -57,12 +57,11 @@ except ModuleNotFoundError:
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
if _HAS_EVIDENTLY:
|
|
60
|
-
from evidently.renderers.notebook_utils import determine_template
|
|
61
60
|
from evidently.report.report import Report
|
|
62
61
|
from evidently.suite.base_suite import Suite
|
|
63
62
|
from evidently.ui.type_aliases import STR_UUID
|
|
64
63
|
from evidently.ui.workspace import Workspace
|
|
65
|
-
from evidently.utils.dashboard import TemplateParams
|
|
64
|
+
from evidently.utils.dashboard import TemplateParams, file_html_template
|
|
66
65
|
|
|
67
66
|
|
|
68
67
|
class EvidentlyModelMonitoringApplicationBase(mm_base.ModelMonitoringApplicationBase):
|
|
@@ -123,7 +122,7 @@ class EvidentlyModelMonitoringApplicationBase(mm_base.ModelMonitoringApplication
|
|
|
123
122
|
additional_graphs={},
|
|
124
123
|
)
|
|
125
124
|
|
|
126
|
-
dashboard_html = self._render(
|
|
125
|
+
dashboard_html = self._render(file_html_template, template_params)
|
|
127
126
|
self.context.log_artifact(
|
|
128
127
|
artifact_name, body=dashboard_html.encode("utf-8"), format="html"
|
|
129
128
|
)
|
|
@@ -201,7 +200,7 @@ class EvidentlyModelMonitoringApplicationBaseV2(
|
|
|
201
200
|
additional_graphs={},
|
|
202
201
|
)
|
|
203
202
|
|
|
204
|
-
dashboard_html = self._render(
|
|
203
|
+
dashboard_html = self._render(file_html_template, template_params)
|
|
205
204
|
monitoring_context.log_artifact(
|
|
206
205
|
artifact_name, body=dashboard_html.encode("utf-8"), format="html"
|
|
207
206
|
)
|
|
@@ -177,6 +177,11 @@ class SQLStoreBase(StoreBase):
|
|
|
177
177
|
param table: SQLAlchemy declarative table.
|
|
178
178
|
:param criteria: A list of binary expressions that filter the query.
|
|
179
179
|
"""
|
|
180
|
+
if not self._engine.has_table(table.__tablename__):
|
|
181
|
+
logger.debug(
|
|
182
|
+
f"Table {table.__tablename__} does not exist in the database. Skipping deletion."
|
|
183
|
+
)
|
|
184
|
+
return
|
|
180
185
|
with create_session(dsn=self._sql_connection_string) as session:
|
|
181
186
|
# Generate and commit the delete query
|
|
182
187
|
session.query(
|
|
@@ -408,14 +408,14 @@ class KVStoreBase(StoreBase):
|
|
|
408
408
|
|
|
409
409
|
"""
|
|
410
410
|
try:
|
|
411
|
-
|
|
411
|
+
response = self.client.kv.get(
|
|
412
412
|
container=self._get_monitoring_schedules_container(
|
|
413
413
|
project_name=self.project
|
|
414
414
|
),
|
|
415
415
|
table_path=endpoint_id,
|
|
416
416
|
key=application_name,
|
|
417
417
|
)
|
|
418
|
-
return
|
|
418
|
+
return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
419
419
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
420
420
|
logger.debug("Error while getting last analyzed time", err=err)
|
|
421
421
|
raise mlrun.errors.MLRunNotFoundError(
|
|
@@ -27,7 +27,7 @@ from mlrun.utils import logger
|
|
|
27
27
|
class TSDBConnector(ABC):
|
|
28
28
|
type: typing.ClassVar[str]
|
|
29
29
|
|
|
30
|
-
def __init__(self, project: str):
|
|
30
|
+
def __init__(self, project: str) -> None:
|
|
31
31
|
"""
|
|
32
32
|
Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
|
|
33
33
|
At the moment we have 3 different types of monitoring data:
|
|
@@ -42,10 +42,10 @@ class TSDBConnector(ABC):
|
|
|
42
42
|
writer.
|
|
43
43
|
|
|
44
44
|
:param project: the name of the project.
|
|
45
|
-
|
|
46
45
|
"""
|
|
47
46
|
self.project = project
|
|
48
47
|
|
|
48
|
+
@abstractmethod
|
|
49
49
|
def apply_monitoring_stream_steps(self, graph):
|
|
50
50
|
"""
|
|
51
51
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -58,6 +58,7 @@ class TSDBConnector(ABC):
|
|
|
58
58
|
"""
|
|
59
59
|
pass
|
|
60
60
|
|
|
61
|
+
@abstractmethod
|
|
61
62
|
def write_application_event(
|
|
62
63
|
self,
|
|
63
64
|
event: dict,
|
|
@@ -69,13 +70,14 @@ class TSDBConnector(ABC):
|
|
|
69
70
|
:raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
|
|
70
71
|
"""
|
|
71
72
|
|
|
73
|
+
@abstractmethod
|
|
72
74
|
def delete_tsdb_resources(self):
|
|
73
75
|
"""
|
|
74
76
|
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
75
77
|
"""
|
|
76
|
-
|
|
77
78
|
pass
|
|
78
79
|
|
|
80
|
+
@abstractmethod
|
|
79
81
|
def get_model_endpoint_real_time_metrics(
|
|
80
82
|
self,
|
|
81
83
|
endpoint_id: str,
|
|
@@ -102,6 +104,7 @@ class TSDBConnector(ABC):
|
|
|
102
104
|
"""
|
|
103
105
|
pass
|
|
104
106
|
|
|
107
|
+
@abstractmethod
|
|
105
108
|
def create_tables(self) -> None:
|
|
106
109
|
"""
|
|
107
110
|
Create the TSDB tables using the TSDB connector. At the moment we support 3 types of tables:
|
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
16
15
|
import json
|
|
17
16
|
|
|
@@ -21,8 +20,6 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
21
20
|
EventKeyMetrics,
|
|
22
21
|
)
|
|
23
22
|
|
|
24
|
-
_TABLE_COLUMN = "table_column"
|
|
25
|
-
|
|
26
23
|
|
|
27
24
|
class ProcessBeforeTDEngine(mlrun.feature_store.steps.MapClass):
|
|
28
25
|
def __init__(self, **kwargs):
|
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
from typing import Any
|
|
15
16
|
|
|
16
17
|
import mlrun.feature_store.steps
|
|
17
18
|
from mlrun.common.schemas.model_monitoring import (
|
|
@@ -21,6 +22,24 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
26
|
+
"""
|
|
27
|
+
Normalize user defined keys - input data to a model and its predictions,
|
|
28
|
+
to a form V3IO frames tolerates.
|
|
29
|
+
|
|
30
|
+
The dictionary keys should conform to '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'.
|
|
31
|
+
"""
|
|
32
|
+
prefix = "_"
|
|
33
|
+
|
|
34
|
+
def norm_key(key: str) -> str:
|
|
35
|
+
key = key.replace("-", "_") # hyphens `-` are not allowed
|
|
36
|
+
if key and key[0].isdigit(): # starting with a digit is not allowed
|
|
37
|
+
return prefix + key
|
|
38
|
+
return key
|
|
39
|
+
|
|
40
|
+
return {norm_key(k): v for k, v in event.items()}
|
|
41
|
+
|
|
42
|
+
|
|
24
43
|
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
25
44
|
def __init__(self, **kwargs):
|
|
26
45
|
"""
|
|
@@ -68,8 +87,8 @@ class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
|
68
87
|
# endpoint_features includes the event values of each feature and prediction
|
|
69
88
|
endpoint_features = {
|
|
70
89
|
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
71
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
72
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
90
|
+
**_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_PREDICTIONS]),
|
|
91
|
+
**_normalize_dict_for_v3io_frames(event[EventFieldType.NAMED_FEATURES]),
|
|
73
92
|
**base_event,
|
|
74
93
|
}
|
|
75
94
|
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
@@ -27,7 +27,6 @@ import mlrun.datastore.targets
|
|
|
27
27
|
import mlrun.feature_store as fstore
|
|
28
28
|
import mlrun.feature_store.steps
|
|
29
29
|
import mlrun.model_monitoring.db
|
|
30
|
-
import mlrun.model_monitoring.prometheus
|
|
31
30
|
import mlrun.serving.states
|
|
32
31
|
import mlrun.utils
|
|
33
32
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
@@ -37,7 +36,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
37
36
|
FileTargetKind,
|
|
38
37
|
ModelEndpointTarget,
|
|
39
38
|
ProjectSecretKeys,
|
|
40
|
-
PrometheusEndpoints,
|
|
41
39
|
)
|
|
42
40
|
from mlrun.utils import logger
|
|
43
41
|
|
|
@@ -172,39 +170,12 @@ class EventStreamProcessor:
|
|
|
172
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
173
171
|
)
|
|
174
172
|
|
|
175
|
-
# Event routing based on the provided path
|
|
176
|
-
def apply_event_routing():
|
|
177
|
-
typing.cast(
|
|
178
|
-
mlrun.serving.TaskStep,
|
|
179
|
-
graph.add_step(
|
|
180
|
-
"EventRouting",
|
|
181
|
-
full_event=True,
|
|
182
|
-
project=self.project,
|
|
183
|
-
),
|
|
184
|
-
).respond()
|
|
185
|
-
|
|
186
|
-
apply_event_routing()
|
|
187
|
-
|
|
188
|
-
# Filter out events with '-' in the path basename from going forward
|
|
189
|
-
# through the next steps of the stream graph
|
|
190
|
-
def apply_storey_filter_stream_events():
|
|
191
|
-
# Filter events with Prometheus endpoints path
|
|
192
|
-
graph.add_step(
|
|
193
|
-
"storey.Filter",
|
|
194
|
-
"filter_stream_event",
|
|
195
|
-
_fn=f"(event.path not in {PrometheusEndpoints.list()})",
|
|
196
|
-
full_event=True,
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
apply_storey_filter_stream_events()
|
|
200
|
-
|
|
201
173
|
# Process endpoint event: splitting into sub-events and validate event data
|
|
202
174
|
def apply_process_endpoint_event():
|
|
203
175
|
graph.add_step(
|
|
204
176
|
"ProcessEndpointEvent",
|
|
205
177
|
full_event=True,
|
|
206
178
|
project=self.project,
|
|
207
|
-
after="filter_stream_event",
|
|
208
179
|
)
|
|
209
180
|
|
|
210
181
|
apply_process_endpoint_event()
|
|
@@ -324,33 +295,10 @@ class EventStreamProcessor:
|
|
|
324
295
|
|
|
325
296
|
apply_storey_sample_window()
|
|
326
297
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
)
|
|
332
|
-
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
333
|
-
|
|
334
|
-
else:
|
|
335
|
-
# Prometheus
|
|
336
|
-
# Increase the prediction counter by 1 and update the latency value
|
|
337
|
-
graph.add_step(
|
|
338
|
-
"IncCounter",
|
|
339
|
-
name="IncCounter",
|
|
340
|
-
after="MapFeatureNames",
|
|
341
|
-
project=self.project,
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
# Record a sample of features and labels
|
|
345
|
-
def apply_record_features_to_prometheus():
|
|
346
|
-
graph.add_step(
|
|
347
|
-
"RecordFeatures",
|
|
348
|
-
name="RecordFeaturesToPrometheus",
|
|
349
|
-
after="sample",
|
|
350
|
-
project=self.project,
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
apply_record_features_to_prometheus()
|
|
298
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
299
|
+
project=self.project, secret_provider=secret_provider
|
|
300
|
+
)
|
|
301
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
354
302
|
|
|
355
303
|
# Parquet branch
|
|
356
304
|
# Filter and validate different keys before writing the data to Parquet target
|
|
@@ -542,11 +490,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
542
490
|
error = event.get("error")
|
|
543
491
|
if error:
|
|
544
492
|
self.error_count[endpoint_id] += 1
|
|
545
|
-
|
|
546
|
-
project=self.project,
|
|
547
|
-
endpoint_id=event["endpoint_id"],
|
|
548
|
-
model_name=event["model"],
|
|
549
|
-
)
|
|
493
|
+
# TODO: write to tsdb / kv once in a while
|
|
550
494
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
551
495
|
|
|
552
496
|
# Validate event fields
|
|
@@ -973,98 +917,6 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
973
917
|
return event
|
|
974
918
|
|
|
975
919
|
|
|
976
|
-
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
977
|
-
"""
|
|
978
|
-
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
979
|
-
to the caller. At the moment there are several paths:
|
|
980
|
-
|
|
981
|
-
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
982
|
-
to scrape the results from the monitoring stream memory.
|
|
983
|
-
|
|
984
|
-
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
985
|
-
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
986
|
-
metrics.
|
|
987
|
-
|
|
988
|
-
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
989
|
-
|
|
990
|
-
"""
|
|
991
|
-
|
|
992
|
-
def __init__(
|
|
993
|
-
self,
|
|
994
|
-
project: str,
|
|
995
|
-
**kwargs,
|
|
996
|
-
):
|
|
997
|
-
super().__init__(**kwargs)
|
|
998
|
-
self.project: str = project
|
|
999
|
-
|
|
1000
|
-
def do(self, event):
|
|
1001
|
-
if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
|
|
1002
|
-
# Return a parsed Prometheus registry file
|
|
1003
|
-
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1004
|
-
elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
|
|
1005
|
-
# Update statistical metrics
|
|
1006
|
-
for event_metric in event.body:
|
|
1007
|
-
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1008
|
-
project=self.project,
|
|
1009
|
-
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1010
|
-
metric=event_metric[EventFieldType.METRIC],
|
|
1011
|
-
value=event_metric[EventFieldType.VALUE],
|
|
1012
|
-
)
|
|
1013
|
-
elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
|
|
1014
|
-
# Update drift status
|
|
1015
|
-
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1016
|
-
project=self.project,
|
|
1017
|
-
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1018
|
-
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1019
|
-
)
|
|
1020
|
-
|
|
1021
|
-
return event
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1025
|
-
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1026
|
-
|
|
1027
|
-
def __init__(self, project: str, **kwargs):
|
|
1028
|
-
super().__init__(**kwargs)
|
|
1029
|
-
self.project: str = project
|
|
1030
|
-
|
|
1031
|
-
def do(self, event):
|
|
1032
|
-
# Compute prediction per second
|
|
1033
|
-
|
|
1034
|
-
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1035
|
-
project=self.project,
|
|
1036
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1037
|
-
latency=event[EventFieldType.LATENCY],
|
|
1038
|
-
model_name=event[EventFieldType.MODEL],
|
|
1039
|
-
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1040
|
-
)
|
|
1041
|
-
|
|
1042
|
-
return event
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1046
|
-
"""Record a sample of features and labels in Prometheus registry"""
|
|
1047
|
-
|
|
1048
|
-
def __init__(self, project: str, **kwargs):
|
|
1049
|
-
super().__init__(**kwargs)
|
|
1050
|
-
self.project: str = project
|
|
1051
|
-
|
|
1052
|
-
def do(self, event):
|
|
1053
|
-
# Generate a dictionary of features and predictions
|
|
1054
|
-
features = {
|
|
1055
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1056
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
1057
|
-
}
|
|
1058
|
-
|
|
1059
|
-
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1060
|
-
project=self.project,
|
|
1061
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1062
|
-
features=features,
|
|
1063
|
-
)
|
|
1064
|
-
|
|
1065
|
-
return event
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
920
|
def update_endpoint_record(
|
|
1069
921
|
project: str,
|
|
1070
922
|
endpoint_id: str,
|