mlrun 1.6.0rc21__py3-none-any.whl → 1.6.0rc22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +15 -8
- mlrun/artifacts/manager.py +1 -1
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/datastore/azure_blob.py +9 -14
- mlrun/datastore/base.py +21 -7
- mlrun/datastore/dbfs_store.py +10 -10
- mlrun/datastore/filestore.py +2 -1
- mlrun/datastore/google_cloud_storage.py +9 -8
- mlrun/datastore/redis.py +2 -1
- mlrun/datastore/s3.py +3 -6
- mlrun/datastore/sources.py +2 -12
- mlrun/datastore/targets.py +2 -13
- mlrun/datastore/v3io.py +16 -19
- mlrun/db/httpdb.py +8 -1
- mlrun/execution.py +14 -5
- mlrun/feature_store/api.py +3 -4
- mlrun/launcher/base.py +4 -4
- mlrun/lists.py +0 -6
- mlrun/model.py +8 -1
- mlrun/model_monitoring/api.py +9 -31
- mlrun/model_monitoring/batch.py +14 -13
- mlrun/model_monitoring/controller.py +91 -69
- mlrun/model_monitoring/controller_handler.py +1 -3
- mlrun/model_monitoring/helpers.py +19 -8
- mlrun/model_monitoring/stream_processing.py +0 -3
- mlrun/projects/operations.py +1 -1
- mlrun/projects/project.py +5 -4
- mlrun/runtimes/base.py +6 -1
- mlrun/runtimes/constants.py +11 -0
- mlrun/runtimes/kubejob.py +1 -1
- mlrun/runtimes/local.py +64 -53
- mlrun/serving/routers.py +7 -20
- mlrun/serving/server.py +4 -14
- mlrun/serving/utils.py +0 -3
- mlrun/utils/helpers.py +5 -2
- mlrun/utils/logger.py +5 -5
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/METADATA +3 -1
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/RECORD +45 -45
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/WHEEL +0 -0
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc21.dist-info → mlrun-1.6.0rc22.dist-info}/top_level.txt +0 -0
mlrun/db/httpdb.py
CHANGED
|
@@ -707,7 +707,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
707
707
|
:param state: List only runs whose state is specified.
|
|
708
708
|
:param sort: Whether to sort the result according to their start time. Otherwise, results will be
|
|
709
709
|
returned by their internal order in the DB (order will not be guaranteed).
|
|
710
|
-
:param last: Deprecated - currently not used.
|
|
710
|
+
:param last: Deprecated - currently not used (will be removed in 1.8.0).
|
|
711
711
|
:param iter: If ``True`` return runs from all iterations. Otherwise, return only runs whose ``iter`` is 0.
|
|
712
712
|
:param start_time_from: Filter by run start time in ``[start_time_from, start_time_to]``.
|
|
713
713
|
:param start_time_to: Filter by run start time in ``[start_time_from, start_time_to]``.
|
|
@@ -733,6 +733,13 @@ class HTTPRunDB(RunDBInterface):
|
|
|
733
733
|
"using the `with_notifications` flag."
|
|
734
734
|
)
|
|
735
735
|
|
|
736
|
+
if last:
|
|
737
|
+
# TODO: Remove this in 1.8.0
|
|
738
|
+
warnings.warn(
|
|
739
|
+
"'last' is deprecated and will be removed in 1.8.0.",
|
|
740
|
+
FutureWarning,
|
|
741
|
+
)
|
|
742
|
+
|
|
736
743
|
if (
|
|
737
744
|
not name
|
|
738
745
|
and not uid
|
mlrun/execution.py
CHANGED
|
@@ -393,7 +393,7 @@ class MLClientCtx(object):
|
|
|
393
393
|
if v:
|
|
394
394
|
self._set_input(k, v)
|
|
395
395
|
|
|
396
|
-
if host and not is_api
|
|
396
|
+
if host and not is_api:
|
|
397
397
|
self.set_label("host", host)
|
|
398
398
|
|
|
399
399
|
start = get_in(attrs, "status.start_time")
|
|
@@ -411,7 +411,7 @@ class MLClientCtx(object):
|
|
|
411
411
|
self._artifacts_manager.artifacts[key] = artifact_obj
|
|
412
412
|
self._state = status.get("state", self._state)
|
|
413
413
|
|
|
414
|
-
#
|
|
414
|
+
# No need to store the run for every worker
|
|
415
415
|
if store_run and self.is_logging_worker():
|
|
416
416
|
self.store_run()
|
|
417
417
|
return self
|
|
@@ -434,6 +434,12 @@ class MLClientCtx(object):
|
|
|
434
434
|
context.set_label("framework", "sklearn")
|
|
435
435
|
|
|
436
436
|
"""
|
|
437
|
+
if not self.is_logging_worker():
|
|
438
|
+
logger.warning(
|
|
439
|
+
"Setting labels is only supported in the logging worker, ignoring"
|
|
440
|
+
)
|
|
441
|
+
return
|
|
442
|
+
|
|
437
443
|
if replace or not self._labels.get(key):
|
|
438
444
|
self._labels[key] = str(value)
|
|
439
445
|
|
|
@@ -974,10 +980,11 @@ class MLClientCtx(object):
|
|
|
974
980
|
"""
|
|
975
981
|
# If it's a OpenMPI job, get the global rank and compare to the logging rank (worker) set in MLRun's
|
|
976
982
|
# configuration:
|
|
977
|
-
|
|
983
|
+
labels = self.labels
|
|
984
|
+
if "host" in labels and labels.get("kind", "job") == "mpijob":
|
|
978
985
|
# The host (pod name) of each worker is created by k8s, and by default it uses the rank number as the id in
|
|
979
986
|
# the following template: ...-worker-<rank>
|
|
980
|
-
rank = int(
|
|
987
|
+
rank = int(labels["host"].rsplit("-", 1)[1])
|
|
981
988
|
return rank == mlrun.mlconf.packagers.logging_worker
|
|
982
989
|
|
|
983
990
|
# Single worker is always the logging worker:
|
|
@@ -1004,7 +1011,6 @@ class MLClientCtx(object):
|
|
|
1004
1011
|
_struct[key] = val
|
|
1005
1012
|
|
|
1006
1013
|
struct = {
|
|
1007
|
-
"metadata.labels": self._labels,
|
|
1008
1014
|
"metadata.annotations": self._annotations,
|
|
1009
1015
|
"spec.parameters": self._parameters,
|
|
1010
1016
|
"spec.outputs": self._outputs,
|
|
@@ -1019,6 +1025,9 @@ class MLClientCtx(object):
|
|
|
1019
1025
|
if self._state != "completed":
|
|
1020
1026
|
struct["status.state"] = self._state
|
|
1021
1027
|
|
|
1028
|
+
if self.is_logging_worker():
|
|
1029
|
+
struct["metadata.labels"] = self._labels
|
|
1030
|
+
|
|
1022
1031
|
set_if_not_none(struct, "status.error", self._error)
|
|
1023
1032
|
set_if_not_none(struct, "status.commit", self._commit)
|
|
1024
1033
|
set_if_not_none(struct, "status.iterations", self._iteration_results)
|
mlrun/feature_store/api.py
CHANGED
|
@@ -933,7 +933,7 @@ def _deploy_ingestion_service_v2(
|
|
|
933
933
|
source = HTTPSource()
|
|
934
934
|
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
935
935
|
config = RunConfig(function=func)
|
|
936
|
-
|
|
936
|
+
my_set.deploy_ingestion_service(source, run_config=config)
|
|
937
937
|
|
|
938
938
|
:param featureset: feature set object or uri
|
|
939
939
|
:param source: data source object describing the online or offline source
|
|
@@ -1025,7 +1025,7 @@ def deploy_ingestion_service(
|
|
|
1025
1025
|
source = HTTPSource()
|
|
1026
1026
|
func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
|
|
1027
1027
|
config = RunConfig(function=func)
|
|
1028
|
-
|
|
1028
|
+
my_set.deploy_ingestion_service(source, run_config=config)
|
|
1029
1029
|
|
|
1030
1030
|
:param featureset: feature set object or uri
|
|
1031
1031
|
:param source: data source object describing the online or offline source
|
|
@@ -1036,8 +1036,7 @@ def deploy_ingestion_service(
|
|
|
1036
1036
|
|
|
1037
1037
|
:return: URL to access the deployed ingestion service
|
|
1038
1038
|
"""
|
|
1039
|
-
endpoint, _ =
|
|
1040
|
-
featureset=featureset,
|
|
1039
|
+
endpoint, _ = featureset.deploy_ingestion_service(
|
|
1041
1040
|
source=source,
|
|
1042
1041
|
targets=targets,
|
|
1043
1042
|
name=name,
|
mlrun/launcher/base.py
CHANGED
|
@@ -396,10 +396,10 @@ class BaseLauncher(abc.ABC):
|
|
|
396
396
|
status=run.status.state,
|
|
397
397
|
name=run.metadata.name,
|
|
398
398
|
)
|
|
399
|
-
if
|
|
400
|
-
|
|
401
|
-
mlrun.runtimes.constants.RunStates.
|
|
402
|
-
|
|
399
|
+
if (
|
|
400
|
+
run.status.state
|
|
401
|
+
in mlrun.runtimes.constants.RunStates.error_and_abortion_states()
|
|
402
|
+
):
|
|
403
403
|
if runtime._is_remote and not runtime.is_child:
|
|
404
404
|
logger.error(
|
|
405
405
|
"Run did not finish successfully",
|
mlrun/lists.py
CHANGED
mlrun/model.py
CHANGED
|
@@ -1259,8 +1259,15 @@ class RunObject(RunTemplate):
|
|
|
1259
1259
|
"""error string if failed"""
|
|
1260
1260
|
if self.status:
|
|
1261
1261
|
unknown_error = ""
|
|
1262
|
-
if
|
|
1262
|
+
if (
|
|
1263
|
+
self.status.state
|
|
1264
|
+
in mlrun.runtimes.constants.RunStates.abortion_states()
|
|
1265
|
+
):
|
|
1266
|
+
unknown_error = "Run was aborted"
|
|
1267
|
+
|
|
1268
|
+
elif self.status.state in mlrun.runtimes.constants.RunStates.error_states():
|
|
1263
1269
|
unknown_error = "Unknown error"
|
|
1270
|
+
|
|
1264
1271
|
return (
|
|
1265
1272
|
self.status.error
|
|
1266
1273
|
or self.status.reason
|
mlrun/model_monitoring/api.py
CHANGED
|
@@ -132,7 +132,6 @@ def record_results(
|
|
|
132
132
|
drift_threshold: typing.Optional[float] = None,
|
|
133
133
|
possible_drift_threshold: typing.Optional[float] = None,
|
|
134
134
|
trigger_monitoring_job: bool = False,
|
|
135
|
-
last_in_batch_set: typing.Optional[bool] = True,
|
|
136
135
|
artifacts_tag: str = "",
|
|
137
136
|
default_batch_image="mlrun/mlrun",
|
|
138
137
|
) -> ModelEndpoint:
|
|
@@ -165,14 +164,6 @@ def record_results(
|
|
|
165
164
|
:param possible_drift_threshold: The threshold of which to mark possible drifts.
|
|
166
165
|
:param trigger_monitoring_job: If true, run the batch drift job. If not exists, the monitoring batch function
|
|
167
166
|
will be registered through MLRun API with the provided image.
|
|
168
|
-
:param last_in_batch_set: This flag can (and should only) be used when the model endpoint does not have
|
|
169
|
-
model-monitoring set.
|
|
170
|
-
If set to `True` (the default), this flag marks the current monitoring window
|
|
171
|
-
(on this monitoring endpoint) is completed - the data inferred so far is assumed
|
|
172
|
-
to be the total data for this monitoring window.
|
|
173
|
-
You may want to set this flag to `False` if you want to record multiple results in
|
|
174
|
-
close time proximity ("batch set"). In this case, set this flag to `False` on all
|
|
175
|
-
but the last batch in the set.
|
|
176
167
|
:param artifacts_tag: Tag to use for all the artifacts resulted from the function. Will be relevant
|
|
177
168
|
only if the monitoring batch job has been triggered.
|
|
178
169
|
|
|
@@ -206,25 +197,14 @@ def record_results(
|
|
|
206
197
|
)
|
|
207
198
|
|
|
208
199
|
if model_endpoint.spec.stream_path == "":
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
)
|
|
218
|
-
else:
|
|
219
|
-
if last_in_batch_set is not None:
|
|
220
|
-
logger.warning(
|
|
221
|
-
"`last_in_batch_set` is not `None`, but the model endpoint has a stream path. "
|
|
222
|
-
"Ignoring `last_in_batch_set`, as it is relevant only when the model "
|
|
223
|
-
"endpoint does not have a model monitoring infrastructure in place (i.e. stream path is "
|
|
224
|
-
" empty). Set `last_in_batch_set` to `None` to resolve this warning.",
|
|
225
|
-
project=project,
|
|
226
|
-
endpoint_id=model_endpoint.metadata.uid,
|
|
227
|
-
)
|
|
200
|
+
logger.info(
|
|
201
|
+
"Updating the last request time to mark the current monitoring window as completed",
|
|
202
|
+
project=project,
|
|
203
|
+
endpoint_id=model_endpoint.metadata.uid,
|
|
204
|
+
)
|
|
205
|
+
bump_model_endpoint_last_request(
|
|
206
|
+
project=project, model_endpoint=model_endpoint, db=db
|
|
207
|
+
)
|
|
228
208
|
|
|
229
209
|
if trigger_monitoring_job:
|
|
230
210
|
# Run the monitoring batch drift job
|
|
@@ -612,9 +592,7 @@ def read_dataset_as_dataframe(
|
|
|
612
592
|
if label_columns is None:
|
|
613
593
|
label_columns = dataset.status.label_column
|
|
614
594
|
# Get the features and parse to DataFrame:
|
|
615
|
-
dataset =
|
|
616
|
-
dataset.uri, drop_columns=drop_columns
|
|
617
|
-
).to_dataframe()
|
|
595
|
+
dataset = dataset.get_offline_features(drop_columns=drop_columns).to_dataframe()
|
|
618
596
|
|
|
619
597
|
elif isinstance(dataset, (list, np.ndarray)):
|
|
620
598
|
if not feature_columns:
|
mlrun/model_monitoring/batch.py
CHANGED
|
@@ -117,20 +117,21 @@ class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
|
|
|
117
117
|
def _calc_kl_div(
|
|
118
118
|
actual_dist: np.array, expected_dist: np.array, kld_scaling: float
|
|
119
119
|
) -> float:
|
|
120
|
-
"""Return the
|
|
120
|
+
"""Return the asymmetric KL divergence"""
|
|
121
|
+
# We take 0*log(0) == 0 for this calculation
|
|
122
|
+
mask = actual_dist != 0
|
|
123
|
+
actual_dist = actual_dist[mask]
|
|
124
|
+
expected_dist = expected_dist[mask]
|
|
121
125
|
return np.sum(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
(
|
|
125
|
-
|
|
126
|
-
actual_dist
|
|
127
|
-
/ np.where(expected_dist != 0, expected_dist, kld_scaling)
|
|
128
|
-
),
|
|
129
|
-
0,
|
|
130
|
-
)
|
|
126
|
+
actual_dist
|
|
127
|
+
* np.log(
|
|
128
|
+
actual_dist / np.where(expected_dist != 0, expected_dist, kld_scaling)
|
|
129
|
+
),
|
|
131
130
|
)
|
|
132
131
|
|
|
133
|
-
def compute(
|
|
132
|
+
def compute(
|
|
133
|
+
self, capping: Optional[float] = None, kld_scaling: float = 1e-4
|
|
134
|
+
) -> float:
|
|
134
135
|
"""
|
|
135
136
|
:param capping: A bounded value for the KL Divergence. For infinite distance, the result is replaced with
|
|
136
137
|
the capping value which indicates a huge differences between the distributions.
|
|
@@ -141,8 +142,8 @@ class KullbackLeiblerDivergence(HistogramDistanceMetric, metric_name="kld"):
|
|
|
141
142
|
t_u = self._calc_kl_div(self.distrib_t, self.distrib_u, kld_scaling)
|
|
142
143
|
u_t = self._calc_kl_div(self.distrib_u, self.distrib_t, kld_scaling)
|
|
143
144
|
result = t_u + u_t
|
|
144
|
-
if capping:
|
|
145
|
-
return capping
|
|
145
|
+
if capping and result == float("inf"):
|
|
146
|
+
return capping
|
|
146
147
|
return result
|
|
147
148
|
|
|
148
149
|
|
|
@@ -17,7 +17,7 @@ import datetime
|
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
|
-
from typing import Any, Iterator,
|
|
20
|
+
from typing import Any, Iterator, NamedTuple, Optional, Union, cast
|
|
21
21
|
|
|
22
22
|
from v3io.dataplane.response import HttpResponseError
|
|
23
23
|
|
|
@@ -35,10 +35,15 @@ from mlrun.model_monitoring.helpers import (
|
|
|
35
35
|
get_monitoring_parquet_path,
|
|
36
36
|
get_stream_path,
|
|
37
37
|
)
|
|
38
|
-
from mlrun.utils import logger
|
|
38
|
+
from mlrun.utils import create_logger, datetime_now, logger
|
|
39
39
|
from mlrun.utils.v3io_clients import get_v3io_client
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class _Interval(NamedTuple):
|
|
43
|
+
start: datetime.datetime
|
|
44
|
+
end: datetime.datetime
|
|
45
|
+
|
|
46
|
+
|
|
42
47
|
class _BatchWindow:
|
|
43
48
|
V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
|
|
44
49
|
|
|
@@ -60,7 +65,11 @@ class _BatchWindow:
|
|
|
60
65
|
self._endpoint = endpoint
|
|
61
66
|
self._application = application
|
|
62
67
|
self._first_request = first_request
|
|
63
|
-
self._kv_storage = get_v3io_client(
|
|
68
|
+
self._kv_storage = get_v3io_client(
|
|
69
|
+
endpoint=mlrun.mlconf.v3io_api,
|
|
70
|
+
# Avoid noisy warning logs before the KV table is created
|
|
71
|
+
logger=create_logger(name="v3io_client", level="error"),
|
|
72
|
+
).kv
|
|
64
73
|
self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
|
|
65
74
|
self._stop = last_updated
|
|
66
75
|
self._step = timedelta_seconds
|
|
@@ -75,24 +84,26 @@ class _BatchWindow:
|
|
|
75
84
|
)
|
|
76
85
|
except HttpResponseError as err:
|
|
77
86
|
logger.info(
|
|
78
|
-
"
|
|
79
|
-
"as this is probably the first time this
|
|
80
|
-
"Using the latest between first
|
|
87
|
+
"No last analyzed time was found for this endpoint and "
|
|
88
|
+
"application, as this is probably the first time this "
|
|
89
|
+
"application is running. Using the latest between first "
|
|
90
|
+
"request time or last update time minus one day instead",
|
|
81
91
|
endpoint=self._endpoint,
|
|
82
92
|
application=self._application,
|
|
83
93
|
first_request=self._first_request,
|
|
84
|
-
|
|
85
|
-
error=err,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
# TODO : Change the timedelta according to the policy.
|
|
89
|
-
first_period_in_seconds = max(
|
|
90
|
-
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
91
|
-
) # max between one day and the base period
|
|
92
|
-
return max(
|
|
93
|
-
self._first_request,
|
|
94
|
-
self._stop - first_period_in_seconds,
|
|
94
|
+
last_updated=self._stop,
|
|
95
95
|
)
|
|
96
|
+
logger.debug("Error while getting last analyzed time", err=err)
|
|
97
|
+
if self._first_request and self._stop:
|
|
98
|
+
# TODO : Change the timedelta according to the policy.
|
|
99
|
+
first_period_in_seconds = max(
|
|
100
|
+
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
101
|
+
) # max between one day and the base period
|
|
102
|
+
return max(
|
|
103
|
+
self._first_request,
|
|
104
|
+
self._stop - first_period_in_seconds,
|
|
105
|
+
)
|
|
106
|
+
return self._first_request
|
|
96
107
|
|
|
97
108
|
last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
|
|
98
109
|
logger.info(
|
|
@@ -119,20 +130,29 @@ class _BatchWindow:
|
|
|
119
130
|
|
|
120
131
|
def get_intervals(
|
|
121
132
|
self,
|
|
122
|
-
) -> Iterator[
|
|
133
|
+
) -> Iterator[_Interval]:
|
|
123
134
|
"""Generate the batch interval time ranges."""
|
|
124
135
|
if self._start is not None and self._stop is not None:
|
|
125
136
|
entered = False
|
|
126
|
-
|
|
137
|
+
# Iterate timestamp from start until timestamp <= stop - step
|
|
138
|
+
# so that the last interval will end at (timestamp + step) <= stop.
|
|
139
|
+
# Add 1 to stop - step to get <= and not <.
|
|
140
|
+
for timestamp in range(
|
|
141
|
+
self._start, self._stop - self._step + 1, self._step
|
|
142
|
+
):
|
|
127
143
|
entered = True
|
|
128
|
-
start_time = datetime.datetime.
|
|
129
|
-
|
|
130
|
-
|
|
144
|
+
start_time = datetime.datetime.fromtimestamp(
|
|
145
|
+
timestamp, tz=datetime.timezone.utc
|
|
146
|
+
)
|
|
147
|
+
end_time = datetime.datetime.fromtimestamp(
|
|
148
|
+
timestamp + self._step, tz=datetime.timezone.utc
|
|
149
|
+
)
|
|
150
|
+
yield _Interval(start_time, end_time)
|
|
131
151
|
self._update_last_analyzed(timestamp + self._step)
|
|
132
152
|
if not entered:
|
|
133
153
|
logger.info(
|
|
134
154
|
"All the data is set, but no complete intervals were found. "
|
|
135
|
-
"Wait for last_updated to be updated
|
|
155
|
+
"Wait for last_updated to be updated",
|
|
136
156
|
endpoint=self._endpoint,
|
|
137
157
|
application=self._application,
|
|
138
158
|
start=self._start,
|
|
@@ -141,8 +161,8 @@ class _BatchWindow:
|
|
|
141
161
|
)
|
|
142
162
|
else:
|
|
143
163
|
logger.warn(
|
|
144
|
-
"The first request time is not
|
|
145
|
-
"No intervals will be generated
|
|
164
|
+
"The first request time is not found for this endpoint. "
|
|
165
|
+
"No intervals will be generated",
|
|
146
166
|
endpoint=self._endpoint,
|
|
147
167
|
application=self._application,
|
|
148
168
|
start=self._start,
|
|
@@ -185,26 +205,38 @@ class _BatchWindowGenerator:
|
|
|
185
205
|
)
|
|
186
206
|
|
|
187
207
|
@classmethod
|
|
188
|
-
def _get_last_updated_time(
|
|
208
|
+
def _get_last_updated_time(
|
|
209
|
+
cls, last_request: Optional[str], has_stream: bool
|
|
210
|
+
) -> Optional[int]:
|
|
189
211
|
"""
|
|
190
212
|
Get the last updated time of a model endpoint.
|
|
191
213
|
"""
|
|
192
214
|
if not last_request:
|
|
193
215
|
return None
|
|
194
|
-
|
|
216
|
+
last_updated = int(
|
|
195
217
|
cls._date_string2timestamp(last_request)
|
|
196
218
|
- cast(
|
|
197
219
|
float,
|
|
198
220
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
199
221
|
)
|
|
200
222
|
)
|
|
223
|
+
if not has_stream:
|
|
224
|
+
# If the endpoint does not have a stream, `last_updated` should be
|
|
225
|
+
# the minimum between the current time and the last updated time.
|
|
226
|
+
# This compensates for the bumping mechanism - see
|
|
227
|
+
# `bump_model_endpoint_last_request`.
|
|
228
|
+
last_updated = min(int(datetime_now().timestamp()), last_updated)
|
|
229
|
+
logger.debug(
|
|
230
|
+
"The endpoint does not have a stream", last_updated=last_updated
|
|
231
|
+
)
|
|
232
|
+
return last_updated
|
|
201
233
|
|
|
202
234
|
@classmethod
|
|
203
235
|
def _normalize_first_request(
|
|
204
236
|
cls, first_request: Optional[str], endpoint: str
|
|
205
237
|
) -> Optional[int]:
|
|
206
238
|
if not first_request:
|
|
207
|
-
logger.
|
|
239
|
+
logger.debug(
|
|
208
240
|
"There is no first request time for this endpoint.",
|
|
209
241
|
endpoint=endpoint,
|
|
210
242
|
first_request=first_request,
|
|
@@ -223,6 +255,7 @@ class _BatchWindowGenerator:
|
|
|
223
255
|
application: str,
|
|
224
256
|
first_request: Optional[str],
|
|
225
257
|
last_request: Optional[str],
|
|
258
|
+
has_stream: bool,
|
|
226
259
|
) -> _BatchWindow:
|
|
227
260
|
"""
|
|
228
261
|
Get the batch window for a specific endpoint and application.
|
|
@@ -234,7 +267,7 @@ class _BatchWindowGenerator:
|
|
|
234
267
|
endpoint=endpoint,
|
|
235
268
|
application=application,
|
|
236
269
|
timedelta_seconds=self._timedelta,
|
|
237
|
-
last_updated=self._get_last_updated_time(last_request),
|
|
270
|
+
last_updated=self._get_last_updated_time(last_request, has_stream),
|
|
238
271
|
first_request=self._normalize_first_request(first_request, endpoint),
|
|
239
272
|
)
|
|
240
273
|
|
|
@@ -259,20 +292,12 @@ class MonitoringApplicationController:
|
|
|
259
292
|
"""
|
|
260
293
|
self.context = context
|
|
261
294
|
self.project = project
|
|
295
|
+
self.project_obj = mlrun.get_or_create_project(project)
|
|
262
296
|
|
|
263
|
-
logger.
|
|
264
|
-
"Initializing MonitoringApplicationController",
|
|
265
|
-
project=project,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
# Get a runtime database
|
|
297
|
+
context.logger.debug(f"Initializing {self.__class__.__name__}", project=project)
|
|
269
298
|
|
|
270
299
|
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
271
300
|
|
|
272
|
-
# If an error occurs, it will be raised using the following argument
|
|
273
|
-
self.endpoints_exceptions = {}
|
|
274
|
-
|
|
275
|
-
# The batch window
|
|
276
301
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
277
302
|
batch_dict=context.parameters[
|
|
278
303
|
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
@@ -285,7 +310,7 @@ class MonitoringApplicationController:
|
|
|
285
310
|
)
|
|
286
311
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
287
312
|
self.parquet_directory = get_monitoring_parquet_path(
|
|
288
|
-
|
|
313
|
+
self.project_obj,
|
|
289
314
|
kind=mm_constants.FileTargetKind.APPS_PARQUET,
|
|
290
315
|
)
|
|
291
316
|
self.storage_options = None
|
|
@@ -311,21 +336,23 @@ class MonitoringApplicationController:
|
|
|
311
336
|
|
|
312
337
|
def run(self):
|
|
313
338
|
"""
|
|
314
|
-
Main method for run all the relevant monitoring
|
|
339
|
+
Main method for run all the relevant monitoring applications on each endpoint
|
|
315
340
|
"""
|
|
316
341
|
try:
|
|
317
342
|
endpoints = self.db.list_model_endpoints(uids=self.model_endpoints)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
343
|
+
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
344
|
+
if monitoring_functions:
|
|
345
|
+
applications_names = list(
|
|
346
|
+
{app.metadata.name for app in monitoring_functions}
|
|
347
|
+
)
|
|
323
348
|
else:
|
|
324
|
-
logger.info(
|
|
349
|
+
self.context.logger.info(
|
|
350
|
+
"No monitoring functions found", project=self.project
|
|
351
|
+
)
|
|
325
352
|
applications_names = []
|
|
326
353
|
|
|
327
354
|
except Exception as e:
|
|
328
|
-
logger.error("Failed to list endpoints", exc=e)
|
|
355
|
+
self.context.logger.error("Failed to list endpoints", exc=e)
|
|
329
356
|
return
|
|
330
357
|
if endpoints and applications_names:
|
|
331
358
|
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
@@ -362,9 +389,7 @@ class MonitoringApplicationController:
|
|
|
362
389
|
futures.append(future)
|
|
363
390
|
|
|
364
391
|
for future in concurrent.futures.as_completed(futures):
|
|
365
|
-
|
|
366
|
-
if res:
|
|
367
|
-
self.endpoints_exceptions[res[0]] = res[1]
|
|
392
|
+
future.result()
|
|
368
393
|
|
|
369
394
|
self._delete_old_parquet(endpoints=endpoints)
|
|
370
395
|
|
|
@@ -378,7 +403,7 @@ class MonitoringApplicationController:
|
|
|
378
403
|
parquet_directory: str,
|
|
379
404
|
storage_options: dict,
|
|
380
405
|
model_monitoring_access_key: str,
|
|
381
|
-
) ->
|
|
406
|
+
) -> None:
|
|
382
407
|
"""
|
|
383
408
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
384
409
|
for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
|
|
@@ -413,6 +438,7 @@ class MonitoringApplicationController:
|
|
|
413
438
|
application=application,
|
|
414
439
|
first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
|
|
415
440
|
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
441
|
+
has_stream=endpoint[mm_constants.EventFieldType.STREAM_PATH] != "",
|
|
416
442
|
)
|
|
417
443
|
|
|
418
444
|
for start_infer_time, end_infer_time in batch_window.get_intervals():
|
|
@@ -432,22 +458,18 @@ class MonitoringApplicationController:
|
|
|
432
458
|
parquet_target_path = offline_response.vector.get_target_path()
|
|
433
459
|
|
|
434
460
|
if len(df) == 0:
|
|
435
|
-
logger.
|
|
436
|
-
"
|
|
437
|
-
featureset_name=m_fs.metadata.name,
|
|
461
|
+
logger.info(
|
|
462
|
+
"During this time window, the endpoint has not received any data",
|
|
438
463
|
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
439
|
-
min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
440
464
|
start_time=start_infer_time,
|
|
441
465
|
end_time=end_infer_time,
|
|
442
466
|
)
|
|
443
467
|
continue
|
|
444
468
|
|
|
445
|
-
# Continue if not enough events provided since the deployment of the model endpoint
|
|
446
469
|
except FileNotFoundError:
|
|
447
470
|
logger.warn(
|
|
448
|
-
"
|
|
471
|
+
"No parquets were written yet",
|
|
449
472
|
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
450
|
-
min_required_events=mlrun.mlconf.model_endpoint_monitoring.parquet_batching_max_events,
|
|
451
473
|
)
|
|
452
474
|
continue
|
|
453
475
|
|
|
@@ -481,12 +503,11 @@ class MonitoringApplicationController:
|
|
|
481
503
|
model_monitoring_access_key=model_monitoring_access_key,
|
|
482
504
|
parquet_target_path=parquet_target_path,
|
|
483
505
|
)
|
|
484
|
-
except Exception
|
|
485
|
-
logger.
|
|
506
|
+
except Exception:
|
|
507
|
+
logger.exception(
|
|
486
508
|
"Encountered an exception",
|
|
487
509
|
endpoint_id=endpoint[mm_constants.EventFieldType.UID],
|
|
488
510
|
)
|
|
489
|
-
return endpoint_id, e
|
|
490
511
|
|
|
491
512
|
def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
|
|
492
513
|
"""
|
|
@@ -500,12 +521,14 @@ class MonitoringApplicationController:
|
|
|
500
521
|
self.parquet_directory,
|
|
501
522
|
{"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
|
|
502
523
|
)
|
|
503
|
-
fs = store.
|
|
524
|
+
fs = store.filesystem
|
|
504
525
|
|
|
505
526
|
# calculate time threshold (keep only files from the last 24 hours)
|
|
506
|
-
time_to_keep =
|
|
507
|
-
|
|
508
|
-
|
|
527
|
+
time_to_keep = (
|
|
528
|
+
datetime.datetime.now(tz=datetime.timezone.utc)
|
|
529
|
+
- datetime.timedelta(days=days)
|
|
530
|
+
).timestamp()
|
|
531
|
+
|
|
509
532
|
for endpoint in endpoints:
|
|
510
533
|
try:
|
|
511
534
|
apps_parquet_directories = fs.listdir(
|
|
@@ -619,14 +642,13 @@ class MonitoringApplicationController:
|
|
|
619
642
|
|
|
620
643
|
# get offline features based on application start and end time.
|
|
621
644
|
# store the result parquet by partitioning by controller end processing time
|
|
622
|
-
offline_response =
|
|
623
|
-
feature_vector=vector,
|
|
645
|
+
offline_response = vector.get_offline_features(
|
|
624
646
|
start_time=start_infer_time,
|
|
625
647
|
end_time=end_infer_time,
|
|
626
648
|
timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
|
|
627
649
|
target=ParquetTarget(
|
|
628
650
|
path=parquet_directory
|
|
629
|
-
+ f"/key={endpoint_id}/{start_infer_time.
|
|
651
|
+
+ f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
|
|
630
652
|
storage_options=storage_options,
|
|
631
653
|
),
|
|
632
654
|
)
|
|
@@ -16,7 +16,7 @@ import mlrun
|
|
|
16
16
|
from mlrun.model_monitoring.controller import MonitoringApplicationController
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def handler(context: mlrun.run.MLClientCtx):
|
|
19
|
+
def handler(context: mlrun.run.MLClientCtx) -> None:
|
|
20
20
|
"""
|
|
21
21
|
Run model monitoring application processor
|
|
22
22
|
|
|
@@ -27,5 +27,3 @@ def handler(context: mlrun.run.MLClientCtx):
|
|
|
27
27
|
project=context.project,
|
|
28
28
|
)
|
|
29
29
|
monitor_app_controller.run()
|
|
30
|
-
if monitor_app_controller.endpoints_exceptions:
|
|
31
|
-
context.logger.error(monitor_app_controller.endpoints_exceptions)
|