mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +24 -3
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +14 -0
- mlrun/common/model_monitoring/helpers.py +123 -0
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +14 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +12 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +34 -0
- mlrun/common/schemas/hub.py +33 -20
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +12 -15
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +69 -19
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +12 -5
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +7 -2
- mlrun/datastore/datastore_profile.py +84 -22
- mlrun/datastore/model_provider/huggingface_provider.py +225 -41
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +206 -74
- mlrun/datastore/model_provider/openai_provider.py +226 -66
- mlrun/datastore/s3.py +39 -18
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +17 -12
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +25 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +63 -32
- mlrun/db/httpdb.py +373 -153
- mlrun/db/nopdb.py +54 -21
- mlrun/errors.py +4 -2
- mlrun/execution.py +66 -25
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +52 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +172 -0
- mlrun/hub/step.py +113 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +15 -7
- mlrun/launcher/local.py +4 -1
- mlrun/model.py +14 -4
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +65 -28
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +299 -128
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/controller.py +132 -58
- mlrun/model_monitoring/db/_schedules.py +38 -29
- mlrun/model_monitoring/db/_stats.py +6 -16
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +30 -6
- mlrun/model_monitoring/stream_processing.py +34 -28
- mlrun/model_monitoring/writer.py +224 -4
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -43
- mlrun/platforms/iguazio.py +8 -4
- mlrun/projects/operations.py +17 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +187 -123
- mlrun/run.py +95 -21
- mlrun/runtimes/__init__.py +2 -186
- mlrun/runtimes/base.py +103 -25
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +5 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +12 -7
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +339 -40
- mlrun/runtimes/nuclio/function.py +222 -72
- mlrun/runtimes/nuclio/serving.py +132 -42
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +99 -14
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +84 -11
- mlrun/serving/routers.py +26 -44
- mlrun/serving/server.py +138 -51
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +997 -283
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +149 -95
- mlrun/serving/v2_serving.py +9 -10
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +292 -94
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +3 -5
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +3 -3
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
- mlrun/api/schemas/__init__.py +0 -259
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -24,15 +24,12 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
24
24
|
import mlrun.errors
|
|
25
25
|
import mlrun.feature_store as fstore
|
|
26
26
|
import mlrun.feature_store.feature_set as fs
|
|
27
|
-
import mlrun.features
|
|
28
27
|
import mlrun.serving
|
|
29
28
|
import mlrun.utils
|
|
30
29
|
from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
|
|
31
30
|
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
32
31
|
from mlrun.common.schemas import ModelEndpoint
|
|
33
|
-
from mlrun.model_monitoring.helpers import
|
|
34
|
-
calculate_inputs_statistics,
|
|
35
|
-
)
|
|
32
|
+
from mlrun.model_monitoring.helpers import calculate_inputs_statistics
|
|
36
33
|
|
|
37
34
|
|
|
38
35
|
class _ArtifactsLogger(Protocol):
|
|
@@ -252,6 +249,7 @@ class MonitoringApplicationContext:
|
|
|
252
249
|
project=self.project_name,
|
|
253
250
|
endpoint_id=self.endpoint_id,
|
|
254
251
|
feature_analysis=True,
|
|
252
|
+
tsdb_metrics=False,
|
|
255
253
|
)
|
|
256
254
|
return self._model_endpoint
|
|
257
255
|
|
|
@@ -11,33 +11,37 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import collections
|
|
15
15
|
import concurrent.futures
|
|
16
16
|
import datetime
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import traceback
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterator
|
|
21
22
|
from contextlib import AbstractContextManager
|
|
22
23
|
from types import TracebackType
|
|
23
|
-
from typing import Any, NamedTuple, Optional, Union, cast
|
|
24
|
+
from typing import Any, Final, NamedTuple, Optional, Union, cast
|
|
24
25
|
|
|
25
26
|
import nuclio_sdk
|
|
27
|
+
import numpy as np
|
|
26
28
|
import pandas as pd
|
|
27
29
|
|
|
28
30
|
import mlrun
|
|
29
31
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
32
|
+
import mlrun.feature_store as fstore
|
|
30
33
|
import mlrun.model_monitoring
|
|
31
34
|
import mlrun.model_monitoring.db._schedules as schedules
|
|
32
35
|
import mlrun.model_monitoring.helpers
|
|
33
36
|
import mlrun.platforms.iguazio
|
|
37
|
+
from mlrun.common.schemas import EndpointType
|
|
34
38
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
35
39
|
ControllerEvent,
|
|
36
40
|
ControllerEventEndpointPolicy,
|
|
37
41
|
)
|
|
38
42
|
from mlrun.errors import err_to_str
|
|
39
43
|
from mlrun.model_monitoring.helpers import batch_dict2timedelta
|
|
40
|
-
from mlrun.utils import logger
|
|
44
|
+
from mlrun.utils import datetime_now, logger
|
|
41
45
|
|
|
42
46
|
_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
|
|
43
47
|
_SECONDS_IN_MINUTE = 60
|
|
@@ -49,14 +53,16 @@ class _Interval(NamedTuple):
|
|
|
49
53
|
|
|
50
54
|
|
|
51
55
|
class _BatchWindow:
|
|
56
|
+
TIMESTAMP_RESOLUTION_MICRO: Final = 1e-6 # 0.000001 seconds or 1 microsecond
|
|
57
|
+
|
|
52
58
|
def __init__(
|
|
53
59
|
self,
|
|
54
60
|
*,
|
|
55
61
|
schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
|
|
56
62
|
application: str,
|
|
57
63
|
timedelta_seconds: int,
|
|
58
|
-
last_updated:
|
|
59
|
-
first_request:
|
|
64
|
+
last_updated: float,
|
|
65
|
+
first_request: float,
|
|
60
66
|
endpoint_mode: mm_constants.EndpointMode = mm_constants.EndpointMode.REAL_TIME,
|
|
61
67
|
) -> None:
|
|
62
68
|
"""
|
|
@@ -73,15 +79,17 @@ class _BatchWindow:
|
|
|
73
79
|
self._endpoint_mode = endpoint_mode
|
|
74
80
|
self._start = self._get_last_analyzed()
|
|
75
81
|
|
|
76
|
-
def _get_saved_last_analyzed(
|
|
77
|
-
|
|
82
|
+
def _get_saved_last_analyzed(
|
|
83
|
+
self,
|
|
84
|
+
) -> Optional[float]:
|
|
85
|
+
return self._db.get_application_time(self._application)
|
|
78
86
|
|
|
79
|
-
def _update_last_analyzed(self, last_analyzed:
|
|
87
|
+
def _update_last_analyzed(self, last_analyzed: float) -> None:
|
|
80
88
|
self._db.update_application_time(
|
|
81
89
|
application=self._application, timestamp=last_analyzed
|
|
82
90
|
)
|
|
83
91
|
|
|
84
|
-
def _get_initial_last_analyzed(self) ->
|
|
92
|
+
def _get_initial_last_analyzed(self) -> float:
|
|
85
93
|
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
86
94
|
logger.info(
|
|
87
95
|
"No last analyzed time was found for this endpoint and application, as this is "
|
|
@@ -107,7 +115,7 @@ class _BatchWindow:
|
|
|
107
115
|
self._stop - first_period_in_seconds,
|
|
108
116
|
)
|
|
109
117
|
|
|
110
|
-
def _get_last_analyzed(self) ->
|
|
118
|
+
def _get_last_analyzed(self) -> float:
|
|
111
119
|
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
112
120
|
if saved_last_analyzed is not None:
|
|
113
121
|
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
@@ -127,13 +135,14 @@ class _BatchWindow:
|
|
|
127
135
|
# Iterate timestamp from start until timestamp <= stop - step
|
|
128
136
|
# so that the last interval will end at (timestamp + step) <= stop.
|
|
129
137
|
# Add 1 to stop - step to get <= and not <.
|
|
130
|
-
for timestamp in
|
|
138
|
+
for timestamp in np.arange(
|
|
139
|
+
self._start, self._stop - self._step + 1, self._step
|
|
140
|
+
):
|
|
131
141
|
entered = True
|
|
132
|
-
start_time = datetime.datetime.fromtimestamp(
|
|
133
|
-
timestamp, tz=datetime.timezone.utc
|
|
134
|
-
)
|
|
142
|
+
start_time = datetime.datetime.fromtimestamp(timestamp, tz=datetime.UTC)
|
|
135
143
|
end_time = datetime.datetime.fromtimestamp(
|
|
136
|
-
timestamp + self._step,
|
|
144
|
+
timestamp - self.TIMESTAMP_RESOLUTION_MICRO + self._step,
|
|
145
|
+
tz=datetime.UTC,
|
|
137
146
|
)
|
|
138
147
|
yield _Interval(start_time, end_time)
|
|
139
148
|
|
|
@@ -149,27 +158,19 @@ class _BatchWindow:
|
|
|
149
158
|
# If the endpoint is a batch endpoint, we need to update the last analyzed time
|
|
150
159
|
# to the end of the batch time.
|
|
151
160
|
if last_analyzed:
|
|
152
|
-
if last_analyzed < self._stop:
|
|
161
|
+
if last_analyzed - self.TIMESTAMP_RESOLUTION_MICRO < self._stop:
|
|
153
162
|
# If the last analyzed time is earlier than the stop time,
|
|
154
163
|
# yield the final partial interval from last_analyzed to stop
|
|
155
164
|
yield _Interval(
|
|
156
|
-
datetime.datetime.fromtimestamp(
|
|
157
|
-
|
|
158
|
-
),
|
|
159
|
-
datetime.datetime.fromtimestamp(
|
|
160
|
-
self._stop, tz=datetime.timezone.utc
|
|
161
|
-
),
|
|
165
|
+
datetime.datetime.fromtimestamp(last_analyzed, tz=datetime.UTC),
|
|
166
|
+
datetime.datetime.fromtimestamp(self._stop, tz=datetime.UTC),
|
|
162
167
|
)
|
|
163
168
|
else:
|
|
164
169
|
# The time span between the start and end of the batch is shorter than the step,
|
|
165
170
|
# so we need to yield a partial interval covering that range.
|
|
166
171
|
yield _Interval(
|
|
167
|
-
datetime.datetime.fromtimestamp(
|
|
168
|
-
|
|
169
|
-
),
|
|
170
|
-
datetime.datetime.fromtimestamp(
|
|
171
|
-
self._stop, tz=datetime.timezone.utc
|
|
172
|
-
),
|
|
172
|
+
datetime.datetime.fromtimestamp(self._start, tz=datetime.UTC),
|
|
173
|
+
datetime.datetime.fromtimestamp(self._stop, tz=datetime.UTC),
|
|
173
174
|
)
|
|
174
175
|
|
|
175
176
|
self._update_last_analyzed(last_analyzed=self._stop)
|
|
@@ -223,7 +224,7 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
223
224
|
def get_application_list(self) -> set[str]:
|
|
224
225
|
return self._schedules_file.get_application_list()
|
|
225
226
|
|
|
226
|
-
def get_min_last_analyzed(self) -> Optional[
|
|
227
|
+
def get_min_last_analyzed(self) -> Optional[float]:
|
|
227
228
|
return self._schedules_file.get_min_timestamp()
|
|
228
229
|
|
|
229
230
|
@classmethod
|
|
@@ -231,22 +232,29 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
231
232
|
cls,
|
|
232
233
|
last_request: datetime.datetime,
|
|
233
234
|
endpoint_mode: mm_constants.EndpointMode,
|
|
234
|
-
|
|
235
|
+
not_old_batch_endpoint: bool,
|
|
236
|
+
) -> float:
|
|
235
237
|
"""
|
|
236
238
|
Get the last updated time of a model endpoint.
|
|
237
239
|
"""
|
|
238
240
|
|
|
239
241
|
if endpoint_mode == mm_constants.EndpointMode.REAL_TIME:
|
|
240
|
-
last_updated =
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
float,
|
|
244
|
-
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
245
|
-
)
|
|
242
|
+
last_updated = last_request.timestamp() - cast(
|
|
243
|
+
float,
|
|
244
|
+
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
246
245
|
)
|
|
246
|
+
if not not_old_batch_endpoint:
|
|
247
|
+
# If the endpoint does not have a stream, `last_updated` should be
|
|
248
|
+
# the minimum between the current time and the last updated time.
|
|
249
|
+
# This compensates for the bumping mechanism - see
|
|
250
|
+
# `update_model_endpoint_last_request`.
|
|
251
|
+
last_updated = min(datetime_now().timestamp(), last_updated)
|
|
252
|
+
logger.debug(
|
|
253
|
+
"The endpoint does not have a stream", last_updated=last_updated
|
|
254
|
+
)
|
|
247
255
|
|
|
248
256
|
return last_updated
|
|
249
|
-
return
|
|
257
|
+
return last_request.timestamp()
|
|
250
258
|
|
|
251
259
|
def get_intervals(
|
|
252
260
|
self,
|
|
@@ -255,6 +263,7 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
255
263
|
first_request: datetime.datetime,
|
|
256
264
|
last_request: datetime.datetime,
|
|
257
265
|
endpoint_mode: mm_constants.EndpointMode,
|
|
266
|
+
not_old_batch_endpoint: bool,
|
|
258
267
|
) -> Iterator[_Interval]:
|
|
259
268
|
"""
|
|
260
269
|
Get the batch window for a specific endpoint and application.
|
|
@@ -266,8 +275,10 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
266
275
|
schedules_file=self._schedules_file,
|
|
267
276
|
application=application,
|
|
268
277
|
timedelta_seconds=self._timedelta,
|
|
269
|
-
last_updated=self._get_last_updated_time(
|
|
270
|
-
|
|
278
|
+
last_updated=self._get_last_updated_time(
|
|
279
|
+
last_request, endpoint_mode, not_old_batch_endpoint
|
|
280
|
+
),
|
|
281
|
+
first_request=first_request.timestamp(),
|
|
271
282
|
endpoint_mode=endpoint_mode,
|
|
272
283
|
)
|
|
273
284
|
yield from self.batch_window.get_intervals()
|
|
@@ -291,6 +302,8 @@ class MonitoringApplicationController:
|
|
|
291
302
|
Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
|
|
292
303
|
"""
|
|
293
304
|
|
|
305
|
+
_MAX_FEATURE_SET_PER_WORKER = 1000
|
|
306
|
+
|
|
294
307
|
def __init__(self) -> None:
|
|
295
308
|
"""Initialize Monitoring Application Controller"""
|
|
296
309
|
self.project = cast(str, mlrun.mlconf.active_project)
|
|
@@ -324,6 +337,9 @@ class MonitoringApplicationController:
|
|
|
324
337
|
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
325
338
|
],
|
|
326
339
|
] = {}
|
|
340
|
+
self.feature_sets: collections.OrderedDict[
|
|
341
|
+
str, mlrun.feature_store.FeatureSet
|
|
342
|
+
] = collections.OrderedDict()
|
|
327
343
|
self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
328
344
|
project=self.project
|
|
329
345
|
)
|
|
@@ -433,15 +449,14 @@ class MonitoringApplicationController:
|
|
|
433
449
|
base_period_minutes, current_min_last_analyzed, current_time
|
|
434
450
|
)
|
|
435
451
|
and (
|
|
436
|
-
|
|
437
|
-
!= last_timestamp_sent
|
|
452
|
+
endpoint.status.last_request.timestamp() != last_timestamp_sent
|
|
438
453
|
or current_min_last_analyzed != last_analyzed_sent
|
|
439
454
|
)
|
|
440
455
|
):
|
|
441
456
|
# Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
|
|
442
457
|
schedules_file.update_endpoint_timestamps(
|
|
443
458
|
endpoint_uid=endpoint.metadata.uid,
|
|
444
|
-
last_request=
|
|
459
|
+
last_request=endpoint.status.last_request.timestamp(),
|
|
445
460
|
last_analyzed=current_min_last_analyzed,
|
|
446
461
|
)
|
|
447
462
|
return True
|
|
@@ -460,13 +475,14 @@ class MonitoringApplicationController:
|
|
|
460
475
|
last_request=endpoint.status.last_request,
|
|
461
476
|
first_request=endpoint.status.first_request,
|
|
462
477
|
endpoint_type=endpoint.metadata.endpoint_type,
|
|
478
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
463
479
|
)
|
|
464
480
|
return False
|
|
465
481
|
|
|
466
482
|
@staticmethod
|
|
467
483
|
def _should_send_nop_event(
|
|
468
484
|
base_period_minutes: int,
|
|
469
|
-
min_last_analyzed:
|
|
485
|
+
min_last_analyzed: float,
|
|
470
486
|
current_time: datetime.datetime,
|
|
471
487
|
):
|
|
472
488
|
if min_last_analyzed:
|
|
@@ -515,7 +531,7 @@ class MonitoringApplicationController:
|
|
|
515
531
|
try:
|
|
516
532
|
project_name = event[ControllerEvent.PROJECT]
|
|
517
533
|
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
518
|
-
|
|
534
|
+
not_old_batch_endpoint = True
|
|
519
535
|
if (
|
|
520
536
|
event[ControllerEvent.KIND]
|
|
521
537
|
== mm_constants.ControllerEventKind.BATCH_COMPLETE
|
|
@@ -572,6 +588,10 @@ class MonitoringApplicationController:
|
|
|
572
588
|
|
|
573
589
|
endpoint_mode = mm_constants.EndpointMode.REAL_TIME
|
|
574
590
|
|
|
591
|
+
not_old_batch_endpoint = (
|
|
592
|
+
event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
|
|
593
|
+
)
|
|
594
|
+
|
|
575
595
|
logger.info(
|
|
576
596
|
"Starting to analyze", timestamp=last_stream_timestamp.isoformat()
|
|
577
597
|
)
|
|
@@ -590,16 +610,49 @@ class MonitoringApplicationController:
|
|
|
590
610
|
first_request=first_request,
|
|
591
611
|
last_request=last_stream_timestamp,
|
|
592
612
|
endpoint_mode=endpoint_mode,
|
|
613
|
+
not_old_batch_endpoint=not_old_batch_endpoint,
|
|
593
614
|
):
|
|
594
615
|
data_in_window = False
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
616
|
+
if not_old_batch_endpoint:
|
|
617
|
+
# Serving endpoint - get the relevant window data from the TSDB
|
|
618
|
+
prediction_metric = self.tsdb_connector.read_predictions(
|
|
619
|
+
start=start_infer_time,
|
|
620
|
+
end=end_infer_time,
|
|
621
|
+
endpoint_id=endpoint_id,
|
|
622
|
+
)
|
|
623
|
+
if prediction_metric.data:
|
|
624
|
+
data_in_window = True
|
|
625
|
+
else:
|
|
626
|
+
# Old batch endpoint - get the relevant window data from the parquet target
|
|
627
|
+
warnings.warn(
|
|
628
|
+
"Analyzing batch model endpoints with real time processing events is "
|
|
629
|
+
"deprecated in 1.10.0 and will be removed in 1.12.0. "
|
|
630
|
+
"Instead, use job-based serving to invoke and analyze offline batch model"
|
|
631
|
+
"endpoints.",
|
|
632
|
+
# TODO: Remove this in 1.12.0
|
|
633
|
+
FutureWarning,
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
if endpoint_id not in self.feature_sets:
|
|
637
|
+
self.feature_sets[endpoint_id] = fstore.get_feature_set(
|
|
638
|
+
event[ControllerEvent.FEATURE_SET_URI]
|
|
639
|
+
)
|
|
640
|
+
self.feature_sets.move_to_end(endpoint_id, last=False)
|
|
641
|
+
if (
|
|
642
|
+
len(self.feature_sets)
|
|
643
|
+
> self._MAX_FEATURE_SET_PER_WORKER
|
|
644
|
+
):
|
|
645
|
+
self.feature_sets.popitem(last=True)
|
|
646
|
+
m_fs = self.feature_sets.get(endpoint_id)
|
|
647
|
+
|
|
648
|
+
df = m_fs.to_dataframe(
|
|
649
|
+
start_time=start_infer_time,
|
|
650
|
+
end_time=end_infer_time,
|
|
651
|
+
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
652
|
+
storage_options=self.storage_options,
|
|
653
|
+
)
|
|
654
|
+
if len(df) > 0:
|
|
655
|
+
data_in_window = True
|
|
603
656
|
|
|
604
657
|
if not data_in_window:
|
|
605
658
|
logger.info(
|
|
@@ -616,7 +669,10 @@ class MonitoringApplicationController:
|
|
|
616
669
|
endpoint_id=endpoint_id,
|
|
617
670
|
)
|
|
618
671
|
self._push_to_applications(
|
|
619
|
-
start_infer_time=start_infer_time
|
|
672
|
+
start_infer_time=start_infer_time
|
|
673
|
+
- datetime.timedelta(
|
|
674
|
+
batch_window_generator.batch_window.TIMESTAMP_RESOLUTION_MICRO
|
|
675
|
+
), # We subtract a microsecond to ensure that the apps will retrieve start time data.
|
|
620
676
|
end_infer_time=end_infer_time,
|
|
621
677
|
endpoint_id=endpoint_id,
|
|
622
678
|
endpoint_name=endpoint_name,
|
|
@@ -653,6 +709,9 @@ class MonitoringApplicationController:
|
|
|
653
709
|
ControllerEvent.ENDPOINT_TYPE: event[
|
|
654
710
|
ControllerEvent.ENDPOINT_TYPE
|
|
655
711
|
],
|
|
712
|
+
ControllerEvent.FEATURE_SET_URI: event[
|
|
713
|
+
ControllerEvent.FEATURE_SET_URI
|
|
714
|
+
],
|
|
656
715
|
ControllerEvent.FIRST_REQUEST: event[
|
|
657
716
|
ControllerEvent.FIRST_REQUEST
|
|
658
717
|
],
|
|
@@ -732,8 +791,17 @@ class MonitoringApplicationController:
|
|
|
732
791
|
logger.info("Starting monitoring controller chief")
|
|
733
792
|
applications_names = []
|
|
734
793
|
endpoints = self.project_obj.list_model_endpoints(
|
|
735
|
-
tsdb_metrics=False,
|
|
794
|
+
tsdb_metrics=False,
|
|
795
|
+
modes=[
|
|
796
|
+
mm_constants.EndpointMode.REAL_TIME,
|
|
797
|
+
mm_constants.EndpointMode.BATCH_LEGACY,
|
|
798
|
+
],
|
|
736
799
|
).endpoints
|
|
800
|
+
|
|
801
|
+
if not endpoints:
|
|
802
|
+
logger.info("No model endpoints found", project=self.project)
|
|
803
|
+
return
|
|
804
|
+
|
|
737
805
|
last_request_dict = self.tsdb_connector.get_last_request(
|
|
738
806
|
endpoint_ids=[mep.metadata.uid for mep in endpoints]
|
|
739
807
|
)
|
|
@@ -742,9 +810,6 @@ class MonitoringApplicationController:
|
|
|
742
810
|
mm_constants.EventFieldType.ENDPOINT_ID
|
|
743
811
|
)[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
|
|
744
812
|
|
|
745
|
-
if not endpoints:
|
|
746
|
-
logger.info("No model endpoints found", project=self.project)
|
|
747
|
-
return
|
|
748
813
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
749
814
|
if monitoring_functions:
|
|
750
815
|
# if monitoring_functions: - TODO : ML-7700
|
|
@@ -790,7 +855,11 @@ class MonitoringApplicationController:
|
|
|
790
855
|
for endpoint in endpoints:
|
|
791
856
|
last_request = last_request_dict.get(endpoint.metadata.uid, None)
|
|
792
857
|
if isinstance(last_request, float):
|
|
793
|
-
last_request =
|
|
858
|
+
last_request = datetime.datetime.fromtimestamp(
|
|
859
|
+
last_request, tz=datetime.UTC
|
|
860
|
+
)
|
|
861
|
+
elif isinstance(last_request, pd.Timestamp):
|
|
862
|
+
last_request = last_request.to_pydatetime()
|
|
794
863
|
endpoint.status.last_request = (
|
|
795
864
|
last_request or endpoint.status.last_request
|
|
796
865
|
)
|
|
@@ -842,6 +911,7 @@ class MonitoringApplicationController:
|
|
|
842
911
|
sep=" ", timespec="microseconds"
|
|
843
912
|
),
|
|
844
913
|
endpoint_type=endpoint.metadata.endpoint_type,
|
|
914
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
845
915
|
endpoint_policy=json.dumps(policy),
|
|
846
916
|
)
|
|
847
917
|
policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
|
|
@@ -859,6 +929,7 @@ class MonitoringApplicationController:
|
|
|
859
929
|
sep=" ", timespec="microseconds"
|
|
860
930
|
),
|
|
861
931
|
endpoint_type=endpoint.metadata.endpoint_type.value,
|
|
932
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
862
933
|
endpoint_policy=policy,
|
|
863
934
|
)
|
|
864
935
|
|
|
@@ -871,6 +942,7 @@ class MonitoringApplicationController:
|
|
|
871
942
|
timestamp: str,
|
|
872
943
|
first_request: str,
|
|
873
944
|
endpoint_type: int,
|
|
945
|
+
feature_set_uri: str,
|
|
874
946
|
endpoint_policy: dict[str, Any],
|
|
875
947
|
) -> None:
|
|
876
948
|
"""
|
|
@@ -883,6 +955,7 @@ class MonitoringApplicationController:
|
|
|
883
955
|
:param endpoint_id: endpoint id string
|
|
884
956
|
:param endpoint_name: the endpoint name string
|
|
885
957
|
:param endpoint_type: Enum of the endpoint type
|
|
958
|
+
:param feature_set_uri: the feature set uri string
|
|
886
959
|
"""
|
|
887
960
|
event = {
|
|
888
961
|
ControllerEvent.KIND.value: kind,
|
|
@@ -892,6 +965,7 @@ class MonitoringApplicationController:
|
|
|
892
965
|
ControllerEvent.TIMESTAMP.value: timestamp,
|
|
893
966
|
ControllerEvent.FIRST_REQUEST.value: first_request,
|
|
894
967
|
ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
|
|
968
|
+
ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
|
|
895
969
|
ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
|
|
896
970
|
}
|
|
897
971
|
logger.info(
|
|
@@ -13,15 +13,12 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
|
-
import sys
|
|
17
16
|
from abc import ABC, abstractmethod
|
|
18
17
|
from contextlib import AbstractContextManager
|
|
19
|
-
from datetime import datetime
|
|
18
|
+
from datetime import datetime
|
|
20
19
|
from types import TracebackType
|
|
21
20
|
from typing import TYPE_CHECKING, Final, Optional
|
|
22
21
|
|
|
23
|
-
import botocore.exceptions
|
|
24
|
-
|
|
25
22
|
import mlrun
|
|
26
23
|
import mlrun.common.schemas as schemas
|
|
27
24
|
import mlrun.errors
|
|
@@ -30,10 +27,7 @@ import mlrun.utils.helpers
|
|
|
30
27
|
from mlrun.utils import logger
|
|
31
28
|
|
|
32
29
|
if TYPE_CHECKING:
|
|
33
|
-
|
|
34
|
-
from typing import Self
|
|
35
|
-
else:
|
|
36
|
-
from typing_extensions import Self
|
|
30
|
+
from typing import Self
|
|
37
31
|
|
|
38
32
|
|
|
39
33
|
class ModelMonitoringSchedulesFileBase(AbstractContextManager, ABC):
|
|
@@ -88,16 +82,8 @@ class ModelMonitoringSchedulesFileBase(AbstractContextManager, ABC):
|
|
|
88
82
|
except (
|
|
89
83
|
mlrun.errors.MLRunNotFoundError,
|
|
90
84
|
# Different errors are raised for S3 or local storage, see ML-8042
|
|
91
|
-
botocore.exceptions.ClientError,
|
|
92
85
|
FileNotFoundError,
|
|
93
|
-
)
|
|
94
|
-
if (
|
|
95
|
-
isinstance(err, botocore.exceptions.ClientError)
|
|
96
|
-
# Add a log only to "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
|
|
97
|
-
and err.response["Error"]["Code"] != "NoSuchKey"
|
|
98
|
-
):
|
|
99
|
-
raise
|
|
100
|
-
|
|
86
|
+
):
|
|
101
87
|
logger.exception(
|
|
102
88
|
"The schedules file was not found. It should have been created "
|
|
103
89
|
"as a part of the model endpoint's creation",
|
|
@@ -162,19 +148,29 @@ class ModelMonitoringSchedulesFileEndpoint(ModelMonitoringSchedulesFileBase):
|
|
|
162
148
|
endpoint_id=model_endpoint.metadata.uid,
|
|
163
149
|
)
|
|
164
150
|
|
|
165
|
-
def get_application_time(self, application: str) -> Optional[
|
|
151
|
+
def get_application_time(self, application: str) -> Optional[float]:
|
|
166
152
|
self._check_open_schedules()
|
|
167
153
|
return self._schedules.get(application)
|
|
168
154
|
|
|
169
|
-
def update_application_time(self, application: str, timestamp:
|
|
155
|
+
def update_application_time(self, application: str, timestamp: float) -> None:
|
|
156
|
+
self._check_open_schedules()
|
|
157
|
+
self._schedules[application] = float(timestamp)
|
|
158
|
+
|
|
159
|
+
def delete_application_time(self, application: str) -> None:
|
|
170
160
|
self._check_open_schedules()
|
|
171
|
-
self._schedules
|
|
161
|
+
if application in self._schedules:
|
|
162
|
+
logger.debug(
|
|
163
|
+
"Deleting application time from schedules",
|
|
164
|
+
application=application,
|
|
165
|
+
endpoint_id=self._endpoint_id,
|
|
166
|
+
)
|
|
167
|
+
del self._schedules[application]
|
|
172
168
|
|
|
173
169
|
def get_application_list(self) -> set[str]:
|
|
174
170
|
self._check_open_schedules()
|
|
175
171
|
return set(self._schedules.keys())
|
|
176
172
|
|
|
177
|
-
def get_min_timestamp(self) -> Optional[
|
|
173
|
+
def get_min_timestamp(self) -> Optional[float]:
|
|
178
174
|
self._check_open_schedules()
|
|
179
175
|
return min(self._schedules.values(), default=None)
|
|
180
176
|
|
|
@@ -198,7 +194,7 @@ class ModelMonitoringSchedulesFileChief(ModelMonitoringSchedulesFileBase):
|
|
|
198
194
|
project=self._project
|
|
199
195
|
)
|
|
200
196
|
|
|
201
|
-
def get_endpoint_last_request(self, endpoint_uid: str) -> Optional[
|
|
197
|
+
def get_endpoint_last_request(self, endpoint_uid: str) -> Optional[float]:
|
|
202
198
|
self._check_open_schedules()
|
|
203
199
|
if endpoint_uid in self._schedules:
|
|
204
200
|
return self._schedules[endpoint_uid].get(
|
|
@@ -208,15 +204,19 @@ class ModelMonitoringSchedulesFileChief(ModelMonitoringSchedulesFileBase):
|
|
|
208
204
|
return None
|
|
209
205
|
|
|
210
206
|
def update_endpoint_timestamps(
|
|
211
|
-
self, endpoint_uid: str, last_request:
|
|
207
|
+
self, endpoint_uid: str, last_request: float, last_analyzed: float
|
|
212
208
|
) -> None:
|
|
213
209
|
self._check_open_schedules()
|
|
214
210
|
self._schedules[endpoint_uid] = {
|
|
215
|
-
schemas.model_monitoring.constants.ScheduleChiefFields.LAST_REQUEST:
|
|
216
|
-
|
|
211
|
+
schemas.model_monitoring.constants.ScheduleChiefFields.LAST_REQUEST: float(
|
|
212
|
+
last_request
|
|
213
|
+
),
|
|
214
|
+
schemas.model_monitoring.constants.ScheduleChiefFields.LAST_ANALYZED: float(
|
|
215
|
+
last_analyzed
|
|
216
|
+
),
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
def get_endpoint_last_analyzed(self, endpoint_uid: str) -> Optional[
|
|
219
|
+
def get_endpoint_last_analyzed(self, endpoint_uid: str) -> Optional[float]:
|
|
220
220
|
self._check_open_schedules()
|
|
221
221
|
if endpoint_uid in self._schedules:
|
|
222
222
|
return self._schedules[endpoint_uid].get(
|
|
@@ -267,9 +267,18 @@ class ModelMonitoringSchedulesFileApplication(ModelMonitoringSchedulesFileBase):
|
|
|
267
267
|
self, endpoint_uid: str, last_analyzed: datetime
|
|
268
268
|
) -> None:
|
|
269
269
|
self._check_open_schedules()
|
|
270
|
-
self._schedules[endpoint_uid] = last_analyzed.
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
self._schedules[endpoint_uid] = last_analyzed.isoformat()
|
|
271
|
+
|
|
272
|
+
def delete_endpoints_last_analyzed(self, endpoint_uids: list[str]) -> None:
|
|
273
|
+
self._check_open_schedules()
|
|
274
|
+
for endpoint_uid in endpoint_uids:
|
|
275
|
+
if endpoint_uid in self._schedules:
|
|
276
|
+
logger.debug(
|
|
277
|
+
"Deleting endpoint last analyzed from schedules",
|
|
278
|
+
endpoint_uid=endpoint_uid,
|
|
279
|
+
application=self._application,
|
|
280
|
+
)
|
|
281
|
+
del self._schedules[endpoint_uid]
|
|
273
282
|
|
|
274
283
|
|
|
275
284
|
def _delete_folder(folder: str) -> None:
|
|
@@ -13,11 +13,11 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import abc
|
|
15
15
|
import json
|
|
16
|
+
import typing
|
|
16
17
|
from abc import abstractmethod
|
|
17
|
-
from datetime import
|
|
18
|
+
from datetime import UTC, datetime
|
|
18
19
|
from typing import cast
|
|
19
20
|
|
|
20
|
-
import botocore.exceptions
|
|
21
21
|
import fsspec
|
|
22
22
|
|
|
23
23
|
import mlrun.datastore.base
|
|
@@ -73,7 +73,7 @@ class ModelMonitoringStatsFile(abc.ABC):
|
|
|
73
73
|
path=self._item.url,
|
|
74
74
|
)
|
|
75
75
|
|
|
76
|
-
def read(self) -> tuple[dict, datetime]:
|
|
76
|
+
def read(self) -> tuple[dict, typing.Optional[datetime]]:
|
|
77
77
|
"""
|
|
78
78
|
Read the stats data and timestamp saved in file
|
|
79
79
|
:return: tuple[dict, str] dictionary with stats data and timestamp saved in file
|
|
@@ -82,30 +82,20 @@ class ModelMonitoringStatsFile(abc.ABC):
|
|
|
82
82
|
content = json.loads(self._item.get().decode())
|
|
83
83
|
timestamp = content.get("timestamp")
|
|
84
84
|
if timestamp is not None:
|
|
85
|
-
timestamp = datetime.fromisoformat(timestamp).astimezone(
|
|
86
|
-
tz=timezone.utc
|
|
87
|
-
)
|
|
85
|
+
timestamp = datetime.fromisoformat(timestamp).astimezone(tz=UTC)
|
|
88
86
|
return content.get("data"), timestamp
|
|
89
87
|
except (
|
|
90
88
|
mlrun.errors.MLRunNotFoundError,
|
|
91
89
|
# Different errors are raised for S3 or local storage, see ML-8042
|
|
92
|
-
botocore.exceptions.ClientError,
|
|
93
90
|
FileNotFoundError,
|
|
94
91
|
) as err:
|
|
95
|
-
|
|
96
|
-
isinstance(err, botocore.exceptions.ClientError)
|
|
97
|
-
# Add a log only to "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
|
|
98
|
-
and err.response["Error"]["Code"] != "NoSuchKey"
|
|
99
|
-
):
|
|
100
|
-
raise
|
|
101
|
-
|
|
102
|
-
logger.exception(
|
|
92
|
+
logger.warning(
|
|
103
93
|
"The Stats file was not found. It should have been created "
|
|
104
94
|
"as a part of the model endpoint's creation",
|
|
105
95
|
path=self._path,
|
|
106
96
|
error=err,
|
|
107
97
|
)
|
|
108
|
-
|
|
98
|
+
return {}, None
|
|
109
99
|
|
|
110
100
|
def write(self, stats: dict, timestamp: datetime) -> None:
|
|
111
101
|
"""
|