mlrun 1.7.1rc10__py3-none-any.whl → 1.8.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +23 -21
- mlrun/__main__.py +3 -3
- mlrun/alerts/alert.py +148 -14
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +55 -12
- mlrun/artifacts/dataset.py +16 -16
- mlrun/artifacts/document.py +378 -0
- mlrun/artifacts/manager.py +26 -17
- mlrun/artifacts/model.py +66 -53
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/base/__init__.py → common/formatters/model_endpoint.py} +16 -1
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +1 -29
- mlrun/common/runtimes/constants.py +1 -2
- mlrun/common/schemas/__init__.py +6 -2
- mlrun/common/schemas/alert.py +111 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +11 -7
- mlrun/common/schemas/auth.py +6 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +2 -3
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +67 -14
- mlrun/common/schemas/model_monitoring/grafana.py +1 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +92 -147
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +137 -0
- mlrun/common/schemas/pipeline.py +2 -2
- mlrun/common/schemas/project.py +25 -17
- mlrun/common/schemas/runs.py +2 -2
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +5 -5
- mlrun/config.py +68 -10
- mlrun/data_types/__init__.py +0 -2
- mlrun/data_types/data_types.py +1 -0
- mlrun/data_types/infer.py +3 -1
- mlrun/data_types/spark.py +5 -3
- mlrun/data_types/to_pandas.py +11 -2
- mlrun/datastore/__init__.py +2 -2
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +12 -4
- mlrun/datastore/datastore.py +9 -3
- mlrun/datastore/datastore_profile.py +79 -20
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +4 -1
- mlrun/datastore/sources.py +52 -51
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/targets.py +23 -22
- mlrun/datastore/utils.py +2 -2
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +229 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +213 -83
- mlrun/db/factory.py +0 -3
- mlrun/db/httpdb.py +1265 -387
- mlrun/db/nopdb.py +205 -74
- mlrun/errors.py +2 -2
- mlrun/execution.py +136 -50
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +41 -40
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +27 -24
- mlrun/feature_store/retrieval/base.py +14 -9
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/steps.py +2 -2
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +29 -27
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +4 -3
- mlrun/model.py +117 -46
- mlrun/model_monitoring/__init__.py +4 -4
- mlrun/model_monitoring/api.py +72 -59
- mlrun/model_monitoring/applications/_application_steps.py +17 -17
- mlrun/model_monitoring/applications/base.py +165 -6
- mlrun/model_monitoring/applications/context.py +88 -37
- mlrun/model_monitoring/applications/evidently_base.py +0 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +43 -21
- mlrun/model_monitoring/applications/results.py +55 -3
- mlrun/model_monitoring/controller.py +207 -239
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +156 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/base.py +78 -25
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +61 -6
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +255 -29
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +78 -17
- mlrun/model_monitoring/helpers.py +151 -49
- mlrun/model_monitoring/stream_processing.py +99 -283
- mlrun/model_monitoring/tracking_policy.py +10 -3
- mlrun/model_monitoring/writer.py +48 -36
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +31 -14
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +27 -27
- mlrun/projects/pipelines.py +71 -36
- mlrun/projects/project.py +890 -220
- mlrun/run.py +53 -10
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +15 -11
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/generators.py +2 -1
- mlrun/runtimes/kubejob.py +4 -5
- mlrun/runtimes/mounts.py +572 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -11
- mlrun/runtimes/nuclio/function.py +19 -17
- mlrun/runtimes/nuclio/serving.py +18 -13
- mlrun/runtimes/pod.py +154 -45
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +21 -11
- mlrun/runtimes/utils.py +6 -5
- mlrun/serving/merger.py +6 -4
- mlrun/serving/remote.py +18 -17
- mlrun/serving/routers.py +185 -172
- mlrun/serving/server.py +7 -1
- mlrun/serving/states.py +97 -78
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +105 -72
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/clones.py +1 -1
- mlrun/utils/helpers.py +63 -19
- mlrun/utils/logger.py +106 -4
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +33 -14
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +6 -6
- mlrun/utils/notifications/notification/webhook.py +6 -6
- mlrun/utils/notifications/notification_pusher.py +86 -44
- mlrun/utils/regex.py +11 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/METADATA +29 -24
- mlrun-1.8.0rc11.dist-info/RECORD +347 -0
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.1rc10.dist-info/RECORD +0 -351
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/LICENSE +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/WHEEL +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/top_level.txt +0 -0
|
@@ -11,31 +11,31 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import concurrent.futures
|
|
15
16
|
import datetime
|
|
16
17
|
import json
|
|
17
18
|
import os
|
|
18
|
-
import re
|
|
19
19
|
from collections.abc import Iterator
|
|
20
|
-
from
|
|
20
|
+
from contextlib import AbstractContextManager
|
|
21
|
+
from types import TracebackType
|
|
22
|
+
from typing import NamedTuple, Optional, cast
|
|
21
23
|
|
|
22
|
-
import
|
|
24
|
+
import nuclio_sdk
|
|
23
25
|
|
|
24
26
|
import mlrun
|
|
25
27
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
26
|
-
import mlrun.data_types.infer
|
|
27
28
|
import mlrun.feature_store as fstore
|
|
28
|
-
import mlrun.model_monitoring
|
|
29
|
-
from mlrun.
|
|
29
|
+
import mlrun.model_monitoring
|
|
30
|
+
from mlrun.common.schemas import EndpointType
|
|
30
31
|
from mlrun.datastore import get_stream_pusher
|
|
31
32
|
from mlrun.errors import err_to_str
|
|
32
|
-
from mlrun.model_monitoring.
|
|
33
|
-
|
|
34
|
-
batch_dict2timedelta,
|
|
35
|
-
get_stream_path,
|
|
36
|
-
)
|
|
33
|
+
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
34
|
+
from mlrun.model_monitoring.helpers import batch_dict2timedelta, get_stream_path
|
|
37
35
|
from mlrun.utils import datetime_now, logger
|
|
38
36
|
|
|
37
|
+
_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
|
|
38
|
+
|
|
39
39
|
|
|
40
40
|
class _Interval(NamedTuple):
|
|
41
41
|
start: datetime.datetime
|
|
@@ -45,12 +45,12 @@ class _Interval(NamedTuple):
|
|
|
45
45
|
class _BatchWindow:
|
|
46
46
|
def __init__(
|
|
47
47
|
self,
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
*,
|
|
49
|
+
schedules_file: ModelMonitoringSchedulesFile,
|
|
50
50
|
application: str,
|
|
51
51
|
timedelta_seconds: int,
|
|
52
|
-
last_updated:
|
|
53
|
-
first_request:
|
|
52
|
+
last_updated: int,
|
|
53
|
+
first_request: int,
|
|
54
54
|
) -> None:
|
|
55
55
|
"""
|
|
56
56
|
Initialize a batch window object that handles the batch interval time range
|
|
@@ -58,159 +58,124 @@ class _BatchWindow:
|
|
|
58
58
|
All the time values are in seconds.
|
|
59
59
|
The start and stop time are in seconds since the epoch.
|
|
60
60
|
"""
|
|
61
|
-
self.project = project
|
|
62
|
-
self._endpoint = endpoint
|
|
63
61
|
self._application = application
|
|
64
62
|
self._first_request = first_request
|
|
65
63
|
self._stop = last_updated
|
|
66
64
|
self._step = timedelta_seconds
|
|
67
|
-
self._db =
|
|
65
|
+
self._db = schedules_file
|
|
68
66
|
self._start = self._get_last_analyzed()
|
|
69
67
|
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
last_analyzed = self._db.get_last_analyzed(
|
|
73
|
-
endpoint_id=self._endpoint,
|
|
74
|
-
application_name=self._application,
|
|
75
|
-
)
|
|
76
|
-
except mlrun.errors.MLRunNotFoundError:
|
|
77
|
-
logger.info(
|
|
78
|
-
"No last analyzed time was found for this endpoint and "
|
|
79
|
-
"application, as this is probably the first time this "
|
|
80
|
-
"application is running. Using the latest between first "
|
|
81
|
-
"request time or last update time minus one day instead",
|
|
82
|
-
endpoint=self._endpoint,
|
|
83
|
-
application=self._application,
|
|
84
|
-
first_request=self._first_request,
|
|
85
|
-
last_updated=self._stop,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
if self._first_request and self._stop:
|
|
89
|
-
# TODO : Change the timedelta according to the policy.
|
|
90
|
-
first_period_in_seconds = max(
|
|
91
|
-
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
92
|
-
) # max between one day and the base period
|
|
93
|
-
return max(
|
|
94
|
-
self._first_request,
|
|
95
|
-
self._stop - first_period_in_seconds,
|
|
96
|
-
)
|
|
97
|
-
return self._first_request
|
|
68
|
+
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
69
|
+
return cast(int, self._db.get_application_time(self._application))
|
|
98
70
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
application=self._application,
|
|
103
|
-
last_analyzed=last_analyzed,
|
|
71
|
+
def _update_last_analyzed(self, last_analyzed: int) -> None:
|
|
72
|
+
self._db.update_application_time(
|
|
73
|
+
application=self._application, timestamp=last_analyzed
|
|
104
74
|
)
|
|
105
|
-
return last_analyzed
|
|
106
75
|
|
|
107
|
-
def
|
|
76
|
+
def _get_initial_last_analyzed(self) -> int:
|
|
108
77
|
logger.info(
|
|
109
|
-
"
|
|
110
|
-
|
|
78
|
+
"No last analyzed time was found for this endpoint and application, as this is "
|
|
79
|
+
"probably the first time this application is running. Initializing last analyzed "
|
|
80
|
+
"to the latest between first request time or last update time minus one day",
|
|
111
81
|
application=self._application,
|
|
112
|
-
|
|
82
|
+
first_request=self._first_request,
|
|
83
|
+
last_updated=self._stop,
|
|
113
84
|
)
|
|
114
|
-
|
|
115
|
-
self.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
85
|
+
# max between one day and the base period
|
|
86
|
+
first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
|
|
87
|
+
return max(
|
|
88
|
+
self._first_request,
|
|
89
|
+
self._stop - first_period_in_seconds,
|
|
119
90
|
)
|
|
120
91
|
|
|
121
|
-
def
|
|
122
|
-
self
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if self._start is not None and self._stop is not None:
|
|
126
|
-
entered = False
|
|
127
|
-
# Iterate timestamp from start until timestamp <= stop - step
|
|
128
|
-
# so that the last interval will end at (timestamp + step) <= stop.
|
|
129
|
-
# Add 1 to stop - step to get <= and not <.
|
|
130
|
-
for timestamp in range(
|
|
131
|
-
self._start, self._stop - self._step + 1, self._step
|
|
132
|
-
):
|
|
133
|
-
entered = True
|
|
134
|
-
start_time = datetime.datetime.fromtimestamp(
|
|
135
|
-
timestamp, tz=datetime.timezone.utc
|
|
136
|
-
)
|
|
137
|
-
end_time = datetime.datetime.fromtimestamp(
|
|
138
|
-
timestamp + self._step, tz=datetime.timezone.utc
|
|
139
|
-
)
|
|
140
|
-
yield _Interval(start_time, end_time)
|
|
141
|
-
self._update_last_analyzed(timestamp + self._step)
|
|
142
|
-
if not entered:
|
|
143
|
-
logger.info(
|
|
144
|
-
"All the data is set, but no complete intervals were found. "
|
|
145
|
-
"Wait for last_updated to be updated",
|
|
146
|
-
endpoint=self._endpoint,
|
|
147
|
-
application=self._application,
|
|
148
|
-
start=self._start,
|
|
149
|
-
stop=self._stop,
|
|
150
|
-
step=self._step,
|
|
151
|
-
)
|
|
92
|
+
def _get_last_analyzed(self) -> int:
|
|
93
|
+
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
94
|
+
if saved_last_analyzed is not None:
|
|
95
|
+
return saved_last_analyzed
|
|
152
96
|
else:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
97
|
+
last_analyzed = self._get_initial_last_analyzed()
|
|
98
|
+
# Update the in-memory DB to avoid duplicate initializations
|
|
99
|
+
self._update_last_analyzed(last_analyzed)
|
|
100
|
+
return last_analyzed
|
|
101
|
+
|
|
102
|
+
def get_intervals(self) -> Iterator[_Interval]:
|
|
103
|
+
"""Generate the batch interval time ranges."""
|
|
104
|
+
entered = False
|
|
105
|
+
# Iterate timestamp from start until timestamp <= stop - step
|
|
106
|
+
# so that the last interval will end at (timestamp + step) <= stop.
|
|
107
|
+
# Add 1 to stop - step to get <= and not <.
|
|
108
|
+
for timestamp in range(self._start, self._stop - self._step + 1, self._step):
|
|
109
|
+
entered = True
|
|
110
|
+
start_time = datetime.datetime.fromtimestamp(
|
|
111
|
+
timestamp, tz=datetime.timezone.utc
|
|
112
|
+
)
|
|
113
|
+
end_time = datetime.datetime.fromtimestamp(
|
|
114
|
+
timestamp + self._step, tz=datetime.timezone.utc
|
|
115
|
+
)
|
|
116
|
+
yield _Interval(start_time, end_time)
|
|
117
|
+
|
|
118
|
+
last_analyzed = timestamp + self._step
|
|
119
|
+
self._update_last_analyzed(last_analyzed)
|
|
120
|
+
logger.debug(
|
|
121
|
+
"Updated the last analyzed time for this endpoint and application",
|
|
122
|
+
application=self._application,
|
|
123
|
+
last_analyzed=last_analyzed,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if not entered:
|
|
127
|
+
logger.debug(
|
|
128
|
+
"All the data is set, but no complete intervals were found. "
|
|
129
|
+
"Wait for last_updated to be updated",
|
|
157
130
|
application=self._application,
|
|
158
131
|
start=self._start,
|
|
159
132
|
stop=self._stop,
|
|
133
|
+
step=self._step,
|
|
160
134
|
)
|
|
161
135
|
|
|
162
136
|
|
|
163
|
-
class _BatchWindowGenerator:
|
|
164
|
-
def __init__(self,
|
|
137
|
+
class _BatchWindowGenerator(AbstractContextManager):
|
|
138
|
+
def __init__(self, project: str, endpoint_id: str, window_length: int) -> None:
|
|
165
139
|
"""
|
|
166
140
|
Initialize a batch window generator object that generates batch window objects
|
|
167
141
|
for the monitoring functions.
|
|
168
142
|
"""
|
|
169
|
-
self.
|
|
170
|
-
self.
|
|
171
|
-
self._timedelta =
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
self.
|
|
187
|
-
|
|
188
|
-
pair_list = pair.split(":")
|
|
189
|
-
self._batch_dict[pair_list[0]] = float(pair_list[1])
|
|
190
|
-
|
|
191
|
-
def _get_timedelta(self) -> int:
|
|
192
|
-
"""Get the timedelta in seconds from the batch dictionary"""
|
|
193
|
-
return int(
|
|
194
|
-
batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
|
|
143
|
+
self._project = project
|
|
144
|
+
self._endpoint_id = endpoint_id
|
|
145
|
+
self._timedelta = window_length
|
|
146
|
+
self._schedules_file = ModelMonitoringSchedulesFile(
|
|
147
|
+
project=project, endpoint_id=endpoint_id
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def __enter__(self) -> "_BatchWindowGenerator":
|
|
151
|
+
self._schedules_file.__enter__()
|
|
152
|
+
return super().__enter__()
|
|
153
|
+
|
|
154
|
+
def __exit__(
|
|
155
|
+
self,
|
|
156
|
+
exc_type: Optional[type[BaseException]],
|
|
157
|
+
exc_value: Optional[BaseException],
|
|
158
|
+
traceback: Optional[TracebackType],
|
|
159
|
+
) -> Optional[bool]:
|
|
160
|
+
self._schedules_file.__exit__(
|
|
161
|
+
exc_type=exc_type, exc_value=exc_value, traceback=traceback
|
|
195
162
|
)
|
|
196
163
|
|
|
197
164
|
@classmethod
|
|
198
165
|
def _get_last_updated_time(
|
|
199
|
-
cls, last_request:
|
|
200
|
-
) ->
|
|
166
|
+
cls, last_request: datetime.datetime, not_batch_endpoint: bool
|
|
167
|
+
) -> int:
|
|
201
168
|
"""
|
|
202
169
|
Get the last updated time of a model endpoint.
|
|
203
170
|
"""
|
|
204
|
-
if not last_request:
|
|
205
|
-
return None
|
|
206
171
|
last_updated = int(
|
|
207
|
-
|
|
172
|
+
last_request.timestamp()
|
|
208
173
|
- cast(
|
|
209
174
|
float,
|
|
210
175
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
211
176
|
)
|
|
212
177
|
)
|
|
213
|
-
if not
|
|
178
|
+
if not not_batch_endpoint:
|
|
214
179
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
215
180
|
# the minimum between the current time and the last updated time.
|
|
216
181
|
# This compensates for the bumping mechanism - see
|
|
@@ -221,45 +186,38 @@ class _BatchWindowGenerator:
|
|
|
221
186
|
)
|
|
222
187
|
return last_updated
|
|
223
188
|
|
|
224
|
-
|
|
225
|
-
def _normalize_first_request(
|
|
226
|
-
cls, first_request: Optional[str], endpoint: str
|
|
227
|
-
) -> Optional[int]:
|
|
228
|
-
if not first_request:
|
|
229
|
-
logger.debug(
|
|
230
|
-
"There is no first request time for this endpoint.",
|
|
231
|
-
endpoint=endpoint,
|
|
232
|
-
first_request=first_request,
|
|
233
|
-
)
|
|
234
|
-
return None
|
|
235
|
-
return cls._date_string2timestamp(first_request)
|
|
236
|
-
|
|
237
|
-
@staticmethod
|
|
238
|
-
def _date_string2timestamp(date_string: str) -> int:
|
|
239
|
-
return int(datetime.datetime.fromisoformat(date_string).timestamp())
|
|
240
|
-
|
|
241
|
-
def get_batch_window(
|
|
189
|
+
def get_intervals(
|
|
242
190
|
self,
|
|
243
|
-
|
|
244
|
-
endpoint: str,
|
|
191
|
+
*,
|
|
245
192
|
application: str,
|
|
246
|
-
first_request:
|
|
247
|
-
last_request:
|
|
248
|
-
|
|
249
|
-
) ->
|
|
193
|
+
first_request: datetime.datetime,
|
|
194
|
+
last_request: datetime.datetime,
|
|
195
|
+
not_batch_endpoint: bool,
|
|
196
|
+
) -> Iterator[_Interval]:
|
|
250
197
|
"""
|
|
251
198
|
Get the batch window for a specific endpoint and application.
|
|
252
|
-
first_request
|
|
199
|
+
`first_request` and `last_request` are the timestamps of the first request and last
|
|
200
|
+
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
253
201
|
"""
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
project=project,
|
|
257
|
-
endpoint=endpoint,
|
|
202
|
+
batch_window = _BatchWindow(
|
|
203
|
+
schedules_file=self._schedules_file,
|
|
258
204
|
application=application,
|
|
259
205
|
timedelta_seconds=self._timedelta,
|
|
260
|
-
last_updated=self._get_last_updated_time(last_request,
|
|
261
|
-
first_request=
|
|
206
|
+
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
207
|
+
first_request=int(first_request.timestamp()),
|
|
262
208
|
)
|
|
209
|
+
yield from batch_window.get_intervals()
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _get_window_length() -> int:
|
|
213
|
+
"""Get the timedelta in seconds from the batch dictionary"""
|
|
214
|
+
return int(
|
|
215
|
+
batch_dict2timedelta(
|
|
216
|
+
json.loads(
|
|
217
|
+
cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
|
|
218
|
+
)
|
|
219
|
+
).total_seconds()
|
|
220
|
+
)
|
|
263
221
|
|
|
264
222
|
|
|
265
223
|
class MonitoringApplicationController:
|
|
@@ -276,19 +234,11 @@ class MonitoringApplicationController:
|
|
|
276
234
|
|
|
277
235
|
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
278
236
|
|
|
279
|
-
self.
|
|
280
|
-
|
|
281
|
-
self._batch_window_generator = _BatchWindowGenerator(
|
|
282
|
-
batch_dict=json.loads(
|
|
283
|
-
mlrun.get_secret_or_env(
|
|
284
|
-
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
285
|
-
)
|
|
286
|
-
)
|
|
287
|
-
)
|
|
237
|
+
self._window_length = _get_window_length()
|
|
288
238
|
|
|
289
239
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
290
240
|
self.storage_options = None
|
|
291
|
-
if mlconf.artifact_path.startswith("s3://"):
|
|
241
|
+
if mlrun.mlconf.artifact_path.startswith("s3://"):
|
|
292
242
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
293
243
|
|
|
294
244
|
@staticmethod
|
|
@@ -299,6 +249,19 @@ class MonitoringApplicationController:
|
|
|
299
249
|
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
300
250
|
return access_key
|
|
301
251
|
|
|
252
|
+
@staticmethod
|
|
253
|
+
def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
|
|
254
|
+
return (
|
|
255
|
+
# Is the model endpoint monitored?
|
|
256
|
+
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
257
|
+
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
258
|
+
and endpoint.status.first_request
|
|
259
|
+
and endpoint.status.last_request
|
|
260
|
+
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
261
|
+
and endpoint.metadata.endpoint_type.value
|
|
262
|
+
!= mm_constants.EndpointType.ROUTER.value
|
|
263
|
+
)
|
|
264
|
+
|
|
302
265
|
def run(self) -> None:
|
|
303
266
|
"""
|
|
304
267
|
Main method for run all the relevant monitoring applications on each endpoint.
|
|
@@ -312,7 +275,10 @@ class MonitoringApplicationController:
|
|
|
312
275
|
logger.info("Start running monitoring controller")
|
|
313
276
|
try:
|
|
314
277
|
applications_names = []
|
|
315
|
-
|
|
278
|
+
endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
|
|
279
|
+
project=self.project, tsdb_metrics=True
|
|
280
|
+
)
|
|
281
|
+
endpoints = endpoints_list.endpoints
|
|
316
282
|
if not endpoints:
|
|
317
283
|
logger.info("No model endpoints found", project=self.project)
|
|
318
284
|
return
|
|
@@ -349,43 +315,36 @@ class MonitoringApplicationController:
|
|
|
349
315
|
exc=err_to_str(e),
|
|
350
316
|
)
|
|
351
317
|
return
|
|
352
|
-
# Initialize a
|
|
318
|
+
# Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
|
|
353
319
|
with concurrent.futures.ThreadPoolExecutor(
|
|
354
|
-
max_workers=min(len(endpoints), 10)
|
|
320
|
+
max_workers=min(len(endpoints), 10)
|
|
355
321
|
) as pool:
|
|
356
322
|
for endpoint in endpoints:
|
|
357
|
-
if (
|
|
358
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
359
|
-
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
360
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
361
|
-
):
|
|
362
|
-
# Skip router endpoint:
|
|
363
|
-
if (
|
|
364
|
-
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
365
|
-
== mm_constants.EndpointType.ROUTER
|
|
366
|
-
):
|
|
367
|
-
# Router endpoint has no feature stats
|
|
368
|
-
logger.info(
|
|
369
|
-
f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
|
|
370
|
-
)
|
|
371
|
-
continue
|
|
323
|
+
if self._should_monitor_endpoint(endpoint):
|
|
372
324
|
pool.submit(
|
|
373
325
|
MonitoringApplicationController.model_endpoint_process,
|
|
326
|
+
project=self.project,
|
|
374
327
|
endpoint=endpoint,
|
|
375
328
|
applications_names=applications_names,
|
|
376
|
-
|
|
377
|
-
project=self.project,
|
|
329
|
+
window_length=self._window_length,
|
|
378
330
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
379
331
|
storage_options=self.storage_options,
|
|
380
332
|
)
|
|
333
|
+
else:
|
|
334
|
+
logger.debug(
|
|
335
|
+
"Skipping endpoint, not ready or not suitable for monitoring",
|
|
336
|
+
endpoint_id=endpoint.metadata.uid,
|
|
337
|
+
endpoint_name=endpoint.metadata.name,
|
|
338
|
+
)
|
|
339
|
+
logger.info("Finished running monitoring controller")
|
|
381
340
|
|
|
382
341
|
@classmethod
|
|
383
342
|
def model_endpoint_process(
|
|
384
343
|
cls,
|
|
385
|
-
endpoint: dict,
|
|
386
|
-
applications_names: list[str],
|
|
387
|
-
batch_window_generator: _BatchWindowGenerator,
|
|
388
344
|
project: str,
|
|
345
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
346
|
+
applications_names: list[str],
|
|
347
|
+
window_length: int,
|
|
389
348
|
model_monitoring_access_key: str,
|
|
390
349
|
storage_options: Optional[dict] = None,
|
|
391
350
|
) -> None:
|
|
@@ -401,56 +360,60 @@ class MonitoringApplicationController:
|
|
|
401
360
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
402
361
|
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
403
362
|
"""
|
|
404
|
-
endpoint_id = endpoint
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
363
|
+
endpoint_id = endpoint.metadata.uid
|
|
364
|
+
not_batch_endpoint = not (
|
|
365
|
+
endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
|
|
408
366
|
)
|
|
367
|
+
m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
|
|
409
368
|
try:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
logger.info(
|
|
429
|
-
"No data found for the given interval",
|
|
430
|
-
start=start_infer_time,
|
|
431
|
-
end=end_infer_time,
|
|
432
|
-
endpoint_id=endpoint_id,
|
|
433
|
-
)
|
|
434
|
-
else:
|
|
435
|
-
logger.info(
|
|
436
|
-
"Data found for the given interval",
|
|
437
|
-
start=start_infer_time,
|
|
438
|
-
end=end_infer_time,
|
|
439
|
-
endpoint_id=endpoint_id,
|
|
440
|
-
)
|
|
441
|
-
cls._push_to_applications(
|
|
442
|
-
start_infer_time=start_infer_time,
|
|
443
|
-
end_infer_time=end_infer_time,
|
|
444
|
-
endpoint_id=endpoint_id,
|
|
445
|
-
project=project,
|
|
446
|
-
applications_names=[application],
|
|
447
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
369
|
+
with _BatchWindowGenerator(
|
|
370
|
+
project=project, endpoint_id=endpoint_id, window_length=window_length
|
|
371
|
+
) as batch_window_generator:
|
|
372
|
+
for application in applications_names:
|
|
373
|
+
for (
|
|
374
|
+
start_infer_time,
|
|
375
|
+
end_infer_time,
|
|
376
|
+
) in batch_window_generator.get_intervals(
|
|
377
|
+
application=application,
|
|
378
|
+
first_request=endpoint.status.first_request,
|
|
379
|
+
last_request=endpoint.status.last_request,
|
|
380
|
+
not_batch_endpoint=not_batch_endpoint,
|
|
381
|
+
):
|
|
382
|
+
df = m_fs.to_dataframe(
|
|
383
|
+
start_time=start_infer_time,
|
|
384
|
+
end_time=end_infer_time,
|
|
385
|
+
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
386
|
+
storage_options=storage_options,
|
|
448
387
|
)
|
|
388
|
+
if len(df) == 0:
|
|
389
|
+
logger.info(
|
|
390
|
+
"No data found for the given interval",
|
|
391
|
+
start=start_infer_time,
|
|
392
|
+
end=end_infer_time,
|
|
393
|
+
endpoint_id=endpoint_id,
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
logger.info(
|
|
397
|
+
"Data found for the given interval",
|
|
398
|
+
start=start_infer_time,
|
|
399
|
+
end=end_infer_time,
|
|
400
|
+
endpoint_id=endpoint_id,
|
|
401
|
+
)
|
|
402
|
+
cls._push_to_applications(
|
|
403
|
+
start_infer_time=start_infer_time,
|
|
404
|
+
end_infer_time=end_infer_time,
|
|
405
|
+
endpoint_id=endpoint_id,
|
|
406
|
+
endpoint_name=endpoint.metadata.name,
|
|
407
|
+
project=project,
|
|
408
|
+
applications_names=[application],
|
|
409
|
+
model_monitoring_access_key=model_monitoring_access_key,
|
|
410
|
+
)
|
|
411
|
+
logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
|
|
449
412
|
|
|
450
413
|
except Exception:
|
|
451
414
|
logger.exception(
|
|
452
415
|
"Encountered an exception",
|
|
453
|
-
endpoint_id=endpoint
|
|
416
|
+
endpoint_id=endpoint.metadata.uid,
|
|
454
417
|
)
|
|
455
418
|
|
|
456
419
|
@staticmethod
|
|
@@ -458,6 +421,7 @@ class MonitoringApplicationController:
|
|
|
458
421
|
start_infer_time: datetime.datetime,
|
|
459
422
|
end_infer_time: datetime.datetime,
|
|
460
423
|
endpoint_id: str,
|
|
424
|
+
endpoint_name: str,
|
|
461
425
|
project: str,
|
|
462
426
|
applications_names: list[str],
|
|
463
427
|
model_monitoring_access_key: str,
|
|
@@ -481,6 +445,7 @@ class MonitoringApplicationController:
|
|
|
481
445
|
sep=" ", timespec="microseconds"
|
|
482
446
|
),
|
|
483
447
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
448
|
+
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
484
449
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
485
450
|
project=project,
|
|
486
451
|
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
@@ -491,14 +456,17 @@ class MonitoringApplicationController:
|
|
|
491
456
|
stream_uri = get_stream_path(project=project, function_name=app_name)
|
|
492
457
|
|
|
493
458
|
logger.info(
|
|
494
|
-
|
|
459
|
+
"Pushing data to application stream",
|
|
460
|
+
endpoint_id=endpoint_id,
|
|
461
|
+
app_name=app_name,
|
|
462
|
+
stream_uri=stream_uri,
|
|
495
463
|
)
|
|
496
464
|
get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
|
|
497
465
|
[data]
|
|
498
466
|
)
|
|
499
467
|
|
|
500
468
|
|
|
501
|
-
def handler(context:
|
|
469
|
+
def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
502
470
|
"""
|
|
503
471
|
Run model monitoring application processor
|
|
504
472
|
|
|
@@ -12,7 +12,5 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .stores import ObjectStoreFactory, get_store_object
|
|
16
|
-
from .stores.base import StoreBase
|
|
17
15
|
from .tsdb import get_tsdb_connector
|
|
18
16
|
from .tsdb.base import TSDBConnector
|