mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +14 -12
- mlrun/__main__.py +3 -3
- mlrun/alerts/alert.py +19 -12
- mlrun/artifacts/__init__.py +0 -2
- mlrun/artifacts/base.py +34 -11
- mlrun/artifacts/dataset.py +16 -16
- mlrun/artifacts/manager.py +13 -13
- mlrun/artifacts/model.py +66 -53
- mlrun/common/constants.py +6 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/common/formatters/model_endpoint.py +30 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/model_monitoring/__init__.py +0 -3
- mlrun/common/model_monitoring/helpers.py +1 -1
- mlrun/common/runtimes/constants.py +1 -2
- mlrun/common/schemas/__init__.py +4 -2
- mlrun/common/schemas/artifact.py +0 -6
- mlrun/common/schemas/common.py +50 -0
- mlrun/common/schemas/model_monitoring/__init__.py +8 -1
- mlrun/common/schemas/model_monitoring/constants.py +62 -12
- mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +149 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -5
- mlrun/common/schemas/partition.py +122 -0
- mlrun/config.py +43 -15
- mlrun/data_types/__init__.py +0 -2
- mlrun/data_types/data_types.py +0 -1
- mlrun/data_types/infer.py +3 -1
- mlrun/data_types/spark.py +4 -4
- mlrun/data_types/to_pandas.py +2 -11
- mlrun/datastore/__init__.py +0 -2
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +12 -4
- mlrun/datastore/datastore.py +9 -3
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +4 -1
- mlrun/datastore/sources.py +51 -49
- mlrun/datastore/store_resources.py +0 -2
- mlrun/datastore/targets.py +22 -23
- mlrun/datastore/utils.py +2 -2
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +126 -62
- mlrun/db/factory.py +3 -0
- mlrun/db/httpdb.py +767 -231
- mlrun/db/nopdb.py +126 -57
- mlrun/errors.py +2 -2
- mlrun/execution.py +55 -29
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +40 -40
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +27 -24
- mlrun/feature_store/retrieval/base.py +14 -9
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/steps.py +2 -2
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +29 -27
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +4 -3
- mlrun/model.py +108 -44
- mlrun/model_monitoring/__init__.py +1 -2
- mlrun/model_monitoring/api.py +6 -6
- mlrun/model_monitoring/applications/_application_steps.py +13 -15
- mlrun/model_monitoring/applications/histogram_data_drift.py +41 -15
- mlrun/model_monitoring/applications/results.py +55 -3
- mlrun/model_monitoring/controller.py +185 -223
- mlrun/model_monitoring/db/_schedules.py +156 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/stores/__init__.py +1 -1
- mlrun/model_monitoring/db/stores/base/store.py +6 -65
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -25
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -97
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +2 -58
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -15
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +6 -257
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -271
- mlrun/model_monitoring/db/tsdb/base.py +74 -22
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +66 -35
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +284 -51
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -17
- mlrun/model_monitoring/helpers.py +97 -1
- mlrun/model_monitoring/model_endpoint.py +4 -2
- mlrun/model_monitoring/stream_processing.py +2 -2
- mlrun/model_monitoring/tracking_policy.py +10 -3
- mlrun/model_monitoring/writer.py +47 -26
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +31 -14
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +3 -3
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/__init__.py +1 -6
- mlrun/projects/operations.py +27 -27
- mlrun/projects/pipelines.py +85 -215
- mlrun/projects/project.py +444 -158
- mlrun/run.py +9 -9
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +13 -10
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/generators.py +2 -1
- mlrun/runtimes/kubejob.py +4 -5
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -11
- mlrun/runtimes/nuclio/function.py +14 -13
- mlrun/runtimes/nuclio/serving.py +9 -9
- mlrun/runtimes/pod.py +74 -29
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +21 -11
- mlrun/runtimes/utils.py +6 -5
- mlrun/serving/merger.py +6 -4
- mlrun/serving/remote.py +18 -17
- mlrun/serving/routers.py +27 -27
- mlrun/serving/server.py +1 -1
- mlrun/serving/states.py +76 -71
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +4 -4
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/helpers.py +72 -28
- mlrun/utils/logger.py +104 -2
- mlrun/utils/notifications/notification/base.py +23 -4
- mlrun/utils/notifications/notification/console.py +1 -1
- mlrun/utils/notifications/notification/git.py +6 -6
- mlrun/utils/notifications/notification/ipython.py +5 -4
- mlrun/utils/notifications/notification/slack.py +1 -1
- mlrun/utils/notifications/notification/webhook.py +13 -17
- mlrun/utils/notifications/notification_pusher.py +23 -19
- mlrun/utils/regex.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/METADATA +186 -186
- mlrun-1.8.0rc1.dist-info/RECORD +356 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/WHEEL +1 -1
- mlrun-1.7.2rc3.dist-info/RECORD +0 -351
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/LICENSE +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -11,31 +11,30 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import concurrent.futures
|
|
15
16
|
import datetime
|
|
16
17
|
import json
|
|
17
18
|
import os
|
|
18
|
-
import re
|
|
19
19
|
from collections.abc import Iterator
|
|
20
|
-
from
|
|
20
|
+
from contextlib import AbstractContextManager
|
|
21
|
+
from types import TracebackType
|
|
22
|
+
from typing import Any, NamedTuple, Optional, cast
|
|
21
23
|
|
|
22
|
-
import
|
|
24
|
+
import nuclio_sdk
|
|
23
25
|
|
|
24
26
|
import mlrun
|
|
25
27
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
26
|
-
import mlrun.data_types.infer
|
|
27
28
|
import mlrun.feature_store as fstore
|
|
28
|
-
import mlrun.model_monitoring
|
|
29
|
-
from mlrun.config import config as mlconf
|
|
29
|
+
import mlrun.model_monitoring
|
|
30
30
|
from mlrun.datastore import get_stream_pusher
|
|
31
31
|
from mlrun.errors import err_to_str
|
|
32
|
-
from mlrun.model_monitoring.
|
|
33
|
-
|
|
34
|
-
batch_dict2timedelta,
|
|
35
|
-
get_stream_path,
|
|
36
|
-
)
|
|
32
|
+
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
33
|
+
from mlrun.model_monitoring.helpers import batch_dict2timedelta, get_stream_path
|
|
37
34
|
from mlrun.utils import datetime_now, logger
|
|
38
35
|
|
|
36
|
+
_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
|
|
37
|
+
|
|
39
38
|
|
|
40
39
|
class _Interval(NamedTuple):
|
|
41
40
|
start: datetime.datetime
|
|
@@ -45,12 +44,12 @@ class _Interval(NamedTuple):
|
|
|
45
44
|
class _BatchWindow:
|
|
46
45
|
def __init__(
|
|
47
46
|
self,
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
*,
|
|
48
|
+
schedules_file: ModelMonitoringSchedulesFile,
|
|
50
49
|
application: str,
|
|
51
50
|
timedelta_seconds: int,
|
|
52
|
-
last_updated:
|
|
53
|
-
first_request:
|
|
51
|
+
last_updated: int,
|
|
52
|
+
first_request: int,
|
|
54
53
|
) -> None:
|
|
55
54
|
"""
|
|
56
55
|
Initialize a batch window object that handles the batch interval time range
|
|
@@ -58,151 +57,114 @@ class _BatchWindow:
|
|
|
58
57
|
All the time values are in seconds.
|
|
59
58
|
The start and stop time are in seconds since the epoch.
|
|
60
59
|
"""
|
|
61
|
-
self.project = project
|
|
62
|
-
self._endpoint = endpoint
|
|
63
60
|
self._application = application
|
|
64
61
|
self._first_request = first_request
|
|
65
62
|
self._stop = last_updated
|
|
66
63
|
self._step = timedelta_seconds
|
|
67
|
-
self._db =
|
|
64
|
+
self._db = schedules_file
|
|
68
65
|
self._start = self._get_last_analyzed()
|
|
69
66
|
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
last_analyzed = self._db.get_last_analyzed(
|
|
73
|
-
endpoint_id=self._endpoint,
|
|
74
|
-
application_name=self._application,
|
|
75
|
-
)
|
|
76
|
-
except mlrun.errors.MLRunNotFoundError:
|
|
77
|
-
logger.info(
|
|
78
|
-
"No last analyzed time was found for this endpoint and "
|
|
79
|
-
"application, as this is probably the first time this "
|
|
80
|
-
"application is running. Using the latest between first "
|
|
81
|
-
"request time or last update time minus one day instead",
|
|
82
|
-
endpoint=self._endpoint,
|
|
83
|
-
application=self._application,
|
|
84
|
-
first_request=self._first_request,
|
|
85
|
-
last_updated=self._stop,
|
|
86
|
-
)
|
|
67
|
+
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
68
|
+
return self._db.get_application_time(self._application)
|
|
87
69
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
92
|
-
) # max between one day and the base period
|
|
93
|
-
return max(
|
|
94
|
-
self._first_request,
|
|
95
|
-
self._stop - first_period_in_seconds,
|
|
96
|
-
)
|
|
97
|
-
return self._first_request
|
|
98
|
-
|
|
99
|
-
logger.info(
|
|
100
|
-
"Got the last analyzed time for this endpoint and application",
|
|
101
|
-
endpoint=self._endpoint,
|
|
102
|
-
application=self._application,
|
|
103
|
-
last_analyzed=last_analyzed,
|
|
70
|
+
def _update_last_analyzed(self, last_analyzed: int) -> None:
|
|
71
|
+
self._db.update_application_time(
|
|
72
|
+
application=self._application, timestamp=last_analyzed
|
|
104
73
|
)
|
|
105
|
-
return last_analyzed
|
|
106
74
|
|
|
107
|
-
def
|
|
75
|
+
def _get_initial_last_analyzed(self) -> int:
|
|
108
76
|
logger.info(
|
|
109
|
-
"
|
|
110
|
-
|
|
77
|
+
"No last analyzed time was found for this endpoint and application, as this is "
|
|
78
|
+
"probably the first time this application is running. Initializing last analyzed "
|
|
79
|
+
"to the latest between first request time or last update time minus one day",
|
|
111
80
|
application=self._application,
|
|
112
|
-
|
|
81
|
+
first_request=self._first_request,
|
|
82
|
+
last_updated=self._stop,
|
|
113
83
|
)
|
|
114
|
-
|
|
115
|
-
self.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
84
|
+
# max between one day and the base period
|
|
85
|
+
first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
|
|
86
|
+
return max(
|
|
87
|
+
self._first_request,
|
|
88
|
+
self._stop - first_period_in_seconds,
|
|
119
89
|
)
|
|
120
90
|
|
|
121
|
-
def
|
|
122
|
-
self
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if self._start is not None and self._stop is not None:
|
|
126
|
-
entered = False
|
|
127
|
-
# Iterate timestamp from start until timestamp <= stop - step
|
|
128
|
-
# so that the last interval will end at (timestamp + step) <= stop.
|
|
129
|
-
# Add 1 to stop - step to get <= and not <.
|
|
130
|
-
for timestamp in range(
|
|
131
|
-
self._start, self._stop - self._step + 1, self._step
|
|
132
|
-
):
|
|
133
|
-
entered = True
|
|
134
|
-
start_time = datetime.datetime.fromtimestamp(
|
|
135
|
-
timestamp, tz=datetime.timezone.utc
|
|
136
|
-
)
|
|
137
|
-
end_time = datetime.datetime.fromtimestamp(
|
|
138
|
-
timestamp + self._step, tz=datetime.timezone.utc
|
|
139
|
-
)
|
|
140
|
-
yield _Interval(start_time, end_time)
|
|
141
|
-
self._update_last_analyzed(timestamp + self._step)
|
|
142
|
-
if not entered:
|
|
143
|
-
logger.info(
|
|
144
|
-
"All the data is set, but no complete intervals were found. "
|
|
145
|
-
"Wait for last_updated to be updated",
|
|
146
|
-
endpoint=self._endpoint,
|
|
147
|
-
application=self._application,
|
|
148
|
-
start=self._start,
|
|
149
|
-
stop=self._stop,
|
|
150
|
-
step=self._step,
|
|
151
|
-
)
|
|
91
|
+
def _get_last_analyzed(self) -> int:
|
|
92
|
+
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
93
|
+
if saved_last_analyzed is not None:
|
|
94
|
+
return saved_last_analyzed
|
|
152
95
|
else:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
96
|
+
last_analyzed = self._get_initial_last_analyzed()
|
|
97
|
+
# Update the in-memory DB to avoid duplicate initializations
|
|
98
|
+
self._update_last_analyzed(last_analyzed)
|
|
99
|
+
return last_analyzed
|
|
100
|
+
|
|
101
|
+
def get_intervals(self) -> Iterator[_Interval]:
|
|
102
|
+
"""Generate the batch interval time ranges."""
|
|
103
|
+
entered = False
|
|
104
|
+
# Iterate timestamp from start until timestamp <= stop - step
|
|
105
|
+
# so that the last interval will end at (timestamp + step) <= stop.
|
|
106
|
+
# Add 1 to stop - step to get <= and not <.
|
|
107
|
+
for timestamp in range(self._start, self._stop - self._step + 1, self._step):
|
|
108
|
+
entered = True
|
|
109
|
+
start_time = datetime.datetime.fromtimestamp(
|
|
110
|
+
timestamp, tz=datetime.timezone.utc
|
|
111
|
+
)
|
|
112
|
+
end_time = datetime.datetime.fromtimestamp(
|
|
113
|
+
timestamp + self._step, tz=datetime.timezone.utc
|
|
114
|
+
)
|
|
115
|
+
yield _Interval(start_time, end_time)
|
|
116
|
+
|
|
117
|
+
last_analyzed = timestamp + self._step
|
|
118
|
+
self._update_last_analyzed(last_analyzed)
|
|
119
|
+
logger.debug(
|
|
120
|
+
"Updated the last analyzed time for this endpoint and application",
|
|
121
|
+
application=self._application,
|
|
122
|
+
last_analyzed=last_analyzed,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if not entered:
|
|
126
|
+
logger.debug(
|
|
127
|
+
"All the data is set, but no complete intervals were found. "
|
|
128
|
+
"Wait for last_updated to be updated",
|
|
157
129
|
application=self._application,
|
|
158
130
|
start=self._start,
|
|
159
131
|
stop=self._stop,
|
|
132
|
+
step=self._step,
|
|
160
133
|
)
|
|
161
134
|
|
|
162
135
|
|
|
163
|
-
class _BatchWindowGenerator:
|
|
164
|
-
def __init__(self,
|
|
136
|
+
class _BatchWindowGenerator(AbstractContextManager):
|
|
137
|
+
def __init__(self, project: str, endpoint_id: str, window_length: int) -> None:
|
|
165
138
|
"""
|
|
166
139
|
Initialize a batch window generator object that generates batch window objects
|
|
167
140
|
for the monitoring functions.
|
|
168
141
|
"""
|
|
169
|
-
self.
|
|
170
|
-
self.
|
|
171
|
-
self._timedelta =
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
self.
|
|
187
|
-
|
|
188
|
-
pair_list = pair.split(":")
|
|
189
|
-
self._batch_dict[pair_list[0]] = float(pair_list[1])
|
|
190
|
-
|
|
191
|
-
def _get_timedelta(self) -> int:
|
|
192
|
-
"""Get the timedelta in seconds from the batch dictionary"""
|
|
193
|
-
return int(
|
|
194
|
-
batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
|
|
142
|
+
self._project = project
|
|
143
|
+
self._endpoint_id = endpoint_id
|
|
144
|
+
self._timedelta = window_length
|
|
145
|
+
self._schedules_file = ModelMonitoringSchedulesFile(
|
|
146
|
+
project=project, endpoint_id=endpoint_id
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def __enter__(self) -> "_BatchWindowGenerator":
|
|
150
|
+
self._schedules_file.__enter__()
|
|
151
|
+
return super().__enter__()
|
|
152
|
+
|
|
153
|
+
def __exit__(
|
|
154
|
+
self,
|
|
155
|
+
exc_type: Optional[type[BaseException]],
|
|
156
|
+
exc_value: Optional[BaseException],
|
|
157
|
+
traceback: Optional[TracebackType],
|
|
158
|
+
) -> Optional[bool]:
|
|
159
|
+
self._schedules_file.__exit__(
|
|
160
|
+
exc_type=exc_type, exc_value=exc_value, traceback=traceback
|
|
195
161
|
)
|
|
196
162
|
|
|
197
163
|
@classmethod
|
|
198
|
-
def _get_last_updated_time(
|
|
199
|
-
cls, last_request: Optional[str], has_stream: bool
|
|
200
|
-
) -> Optional[int]:
|
|
164
|
+
def _get_last_updated_time(cls, last_request: str, has_stream: bool) -> int:
|
|
201
165
|
"""
|
|
202
166
|
Get the last updated time of a model endpoint.
|
|
203
167
|
"""
|
|
204
|
-
if not last_request:
|
|
205
|
-
return None
|
|
206
168
|
last_updated = int(
|
|
207
169
|
cls._date_string2timestamp(last_request)
|
|
208
170
|
- cast(
|
|
@@ -221,45 +183,42 @@ class _BatchWindowGenerator:
|
|
|
221
183
|
)
|
|
222
184
|
return last_updated
|
|
223
185
|
|
|
224
|
-
@classmethod
|
|
225
|
-
def _normalize_first_request(
|
|
226
|
-
cls, first_request: Optional[str], endpoint: str
|
|
227
|
-
) -> Optional[int]:
|
|
228
|
-
if not first_request:
|
|
229
|
-
logger.debug(
|
|
230
|
-
"There is no first request time for this endpoint.",
|
|
231
|
-
endpoint=endpoint,
|
|
232
|
-
first_request=first_request,
|
|
233
|
-
)
|
|
234
|
-
return None
|
|
235
|
-
return cls._date_string2timestamp(first_request)
|
|
236
|
-
|
|
237
186
|
@staticmethod
|
|
238
187
|
def _date_string2timestamp(date_string: str) -> int:
|
|
239
188
|
return int(datetime.datetime.fromisoformat(date_string).timestamp())
|
|
240
189
|
|
|
241
|
-
def
|
|
190
|
+
def get_intervals(
|
|
242
191
|
self,
|
|
243
|
-
|
|
244
|
-
endpoint: str,
|
|
192
|
+
*,
|
|
245
193
|
application: str,
|
|
246
|
-
first_request:
|
|
247
|
-
last_request:
|
|
194
|
+
first_request: str,
|
|
195
|
+
last_request: str,
|
|
248
196
|
has_stream: bool,
|
|
249
|
-
) ->
|
|
197
|
+
) -> Iterator[_Interval]:
|
|
250
198
|
"""
|
|
251
199
|
Get the batch window for a specific endpoint and application.
|
|
252
|
-
first_request
|
|
200
|
+
`first_request` and `last_request` are the timestamps of the first request and last
|
|
201
|
+
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
253
202
|
"""
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
project=project,
|
|
257
|
-
endpoint=endpoint,
|
|
203
|
+
batch_window = _BatchWindow(
|
|
204
|
+
schedules_file=self._schedules_file,
|
|
258
205
|
application=application,
|
|
259
206
|
timedelta_seconds=self._timedelta,
|
|
260
207
|
last_updated=self._get_last_updated_time(last_request, has_stream),
|
|
261
|
-
first_request=self.
|
|
208
|
+
first_request=self._date_string2timestamp(first_request),
|
|
262
209
|
)
|
|
210
|
+
yield from batch_window.get_intervals()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _get_window_length() -> int:
|
|
214
|
+
"""Get the timedelta in seconds from the batch dictionary"""
|
|
215
|
+
return int(
|
|
216
|
+
batch_dict2timedelta(
|
|
217
|
+
json.loads(
|
|
218
|
+
cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
|
|
219
|
+
)
|
|
220
|
+
).total_seconds()
|
|
221
|
+
)
|
|
263
222
|
|
|
264
223
|
|
|
265
224
|
class MonitoringApplicationController:
|
|
@@ -278,17 +237,11 @@ class MonitoringApplicationController:
|
|
|
278
237
|
|
|
279
238
|
self.db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
280
239
|
|
|
281
|
-
self.
|
|
282
|
-
batch_dict=json.loads(
|
|
283
|
-
mlrun.get_secret_or_env(
|
|
284
|
-
mm_constants.EventFieldType.BATCH_INTERVALS_DICT
|
|
285
|
-
)
|
|
286
|
-
)
|
|
287
|
-
)
|
|
240
|
+
self._window_length = _get_window_length()
|
|
288
241
|
|
|
289
242
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
290
243
|
self.storage_options = None
|
|
291
|
-
if mlconf.artifact_path.startswith("s3://"):
|
|
244
|
+
if mlrun.mlconf.artifact_path.startswith("s3://"):
|
|
292
245
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
293
246
|
|
|
294
247
|
@staticmethod
|
|
@@ -299,6 +252,22 @@ class MonitoringApplicationController:
|
|
|
299
252
|
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
300
253
|
return access_key
|
|
301
254
|
|
|
255
|
+
@staticmethod
|
|
256
|
+
def _should_monitor_endpoint(endpoint: dict[str, Any]) -> bool:
|
|
257
|
+
return (
|
|
258
|
+
# Is the model endpoint active?
|
|
259
|
+
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
260
|
+
# Is the model endpoint monitored?
|
|
261
|
+
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
262
|
+
== mm_constants.ModelMonitoringMode.enabled
|
|
263
|
+
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
264
|
+
and endpoint[mm_constants.EventFieldType.FIRST_REQUEST]
|
|
265
|
+
and endpoint[mm_constants.EventFieldType.LAST_REQUEST]
|
|
266
|
+
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
267
|
+
and int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
268
|
+
!= mm_constants.EndpointType.ROUTER
|
|
269
|
+
)
|
|
270
|
+
|
|
302
271
|
def run(self) -> None:
|
|
303
272
|
"""
|
|
304
273
|
Main method for run all the relevant monitoring applications on each endpoint.
|
|
@@ -349,32 +318,18 @@ class MonitoringApplicationController:
|
|
|
349
318
|
exc=err_to_str(e),
|
|
350
319
|
)
|
|
351
320
|
return
|
|
352
|
-
# Initialize a
|
|
321
|
+
# Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
|
|
353
322
|
with concurrent.futures.ThreadPoolExecutor(
|
|
354
|
-
max_workers=min(len(endpoints), 10)
|
|
323
|
+
max_workers=min(len(endpoints), 10)
|
|
355
324
|
) as pool:
|
|
356
325
|
for endpoint in endpoints:
|
|
357
|
-
if (
|
|
358
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
359
|
-
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
360
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
361
|
-
):
|
|
362
|
-
# Skip router endpoint:
|
|
363
|
-
if (
|
|
364
|
-
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
365
|
-
== mm_constants.EndpointType.ROUTER
|
|
366
|
-
):
|
|
367
|
-
# Router endpoint has no feature stats
|
|
368
|
-
logger.info(
|
|
369
|
-
f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
|
|
370
|
-
)
|
|
371
|
-
continue
|
|
326
|
+
if self._should_monitor_endpoint(endpoint):
|
|
372
327
|
pool.submit(
|
|
373
328
|
MonitoringApplicationController.model_endpoint_process,
|
|
329
|
+
project=self.project,
|
|
374
330
|
endpoint=endpoint,
|
|
375
331
|
applications_names=applications_names,
|
|
376
|
-
|
|
377
|
-
project=self.project,
|
|
332
|
+
window_length=self._window_length,
|
|
378
333
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
379
334
|
storage_options=self.storage_options,
|
|
380
335
|
)
|
|
@@ -382,10 +337,10 @@ class MonitoringApplicationController:
|
|
|
382
337
|
@classmethod
|
|
383
338
|
def model_endpoint_process(
|
|
384
339
|
cls,
|
|
340
|
+
project: str,
|
|
385
341
|
endpoint: dict,
|
|
386
342
|
applications_names: list[str],
|
|
387
|
-
|
|
388
|
-
project: str,
|
|
343
|
+
window_length: int,
|
|
389
344
|
model_monitoring_access_key: str,
|
|
390
345
|
storage_options: Optional[dict] = None,
|
|
391
346
|
) -> None:
|
|
@@ -407,45 +362,49 @@ class MonitoringApplicationController:
|
|
|
407
362
|
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
408
363
|
)
|
|
409
364
|
try:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
start=start_infer_time,
|
|
431
|
-
end=end_infer_time,
|
|
432
|
-
endpoint_id=endpoint_id,
|
|
433
|
-
)
|
|
434
|
-
else:
|
|
435
|
-
logger.info(
|
|
436
|
-
"Data found for the given interval",
|
|
437
|
-
start=start_infer_time,
|
|
438
|
-
end=end_infer_time,
|
|
439
|
-
endpoint_id=endpoint_id,
|
|
440
|
-
)
|
|
441
|
-
cls._push_to_applications(
|
|
442
|
-
start_infer_time=start_infer_time,
|
|
443
|
-
end_infer_time=end_infer_time,
|
|
444
|
-
endpoint_id=endpoint_id,
|
|
445
|
-
project=project,
|
|
446
|
-
applications_names=[application],
|
|
447
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
365
|
+
with _BatchWindowGenerator(
|
|
366
|
+
project=project, endpoint_id=endpoint_id, window_length=window_length
|
|
367
|
+
) as batch_window_generator:
|
|
368
|
+
for application in applications_names:
|
|
369
|
+
for (
|
|
370
|
+
start_infer_time,
|
|
371
|
+
end_infer_time,
|
|
372
|
+
) in batch_window_generator.get_intervals(
|
|
373
|
+
application=application,
|
|
374
|
+
first_request=endpoint[
|
|
375
|
+
mm_constants.EventFieldType.FIRST_REQUEST
|
|
376
|
+
],
|
|
377
|
+
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
378
|
+
has_stream=has_stream,
|
|
379
|
+
):
|
|
380
|
+
df = m_fs.to_dataframe(
|
|
381
|
+
start_time=start_infer_time,
|
|
382
|
+
end_time=end_infer_time,
|
|
383
|
+
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
384
|
+
storage_options=storage_options,
|
|
448
385
|
)
|
|
386
|
+
if len(df) == 0:
|
|
387
|
+
logger.info(
|
|
388
|
+
"No data found for the given interval",
|
|
389
|
+
start=start_infer_time,
|
|
390
|
+
end=end_infer_time,
|
|
391
|
+
endpoint_id=endpoint_id,
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
logger.info(
|
|
395
|
+
"Data found for the given interval",
|
|
396
|
+
start=start_infer_time,
|
|
397
|
+
end=end_infer_time,
|
|
398
|
+
endpoint_id=endpoint_id,
|
|
399
|
+
)
|
|
400
|
+
cls._push_to_applications(
|
|
401
|
+
start_infer_time=start_infer_time,
|
|
402
|
+
end_infer_time=end_infer_time,
|
|
403
|
+
endpoint_id=endpoint_id,
|
|
404
|
+
project=project,
|
|
405
|
+
applications_names=[application],
|
|
406
|
+
model_monitoring_access_key=model_monitoring_access_key,
|
|
407
|
+
)
|
|
449
408
|
|
|
450
409
|
except Exception:
|
|
451
410
|
logger.exception(
|
|
@@ -491,14 +450,17 @@ class MonitoringApplicationController:
|
|
|
491
450
|
stream_uri = get_stream_path(project=project, function_name=app_name)
|
|
492
451
|
|
|
493
452
|
logger.info(
|
|
494
|
-
|
|
453
|
+
"Pushing data to application stream",
|
|
454
|
+
endpoint_id=endpoint_id,
|
|
455
|
+
app_name=app_name,
|
|
456
|
+
stream_uri=stream_uri,
|
|
495
457
|
)
|
|
496
458
|
get_stream_pusher(stream_uri, access_key=model_monitoring_access_key).push(
|
|
497
459
|
[data]
|
|
498
460
|
)
|
|
499
461
|
|
|
500
462
|
|
|
501
|
-
def handler(context:
|
|
463
|
+
def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
502
464
|
"""
|
|
503
465
|
Run model monitoring application processor
|
|
504
466
|
|