mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
|
@@ -11,35 +11,30 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import concurrent.futures
|
|
16
15
|
import datetime
|
|
17
16
|
import json
|
|
18
17
|
import os
|
|
19
18
|
import re
|
|
20
19
|
from collections.abc import Iterator
|
|
21
|
-
from typing import
|
|
20
|
+
from typing import NamedTuple, Optional, Union, cast
|
|
22
21
|
|
|
23
22
|
import nuclio
|
|
24
|
-
from v3io.dataplane.response import HttpResponseError
|
|
25
23
|
|
|
26
24
|
import mlrun
|
|
27
25
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
28
26
|
import mlrun.data_types.infer
|
|
29
27
|
import mlrun.feature_store as fstore
|
|
30
|
-
|
|
28
|
+
import mlrun.model_monitoring.db.stores
|
|
29
|
+
from mlrun.config import config as mlconf
|
|
31
30
|
from mlrun.datastore import get_stream_pusher
|
|
32
|
-
from mlrun.datastore.targets import ParquetTarget
|
|
33
31
|
from mlrun.errors import err_to_str
|
|
34
|
-
from mlrun.model_monitoring.batch import calculate_inputs_statistics
|
|
35
32
|
from mlrun.model_monitoring.helpers import (
|
|
36
33
|
_BatchDict,
|
|
37
34
|
batch_dict2timedelta,
|
|
38
|
-
get_monitoring_parquet_path,
|
|
39
35
|
get_stream_path,
|
|
40
36
|
)
|
|
41
|
-
from mlrun.utils import
|
|
42
|
-
from mlrun.utils.v3io_clients import get_v3io_client
|
|
37
|
+
from mlrun.utils import datetime_now, logger
|
|
43
38
|
|
|
44
39
|
|
|
45
40
|
class _Interval(NamedTuple):
|
|
@@ -48,8 +43,6 @@ class _Interval(NamedTuple):
|
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
class _BatchWindow:
|
|
51
|
-
V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
|
|
52
|
-
|
|
53
46
|
def __init__(
|
|
54
47
|
self,
|
|
55
48
|
project: str,
|
|
@@ -65,27 +58,22 @@ class _BatchWindow:
|
|
|
65
58
|
All the time values are in seconds.
|
|
66
59
|
The start and stop time are in seconds since the epoch.
|
|
67
60
|
"""
|
|
61
|
+
self.project = project
|
|
68
62
|
self._endpoint = endpoint
|
|
69
63
|
self._application = application
|
|
70
64
|
self._first_request = first_request
|
|
71
|
-
self._kv_storage = get_v3io_client(
|
|
72
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
73
|
-
# Avoid noisy warning logs before the KV table is created
|
|
74
|
-
logger=create_logger(name="v3io_client", level="error"),
|
|
75
|
-
).kv
|
|
76
|
-
self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
|
|
77
65
|
self._stop = last_updated
|
|
78
66
|
self._step = timedelta_seconds
|
|
67
|
+
self._db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
79
68
|
self._start = self._get_last_analyzed()
|
|
80
69
|
|
|
81
70
|
def _get_last_analyzed(self) -> Optional[int]:
|
|
82
71
|
try:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
key=self._application,
|
|
72
|
+
last_analyzed = self._db.get_last_analyzed(
|
|
73
|
+
endpoint_id=self._endpoint,
|
|
74
|
+
application_name=self._application,
|
|
87
75
|
)
|
|
88
|
-
except
|
|
76
|
+
except mlrun.errors.MLRunNotFoundError:
|
|
89
77
|
logger.info(
|
|
90
78
|
"No last analyzed time was found for this endpoint and "
|
|
91
79
|
"application, as this is probably the first time this "
|
|
@@ -96,7 +84,7 @@ class _BatchWindow:
|
|
|
96
84
|
first_request=self._first_request,
|
|
97
85
|
last_updated=self._stop,
|
|
98
86
|
)
|
|
99
|
-
|
|
87
|
+
|
|
100
88
|
if self._first_request and self._stop:
|
|
101
89
|
# TODO : Change the timedelta according to the policy.
|
|
102
90
|
first_period_in_seconds = max(
|
|
@@ -108,7 +96,6 @@ class _BatchWindow:
|
|
|
108
96
|
)
|
|
109
97
|
return self._first_request
|
|
110
98
|
|
|
111
|
-
last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
|
|
112
99
|
logger.info(
|
|
113
100
|
"Got the last analyzed time for this endpoint and application",
|
|
114
101
|
endpoint=self._endpoint,
|
|
@@ -124,11 +111,11 @@ class _BatchWindow:
|
|
|
124
111
|
application=self._application,
|
|
125
112
|
last_analyzed=last_analyzed,
|
|
126
113
|
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
114
|
+
|
|
115
|
+
self._db.update_last_analyzed(
|
|
116
|
+
endpoint_id=self._endpoint,
|
|
117
|
+
application_name=self._application,
|
|
118
|
+
last_analyzed=last_analyzed,
|
|
132
119
|
)
|
|
133
120
|
|
|
134
121
|
def get_intervals(
|
|
@@ -227,7 +214,7 @@ class _BatchWindowGenerator:
|
|
|
227
214
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
228
215
|
# the minimum between the current time and the last updated time.
|
|
229
216
|
# This compensates for the bumping mechanism - see
|
|
230
|
-
# `
|
|
217
|
+
# `update_model_endpoint_last_request`.
|
|
231
218
|
last_updated = min(int(datetime_now().timestamp()), last_updated)
|
|
232
219
|
logger.debug(
|
|
233
220
|
"The endpoint does not have a stream", last_updated=last_updated
|
|
@@ -282,26 +269,14 @@ class MonitoringApplicationController:
|
|
|
282
269
|
Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
|
|
283
270
|
"""
|
|
284
271
|
|
|
285
|
-
def __init__(
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
project
|
|
289
|
-
):
|
|
290
|
-
"""
|
|
291
|
-
Initialize Monitoring Application Processor object.
|
|
272
|
+
def __init__(self) -> None:
|
|
273
|
+
"""Initialize Monitoring Application Controller"""
|
|
274
|
+
self.project = cast(str, mlrun.mlconf.default_project)
|
|
275
|
+
self.project_obj = mlrun.load_project(name=self.project, url=self.project)
|
|
292
276
|
|
|
293
|
-
|
|
294
|
-
:param project: Project name.
|
|
295
|
-
"""
|
|
296
|
-
self.context = mlrun_context
|
|
297
|
-
self.project = project
|
|
298
|
-
self.project_obj = mlrun.get_or_create_project(project)
|
|
277
|
+
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
299
278
|
|
|
300
|
-
|
|
301
|
-
f"Initializing {self.__class__.__name__}", project=project
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
self.db = mlrun.model_monitoring.get_model_endpoint_store(project=project)
|
|
279
|
+
self.db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
305
280
|
|
|
306
281
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
307
282
|
batch_dict=json.loads(
|
|
@@ -312,14 +287,8 @@ class MonitoringApplicationController:
|
|
|
312
287
|
)
|
|
313
288
|
|
|
314
289
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
315
|
-
self.parquet_directory = get_monitoring_parquet_path(
|
|
316
|
-
self.project_obj,
|
|
317
|
-
kind=mm_constants.FileTargetKind.APPS_PARQUET,
|
|
318
|
-
)
|
|
319
290
|
self.storage_options = None
|
|
320
|
-
if
|
|
321
|
-
self._initialize_v3io_configurations()
|
|
322
|
-
elif self.parquet_directory.startswith("s3://"):
|
|
291
|
+
if mlconf.artifact_path.startswith("s3://"):
|
|
323
292
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
324
293
|
|
|
325
294
|
@staticmethod
|
|
@@ -330,89 +299,85 @@ class MonitoringApplicationController:
|
|
|
330
299
|
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
331
300
|
return access_key
|
|
332
301
|
|
|
333
|
-
def
|
|
334
|
-
self.v3io_framesd = mlrun.mlconf.v3io_framesd
|
|
335
|
-
self.v3io_api = mlrun.mlconf.v3io_api
|
|
336
|
-
self.storage_options = dict(
|
|
337
|
-
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
def run(self, event: nuclio.Event):
|
|
302
|
+
def run(self) -> None:
|
|
341
303
|
"""
|
|
342
|
-
Main method for run all the relevant monitoring applications on each endpoint
|
|
343
|
-
|
|
344
|
-
|
|
304
|
+
Main method for run all the relevant monitoring applications on each endpoint.
|
|
305
|
+
This method handles the following:
|
|
306
|
+
1. List model endpoints
|
|
307
|
+
2. List applications
|
|
308
|
+
3. Check model monitoring windows
|
|
309
|
+
4. Send data to applications
|
|
310
|
+
5. Delete old parquets
|
|
345
311
|
"""
|
|
346
312
|
logger.info("Start running monitoring controller")
|
|
347
313
|
try:
|
|
348
314
|
applications_names = []
|
|
349
|
-
endpoints = self.db.list_model_endpoints()
|
|
315
|
+
endpoints = self.db.list_model_endpoints(include_stats=True)
|
|
350
316
|
if not endpoints:
|
|
351
|
-
|
|
352
|
-
"No model endpoints found", project=self.project
|
|
353
|
-
)
|
|
317
|
+
logger.info("No model endpoints found", project=self.project)
|
|
354
318
|
return
|
|
355
319
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
356
320
|
if monitoring_functions:
|
|
357
|
-
# Gets only application in ready state
|
|
358
321
|
applications_names = list(
|
|
359
|
-
{
|
|
360
|
-
app.metadata.name
|
|
361
|
-
for app in monitoring_functions
|
|
362
|
-
if app.status.state == "ready"
|
|
363
|
-
}
|
|
322
|
+
{app.metadata.name for app in monitoring_functions}
|
|
364
323
|
)
|
|
324
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
325
|
+
# Gets only application in ready state
|
|
326
|
+
# applications_names = list(
|
|
327
|
+
# {
|
|
328
|
+
# app.metadata.name
|
|
329
|
+
# for app in monitoring_functions
|
|
330
|
+
# if (
|
|
331
|
+
# app.status.state == "ready"
|
|
332
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
333
|
+
# or app.metadata.name
|
|
334
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
335
|
+
# )
|
|
336
|
+
# }
|
|
337
|
+
# )
|
|
365
338
|
if not applications_names:
|
|
366
|
-
|
|
367
|
-
"No monitoring functions found", project=self.project
|
|
368
|
-
)
|
|
339
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
369
340
|
return
|
|
341
|
+
logger.info(
|
|
342
|
+
"Starting to iterate over the applications",
|
|
343
|
+
applications=applications_names,
|
|
344
|
+
)
|
|
370
345
|
|
|
371
346
|
except Exception as e:
|
|
372
|
-
|
|
347
|
+
logger.error(
|
|
373
348
|
"Failed to list endpoints and monitoring applications",
|
|
374
349
|
exc=err_to_str(e),
|
|
375
350
|
)
|
|
376
351
|
return
|
|
377
352
|
# Initialize a process pool that will be used to run each endpoint applications on a dedicated process
|
|
378
|
-
|
|
353
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
379
354
|
max_workers=min(len(endpoints), 10),
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
for endpoint in endpoints:
|
|
383
|
-
if (
|
|
384
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
385
|
-
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
386
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
387
|
-
):
|
|
388
|
-
# Skip router endpoint:
|
|
355
|
+
) as pool:
|
|
356
|
+
for endpoint in endpoints:
|
|
389
357
|
if (
|
|
390
|
-
|
|
391
|
-
|
|
358
|
+
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
359
|
+
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
360
|
+
== mm_constants.ModelMonitoringMode.enabled.value
|
|
392
361
|
):
|
|
393
|
-
#
|
|
394
|
-
|
|
395
|
-
|
|
362
|
+
# Skip router endpoint:
|
|
363
|
+
if (
|
|
364
|
+
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
365
|
+
== mm_constants.EndpointType.ROUTER
|
|
366
|
+
):
|
|
367
|
+
# Router endpoint has no feature stats
|
|
368
|
+
logger.info(
|
|
369
|
+
f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
|
|
370
|
+
)
|
|
371
|
+
continue
|
|
372
|
+
pool.submit(
|
|
373
|
+
MonitoringApplicationController.model_endpoint_process,
|
|
374
|
+
endpoint=endpoint,
|
|
375
|
+
applications_names=applications_names,
|
|
376
|
+
batch_window_generator=self._batch_window_generator,
|
|
377
|
+
project=self.project,
|
|
378
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
379
|
+
storage_options=self.storage_options,
|
|
396
380
|
)
|
|
397
|
-
continue
|
|
398
|
-
future = pool.submit(
|
|
399
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
400
|
-
endpoint=endpoint,
|
|
401
|
-
applications_names=applications_names,
|
|
402
|
-
batch_window_generator=self._batch_window_generator,
|
|
403
|
-
project=self.project,
|
|
404
|
-
parquet_directory=self.parquet_directory,
|
|
405
|
-
storage_options=self.storage_options,
|
|
406
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
407
|
-
)
|
|
408
|
-
futures.append(future)
|
|
409
|
-
|
|
410
|
-
for future in concurrent.futures.as_completed(futures):
|
|
411
|
-
result = future.result()
|
|
412
|
-
if result:
|
|
413
|
-
self.context.log_results(result)
|
|
414
|
-
|
|
415
|
-
self._delete_old_parquet(endpoints=endpoints)
|
|
416
381
|
|
|
417
382
|
@classmethod
|
|
418
383
|
def model_endpoint_process(
|
|
@@ -421,10 +386,9 @@ class MonitoringApplicationController:
|
|
|
421
386
|
applications_names: list[str],
|
|
422
387
|
batch_window_generator: _BatchWindowGenerator,
|
|
423
388
|
project: str,
|
|
424
|
-
parquet_directory: str,
|
|
425
|
-
storage_options: dict,
|
|
426
389
|
model_monitoring_access_key: str,
|
|
427
|
-
|
|
390
|
+
storage_options: Optional[dict] = None,
|
|
391
|
+
) -> None:
|
|
428
392
|
"""
|
|
429
393
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
430
394
|
for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
|
|
@@ -434,25 +398,15 @@ class MonitoringApplicationController:
|
|
|
434
398
|
:param applications_names: (list[str]) List of application names to push results to.
|
|
435
399
|
:param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
|
|
436
400
|
:param project: (str) Project name.
|
|
437
|
-
:param parquet_directory: (str) Directory to store application parquet files
|
|
438
|
-
:param storage_options: (dict) Storage options for writing ParquetTarget.
|
|
439
401
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
440
|
-
|
|
402
|
+
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
441
403
|
"""
|
|
442
404
|
endpoint_id = endpoint[mm_constants.EventFieldType.UID]
|
|
443
|
-
|
|
405
|
+
has_stream = endpoint[mm_constants.EventFieldType.STREAM_PATH] != ""
|
|
406
|
+
m_fs = fstore.get_feature_set(
|
|
407
|
+
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
408
|
+
)
|
|
444
409
|
try:
|
|
445
|
-
m_fs = fstore.get_feature_set(
|
|
446
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
447
|
-
)
|
|
448
|
-
labels = endpoint[mm_constants.EventFieldType.LABEL_NAMES]
|
|
449
|
-
if labels:
|
|
450
|
-
if isinstance(labels, str):
|
|
451
|
-
labels = json.loads(labels)
|
|
452
|
-
for label in labels:
|
|
453
|
-
if label not in list(m_fs.spec.features.keys()):
|
|
454
|
-
m_fs.add_feature(fstore.Feature(name=label, value_type="float"))
|
|
455
|
-
|
|
456
410
|
for application in applications_names:
|
|
457
411
|
batch_window = batch_window_generator.get_batch_window(
|
|
458
412
|
project=project,
|
|
@@ -460,162 +414,72 @@ class MonitoringApplicationController:
|
|
|
460
414
|
application=application,
|
|
461
415
|
first_request=endpoint[mm_constants.EventFieldType.FIRST_REQUEST],
|
|
462
416
|
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
463
|
-
has_stream=
|
|
417
|
+
has_stream=has_stream,
|
|
464
418
|
)
|
|
465
419
|
|
|
466
420
|
for start_infer_time, end_infer_time in batch_window.get_intervals():
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
421
|
+
df = m_fs.to_dataframe(
|
|
422
|
+
start_time=start_infer_time,
|
|
423
|
+
end_time=end_infer_time,
|
|
424
|
+
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
425
|
+
storage_options=storage_options,
|
|
426
|
+
)
|
|
427
|
+
if len(df) == 0:
|
|
428
|
+
logger.info(
|
|
429
|
+
"No data found for the given interval",
|
|
430
|
+
start=start_infer_time,
|
|
431
|
+
end=end_infer_time,
|
|
471
432
|
endpoint_id=endpoint_id,
|
|
433
|
+
)
|
|
434
|
+
else:
|
|
435
|
+
logger.info(
|
|
436
|
+
"Data found for the given interval",
|
|
437
|
+
start=start_infer_time,
|
|
438
|
+
end=end_infer_time,
|
|
439
|
+
endpoint_id=endpoint_id,
|
|
440
|
+
)
|
|
441
|
+
cls._push_to_applications(
|
|
472
442
|
start_infer_time=start_infer_time,
|
|
473
443
|
end_infer_time=end_infer_time,
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
df = offline_response.to_dataframe()
|
|
480
|
-
parquet_target_path = offline_response.vector.get_target_path()
|
|
481
|
-
|
|
482
|
-
if len(df) == 0:
|
|
483
|
-
logger.info(
|
|
484
|
-
"During this time window, the endpoint has not received any data",
|
|
485
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
486
|
-
start_time=start_infer_time,
|
|
487
|
-
end_time=end_infer_time,
|
|
488
|
-
)
|
|
489
|
-
continue
|
|
490
|
-
|
|
491
|
-
except FileNotFoundError:
|
|
492
|
-
logger.warn(
|
|
493
|
-
"No parquets were written yet",
|
|
494
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
444
|
+
endpoint_id=endpoint_id,
|
|
445
|
+
project=project,
|
|
446
|
+
applications_names=[application],
|
|
447
|
+
model_monitoring_access_key=model_monitoring_access_key,
|
|
495
448
|
)
|
|
496
|
-
continue
|
|
497
|
-
|
|
498
|
-
# Get the timestamp of the latest request:
|
|
499
|
-
latest_request = df[mm_constants.EventFieldType.TIMESTAMP].iloc[-1]
|
|
500
|
-
|
|
501
|
-
# Get the feature stats from the model endpoint for reference data
|
|
502
|
-
feature_stats = json.loads(
|
|
503
|
-
endpoint[mm_constants.EventFieldType.FEATURE_STATS]
|
|
504
|
-
)
|
|
505
|
-
|
|
506
|
-
# Pad the original feature stats to accommodate current
|
|
507
|
-
# data out of the original range (unless already padded)
|
|
508
|
-
pad_features_hist(FeatureStats(feature_stats))
|
|
509
449
|
|
|
510
|
-
# Get the current stats:
|
|
511
|
-
current_stats = calculate_inputs_statistics(
|
|
512
|
-
sample_set_statistics=feature_stats,
|
|
513
|
-
inputs=df,
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
cls._push_to_applications(
|
|
517
|
-
current_stats=current_stats,
|
|
518
|
-
feature_stats=feature_stats,
|
|
519
|
-
start_infer_time=start_infer_time,
|
|
520
|
-
end_infer_time=end_infer_time,
|
|
521
|
-
endpoint_id=endpoint_id,
|
|
522
|
-
latest_request=latest_request,
|
|
523
|
-
project=project,
|
|
524
|
-
applications_names=[application],
|
|
525
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
526
|
-
parquet_target_path=parquet_target_path,
|
|
527
|
-
)
|
|
528
|
-
start_times.add(start_infer_time)
|
|
529
450
|
except Exception:
|
|
530
451
|
logger.exception(
|
|
531
452
|
"Encountered an exception",
|
|
532
453
|
endpoint_id=endpoint[mm_constants.EventFieldType.UID],
|
|
533
454
|
)
|
|
534
455
|
|
|
535
|
-
if start_times:
|
|
536
|
-
return {endpoint_id: [str(t) for t in sorted(list(start_times))]}
|
|
537
|
-
|
|
538
|
-
def _delete_old_parquet(self, endpoints: list[dict[str, Any]], days: int = 1):
|
|
539
|
-
"""
|
|
540
|
-
Delete application parquets older than the argument days.
|
|
541
|
-
|
|
542
|
-
:param endpoints: A list of dictionaries of model endpoints records.
|
|
543
|
-
"""
|
|
544
|
-
if self.parquet_directory.startswith("v3io:///"):
|
|
545
|
-
# create fs with access to the user side (under projects)
|
|
546
|
-
store, _, _ = mlrun.store_manager.get_or_create_store(
|
|
547
|
-
self.parquet_directory,
|
|
548
|
-
{"V3IO_ACCESS_KEY": self.model_monitoring_access_key},
|
|
549
|
-
)
|
|
550
|
-
fs = store.filesystem
|
|
551
|
-
|
|
552
|
-
# calculate time threshold (keep only files from the last 24 hours)
|
|
553
|
-
time_to_keep = (
|
|
554
|
-
datetime.datetime.now(tz=datetime.timezone.utc)
|
|
555
|
-
- datetime.timedelta(days=days)
|
|
556
|
-
).timestamp()
|
|
557
|
-
|
|
558
|
-
for endpoint in endpoints:
|
|
559
|
-
try:
|
|
560
|
-
apps_parquet_directories = fs.listdir(
|
|
561
|
-
path=f"{self.parquet_directory}"
|
|
562
|
-
f"/key={endpoint[mm_constants.EventFieldType.UID]}"
|
|
563
|
-
)
|
|
564
|
-
for directory in apps_parquet_directories:
|
|
565
|
-
if directory["mtime"] < time_to_keep:
|
|
566
|
-
# Delete files
|
|
567
|
-
fs.rm(path=directory["name"], recursive=True)
|
|
568
|
-
# Delete directory
|
|
569
|
-
fs.rmdir(path=directory["name"])
|
|
570
|
-
except FileNotFoundError:
|
|
571
|
-
logger.info(
|
|
572
|
-
"Application parquet directory is empty, "
|
|
573
|
-
"probably parquets have not yet been created for this app",
|
|
574
|
-
endpoint=endpoint[mm_constants.EventFieldType.UID],
|
|
575
|
-
path=f"{self.parquet_directory}"
|
|
576
|
-
f"/key={endpoint[mm_constants.EventFieldType.UID]}",
|
|
577
|
-
)
|
|
578
|
-
|
|
579
456
|
@staticmethod
|
|
580
457
|
def _push_to_applications(
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
project,
|
|
588
|
-
applications_names,
|
|
589
|
-
model_monitoring_access_key,
|
|
590
|
-
parquet_target_path,
|
|
458
|
+
start_infer_time: datetime.datetime,
|
|
459
|
+
end_infer_time: datetime.datetime,
|
|
460
|
+
endpoint_id: str,
|
|
461
|
+
project: str,
|
|
462
|
+
applications_names: list[str],
|
|
463
|
+
model_monitoring_access_key: str,
|
|
591
464
|
):
|
|
592
465
|
"""
|
|
593
466
|
Pushes data to multiple stream applications.
|
|
594
467
|
|
|
595
|
-
:param
|
|
596
|
-
:param
|
|
597
|
-
:param
|
|
598
|
-
:param
|
|
599
|
-
:param
|
|
600
|
-
:param
|
|
601
|
-
:param project: mlrun Project name.
|
|
602
|
-
:param applications_names: List of application names to which data will be pushed.
|
|
468
|
+
:param start_infer_time: The beginning of the infer interval window.
|
|
469
|
+
:param end_infer_time: The end of the infer interval window.
|
|
470
|
+
:param endpoint_id: Identifier for the model endpoint.
|
|
471
|
+
:param project: mlrun Project name.
|
|
472
|
+
:param applications_names: List of application names to which data will be pushed.
|
|
473
|
+
:param model_monitoring_access_key: Access key to apply the model monitoring process.
|
|
603
474
|
|
|
604
475
|
"""
|
|
605
|
-
|
|
606
476
|
data = {
|
|
607
|
-
mm_constants.ApplicationEvent.CURRENT_STATS: json.dumps(current_stats),
|
|
608
|
-
mm_constants.ApplicationEvent.FEATURE_STATS: json.dumps(feature_stats),
|
|
609
|
-
mm_constants.ApplicationEvent.SAMPLE_PARQUET_PATH: parquet_target_path,
|
|
610
477
|
mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
|
|
611
478
|
sep=" ", timespec="microseconds"
|
|
612
479
|
),
|
|
613
480
|
mm_constants.ApplicationEvent.END_INFER_TIME: end_infer_time.isoformat(
|
|
614
481
|
sep=" ", timespec="microseconds"
|
|
615
482
|
),
|
|
616
|
-
mm_constants.ApplicationEvent.LAST_REQUEST: latest_request.isoformat(
|
|
617
|
-
sep=" ", timespec="microseconds"
|
|
618
|
-
),
|
|
619
483
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
620
484
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
621
485
|
project=project,
|
|
@@ -633,49 +497,12 @@ class MonitoringApplicationController:
|
|
|
633
497
|
[data]
|
|
634
498
|
)
|
|
635
499
|
|
|
636
|
-
@staticmethod
|
|
637
|
-
def _get_sample_df(
|
|
638
|
-
feature_set: mlrun.common.schemas.FeatureSet,
|
|
639
|
-
endpoint_id: str,
|
|
640
|
-
start_infer_time: datetime.datetime,
|
|
641
|
-
end_infer_time: datetime.datetime,
|
|
642
|
-
parquet_directory: str,
|
|
643
|
-
storage_options: dict,
|
|
644
|
-
application_name: str,
|
|
645
|
-
) -> mlrun.feature_store.OfflineVectorResponse:
|
|
646
|
-
"""
|
|
647
|
-
Retrieves a sample DataFrame of the current input according to the provided infer interval window.
|
|
648
|
-
|
|
649
|
-
:param feature_set: The main feature set.
|
|
650
|
-
:param endpoint_id: Identifier for the model endpoint.
|
|
651
|
-
:param start_infer_time: The beginning of the infer interval window.
|
|
652
|
-
:param end_infer_time: The end of the infer interval window.
|
|
653
|
-
:param parquet_directory: Directory where Parquet files are stored.
|
|
654
|
-
:param storage_options: Storage options for accessing the data.
|
|
655
|
-
:param application_name: Current application name.
|
|
656
500
|
|
|
657
|
-
|
|
501
|
+
def handler(context: nuclio.Context, event: nuclio.Event) -> None:
|
|
502
|
+
"""
|
|
503
|
+
Run model monitoring application processor
|
|
658
504
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
features=features,
|
|
664
|
-
with_indexes=True,
|
|
665
|
-
)
|
|
666
|
-
vector.metadata.tag = application_name
|
|
667
|
-
vector.feature_set_objects = {feature_set.metadata.name: feature_set}
|
|
668
|
-
|
|
669
|
-
# get offline features based on application start and end time.
|
|
670
|
-
# store the result parquet by partitioning by controller end processing time
|
|
671
|
-
offline_response = vector.get_offline_features(
|
|
672
|
-
start_time=start_infer_time,
|
|
673
|
-
end_time=end_infer_time,
|
|
674
|
-
timestamp_for_filtering=mm_constants.EventFieldType.TIMESTAMP,
|
|
675
|
-
target=ParquetTarget(
|
|
676
|
-
path=parquet_directory
|
|
677
|
-
+ f"/key={endpoint_id}/{int(start_infer_time.timestamp())}/{application_name}.parquet",
|
|
678
|
-
storage_options=storage_options,
|
|
679
|
-
),
|
|
680
|
-
)
|
|
681
|
-
return offline_response
|
|
505
|
+
:param context: the Nuclio context
|
|
506
|
+
:param event: trigger event
|
|
507
|
+
"""
|
|
508
|
+
MonitoringApplicationController().run()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -12,12 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
from .
|
|
18
|
-
|
|
19
|
-
Base = declarative_base()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class ModelEndpointsTable(Base, ModelEndpointsBaseTable):
|
|
23
|
-
pass
|
|
15
|
+
from .stores import ObjectStoreFactory, get_store_object
|
|
16
|
+
from .stores.base import StoreBase
|
|
17
|
+
from .tsdb import get_tsdb_connector
|
|
18
|
+
from .tsdb.base import TSDBConnector
|