mlrun 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +26 -22
- mlrun/__main__.py +15 -16
- mlrun/alerts/alert.py +150 -15
- mlrun/api/schemas/__init__.py +1 -9
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +62 -19
- mlrun/artifacts/dataset.py +17 -17
- mlrun/artifacts/document.py +454 -0
- mlrun/artifacts/manager.py +28 -18
- mlrun/artifacts/model.py +91 -59
- mlrun/artifacts/plots.py +2 -2
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -1
- mlrun/common/formatters/feature_set.py +2 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +12 -62
- mlrun/common/runtimes/constants.py +25 -4
- mlrun/common/schemas/__init__.py +9 -5
- mlrun/common/schemas/alert.py +114 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +22 -9
- mlrun/common/schemas/auth.py +8 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +4 -4
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +4 -8
- mlrun/common/schemas/model_monitoring/constants.py +127 -46
- mlrun/common/schemas/model_monitoring/grafana.py +18 -12
- mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +142 -0
- mlrun/common/schemas/pipeline.py +3 -3
- mlrun/common/schemas/project.py +26 -18
- mlrun/common/schemas/runs.py +3 -3
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +6 -5
- mlrun/common/types.py +1 -0
- mlrun/config.py +157 -89
- mlrun/data_types/__init__.py +5 -3
- mlrun/data_types/infer.py +13 -3
- mlrun/data_types/spark.py +2 -1
- mlrun/datastore/__init__.py +59 -18
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +19 -24
- mlrun/datastore/datastore.py +10 -4
- mlrun/datastore/datastore_profile.py +178 -45
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +14 -3
- mlrun/datastore/sources.py +89 -92
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/storeytargets.py +51 -16
- mlrun/datastore/targets.py +38 -31
- mlrun/datastore/utils.py +87 -4
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +291 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +286 -100
- mlrun/db/httpdb.py +1562 -490
- mlrun/db/nopdb.py +250 -83
- mlrun/errors.py +6 -2
- mlrun/execution.py +194 -50
- mlrun/feature_store/__init__.py +2 -10
- mlrun/feature_store/api.py +20 -458
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +105 -479
- mlrun/feature_store/feature_vector_utils.py +466 -0
- mlrun/feature_store/retrieval/base.py +15 -11
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/retrieval/storey_merger.py +1 -1
- mlrun/feature_store/steps.py +3 -3
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +31 -31
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/k8s_utils.py +2 -5
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +6 -2
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +8 -4
- mlrun/model.py +132 -46
- mlrun/model_monitoring/__init__.py +3 -5
- mlrun/model_monitoring/api.py +113 -98
- mlrun/model_monitoring/applications/__init__.py +0 -5
- mlrun/model_monitoring/applications/_application_steps.py +81 -50
- mlrun/model_monitoring/applications/base.py +467 -14
- mlrun/model_monitoring/applications/context.py +212 -134
- mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
- mlrun/model_monitoring/applications/evidently/base.py +146 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
- mlrun/model_monitoring/applications/results.py +67 -15
- mlrun/model_monitoring/controller.py +701 -315
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +242 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
- mlrun/model_monitoring/db/tsdb/base.py +243 -49
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
- mlrun/model_monitoring/helpers.py +356 -114
- mlrun/model_monitoring/stream_processing.py +190 -345
- mlrun/model_monitoring/tracking_policy.py +11 -4
- mlrun/model_monitoring/writer.py +49 -90
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +2 -2
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +35 -32
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +30 -30
- mlrun/projects/pipelines.py +116 -47
- mlrun/projects/project.py +1292 -329
- mlrun/render.py +5 -9
- mlrun/run.py +57 -14
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +30 -22
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
- mlrun/runtimes/function_reference.py +5 -2
- mlrun/runtimes/generators.py +3 -2
- mlrun/runtimes/kubejob.py +6 -7
- mlrun/runtimes/mounts.py +574 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -13
- mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
- mlrun/runtimes/nuclio/function.py +127 -70
- mlrun/runtimes/nuclio/serving.py +105 -37
- mlrun/runtimes/pod.py +159 -54
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +22 -12
- mlrun/runtimes/utils.py +7 -6
- mlrun/secrets.py +2 -2
- mlrun/serving/__init__.py +8 -0
- mlrun/serving/merger.py +7 -5
- mlrun/serving/remote.py +35 -22
- mlrun/serving/routers.py +186 -240
- mlrun/serving/server.py +41 -10
- mlrun/serving/states.py +432 -118
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +161 -203
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +35 -22
- mlrun/utils/clones.py +7 -4
- mlrun/utils/helpers.py +511 -58
- mlrun/utils/logger.py +119 -13
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +39 -15
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +16 -8
- mlrun/utils/notifications/notification/webhook.py +24 -8
- mlrun/utils/notifications/notification_pusher.py +191 -200
- mlrun/utils/regex.py +12 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/METADATA +69 -54
- mlrun-1.8.0.dist-info/RECORD +351 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/applications/evidently_base.py +0 -137
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.2rc4.dist-info/RECORD +0 -351
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -11,31 +11,42 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import collections
|
|
14
16
|
import concurrent.futures
|
|
15
17
|
import datetime
|
|
16
18
|
import json
|
|
17
19
|
import os
|
|
18
|
-
import
|
|
20
|
+
import traceback
|
|
21
|
+
from collections import OrderedDict
|
|
19
22
|
from collections.abc import Iterator
|
|
20
|
-
from
|
|
23
|
+
from contextlib import AbstractContextManager
|
|
24
|
+
from types import TracebackType
|
|
25
|
+
from typing import Any, NamedTuple, Optional, Union, cast
|
|
21
26
|
|
|
22
|
-
import
|
|
27
|
+
import nuclio_sdk
|
|
28
|
+
import pandas as pd
|
|
23
29
|
|
|
24
30
|
import mlrun
|
|
25
31
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
26
|
-
import mlrun.data_types.infer
|
|
27
32
|
import mlrun.feature_store as fstore
|
|
28
|
-
import mlrun.model_monitoring
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
from mlrun.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
import mlrun.model_monitoring
|
|
34
|
+
import mlrun.model_monitoring.db._schedules as schedules
|
|
35
|
+
import mlrun.model_monitoring.helpers
|
|
36
|
+
import mlrun.platforms.iguazio
|
|
37
|
+
from mlrun.common.schemas import EndpointType
|
|
38
|
+
from mlrun.common.schemas.model_monitoring.constants import (
|
|
39
|
+
ControllerEvent,
|
|
40
|
+
ControllerEventEndpointPolicy,
|
|
41
|
+
ControllerEventKind,
|
|
36
42
|
)
|
|
43
|
+
from mlrun.errors import err_to_str
|
|
44
|
+
from mlrun.model_monitoring.helpers import batch_dict2timedelta
|
|
37
45
|
from mlrun.utils import datetime_now, logger
|
|
38
46
|
|
|
47
|
+
_SECONDS_IN_DAY = int(datetime.timedelta(days=1).total_seconds())
|
|
48
|
+
_SECONDS_IN_MINUTE = 60
|
|
49
|
+
|
|
39
50
|
|
|
40
51
|
class _Interval(NamedTuple):
|
|
41
52
|
start: datetime.datetime
|
|
@@ -45,12 +56,12 @@ class _Interval(NamedTuple):
|
|
|
45
56
|
class _BatchWindow:
|
|
46
57
|
def __init__(
|
|
47
58
|
self,
|
|
48
|
-
|
|
49
|
-
|
|
59
|
+
*,
|
|
60
|
+
schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
|
|
50
61
|
application: str,
|
|
51
62
|
timedelta_seconds: int,
|
|
52
|
-
last_updated:
|
|
53
|
-
first_request:
|
|
63
|
+
last_updated: int,
|
|
64
|
+
first_request: int,
|
|
54
65
|
) -> None:
|
|
55
66
|
"""
|
|
56
67
|
Initialize a batch window object that handles the batch interval time range
|
|
@@ -58,159 +69,133 @@ class _BatchWindow:
|
|
|
58
69
|
All the time values are in seconds.
|
|
59
70
|
The start and stop time are in seconds since the epoch.
|
|
60
71
|
"""
|
|
61
|
-
self.project = project
|
|
62
|
-
self._endpoint = endpoint
|
|
63
72
|
self._application = application
|
|
64
73
|
self._first_request = first_request
|
|
65
74
|
self._stop = last_updated
|
|
66
75
|
self._step = timedelta_seconds
|
|
67
|
-
self._db =
|
|
76
|
+
self._db = schedules_file
|
|
68
77
|
self._start = self._get_last_analyzed()
|
|
69
78
|
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
last_analyzed = self._db.get_last_analyzed(
|
|
73
|
-
endpoint_id=self._endpoint,
|
|
74
|
-
application_name=self._application,
|
|
75
|
-
)
|
|
76
|
-
except mlrun.errors.MLRunNotFoundError:
|
|
77
|
-
logger.info(
|
|
78
|
-
"No last analyzed time was found for this endpoint and "
|
|
79
|
-
"application, as this is probably the first time this "
|
|
80
|
-
"application is running. Using the latest between first "
|
|
81
|
-
"request time or last update time minus one day instead",
|
|
82
|
-
endpoint=self._endpoint,
|
|
83
|
-
application=self._application,
|
|
84
|
-
first_request=self._first_request,
|
|
85
|
-
last_updated=self._stop,
|
|
86
|
-
)
|
|
79
|
+
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
80
|
+
return cast(int, self._db.get_application_time(self._application))
|
|
87
81
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
int(datetime.timedelta(days=1).total_seconds()), self._step
|
|
92
|
-
) # max between one day and the base period
|
|
93
|
-
return max(
|
|
94
|
-
self._first_request,
|
|
95
|
-
self._stop - first_period_in_seconds,
|
|
96
|
-
)
|
|
97
|
-
return self._first_request
|
|
98
|
-
|
|
99
|
-
logger.info(
|
|
100
|
-
"Got the last analyzed time for this endpoint and application",
|
|
101
|
-
endpoint=self._endpoint,
|
|
102
|
-
application=self._application,
|
|
103
|
-
last_analyzed=last_analyzed,
|
|
82
|
+
def _update_last_analyzed(self, last_analyzed: int) -> None:
|
|
83
|
+
self._db.update_application_time(
|
|
84
|
+
application=self._application, timestamp=last_analyzed
|
|
104
85
|
)
|
|
105
|
-
return last_analyzed
|
|
106
86
|
|
|
107
|
-
def
|
|
87
|
+
def _get_initial_last_analyzed(self) -> int:
|
|
108
88
|
logger.info(
|
|
109
|
-
"
|
|
110
|
-
|
|
89
|
+
"No last analyzed time was found for this endpoint and application, as this is "
|
|
90
|
+
"probably the first time this application is running. Initializing last analyzed "
|
|
91
|
+
"to the latest between first request time or last update time minus one day",
|
|
111
92
|
application=self._application,
|
|
112
|
-
|
|
93
|
+
first_request=self._first_request,
|
|
94
|
+
last_updated=self._stop,
|
|
113
95
|
)
|
|
114
|
-
|
|
115
|
-
self.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
96
|
+
# max between one day and the base period
|
|
97
|
+
first_period_in_seconds = max(_SECONDS_IN_DAY, self._step)
|
|
98
|
+
return max(
|
|
99
|
+
self._first_request,
|
|
100
|
+
self._stop - first_period_in_seconds,
|
|
119
101
|
)
|
|
120
102
|
|
|
121
|
-
def
|
|
122
|
-
self
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if self._start is not None and self._stop is not None:
|
|
126
|
-
entered = False
|
|
127
|
-
# Iterate timestamp from start until timestamp <= stop - step
|
|
128
|
-
# so that the last interval will end at (timestamp + step) <= stop.
|
|
129
|
-
# Add 1 to stop - step to get <= and not <.
|
|
130
|
-
for timestamp in range(
|
|
131
|
-
self._start, self._stop - self._step + 1, self._step
|
|
132
|
-
):
|
|
133
|
-
entered = True
|
|
134
|
-
start_time = datetime.datetime.fromtimestamp(
|
|
135
|
-
timestamp, tz=datetime.timezone.utc
|
|
136
|
-
)
|
|
137
|
-
end_time = datetime.datetime.fromtimestamp(
|
|
138
|
-
timestamp + self._step, tz=datetime.timezone.utc
|
|
139
|
-
)
|
|
140
|
-
yield _Interval(start_time, end_time)
|
|
141
|
-
self._update_last_analyzed(timestamp + self._step)
|
|
142
|
-
if not entered:
|
|
143
|
-
logger.info(
|
|
144
|
-
"All the data is set, but no complete intervals were found. "
|
|
145
|
-
"Wait for last_updated to be updated",
|
|
146
|
-
endpoint=self._endpoint,
|
|
147
|
-
application=self._application,
|
|
148
|
-
start=self._start,
|
|
149
|
-
stop=self._stop,
|
|
150
|
-
step=self._step,
|
|
151
|
-
)
|
|
103
|
+
def _get_last_analyzed(self) -> int:
|
|
104
|
+
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
105
|
+
if saved_last_analyzed is not None:
|
|
106
|
+
return saved_last_analyzed
|
|
152
107
|
else:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
108
|
+
last_analyzed = self._get_initial_last_analyzed()
|
|
109
|
+
# Update the in-memory DB to avoid duplicate initializations
|
|
110
|
+
self._update_last_analyzed(last_analyzed)
|
|
111
|
+
return last_analyzed
|
|
112
|
+
|
|
113
|
+
def get_intervals(self) -> Iterator[_Interval]:
|
|
114
|
+
"""Generate the batch interval time ranges."""
|
|
115
|
+
entered = False
|
|
116
|
+
# Iterate timestamp from start until timestamp <= stop - step
|
|
117
|
+
# so that the last interval will end at (timestamp + step) <= stop.
|
|
118
|
+
# Add 1 to stop - step to get <= and not <.
|
|
119
|
+
for timestamp in range(self._start, self._stop - self._step + 1, self._step):
|
|
120
|
+
entered = True
|
|
121
|
+
start_time = datetime.datetime.fromtimestamp(
|
|
122
|
+
timestamp, tz=datetime.timezone.utc
|
|
123
|
+
)
|
|
124
|
+
end_time = datetime.datetime.fromtimestamp(
|
|
125
|
+
timestamp + self._step, tz=datetime.timezone.utc
|
|
126
|
+
)
|
|
127
|
+
yield _Interval(start_time, end_time)
|
|
128
|
+
|
|
129
|
+
last_analyzed = timestamp + self._step
|
|
130
|
+
self._update_last_analyzed(last_analyzed)
|
|
131
|
+
logger.debug(
|
|
132
|
+
"Updated the last analyzed time for this endpoint and application",
|
|
133
|
+
application=self._application,
|
|
134
|
+
last_analyzed=last_analyzed,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if not entered:
|
|
138
|
+
logger.debug(
|
|
139
|
+
"All the data is set, but no complete intervals were found. "
|
|
140
|
+
"Wait for last_updated to be updated",
|
|
157
141
|
application=self._application,
|
|
158
142
|
start=self._start,
|
|
159
143
|
stop=self._stop,
|
|
144
|
+
step=self._step,
|
|
160
145
|
)
|
|
161
146
|
|
|
162
147
|
|
|
163
|
-
class _BatchWindowGenerator:
|
|
164
|
-
def __init__(
|
|
148
|
+
class _BatchWindowGenerator(AbstractContextManager):
|
|
149
|
+
def __init__(
|
|
150
|
+
self, project: str, endpoint_id: str, window_length: Optional[int] = None
|
|
151
|
+
) -> None:
|
|
165
152
|
"""
|
|
166
153
|
Initialize a batch window generator object that generates batch window objects
|
|
167
154
|
for the monitoring functions.
|
|
168
155
|
"""
|
|
169
|
-
self.
|
|
170
|
-
self.
|
|
171
|
-
self.
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
# Convert batch dict string into a dictionary
|
|
176
|
-
if isinstance(self._batch_dict, str):
|
|
177
|
-
self._parse_batch_dict_str()
|
|
178
|
-
|
|
179
|
-
def _parse_batch_dict_str(self) -> None:
|
|
180
|
-
"""Convert batch dictionary string into a valid dictionary"""
|
|
181
|
-
characters_to_remove = "{} "
|
|
182
|
-
pattern = "[" + characters_to_remove + "]"
|
|
183
|
-
# Remove unnecessary characters from the provided string
|
|
184
|
-
batch_list = re.sub(pattern, "", self._batch_dict).split(",")
|
|
185
|
-
# Initialize the dictionary of batch interval ranges
|
|
186
|
-
self._batch_dict = {}
|
|
187
|
-
for pair in batch_list:
|
|
188
|
-
pair_list = pair.split(":")
|
|
189
|
-
self._batch_dict[pair_list[0]] = float(pair_list[1])
|
|
190
|
-
|
|
191
|
-
def _get_timedelta(self) -> int:
|
|
192
|
-
"""Get the timedelta in seconds from the batch dictionary"""
|
|
193
|
-
return int(
|
|
194
|
-
batch_dict2timedelta(cast(_BatchDict, self._batch_dict)).total_seconds()
|
|
156
|
+
self.batch_window: _BatchWindow = None
|
|
157
|
+
self._project = project
|
|
158
|
+
self._endpoint_id = endpoint_id
|
|
159
|
+
self._timedelta = window_length
|
|
160
|
+
self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
|
|
161
|
+
project=project, endpoint_id=endpoint_id
|
|
195
162
|
)
|
|
196
163
|
|
|
164
|
+
def __enter__(self) -> "_BatchWindowGenerator":
|
|
165
|
+
self._schedules_file.__enter__()
|
|
166
|
+
return super().__enter__()
|
|
167
|
+
|
|
168
|
+
def __exit__(
|
|
169
|
+
self,
|
|
170
|
+
exc_type: Optional[type[BaseException]],
|
|
171
|
+
exc_value: Optional[BaseException],
|
|
172
|
+
traceback: Optional[TracebackType],
|
|
173
|
+
) -> Optional[bool]:
|
|
174
|
+
self._schedules_file.__exit__(
|
|
175
|
+
exc_type=exc_type, exc_value=exc_value, traceback=traceback
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def get_application_list(self) -> set[str]:
|
|
179
|
+
return self._schedules_file.get_application_list()
|
|
180
|
+
|
|
181
|
+
def get_min_last_analyzed(self) -> Optional[int]:
|
|
182
|
+
return self._schedules_file.get_min_timestamp()
|
|
183
|
+
|
|
197
184
|
@classmethod
|
|
198
185
|
def _get_last_updated_time(
|
|
199
|
-
cls, last_request:
|
|
200
|
-
) ->
|
|
186
|
+
cls, last_request: datetime.datetime, not_batch_endpoint: bool
|
|
187
|
+
) -> int:
|
|
201
188
|
"""
|
|
202
189
|
Get the last updated time of a model endpoint.
|
|
203
190
|
"""
|
|
204
|
-
if not last_request:
|
|
205
|
-
return None
|
|
206
191
|
last_updated = int(
|
|
207
|
-
|
|
192
|
+
last_request.timestamp()
|
|
208
193
|
- cast(
|
|
209
194
|
float,
|
|
210
195
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
211
196
|
)
|
|
212
197
|
)
|
|
213
|
-
if not
|
|
198
|
+
if not not_batch_endpoint:
|
|
214
199
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
215
200
|
# the minimum between the current time and the last updated time.
|
|
216
201
|
# This compensates for the bumping mechanism - see
|
|
@@ -221,45 +206,38 @@ class _BatchWindowGenerator:
|
|
|
221
206
|
)
|
|
222
207
|
return last_updated
|
|
223
208
|
|
|
224
|
-
|
|
225
|
-
def _normalize_first_request(
|
|
226
|
-
cls, first_request: Optional[str], endpoint: str
|
|
227
|
-
) -> Optional[int]:
|
|
228
|
-
if not first_request:
|
|
229
|
-
logger.debug(
|
|
230
|
-
"There is no first request time for this endpoint.",
|
|
231
|
-
endpoint=endpoint,
|
|
232
|
-
first_request=first_request,
|
|
233
|
-
)
|
|
234
|
-
return None
|
|
235
|
-
return cls._date_string2timestamp(first_request)
|
|
236
|
-
|
|
237
|
-
@staticmethod
|
|
238
|
-
def _date_string2timestamp(date_string: str) -> int:
|
|
239
|
-
return int(datetime.datetime.fromisoformat(date_string).timestamp())
|
|
240
|
-
|
|
241
|
-
def get_batch_window(
|
|
209
|
+
def get_intervals(
|
|
242
210
|
self,
|
|
243
|
-
|
|
244
|
-
endpoint: str,
|
|
211
|
+
*,
|
|
245
212
|
application: str,
|
|
246
|
-
first_request:
|
|
247
|
-
last_request:
|
|
248
|
-
|
|
249
|
-
) ->
|
|
213
|
+
first_request: datetime.datetime,
|
|
214
|
+
last_request: datetime.datetime,
|
|
215
|
+
not_batch_endpoint: bool,
|
|
216
|
+
) -> Iterator[_Interval]:
|
|
250
217
|
"""
|
|
251
218
|
Get the batch window for a specific endpoint and application.
|
|
252
|
-
first_request
|
|
219
|
+
`first_request` and `last_request` are the timestamps of the first request and last
|
|
220
|
+
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
253
221
|
"""
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
project=project,
|
|
257
|
-
endpoint=endpoint,
|
|
222
|
+
self.batch_window = _BatchWindow(
|
|
223
|
+
schedules_file=self._schedules_file,
|
|
258
224
|
application=application,
|
|
259
225
|
timedelta_seconds=self._timedelta,
|
|
260
|
-
last_updated=self._get_last_updated_time(last_request,
|
|
261
|
-
first_request=
|
|
226
|
+
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
227
|
+
first_request=int(first_request.timestamp()),
|
|
262
228
|
)
|
|
229
|
+
yield from self.batch_window.get_intervals()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _get_window_length() -> int:
|
|
233
|
+
"""Get the timedelta in seconds from the batch dictionary"""
|
|
234
|
+
return int(
|
|
235
|
+
batch_dict2timedelta(
|
|
236
|
+
json.loads(
|
|
237
|
+
cast(str, os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT))
|
|
238
|
+
)
|
|
239
|
+
).total_seconds()
|
|
240
|
+
)
|
|
263
241
|
|
|
264
242
|
|
|
265
243
|
class MonitoringApplicationController:
|
|
@@ -269,27 +247,79 @@ class MonitoringApplicationController:
|
|
|
269
247
|
Note that the MonitoringApplicationController object requires access keys along with valid project configurations.
|
|
270
248
|
"""
|
|
271
249
|
|
|
250
|
+
_MAX_FEATURE_SET_PER_WORKER = 1000
|
|
251
|
+
|
|
272
252
|
def __init__(self) -> None:
|
|
273
253
|
"""Initialize Monitoring Application Controller"""
|
|
274
254
|
self.project = cast(str, mlrun.mlconf.default_project)
|
|
275
|
-
self.project_obj = mlrun.
|
|
276
|
-
|
|
255
|
+
self.project_obj = mlrun.get_run_db().get_project(name=self.project)
|
|
277
256
|
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
278
257
|
|
|
279
|
-
self.
|
|
258
|
+
self._window_length = _get_window_length()
|
|
280
259
|
|
|
281
|
-
self.
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
260
|
+
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
261
|
+
self.v3io_access_key = mlrun.mlconf.get_v3io_access_key()
|
|
262
|
+
store, _, _ = mlrun.store_manager.get_or_create_store(
|
|
263
|
+
mlrun.mlconf.artifact_path
|
|
264
|
+
)
|
|
265
|
+
self.storage_options = store.get_storage_options()
|
|
266
|
+
self._controller_stream: Optional[
|
|
267
|
+
Union[
|
|
268
|
+
mlrun.platforms.iguazio.OutputStream,
|
|
269
|
+
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
270
|
+
]
|
|
271
|
+
] = None
|
|
272
|
+
self._model_monitoring_stream: Optional[
|
|
273
|
+
Union[
|
|
274
|
+
mlrun.platforms.iguazio.OutputStream,
|
|
275
|
+
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
276
|
+
]
|
|
277
|
+
] = None
|
|
278
|
+
self.applications_streams: dict[
|
|
279
|
+
str,
|
|
280
|
+
Union[
|
|
281
|
+
mlrun.platforms.iguazio.OutputStream,
|
|
282
|
+
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
283
|
+
],
|
|
284
|
+
] = {}
|
|
285
|
+
self.feature_sets: OrderedDict[str, mlrun.feature_store.FeatureSet] = (
|
|
286
|
+
collections.OrderedDict()
|
|
287
|
+
)
|
|
288
|
+
self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
289
|
+
project=self.project
|
|
287
290
|
)
|
|
288
291
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
292
|
+
@property
|
|
293
|
+
def controller_stream(
|
|
294
|
+
self,
|
|
295
|
+
) -> Union[
|
|
296
|
+
mlrun.platforms.iguazio.OutputStream,
|
|
297
|
+
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
298
|
+
]:
|
|
299
|
+
if self._controller_stream is None:
|
|
300
|
+
self._controller_stream = mlrun.model_monitoring.helpers.get_output_stream(
|
|
301
|
+
project=self.project,
|
|
302
|
+
function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
|
|
303
|
+
v3io_access_key=self.v3io_access_key,
|
|
304
|
+
)
|
|
305
|
+
return self._controller_stream
|
|
306
|
+
|
|
307
|
+
@property
|
|
308
|
+
def model_monitoring_stream(
|
|
309
|
+
self,
|
|
310
|
+
) -> Union[
|
|
311
|
+
mlrun.platforms.iguazio.OutputStream,
|
|
312
|
+
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
313
|
+
]:
|
|
314
|
+
if self._model_monitoring_stream is None:
|
|
315
|
+
self._model_monitoring_stream = (
|
|
316
|
+
mlrun.model_monitoring.helpers.get_output_stream(
|
|
317
|
+
project=self.project,
|
|
318
|
+
function_name=mm_constants.MonitoringFunctionNames.STREAM,
|
|
319
|
+
v3io_access_key=self.model_monitoring_access_key,
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
return self._model_monitoring_stream
|
|
293
323
|
|
|
294
324
|
@staticmethod
|
|
295
325
|
def _get_model_monitoring_access_key() -> Optional[str]:
|
|
@@ -299,168 +329,302 @@ class MonitoringApplicationController:
|
|
|
299
329
|
access_key = mlrun.mlconf.get_v3io_access_key()
|
|
300
330
|
return access_key
|
|
301
331
|
|
|
302
|
-
def
|
|
332
|
+
def _should_monitor_endpoint(
|
|
333
|
+
self,
|
|
334
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
335
|
+
application_names: set,
|
|
336
|
+
base_period_minutes: int,
|
|
337
|
+
schedules_file: schedules.ModelMonitoringSchedulesFileChief,
|
|
338
|
+
) -> bool:
|
|
303
339
|
"""
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
340
|
+
checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
|
|
341
|
+
next conditions:
|
|
342
|
+
1. monitoring_mode is enabled
|
|
343
|
+
2. first request exists
|
|
344
|
+
3. last request exists
|
|
345
|
+
4. endpoint_type is not ROUTER
|
|
346
|
+
if the four above conditions apply we require one of the two condition monitor:
|
|
347
|
+
1. never monitored the one of the endpoint applications meaning min_last_analyzed is None
|
|
348
|
+
2. min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
|
|
349
|
+
is sent with the combination of current last_request & current last_analyzed per endpoint.
|
|
311
350
|
"""
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
351
|
+
last_timestamp_sent = schedules_file.get_endpoint_last_request(
|
|
352
|
+
endpoint.metadata.uid
|
|
353
|
+
)
|
|
354
|
+
last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
|
|
355
|
+
endpoint.metadata.uid
|
|
356
|
+
)
|
|
357
|
+
logger.debug(
|
|
358
|
+
"Chief should monitor endpoint check",
|
|
359
|
+
last_timestamp_sent=last_timestamp_sent,
|
|
360
|
+
last_analyzed_sent=last_analyzed_sent,
|
|
361
|
+
uid=endpoint.metadata.uid,
|
|
362
|
+
)
|
|
363
|
+
if (
|
|
364
|
+
# Is the model endpoint monitored?
|
|
365
|
+
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
366
|
+
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
367
|
+
and endpoint.status.first_request
|
|
368
|
+
and endpoint.status.last_request
|
|
369
|
+
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
370
|
+
and endpoint.metadata.endpoint_type.value
|
|
371
|
+
!= mm_constants.EndpointType.ROUTER.value
|
|
372
|
+
):
|
|
373
|
+
with _BatchWindowGenerator(
|
|
374
|
+
project=endpoint.metadata.project,
|
|
375
|
+
endpoint_id=endpoint.metadata.uid,
|
|
376
|
+
) as batch_window_generator:
|
|
377
|
+
current_time = mlrun.utils.datetime_now()
|
|
378
|
+
current_min_last_analyzed = (
|
|
379
|
+
batch_window_generator.get_min_last_analyzed()
|
|
323
380
|
)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
381
|
+
if (
|
|
382
|
+
# Different application names, or last analyzed never updated while there are application to monitor
|
|
383
|
+
application_names
|
|
384
|
+
and (
|
|
385
|
+
application_names
|
|
386
|
+
!= batch_window_generator.get_application_list()
|
|
387
|
+
or not current_min_last_analyzed
|
|
388
|
+
)
|
|
389
|
+
):
|
|
390
|
+
return True
|
|
391
|
+
elif (
|
|
392
|
+
# Does nop event will be sent to close the relevant window
|
|
393
|
+
self._should_send_nop_event(
|
|
394
|
+
base_period_minutes, current_min_last_analyzed, current_time
|
|
395
|
+
)
|
|
396
|
+
and (
|
|
397
|
+
int(endpoint.status.last_request.timestamp())
|
|
398
|
+
!= last_timestamp_sent
|
|
399
|
+
or current_min_last_analyzed != last_analyzed_sent
|
|
400
|
+
)
|
|
401
|
+
):
|
|
402
|
+
# Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
|
|
403
|
+
schedules_file.update_endpoint_timestamps(
|
|
404
|
+
endpoint_uid=endpoint.metadata.uid,
|
|
405
|
+
last_request=int(endpoint.status.last_request.timestamp()),
|
|
406
|
+
last_analyzed=current_min_last_analyzed,
|
|
407
|
+
)
|
|
408
|
+
return True
|
|
409
|
+
else:
|
|
410
|
+
logger.info(
|
|
411
|
+
"All the possible intervals were already analyzed, didn't push regular event",
|
|
412
|
+
endpoint_id=endpoint.metadata.uid,
|
|
413
|
+
last_analyzed=current_min_last_analyzed,
|
|
414
|
+
last_request=endpoint.status.last_request,
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
341
417
|
logger.info(
|
|
342
|
-
"
|
|
343
|
-
|
|
418
|
+
"Should not monitor model endpoint, didn't push regular event",
|
|
419
|
+
endpoint_id=endpoint.metadata.uid,
|
|
420
|
+
endpoint_name=endpoint.metadata.name,
|
|
421
|
+
last_request=endpoint.status.last_request,
|
|
422
|
+
first_request=endpoint.status.first_request,
|
|
423
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
424
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
344
425
|
)
|
|
426
|
+
return False
|
|
345
427
|
|
|
428
|
+
@staticmethod
|
|
429
|
+
def _should_send_nop_event(
|
|
430
|
+
base_period_minutes: int,
|
|
431
|
+
min_last_analyzed: int,
|
|
432
|
+
current_time: datetime.datetime,
|
|
433
|
+
):
|
|
434
|
+
if min_last_analyzed:
|
|
435
|
+
return (
|
|
436
|
+
current_time.timestamp() - min_last_analyzed
|
|
437
|
+
>= datetime.timedelta(minutes=base_period_minutes).total_seconds()
|
|
438
|
+
+ mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
|
|
439
|
+
)
|
|
440
|
+
else:
|
|
441
|
+
return True
|
|
442
|
+
|
|
443
|
+
def run(self, event: nuclio_sdk.Event) -> None:
|
|
444
|
+
"""
|
|
445
|
+
Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
|
|
446
|
+
Handles nop events logic.
|
|
447
|
+
This method handles the following:
|
|
448
|
+
1. Read applications from the event (endpoint_policy)
|
|
449
|
+
2. Check model monitoring windows
|
|
450
|
+
3. Send data to applications
|
|
451
|
+
4. Pushes nop event to main stream if needed
|
|
452
|
+
"""
|
|
453
|
+
logger.info("Start running monitoring controller worker")
|
|
454
|
+
try:
|
|
455
|
+
body = json.loads(event.body.decode("utf-8"))
|
|
346
456
|
except Exception as e:
|
|
347
457
|
logger.error(
|
|
348
|
-
"Failed to
|
|
458
|
+
"Failed to decode event",
|
|
349
459
|
exc=err_to_str(e),
|
|
350
460
|
)
|
|
351
461
|
return
|
|
352
|
-
#
|
|
353
|
-
|
|
354
|
-
max_workers=min(len(endpoints), 10),
|
|
355
|
-
) as pool:
|
|
356
|
-
for endpoint in endpoints:
|
|
357
|
-
if (
|
|
358
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
359
|
-
and endpoint[mm_constants.EventFieldType.MONITORING_MODE]
|
|
360
|
-
== mm_constants.ModelMonitoringMode.enabled.value
|
|
361
|
-
):
|
|
362
|
-
# Skip router endpoint:
|
|
363
|
-
if (
|
|
364
|
-
int(endpoint[mm_constants.EventFieldType.ENDPOINT_TYPE])
|
|
365
|
-
== mm_constants.EndpointType.ROUTER
|
|
366
|
-
):
|
|
367
|
-
# Router endpoint has no feature stats
|
|
368
|
-
logger.info(
|
|
369
|
-
f"{endpoint[mm_constants.EventFieldType.UID]} is router, skipping"
|
|
370
|
-
)
|
|
371
|
-
continue
|
|
372
|
-
pool.submit(
|
|
373
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
374
|
-
endpoint=endpoint,
|
|
375
|
-
applications_names=applications_names,
|
|
376
|
-
batch_window_generator=self._batch_window_generator,
|
|
377
|
-
project=self.project,
|
|
378
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
379
|
-
storage_options=self.storage_options,
|
|
380
|
-
)
|
|
462
|
+
# Run single endpoint process
|
|
463
|
+
self.model_endpoint_process(event=body)
|
|
381
464
|
|
|
382
|
-
@classmethod
|
|
383
465
|
def model_endpoint_process(
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
applications_names: list[str],
|
|
387
|
-
batch_window_generator: _BatchWindowGenerator,
|
|
388
|
-
project: str,
|
|
389
|
-
model_monitoring_access_key: str,
|
|
390
|
-
storage_options: Optional[dict] = None,
|
|
466
|
+
self,
|
|
467
|
+
event: Optional[dict] = None,
|
|
391
468
|
) -> None:
|
|
392
469
|
"""
|
|
393
470
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
394
|
-
for each endpoint.
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
:param endpoint: (dict) Model endpoint record.
|
|
398
|
-
:param applications_names: (list[str]) List of application names to push results to.
|
|
399
|
-
:param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
|
|
400
|
-
:param project: (str) Project name.
|
|
401
|
-
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
402
|
-
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
471
|
+
for each endpoint.
|
|
472
|
+
|
|
473
|
+
:param event: (dict) Event that triggered the monitoring process.
|
|
403
474
|
"""
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
m_fs = fstore.get_feature_set(
|
|
407
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
408
|
-
)
|
|
475
|
+
logger.info("Model endpoint process started", event=event)
|
|
476
|
+
|
|
409
477
|
try:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
478
|
+
project_name = event[ControllerEvent.PROJECT]
|
|
479
|
+
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
480
|
+
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
481
|
+
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
482
|
+
ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
|
|
483
|
+
]
|
|
484
|
+
|
|
485
|
+
not_batch_endpoint = (
|
|
486
|
+
event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
|
|
487
|
+
)
|
|
419
488
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
489
|
+
logger.info(
|
|
490
|
+
"Starting analyzing for", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
491
|
+
)
|
|
492
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
493
|
+
event[ControllerEvent.TIMESTAMP]
|
|
494
|
+
)
|
|
495
|
+
first_request = datetime.datetime.fromisoformat(
|
|
496
|
+
event[ControllerEvent.FIRST_REQUEST]
|
|
497
|
+
)
|
|
498
|
+
with _BatchWindowGenerator(
|
|
499
|
+
project=project_name,
|
|
500
|
+
endpoint_id=endpoint_id,
|
|
501
|
+
window_length=self._window_length,
|
|
502
|
+
) as batch_window_generator:
|
|
503
|
+
for application in applications_names:
|
|
504
|
+
for (
|
|
505
|
+
start_infer_time,
|
|
506
|
+
end_infer_time,
|
|
507
|
+
) in batch_window_generator.get_intervals(
|
|
508
|
+
application=application,
|
|
509
|
+
not_batch_endpoint=not_batch_endpoint,
|
|
510
|
+
first_request=first_request,
|
|
511
|
+
last_request=last_stream_timestamp,
|
|
512
|
+
):
|
|
513
|
+
data_in_window = False
|
|
514
|
+
if not_batch_endpoint:
|
|
515
|
+
# Serving endpoint - get the relevant window data from the TSDB
|
|
516
|
+
prediction_metric = self.tsdb_connector.read_predictions(
|
|
517
|
+
start=start_infer_time,
|
|
518
|
+
end=end_infer_time,
|
|
519
|
+
endpoint_id=endpoint_id,
|
|
520
|
+
)
|
|
521
|
+
if prediction_metric.data:
|
|
522
|
+
data_in_window = True
|
|
523
|
+
else:
|
|
524
|
+
if endpoint_id not in self.feature_sets:
|
|
525
|
+
self.feature_sets[endpoint_id] = fstore.get_feature_set(
|
|
526
|
+
event[ControllerEvent.FEATURE_SET_URI]
|
|
527
|
+
)
|
|
528
|
+
self.feature_sets.move_to_end(endpoint_id, last=False)
|
|
529
|
+
if (
|
|
530
|
+
len(self.feature_sets)
|
|
531
|
+
> self._MAX_FEATURE_SET_PER_WORKER
|
|
532
|
+
):
|
|
533
|
+
self.feature_sets.popitem(last=True)
|
|
534
|
+
m_fs = self.feature_sets.get(endpoint_id)
|
|
535
|
+
|
|
536
|
+
# Batch endpoint - get the relevant window data from the parquet target
|
|
537
|
+
df = m_fs.to_dataframe(
|
|
538
|
+
start_time=start_infer_time,
|
|
539
|
+
end_time=end_infer_time,
|
|
540
|
+
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
541
|
+
storage_options=self.storage_options,
|
|
542
|
+
)
|
|
543
|
+
if len(df) > 0:
|
|
544
|
+
data_in_window = True
|
|
545
|
+
if not data_in_window:
|
|
546
|
+
logger.info(
|
|
547
|
+
"No data found for the given interval",
|
|
548
|
+
start=start_infer_time,
|
|
549
|
+
end=end_infer_time,
|
|
550
|
+
endpoint_id=endpoint_id,
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
logger.info(
|
|
554
|
+
"Data found for the given interval",
|
|
555
|
+
start=start_infer_time,
|
|
556
|
+
end=end_infer_time,
|
|
557
|
+
endpoint_id=endpoint_id,
|
|
558
|
+
)
|
|
559
|
+
self._push_to_applications(
|
|
560
|
+
start_infer_time=start_infer_time,
|
|
561
|
+
end_infer_time=end_infer_time,
|
|
562
|
+
endpoint_id=endpoint_id,
|
|
563
|
+
endpoint_name=endpoint_name,
|
|
564
|
+
project=project_name,
|
|
565
|
+
applications_names=[application],
|
|
566
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
567
|
+
endpoint_updated=event[ControllerEvent.ENDPOINT_POLICY][
|
|
568
|
+
ControllerEventEndpointPolicy.ENDPOINT_UPDATED
|
|
569
|
+
],
|
|
570
|
+
)
|
|
571
|
+
base_period = event[ControllerEvent.ENDPOINT_POLICY][
|
|
572
|
+
ControllerEventEndpointPolicy.BASE_PERIOD
|
|
573
|
+
]
|
|
574
|
+
current_time = mlrun.utils.datetime_now()
|
|
575
|
+
if (
|
|
576
|
+
self._should_send_nop_event(
|
|
577
|
+
base_period,
|
|
578
|
+
batch_window_generator.get_min_last_analyzed(),
|
|
579
|
+
current_time,
|
|
426
580
|
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
581
|
+
and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
|
|
582
|
+
):
|
|
583
|
+
event = {
|
|
584
|
+
ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
|
|
585
|
+
ControllerEvent.PROJECT: project_name,
|
|
586
|
+
ControllerEvent.ENDPOINT_ID: endpoint_id,
|
|
587
|
+
ControllerEvent.ENDPOINT_NAME: endpoint_name,
|
|
588
|
+
ControllerEvent.TIMESTAMP: current_time.isoformat(
|
|
589
|
+
timespec="microseconds"
|
|
590
|
+
),
|
|
591
|
+
ControllerEvent.ENDPOINT_POLICY: event[
|
|
592
|
+
ControllerEvent.ENDPOINT_POLICY
|
|
593
|
+
],
|
|
594
|
+
ControllerEvent.ENDPOINT_TYPE: event[
|
|
595
|
+
ControllerEvent.ENDPOINT_TYPE
|
|
596
|
+
],
|
|
597
|
+
ControllerEvent.FEATURE_SET_URI: event[
|
|
598
|
+
ControllerEvent.FEATURE_SET_URI
|
|
599
|
+
],
|
|
600
|
+
ControllerEvent.FIRST_REQUEST: event[
|
|
601
|
+
ControllerEvent.FIRST_REQUEST
|
|
602
|
+
],
|
|
603
|
+
}
|
|
604
|
+
self._push_to_main_stream(
|
|
605
|
+
event=event,
|
|
606
|
+
endpoint_id=endpoint_id,
|
|
607
|
+
)
|
|
608
|
+
logger.info(
|
|
609
|
+
"Finish analyze for", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
610
|
+
)
|
|
449
611
|
|
|
450
612
|
except Exception:
|
|
451
613
|
logger.exception(
|
|
452
614
|
"Encountered an exception",
|
|
453
|
-
endpoint_id=
|
|
615
|
+
endpoint_id=event[ControllerEvent.ENDPOINT_ID],
|
|
454
616
|
)
|
|
455
617
|
|
|
456
|
-
@staticmethod
|
|
457
618
|
def _push_to_applications(
|
|
619
|
+
self,
|
|
458
620
|
start_infer_time: datetime.datetime,
|
|
459
621
|
end_infer_time: datetime.datetime,
|
|
460
622
|
endpoint_id: str,
|
|
623
|
+
endpoint_name: str,
|
|
461
624
|
project: str,
|
|
462
625
|
applications_names: list[str],
|
|
463
626
|
model_monitoring_access_key: str,
|
|
627
|
+
endpoint_updated: str,
|
|
464
628
|
):
|
|
465
629
|
"""
|
|
466
630
|
Pushes data to multiple stream applications.
|
|
@@ -471,7 +635,7 @@ class MonitoringApplicationController:
|
|
|
471
635
|
:param project: mlrun Project name.
|
|
472
636
|
:param applications_names: List of application names to which data will be pushed.
|
|
473
637
|
:param model_monitoring_access_key: Access key to apply the model monitoring process.
|
|
474
|
-
|
|
638
|
+
:param endpoint_updated: str isoformet for the timestamp the model endpoint was updated
|
|
475
639
|
"""
|
|
476
640
|
data = {
|
|
477
641
|
mm_constants.ApplicationEvent.START_INFER_TIME: start_infer_time.isoformat(
|
|
@@ -481,28 +645,250 @@ class MonitoringApplicationController:
|
|
|
481
645
|
sep=" ", timespec="microseconds"
|
|
482
646
|
),
|
|
483
647
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
484
|
-
mm_constants.ApplicationEvent.
|
|
485
|
-
|
|
486
|
-
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
487
|
-
),
|
|
648
|
+
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
649
|
+
mm_constants.ApplicationEvent.ENDPOINT_UPDATED: endpoint_updated,
|
|
488
650
|
}
|
|
489
651
|
for app_name in applications_names:
|
|
490
652
|
data.update({mm_constants.ApplicationEvent.APPLICATION_NAME: app_name})
|
|
491
|
-
|
|
653
|
+
if app_name not in self.applications_streams:
|
|
654
|
+
self.applications_streams[app_name] = (
|
|
655
|
+
mlrun.model_monitoring.helpers.get_output_stream(
|
|
656
|
+
project=project,
|
|
657
|
+
function_name=app_name,
|
|
658
|
+
v3io_access_key=model_monitoring_access_key,
|
|
659
|
+
)
|
|
660
|
+
)
|
|
661
|
+
app_stream = self.applications_streams.get(app_name)
|
|
492
662
|
|
|
493
663
|
logger.info(
|
|
494
|
-
|
|
664
|
+
"Pushing data to application stream",
|
|
665
|
+
endpoint_id=endpoint_id,
|
|
666
|
+
app_name=app_name,
|
|
667
|
+
app_stream_type=str(type(app_stream)),
|
|
668
|
+
)
|
|
669
|
+
app_stream.push([data], partition_key=endpoint_id)
|
|
670
|
+
|
|
671
|
+
def push_regular_event_to_controller_stream(self) -> None:
|
|
672
|
+
"""
|
|
673
|
+
pushes a regular event to the controller stream.
|
|
674
|
+
"""
|
|
675
|
+
logger.info("Starting monitoring controller chief")
|
|
676
|
+
applications_names = []
|
|
677
|
+
endpoints = self.project_obj.list_model_endpoints(tsdb_metrics=False).endpoints
|
|
678
|
+
last_request_dict = self.tsdb_connector.get_last_request(
|
|
679
|
+
endpoint_ids=[mep.metadata.uid for mep in endpoints]
|
|
680
|
+
)
|
|
681
|
+
if isinstance(last_request_dict, pd.DataFrame):
|
|
682
|
+
last_request_dict = last_request_dict.set_index(
|
|
683
|
+
mm_constants.EventFieldType.ENDPOINT_ID
|
|
684
|
+
)[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
|
|
685
|
+
|
|
686
|
+
if not endpoints:
|
|
687
|
+
logger.info("No model endpoints found", project=self.project)
|
|
688
|
+
return
|
|
689
|
+
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
690
|
+
if monitoring_functions:
|
|
691
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
692
|
+
# Gets only application in ready state
|
|
693
|
+
# applications_names = list(
|
|
694
|
+
# {
|
|
695
|
+
# app.metadata.name
|
|
696
|
+
# for app in monitoring_functions
|
|
697
|
+
# if (
|
|
698
|
+
# app.status.state == "ready"
|
|
699
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
700
|
+
# or app.metadata.name
|
|
701
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
702
|
+
# )
|
|
703
|
+
# }
|
|
704
|
+
# )
|
|
705
|
+
applications_names = list(
|
|
706
|
+
{app.metadata.name for app in monitoring_functions}
|
|
707
|
+
)
|
|
708
|
+
if not applications_names:
|
|
709
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
710
|
+
return
|
|
711
|
+
policy = {
|
|
712
|
+
ControllerEventEndpointPolicy.MONITORING_APPLICATIONS: applications_names,
|
|
713
|
+
ControllerEventEndpointPolicy.BASE_PERIOD: int(
|
|
714
|
+
batch_dict2timedelta(
|
|
715
|
+
json.loads(
|
|
716
|
+
cast(
|
|
717
|
+
str,
|
|
718
|
+
os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
|
|
719
|
+
)
|
|
720
|
+
)
|
|
721
|
+
).total_seconds()
|
|
722
|
+
// _SECONDS_IN_MINUTE
|
|
723
|
+
),
|
|
724
|
+
}
|
|
725
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
726
|
+
max_workers=min(len(endpoints), 10)
|
|
727
|
+
) as pool:
|
|
728
|
+
with schedules.ModelMonitoringSchedulesFileChief(
|
|
729
|
+
self.project
|
|
730
|
+
) as schedule_file:
|
|
731
|
+
for endpoint in endpoints:
|
|
732
|
+
last_request = last_request_dict.get(endpoint.metadata.uid, None)
|
|
733
|
+
if isinstance(last_request, float):
|
|
734
|
+
last_request = pd.to_datetime(last_request, unit="s", utc=True)
|
|
735
|
+
endpoint.status.last_request = (
|
|
736
|
+
last_request or endpoint.status.last_request
|
|
737
|
+
)
|
|
738
|
+
futures = {
|
|
739
|
+
pool.submit(
|
|
740
|
+
self.endpoint_to_regular_event,
|
|
741
|
+
endpoint,
|
|
742
|
+
policy,
|
|
743
|
+
set(applications_names),
|
|
744
|
+
schedule_file,
|
|
745
|
+
): endpoint
|
|
746
|
+
}
|
|
747
|
+
for future in concurrent.futures.as_completed(futures):
|
|
748
|
+
if future.exception():
|
|
749
|
+
exception = future.exception()
|
|
750
|
+
error = (
|
|
751
|
+
f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
|
|
752
|
+
f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
|
|
753
|
+
)
|
|
754
|
+
error += "".join(
|
|
755
|
+
traceback.format_exception(
|
|
756
|
+
None, exception, exception.__traceback__
|
|
757
|
+
)
|
|
758
|
+
)
|
|
759
|
+
logger.error(error)
|
|
760
|
+
logger.info("Finishing monitoring controller chief")
|
|
761
|
+
|
|
762
|
+
def endpoint_to_regular_event(
|
|
763
|
+
self,
|
|
764
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
765
|
+
policy: dict,
|
|
766
|
+
applications_names: set,
|
|
767
|
+
schedule_file: schedules.ModelMonitoringSchedulesFileChief,
|
|
768
|
+
) -> None:
|
|
769
|
+
if self._should_monitor_endpoint(
|
|
770
|
+
endpoint,
|
|
771
|
+
set(applications_names),
|
|
772
|
+
policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
|
|
773
|
+
schedule_file,
|
|
774
|
+
):
|
|
775
|
+
logger.debug(
|
|
776
|
+
"Endpoint data is being prepared for regular event",
|
|
777
|
+
endpoint_id=endpoint.metadata.uid,
|
|
778
|
+
endpoint_name=endpoint.metadata.name,
|
|
779
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
780
|
+
sep=" ", timespec="microseconds"
|
|
781
|
+
),
|
|
782
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
783
|
+
sep=" ", timespec="microseconds"
|
|
784
|
+
),
|
|
785
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
786
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
787
|
+
endpoint_policy=json.dumps(policy),
|
|
788
|
+
)
|
|
789
|
+
policy[ControllerEventEndpointPolicy.ENDPOINT_UPDATED] = (
|
|
790
|
+
endpoint.metadata.updated.isoformat()
|
|
495
791
|
)
|
|
496
|
-
|
|
497
|
-
|
|
792
|
+
self.push_to_controller_stream(
|
|
793
|
+
kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
|
|
794
|
+
project=endpoint.metadata.project,
|
|
795
|
+
endpoint_id=endpoint.metadata.uid,
|
|
796
|
+
endpoint_name=endpoint.metadata.name,
|
|
797
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
798
|
+
sep=" ", timespec="microseconds"
|
|
799
|
+
),
|
|
800
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
801
|
+
sep=" ", timespec="microseconds"
|
|
802
|
+
),
|
|
803
|
+
endpoint_type=endpoint.metadata.endpoint_type.value,
|
|
804
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
805
|
+
endpoint_policy=policy,
|
|
498
806
|
)
|
|
499
807
|
|
|
808
|
+
def push_to_controller_stream(
|
|
809
|
+
self,
|
|
810
|
+
kind: str,
|
|
811
|
+
project: str,
|
|
812
|
+
endpoint_id: str,
|
|
813
|
+
endpoint_name: str,
|
|
814
|
+
timestamp: str,
|
|
815
|
+
first_request: str,
|
|
816
|
+
endpoint_type: int,
|
|
817
|
+
feature_set_uri: str,
|
|
818
|
+
endpoint_policy: dict[str, Any],
|
|
819
|
+
) -> None:
|
|
820
|
+
"""
|
|
821
|
+
Pushes event data to controller stream.
|
|
822
|
+
:param timestamp: the event timestamp str isoformat utc timezone
|
|
823
|
+
:param first_request: the first request str isoformat utc timezone
|
|
824
|
+
:param endpoint_policy: dictionary hold the monitoring policy
|
|
825
|
+
:param kind: str event kind
|
|
826
|
+
:param project: project name
|
|
827
|
+
:param endpoint_id: endpoint id string
|
|
828
|
+
:param endpoint_name: the endpoint name string
|
|
829
|
+
:param endpoint_type: Enum of the endpoint type
|
|
830
|
+
:param feature_set_uri: the feature set uri string
|
|
831
|
+
"""
|
|
832
|
+
event = {
|
|
833
|
+
ControllerEvent.KIND.value: kind,
|
|
834
|
+
ControllerEvent.PROJECT.value: project,
|
|
835
|
+
ControllerEvent.ENDPOINT_ID.value: endpoint_id,
|
|
836
|
+
ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
|
|
837
|
+
ControllerEvent.TIMESTAMP.value: timestamp,
|
|
838
|
+
ControllerEvent.FIRST_REQUEST.value: first_request,
|
|
839
|
+
ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
|
|
840
|
+
ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
|
|
841
|
+
ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
|
|
842
|
+
}
|
|
843
|
+
logger.info(
|
|
844
|
+
"Pushing data to controller stream",
|
|
845
|
+
event=event,
|
|
846
|
+
endpoint_id=endpoint_id,
|
|
847
|
+
controller_stream_type=str(type(self.controller_stream)),
|
|
848
|
+
)
|
|
849
|
+
self.controller_stream.push([event], partition_key=endpoint_id)
|
|
850
|
+
|
|
851
|
+
def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
|
|
852
|
+
"""
|
|
853
|
+
Pushes the given event to model monitoring stream
|
|
854
|
+
:param event: event dictionary to push to stream
|
|
855
|
+
:param endpoint_id: endpoint id string
|
|
856
|
+
"""
|
|
857
|
+
logger.info(
|
|
858
|
+
"Pushing data to main stream, NOP event is been generated",
|
|
859
|
+
event=json.dumps(event),
|
|
860
|
+
endpoint_id=endpoint_id,
|
|
861
|
+
mm_stream_type=str(type(self.model_monitoring_stream)),
|
|
862
|
+
)
|
|
863
|
+
self.model_monitoring_stream.push([event], partition_key=endpoint_id)
|
|
864
|
+
|
|
500
865
|
|
|
501
|
-
def handler(context:
|
|
866
|
+
def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
502
867
|
"""
|
|
503
868
|
Run model monitoring application processor
|
|
504
869
|
|
|
505
870
|
:param context: the Nuclio context
|
|
506
871
|
:param event: trigger event
|
|
507
872
|
"""
|
|
508
|
-
|
|
873
|
+
logger.info(
|
|
874
|
+
"Controller got event",
|
|
875
|
+
trigger=event.trigger,
|
|
876
|
+
trigger_kind=event.trigger.kind,
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
if event.trigger.kind in mm_constants.CRON_TRIGGER_KINDS:
|
|
880
|
+
# Runs controller chief:
|
|
881
|
+
context.user_data.monitor_app_controller.push_regular_event_to_controller_stream()
|
|
882
|
+
elif event.trigger.kind in mm_constants.STREAM_TRIGGER_KINDS:
|
|
883
|
+
# Runs controller worker:
|
|
884
|
+
context.user_data.monitor_app_controller.run(event)
|
|
885
|
+
else:
|
|
886
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
887
|
+
"Wrong trigger kind for model monitoring controller"
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def init_context(context):
|
|
892
|
+
monitor_app_controller = MonitoringApplicationController()
|
|
893
|
+
setattr(context.user_data, "monitor_app_controller", monitor_app_controller)
|
|
894
|
+
context.logger.info("Monitoring application controller initialized")
|