mlrun 1.8.0rc44__py3-none-any.whl → 1.8.0rc46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/common/schemas/model_monitoring/constants.py +5 -0
- mlrun/config.py +6 -0
- mlrun/data_types/__init__.py +5 -1
- mlrun/datastore/targets.py +7 -5
- mlrun/db/base.py +3 -7
- mlrun/db/httpdb.py +16 -18
- mlrun/db/nopdb.py +0 -5
- mlrun/model_monitoring/api.py +5 -2
- mlrun/model_monitoring/applications/context.py +14 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +10 -18
- mlrun/model_monitoring/controller.py +98 -45
- mlrun/model_monitoring/db/_schedules.py +110 -32
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +46 -20
- mlrun/model_monitoring/helpers.py +31 -3
- mlrun/model_monitoring/writer.py +1 -1
- mlrun/projects/project.py +25 -28
- mlrun/runtimes/function_reference.py +3 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
- mlrun/runtimes/nuclio/serving.py +16 -1
- mlrun/serving/v2_serving.py +51 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info}/METADATA +3 -2
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info}/RECORD +27 -27
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info}/WHEEL +1 -1
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info/licenses}/LICENSE +0 -0
- {mlrun-1.8.0rc44.dist-info → mlrun-1.8.0rc46.dist-info}/top_level.txt +0 -0
|
@@ -289,6 +289,11 @@ class ModelMonitoringMode(StrEnum):
|
|
|
289
289
|
disabled = "disabled"
|
|
290
290
|
|
|
291
291
|
|
|
292
|
+
class ScheduleChiefFields(StrEnum):
|
|
293
|
+
LAST_REQUEST = "last_request"
|
|
294
|
+
LAST_ANALYZED = "last_analyzed"
|
|
295
|
+
|
|
296
|
+
|
|
292
297
|
class EndpointType(IntEnum):
|
|
293
298
|
NODE_EP = 1 # end point that is not a child of a router
|
|
294
299
|
ROUTER = 2 # endpoint that is router
|
mlrun/config.py
CHANGED
|
@@ -549,6 +549,10 @@ default_config = {
|
|
|
549
549
|
},
|
|
550
550
|
},
|
|
551
551
|
"model_endpoint_monitoring": {
|
|
552
|
+
# Scaling Rule
|
|
553
|
+
# The fundamental scaling rule to maintain is: Shards/Partitions = Replicas * Workers
|
|
554
|
+
# In other words, the number of shards (V3IO) or partitions (Kafka) must be equal to the
|
|
555
|
+
# total number of worker processes across all pods.
|
|
552
556
|
"serving_stream": {
|
|
553
557
|
"v3io": {
|
|
554
558
|
"shard_count": 2,
|
|
@@ -822,6 +826,8 @@ default_config = {
|
|
|
822
826
|
# maximum allowed alert config cache size in alert's CRUD
|
|
823
827
|
# for the best performance, it is recommended to set this value to the maximum number of alerts
|
|
824
828
|
"max_allowed_cache_size": 20000,
|
|
829
|
+
# default limit for listing alert configs
|
|
830
|
+
"default_list_alert_configs_limit": 2000,
|
|
825
831
|
},
|
|
826
832
|
"auth_with_client_id": {
|
|
827
833
|
"enabled": False,
|
mlrun/data_types/__init__.py
CHANGED
|
@@ -27,8 +27,12 @@ class BaseDataInfer:
|
|
|
27
27
|
get_stats = None
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def is_spark_dataframe(df) -> bool:
|
|
31
|
+
return "rdd" in dir(df)
|
|
32
|
+
|
|
33
|
+
|
|
30
34
|
def get_infer_interface(df) -> BaseDataInfer:
|
|
31
|
-
if
|
|
35
|
+
if is_spark_dataframe(df):
|
|
32
36
|
from .spark import SparkDataInfer
|
|
33
37
|
|
|
34
38
|
return SparkDataInfer
|
mlrun/datastore/targets.py
CHANGED
|
@@ -40,7 +40,7 @@ from mlrun.utils.helpers import to_parquet
|
|
|
40
40
|
from mlrun.utils.v3io_clients import get_frames_client
|
|
41
41
|
|
|
42
42
|
from .. import errors
|
|
43
|
-
from ..data_types import ValueType
|
|
43
|
+
from ..data_types import ValueType, is_spark_dataframe
|
|
44
44
|
from ..platforms.iguazio import parse_path, split_path
|
|
45
45
|
from .datastore_profile import datastore_profile_read
|
|
46
46
|
from .spark_utils import spark_session_update_hadoop_options
|
|
@@ -86,8 +86,10 @@ def generate_target_run_id():
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
|
|
89
|
+
# TODO: Replace with just df.sparkSession when Spark 3.2 support is dropped
|
|
90
|
+
spark_session = getattr(df, "sparkSession") or df.sql_ctx.sparkSession
|
|
89
91
|
non_hadoop_spark_options = spark_session_update_hadoop_options(
|
|
90
|
-
|
|
92
|
+
spark_session, spark_options
|
|
91
93
|
)
|
|
92
94
|
if write_format:
|
|
93
95
|
df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
|
|
@@ -510,7 +512,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
510
512
|
chunk_id=0,
|
|
511
513
|
**kwargs,
|
|
512
514
|
) -> Optional[int]:
|
|
513
|
-
if
|
|
515
|
+
if is_spark_dataframe(df):
|
|
514
516
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
515
517
|
options.update(kwargs)
|
|
516
518
|
df = self.prepare_spark_df(df, key_column, timestamp_key, options)
|
|
@@ -1376,7 +1378,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1376
1378
|
def write_dataframe(
|
|
1377
1379
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1378
1380
|
):
|
|
1379
|
-
if
|
|
1381
|
+
if is_spark_dataframe(df):
|
|
1380
1382
|
options = self.get_spark_options(key_column, timestamp_key)
|
|
1381
1383
|
options.update(kwargs)
|
|
1382
1384
|
df = self.prepare_spark_df(df)
|
|
@@ -2108,7 +2110,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
2108
2110
|
|
|
2109
2111
|
self._create_sql_table()
|
|
2110
2112
|
|
|
2111
|
-
if
|
|
2113
|
+
if is_spark_dataframe(df):
|
|
2112
2114
|
raise ValueError("Spark is not supported")
|
|
2113
2115
|
else:
|
|
2114
2116
|
(
|
mlrun/db/base.py
CHANGED
|
@@ -889,7 +889,9 @@ class RunDBInterface(ABC):
|
|
|
889
889
|
pass
|
|
890
890
|
|
|
891
891
|
@abstractmethod
|
|
892
|
-
def list_alerts_configs(
|
|
892
|
+
def list_alerts_configs(
|
|
893
|
+
self, project="", limit: Optional[int] = None, offset: Optional[int] = None
|
|
894
|
+
):
|
|
893
895
|
pass
|
|
894
896
|
|
|
895
897
|
@abstractmethod
|
|
@@ -1105,12 +1107,6 @@ class RunDBInterface(ABC):
|
|
|
1105
1107
|
) -> bool:
|
|
1106
1108
|
pass
|
|
1107
1109
|
|
|
1108
|
-
@abstractmethod
|
|
1109
|
-
def deploy_histogram_data_drift_app(
|
|
1110
|
-
self, project: str, image: str = "mlrun/mlrun"
|
|
1111
|
-
) -> None:
|
|
1112
|
-
pass
|
|
1113
|
-
|
|
1114
1110
|
@abstractmethod
|
|
1115
1111
|
def set_model_monitoring_credentials(
|
|
1116
1112
|
self,
|
mlrun/db/httpdb.py
CHANGED
|
@@ -4080,21 +4080,6 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4080
4080
|
deletion_failed = True
|
|
4081
4081
|
return not deletion_failed
|
|
4082
4082
|
|
|
4083
|
-
def deploy_histogram_data_drift_app(
|
|
4084
|
-
self, project: str, image: str = "mlrun/mlrun"
|
|
4085
|
-
) -> None:
|
|
4086
|
-
"""
|
|
4087
|
-
Deploy the histogram data drift application.
|
|
4088
|
-
|
|
4089
|
-
:param project: Project name.
|
|
4090
|
-
:param image: The image on which the application will run.
|
|
4091
|
-
"""
|
|
4092
|
-
self.api_call(
|
|
4093
|
-
method=mlrun.common.types.HTTPMethod.PUT,
|
|
4094
|
-
path=f"projects/{project}/model-monitoring/histogram-data-drift-app",
|
|
4095
|
-
params={"image": image},
|
|
4096
|
-
)
|
|
4097
|
-
|
|
4098
4083
|
def set_model_monitoring_credentials(
|
|
4099
4084
|
self,
|
|
4100
4085
|
project: str,
|
|
@@ -4818,20 +4803,33 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4818
4803
|
response = self.api_call("GET", endpoint_path, error_message)
|
|
4819
4804
|
return AlertConfig.from_dict(response.json())
|
|
4820
4805
|
|
|
4821
|
-
def list_alerts_configs(
|
|
4806
|
+
def list_alerts_configs(
|
|
4807
|
+
self, project="", limit: Optional[int] = None, offset: Optional[int] = None
|
|
4808
|
+
) -> list[AlertConfig]:
|
|
4822
4809
|
"""
|
|
4823
4810
|
Retrieve list of alerts of a project.
|
|
4824
4811
|
|
|
4825
4812
|
:param project: The project name.
|
|
4813
|
+
:param limit: The maximum number of alerts to return.
|
|
4814
|
+
Defaults to `mlconf.alerts.default_list_alert_configs_limit` if not provided.
|
|
4815
|
+
:param offset: The number of alerts to skip.
|
|
4826
4816
|
|
|
4827
4817
|
:returns: All the alerts objects of the project.
|
|
4828
4818
|
"""
|
|
4829
4819
|
project = project or config.default_project
|
|
4830
4820
|
endpoint_path = f"projects/{project}/alerts"
|
|
4831
4821
|
error_message = f"get alerts {project}/alerts"
|
|
4832
|
-
|
|
4822
|
+
params = {}
|
|
4823
|
+
# TODO: Deprecate limit and offset when pagination is implemented
|
|
4824
|
+
if limit:
|
|
4825
|
+
params["page-size"] = limit
|
|
4826
|
+
if offset:
|
|
4827
|
+
params["offset"] = offset
|
|
4828
|
+
response = self.api_call(
|
|
4829
|
+
"GET", endpoint_path, error_message, params=params
|
|
4830
|
+
).json()
|
|
4833
4831
|
results = []
|
|
4834
|
-
for item in response:
|
|
4832
|
+
for item in response.get("alerts", []):
|
|
4835
4833
|
results.append(AlertConfig(**item))
|
|
4836
4834
|
return results
|
|
4837
4835
|
|
mlrun/db/nopdb.py
CHANGED
|
@@ -883,11 +883,6 @@ class NopDB(RunDBInterface):
|
|
|
883
883
|
) -> bool:
|
|
884
884
|
pass
|
|
885
885
|
|
|
886
|
-
def deploy_histogram_data_drift_app(
|
|
887
|
-
self, project: str, image: str = "mlrun/mlrun"
|
|
888
|
-
) -> None:
|
|
889
|
-
pass
|
|
890
|
-
|
|
891
886
|
def set_model_monitoring_credentials(
|
|
892
887
|
self,
|
|
893
888
|
project: str,
|
mlrun/model_monitoring/api.py
CHANGED
|
@@ -50,8 +50,8 @@ DatasetType = typing.Union[
|
|
|
50
50
|
|
|
51
51
|
def get_or_create_model_endpoint(
|
|
52
52
|
project: str,
|
|
53
|
+
model_endpoint_name: str,
|
|
53
54
|
model_path: str = "",
|
|
54
|
-
model_endpoint_name: str = "",
|
|
55
55
|
endpoint_id: str = "",
|
|
56
56
|
function_name: str = "",
|
|
57
57
|
function_tag: str = "latest",
|
|
@@ -59,6 +59,7 @@ def get_or_create_model_endpoint(
|
|
|
59
59
|
sample_set_statistics: typing.Optional[dict[str, typing.Any]] = None,
|
|
60
60
|
monitoring_mode: mm_constants.ModelMonitoringMode = mm_constants.ModelMonitoringMode.enabled,
|
|
61
61
|
db_session=None,
|
|
62
|
+
feature_analysis: bool = False,
|
|
62
63
|
) -> ModelEndpoint:
|
|
63
64
|
"""
|
|
64
65
|
Get a single model endpoint object. If not exist, generate a new model endpoint with the provided parameters. Note
|
|
@@ -66,9 +67,9 @@ def get_or_create_model_endpoint(
|
|
|
66
67
|
features, set `monitoring_mode=enabled`.
|
|
67
68
|
|
|
68
69
|
:param project: Project name.
|
|
69
|
-
:param model_path: The model store path (applicable only to new endpoint_id).
|
|
70
70
|
:param model_endpoint_name: If a new model endpoint is created, the model endpoint name will be presented
|
|
71
71
|
under this endpoint (applicable only to new endpoint_id).
|
|
72
|
+
:param model_path: The model store path (applicable only to new endpoint_id).
|
|
72
73
|
:param endpoint_id: Model endpoint unique ID. If not exist in DB, will generate a new record based
|
|
73
74
|
on the provided `endpoint_id`.
|
|
74
75
|
:param function_name: If a new model endpoint is created, use this function name.
|
|
@@ -80,6 +81,7 @@ def get_or_create_model_endpoint(
|
|
|
80
81
|
:param monitoring_mode: If enabled, apply model monitoring features on the provided endpoint id
|
|
81
82
|
(applicable only to new endpoint_id).
|
|
82
83
|
:param db_session: A runtime session that manages the current dialog with the database.
|
|
84
|
+
:param feature_analysis: If True, the model endpoint will be retrieved with the feature analysis mode.
|
|
83
85
|
|
|
84
86
|
:return: A ModelEndpoint object
|
|
85
87
|
"""
|
|
@@ -99,6 +101,7 @@ def get_or_create_model_endpoint(
|
|
|
99
101
|
endpoint_id=endpoint_id,
|
|
100
102
|
function_name=function_name,
|
|
101
103
|
function_tag=function_tag or "latest",
|
|
104
|
+
feature_analysis=feature_analysis,
|
|
102
105
|
)
|
|
103
106
|
# If other fields provided, validate that they are correspond to the existing model endpoint data
|
|
104
107
|
_model_endpoint_validations(
|
|
@@ -76,7 +76,6 @@ class MonitoringApplicationContext:
|
|
|
76
76
|
:param sample_df: (pd.DataFrame) The new sample DataFrame.
|
|
77
77
|
:param start_infer_time: (pd.Timestamp) Start time of the monitoring schedule.
|
|
78
78
|
:param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
|
|
79
|
-
:param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
|
|
80
79
|
:param endpoint_id: (str) ID of the monitored model endpoint
|
|
81
80
|
:param feature_set: (FeatureSet) the model endpoint feature set
|
|
82
81
|
:param endpoint_name: (str) Name of the monitored model endpoint
|
|
@@ -208,6 +207,20 @@ class MonitoringApplicationContext:
|
|
|
208
207
|
@property
|
|
209
208
|
def sample_df(self) -> pd.DataFrame:
|
|
210
209
|
if self._sample_df is None:
|
|
210
|
+
if (
|
|
211
|
+
self.endpoint_name is None
|
|
212
|
+
or self.endpoint_id is None
|
|
213
|
+
or pd.isnull(self.start_infer_time)
|
|
214
|
+
or pd.isnull(self.end_infer_time)
|
|
215
|
+
):
|
|
216
|
+
raise mlrun.errors.MLRunValueError(
|
|
217
|
+
"You have tried to access `monitoring_context.sample_df`, but have not provided it directly "
|
|
218
|
+
"through `sample_data`, nor have you provided the model endpoint's name, ID, and the start and "
|
|
219
|
+
f"end times: `endpoint_name`={self.endpoint_name}, `endpoint_uid`={self.endpoint_id}, "
|
|
220
|
+
f"`start`={self.start_infer_time}, and `end`={self.end_infer_time}. "
|
|
221
|
+
"You can either provide the sample dataframe directly, the model endpoint's details and times, "
|
|
222
|
+
"or adapt the application's logic to not access the sample dataframe."
|
|
223
|
+
)
|
|
211
224
|
feature_set = self.feature_set
|
|
212
225
|
features = [f"{feature_set.metadata.name}.*"]
|
|
213
226
|
vector = fstore.FeatureVector(
|
|
@@ -107,16 +107,14 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
107
107
|
* JSON with the general drift value per feature, produced by default.
|
|
108
108
|
* Plotly table with the various metrics and histograms per feature (disabled by default due to performance issues).
|
|
109
109
|
|
|
110
|
-
This application is deployed by default when calling
|
|
111
|
-
|
|
112
|
-
.. code-block:: python
|
|
113
|
-
|
|
114
|
-
project.enable_model_monitoring()
|
|
115
|
-
|
|
110
|
+
This application is deployed by default when calling
|
|
111
|
+
:py:func:`~mlrun.projects.MlrunProject.enable_model_monitoring`.
|
|
116
112
|
To avoid it, pass :code:`deploy_histogram_data_drift_app=False`.
|
|
117
113
|
|
|
118
114
|
If you want to change the application defaults, such as the classifier or which artifacts to produce, you
|
|
119
115
|
need to inherit from this class and deploy it as any other model monitoring application.
|
|
116
|
+
Please make sure to keep the default application name. This ensures that the full functionality of the application,
|
|
117
|
+
including the statistics view in the UI, is available.
|
|
120
118
|
"""
|
|
121
119
|
|
|
122
120
|
NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
|
|
@@ -140,8 +138,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
140
138
|
produce_plotly_artifact: bool = False,
|
|
141
139
|
) -> None:
|
|
142
140
|
"""
|
|
143
|
-
:param value_classifier: Classifier object that adheres to the
|
|
144
|
-
If not provided, the default
|
|
141
|
+
:param value_classifier: Classifier object that adheres to the :py:class:`~ValueClassifier` protocol.
|
|
142
|
+
If not provided, the default :py:class:`~DataDriftClassifier` is used.
|
|
145
143
|
"""
|
|
146
144
|
self._value_classifier = value_classifier or DataDriftClassifier()
|
|
147
145
|
assert self._REQUIRED_METRICS <= set(
|
|
@@ -181,10 +179,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
181
179
|
return metrics_per_feature
|
|
182
180
|
|
|
183
181
|
def _get_general_drift_result(
|
|
184
|
-
self,
|
|
185
|
-
metrics: list[mm_results.ModelMonitoringApplicationMetric],
|
|
186
|
-
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
187
|
-
metrics_per_feature: DataFrame,
|
|
182
|
+
self, metrics: list[mm_results.ModelMonitoringApplicationMetric]
|
|
188
183
|
) -> mm_results.ModelMonitoringApplicationResult:
|
|
189
184
|
"""Get the general drift result from the metrics list"""
|
|
190
185
|
value = cast(
|
|
@@ -237,7 +232,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
237
232
|
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
238
233
|
) -> list[mm_results._ModelMonitoringApplicationStats]:
|
|
239
234
|
"""
|
|
240
|
-
list the
|
|
235
|
+
Return a list of the statistics.
|
|
236
|
+
|
|
241
237
|
:param metrics: the calculated metrics
|
|
242
238
|
:param metrics_per_feature: metric calculated per feature
|
|
243
239
|
:param monitoring_context: context object for current monitoring application
|
|
@@ -376,11 +372,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
376
372
|
)
|
|
377
373
|
monitoring_context.logger.debug("Computing average per metric")
|
|
378
374
|
metrics = self._get_metrics(metrics_per_feature)
|
|
379
|
-
result = self._get_general_drift_result(
|
|
380
|
-
metrics=metrics,
|
|
381
|
-
monitoring_context=monitoring_context,
|
|
382
|
-
metrics_per_feature=metrics_per_feature,
|
|
383
|
-
)
|
|
375
|
+
result = self._get_general_drift_result(metrics=metrics)
|
|
384
376
|
stats = self._get_stats(
|
|
385
377
|
metrics=metrics,
|
|
386
378
|
monitoring_context=monitoring_context,
|
|
@@ -28,6 +28,7 @@ import mlrun
|
|
|
28
28
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
29
29
|
import mlrun.feature_store as fstore
|
|
30
30
|
import mlrun.model_monitoring
|
|
31
|
+
import mlrun.model_monitoring.db._schedules as schedules
|
|
31
32
|
import mlrun.model_monitoring.helpers
|
|
32
33
|
from mlrun.common.schemas import EndpointType
|
|
33
34
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
@@ -36,7 +37,6 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
36
37
|
ControllerEventKind,
|
|
37
38
|
)
|
|
38
39
|
from mlrun.errors import err_to_str
|
|
39
|
-
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
40
40
|
from mlrun.model_monitoring.helpers import batch_dict2timedelta
|
|
41
41
|
from mlrun.utils import datetime_now, logger
|
|
42
42
|
|
|
@@ -53,7 +53,7 @@ class _BatchWindow:
|
|
|
53
53
|
def __init__(
|
|
54
54
|
self,
|
|
55
55
|
*,
|
|
56
|
-
schedules_file:
|
|
56
|
+
schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
|
|
57
57
|
application: str,
|
|
58
58
|
timedelta_seconds: int,
|
|
59
59
|
last_updated: int,
|
|
@@ -153,7 +153,7 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
153
153
|
self._project = project
|
|
154
154
|
self._endpoint_id = endpoint_id
|
|
155
155
|
self._timedelta = window_length
|
|
156
|
-
self._schedules_file =
|
|
156
|
+
self._schedules_file = schedules.ModelMonitoringSchedulesFileEndpoint(
|
|
157
157
|
project=project, endpoint_id=endpoint_id
|
|
158
158
|
)
|
|
159
159
|
|
|
@@ -273,6 +273,7 @@ class MonitoringApplicationController:
|
|
|
273
273
|
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
274
274
|
application_names: set,
|
|
275
275
|
base_period_minutes: int,
|
|
276
|
+
schedules_file: schedules.ModelMonitoringSchedulesFileChief,
|
|
276
277
|
) -> bool:
|
|
277
278
|
"""
|
|
278
279
|
checks if there is a need to monitor the given endpoint, we should monitor endpoint if it stands in the
|
|
@@ -281,11 +282,23 @@ class MonitoringApplicationController:
|
|
|
281
282
|
2. first request exists
|
|
282
283
|
3. last request exists
|
|
283
284
|
4. endpoint_type is not ROUTER
|
|
284
|
-
if the four above conditions apply we require one of the
|
|
285
|
+
if the four above conditions apply we require one of the two condition monitor:
|
|
285
286
|
1. never monitored the one of the endpoint applications meaning min_last_analyzed is None
|
|
286
|
-
2.
|
|
287
|
-
|
|
287
|
+
2. min_last_analyzed stands in the condition for sending NOP event and this the first time regular event
|
|
288
|
+
is sent with the combination of current last_request & current last_analyzed per endpoint.
|
|
288
289
|
"""
|
|
290
|
+
last_timestamp_sent = schedules_file.get_endpoint_last_request(
|
|
291
|
+
endpoint.metadata.uid
|
|
292
|
+
)
|
|
293
|
+
last_analyzed_sent = schedules_file.get_endpoint_last_analyzed(
|
|
294
|
+
endpoint.metadata.uid
|
|
295
|
+
)
|
|
296
|
+
logger.debug(
|
|
297
|
+
"Chief should monitor endpoint check",
|
|
298
|
+
last_timestamp_sent=last_timestamp_sent,
|
|
299
|
+
last_analyzed_sent=last_analyzed_sent,
|
|
300
|
+
uid=endpoint.metadata.uid,
|
|
301
|
+
)
|
|
289
302
|
if (
|
|
290
303
|
# Is the model endpoint monitored?
|
|
291
304
|
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
@@ -300,26 +313,43 @@ class MonitoringApplicationController:
|
|
|
300
313
|
project=endpoint.metadata.project,
|
|
301
314
|
endpoint_id=endpoint.metadata.uid,
|
|
302
315
|
) as batch_window_generator:
|
|
303
|
-
|
|
304
|
-
|
|
316
|
+
current_time = mlrun.utils.datetime_now()
|
|
317
|
+
current_min_last_analyzed = (
|
|
318
|
+
batch_window_generator.get_min_last_analyzed()
|
|
319
|
+
)
|
|
320
|
+
if (
|
|
321
|
+
# Different application names, or last analyzed never updated while there are application to monitor
|
|
322
|
+
application_names
|
|
323
|
+
and (
|
|
324
|
+
application_names
|
|
325
|
+
!= batch_window_generator.get_application_list()
|
|
326
|
+
or not current_min_last_analyzed
|
|
327
|
+
)
|
|
328
|
+
):
|
|
305
329
|
return True
|
|
306
330
|
elif (
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
331
|
+
# Does nop event will be sent to close the relevant window
|
|
332
|
+
self._should_send_nop_event(
|
|
333
|
+
base_period_minutes, current_min_last_analyzed, current_time
|
|
334
|
+
)
|
|
335
|
+
and (
|
|
336
|
+
int(endpoint.status.last_request.timestamp())
|
|
337
|
+
!= last_timestamp_sent
|
|
338
|
+
or current_min_last_analyzed != last_analyzed_sent
|
|
339
|
+
)
|
|
313
340
|
):
|
|
341
|
+
# Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
|
|
342
|
+
schedules_file.update_endpoint_timestamps(
|
|
343
|
+
endpoint_uid=endpoint.metadata.uid,
|
|
344
|
+
last_request=int(endpoint.status.last_request.timestamp()),
|
|
345
|
+
last_analyzed=current_min_last_analyzed,
|
|
346
|
+
)
|
|
314
347
|
return True
|
|
315
348
|
else:
|
|
316
349
|
logger.info(
|
|
317
350
|
"All the possible intervals were already analyzed, didn't push regular event",
|
|
318
351
|
endpoint_id=endpoint.metadata.uid,
|
|
319
|
-
last_analyzed=
|
|
320
|
-
batch_window_generator.get_min_last_analyzed(),
|
|
321
|
-
tz=datetime.timezone.utc,
|
|
322
|
-
),
|
|
352
|
+
last_analyzed=current_min_last_analyzed,
|
|
323
353
|
last_request=endpoint.status.last_request,
|
|
324
354
|
)
|
|
325
355
|
else:
|
|
@@ -334,6 +364,21 @@ class MonitoringApplicationController:
|
|
|
334
364
|
)
|
|
335
365
|
return False
|
|
336
366
|
|
|
367
|
+
@staticmethod
|
|
368
|
+
def _should_send_nop_event(
|
|
369
|
+
base_period_minutes: int,
|
|
370
|
+
min_last_analyzed: int,
|
|
371
|
+
current_time: datetime.datetime,
|
|
372
|
+
):
|
|
373
|
+
if min_last_analyzed:
|
|
374
|
+
return (
|
|
375
|
+
current_time.timestamp() - min_last_analyzed
|
|
376
|
+
>= datetime.timedelta(minutes=base_period_minutes).total_seconds()
|
|
377
|
+
+ mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs
|
|
378
|
+
)
|
|
379
|
+
else:
|
|
380
|
+
return True
|
|
381
|
+
|
|
337
382
|
def run(self, event: nuclio_sdk.Event) -> None:
|
|
338
383
|
"""
|
|
339
384
|
Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
|
|
@@ -441,9 +486,11 @@ class MonitoringApplicationController:
|
|
|
441
486
|
]
|
|
442
487
|
current_time = mlrun.utils.datetime_now()
|
|
443
488
|
if (
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
489
|
+
self._should_send_nop_event(
|
|
490
|
+
base_period,
|
|
491
|
+
batch_window_generator.get_min_last_analyzed(),
|
|
492
|
+
current_time,
|
|
493
|
+
)
|
|
447
494
|
and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
|
|
448
495
|
):
|
|
449
496
|
event = {
|
|
@@ -581,29 +628,33 @@ class MonitoringApplicationController:
|
|
|
581
628
|
with concurrent.futures.ThreadPoolExecutor(
|
|
582
629
|
max_workers=min(len(endpoints), 10)
|
|
583
630
|
) as pool:
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
631
|
+
with schedules.ModelMonitoringSchedulesFileChief(
|
|
632
|
+
self.project
|
|
633
|
+
) as schedule_file:
|
|
634
|
+
futures = {
|
|
635
|
+
pool.submit(
|
|
636
|
+
self.endpoint_to_regular_event,
|
|
637
|
+
endpoint,
|
|
638
|
+
policy,
|
|
639
|
+
set(applications_names),
|
|
640
|
+
self.v3io_access_key,
|
|
641
|
+
schedule_file,
|
|
642
|
+
): endpoint
|
|
643
|
+
for endpoint in endpoints
|
|
644
|
+
}
|
|
645
|
+
for future in concurrent.futures.as_completed(futures):
|
|
646
|
+
if future.exception():
|
|
647
|
+
exception = future.exception()
|
|
648
|
+
error = (
|
|
649
|
+
f"Failed to push event. Endpoint name: {futures[future].metadata.name}, "
|
|
650
|
+
f"endpoint uid: {futures[future].metadata.uid}, traceback:\n"
|
|
604
651
|
)
|
|
605
|
-
|
|
606
|
-
|
|
652
|
+
error += "".join(
|
|
653
|
+
traceback.format_exception(
|
|
654
|
+
None, exception, exception.__traceback__
|
|
655
|
+
)
|
|
656
|
+
)
|
|
657
|
+
logger.error(error)
|
|
607
658
|
logger.info("Finishing monitoring controller chief")
|
|
608
659
|
|
|
609
660
|
def endpoint_to_regular_event(
|
|
@@ -612,14 +663,16 @@ class MonitoringApplicationController:
|
|
|
612
663
|
policy: dict,
|
|
613
664
|
applications_names: set,
|
|
614
665
|
v3io_access_key: str,
|
|
666
|
+
schedule_file: schedules.ModelMonitoringSchedulesFileChief,
|
|
615
667
|
) -> None:
|
|
616
668
|
if self._should_monitor_endpoint(
|
|
617
669
|
endpoint,
|
|
618
670
|
set(applications_names),
|
|
619
671
|
policy.get(ControllerEventEndpointPolicy.BASE_PERIOD, 10),
|
|
672
|
+
schedule_file,
|
|
620
673
|
):
|
|
621
|
-
logger.
|
|
622
|
-
"
|
|
674
|
+
logger.debug(
|
|
675
|
+
"Endpoint data is being prepared for regular event",
|
|
623
676
|
endpoint_id=endpoint.metadata.uid,
|
|
624
677
|
endpoint_name=endpoint.metadata.name,
|
|
625
678
|
timestamp=endpoint.status.last_request.isoformat(
|