mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +0 -31
- mlrun/artifacts/llm_prompt.py +106 -20
- mlrun/artifacts/manager.py +0 -5
- mlrun/common/constants.py +0 -1
- mlrun/common/schemas/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/functions.py +1 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -0
- mlrun/common/schemas/workflow.py +0 -1
- mlrun/config.py +1 -1
- mlrun/datastore/model_provider/model_provider.py +42 -14
- mlrun/datastore/model_provider/openai_provider.py +96 -15
- mlrun/db/base.py +14 -0
- mlrun/db/httpdb.py +42 -9
- mlrun/db/nopdb.py +8 -0
- mlrun/execution.py +16 -7
- mlrun/model.py +15 -0
- mlrun/model_monitoring/__init__.py +1 -0
- mlrun/model_monitoring/applications/base.py +176 -20
- mlrun/model_monitoring/db/_schedules.py +84 -24
- mlrun/model_monitoring/db/tsdb/base.py +72 -1
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +7 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +37 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +25 -0
- mlrun/model_monitoring/helpers.py +26 -4
- mlrun/projects/project.py +38 -12
- mlrun/runtimes/daskjob.py +6 -0
- mlrun/runtimes/mpijob/abstract.py +6 -0
- mlrun/runtimes/mpijob/v1.py +6 -0
- mlrun/runtimes/nuclio/application/application.py +2 -0
- mlrun/runtimes/nuclio/function.py +6 -0
- mlrun/runtimes/nuclio/serving.py +12 -11
- mlrun/runtimes/pod.py +21 -0
- mlrun/runtimes/remotesparkjob.py +6 -0
- mlrun/runtimes/sparkjob/spark3job.py +6 -0
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/server.py +95 -26
- mlrun/serving/states.py +130 -10
- mlrun/utils/helpers.py +36 -12
- mlrun/utils/retryer.py +15 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/METADATA +3 -8
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/RECORD +47 -47
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc15.dist-info}/top_level.txt +0 -0
mlrun/db/httpdb.py
CHANGED
|
@@ -757,7 +757,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
757
757
|
)
|
|
758
758
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
759
759
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
760
|
-
return self.
|
|
760
|
+
return self.wait_for_background_task_to_reach_terminal_state(
|
|
761
761
|
background_task.metadata.name, project=project
|
|
762
762
|
)
|
|
763
763
|
return None
|
|
@@ -784,7 +784,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
784
784
|
)
|
|
785
785
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
786
786
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
787
|
-
background_task = self.
|
|
787
|
+
background_task = self.wait_for_background_task_to_reach_terminal_state(
|
|
788
788
|
background_task.metadata.name, project=project
|
|
789
789
|
)
|
|
790
790
|
if (
|
|
@@ -839,7 +839,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
839
839
|
)
|
|
840
840
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
841
841
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
842
|
-
background_task = self.
|
|
842
|
+
background_task = self.wait_for_background_task_to_reach_terminal_state(
|
|
843
843
|
background_task.metadata.name, project=project
|
|
844
844
|
)
|
|
845
845
|
if (
|
|
@@ -1485,7 +1485,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
1485
1485
|
"Function is being deleted", project_name=project, function_name=name
|
|
1486
1486
|
)
|
|
1487
1487
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
1488
|
-
background_task = self.
|
|
1488
|
+
background_task = self.wait_for_background_task_to_reach_terminal_state(
|
|
1489
1489
|
background_task.metadata.name, project=project
|
|
1490
1490
|
)
|
|
1491
1491
|
if (
|
|
@@ -3274,7 +3274,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3274
3274
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
3275
3275
|
logger.info("Waiting for project to be deleted", project_name=name)
|
|
3276
3276
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
3277
|
-
background_task = self.
|
|
3277
|
+
background_task = self.wait_for_background_task_to_reach_terminal_state(
|
|
3278
3278
|
background_task.metadata.name
|
|
3279
3279
|
)
|
|
3280
3280
|
if (
|
|
@@ -3387,7 +3387,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3387
3387
|
_verify_project_in_terminal_state,
|
|
3388
3388
|
)
|
|
3389
3389
|
|
|
3390
|
-
def
|
|
3390
|
+
def wait_for_background_task_to_reach_terminal_state(
|
|
3391
3391
|
self, name: str, project: str = ""
|
|
3392
3392
|
) -> mlrun.common.schemas.BackgroundTask:
|
|
3393
3393
|
def _verify_background_task_in_terminal_state():
|
|
@@ -3408,6 +3408,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3408
3408
|
logger,
|
|
3409
3409
|
False,
|
|
3410
3410
|
_verify_background_task_in_terminal_state,
|
|
3411
|
+
fatal_exceptions=(mlrun.errors.MLRunAccessDeniedError,),
|
|
3411
3412
|
)
|
|
3412
3413
|
|
|
3413
3414
|
def create_project_secrets(
|
|
@@ -4082,7 +4083,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4082
4083
|
**response.json()
|
|
4083
4084
|
).background_tasks
|
|
4084
4085
|
for task in background_tasks:
|
|
4085
|
-
task = self.
|
|
4086
|
+
task = self.wait_for_background_task_to_reach_terminal_state(
|
|
4086
4087
|
task.metadata.name, project=project
|
|
4087
4088
|
)
|
|
4088
4089
|
if (
|
|
@@ -4119,7 +4120,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4119
4120
|
**response.json()
|
|
4120
4121
|
).background_tasks
|
|
4121
4122
|
for task in background_tasks:
|
|
4122
|
-
task = self.
|
|
4123
|
+
task = self.wait_for_background_task_to_reach_terminal_state(
|
|
4123
4124
|
task.metadata.name, project=project
|
|
4124
4125
|
)
|
|
4125
4126
|
if (
|
|
@@ -5158,6 +5159,38 @@ class HTTPRunDB(RunDBInterface):
|
|
|
5158
5159
|
response = self.api_call("GET", endpoint_path, error_message)
|
|
5159
5160
|
return mlrun.common.schemas.ProjectSummary(**response.json())
|
|
5160
5161
|
|
|
5162
|
+
def get_drift_over_time(
|
|
5163
|
+
self,
|
|
5164
|
+
project: str,
|
|
5165
|
+
start: Optional[datetime] = None,
|
|
5166
|
+
end: Optional[datetime] = None,
|
|
5167
|
+
) -> mlrun.common.schemas.model_monitoring.ModelEndpointDriftValues:
|
|
5168
|
+
"""
|
|
5169
|
+
Get drift counts over time for the project.
|
|
5170
|
+
|
|
5171
|
+
This method returns a list of tuples, each representing a time-interval (in a granularity set by the
|
|
5172
|
+
duration of the given time range) and the number of suspected drifts and detected drifts in that interval.
|
|
5173
|
+
For a range of 6 hours or less, the granularity is 10 minute, for a range of 2 hours to 72 hours, the
|
|
5174
|
+
granularity is 1 hour, and for a range of more than 72 hours, the granularity is 24 hours.
|
|
5175
|
+
|
|
5176
|
+
:param project: The name of the project for which to retrieve drift counts.
|
|
5177
|
+
:param start: Start time of the range to retrieve drift counts from.
|
|
5178
|
+
:param end: End time of the range to retrieve drift counts from.
|
|
5179
|
+
|
|
5180
|
+
:return: A ModelEndpointDriftValues object containing the drift counts over time.
|
|
5181
|
+
"""
|
|
5182
|
+
endpoint_path = f"projects/{project}/model-endpoints/drift-over-time"
|
|
5183
|
+
error_message = f"Failed retrieving drift data for {project}"
|
|
5184
|
+
response = self.api_call(
|
|
5185
|
+
method="GET",
|
|
5186
|
+
path=endpoint_path,
|
|
5187
|
+
error=error_message,
|
|
5188
|
+
params={"start": start, "end": end},
|
|
5189
|
+
)
|
|
5190
|
+
return mlrun.common.schemas.model_monitoring.ModelEndpointDriftValues(
|
|
5191
|
+
**response.json()
|
|
5192
|
+
)
|
|
5193
|
+
|
|
5161
5194
|
@staticmethod
|
|
5162
5195
|
def _parse_labels(
|
|
5163
5196
|
labels: Optional[Union[str, dict[str, Optional[str]], list[str]]],
|
|
@@ -5478,7 +5511,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
5478
5511
|
def _wait_for_background_task_from_response(self, response):
|
|
5479
5512
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
5480
5513
|
background_task = mlrun.common.schemas.BackgroundTask(**response.json())
|
|
5481
|
-
return self.
|
|
5514
|
+
return self.wait_for_background_task_to_reach_terminal_state(
|
|
5482
5515
|
background_task.metadata.name
|
|
5483
5516
|
)
|
|
5484
5517
|
return None
|
mlrun/db/nopdb.py
CHANGED
|
@@ -980,3 +980,11 @@ class NopDB(RunDBInterface):
|
|
|
980
980
|
|
|
981
981
|
def get_project_summary(self, project: str):
|
|
982
982
|
pass
|
|
983
|
+
|
|
984
|
+
def get_drift_over_time(
|
|
985
|
+
self,
|
|
986
|
+
project: str,
|
|
987
|
+
start: Optional[datetime.datetime] = None,
|
|
988
|
+
end: Optional[datetime.datetime] = None,
|
|
989
|
+
) -> mlrun.common.schemas.model_monitoring.ModelEndpointDriftValues:
|
|
990
|
+
pass
|
mlrun/execution.py
CHANGED
|
@@ -94,6 +94,7 @@ class MLClientCtx:
|
|
|
94
94
|
self._state_thresholds = {}
|
|
95
95
|
self._retry_spec = {}
|
|
96
96
|
self._retry_count = None
|
|
97
|
+
self._retries = []
|
|
97
98
|
|
|
98
99
|
self._labels = {}
|
|
99
100
|
self._annotations = {}
|
|
@@ -468,6 +469,7 @@ class MLClientCtx:
|
|
|
468
469
|
for key, uri in status.get("artifact_uris", {}).items():
|
|
469
470
|
self._artifacts_manager.artifact_uris[key] = uri
|
|
470
471
|
self._retry_count = status.get("retry_count", self._retry_count)
|
|
472
|
+
self._retries = status.get("retries", self._retries)
|
|
471
473
|
# if run is a retry, the state needs to move to running
|
|
472
474
|
if include_status:
|
|
473
475
|
self._state = status.get("state", self._state)
|
|
@@ -911,7 +913,7 @@ class MLClientCtx:
|
|
|
911
913
|
def log_llm_prompt(
|
|
912
914
|
self,
|
|
913
915
|
key,
|
|
914
|
-
|
|
916
|
+
prompt_template: Optional[list[dict]] = None,
|
|
915
917
|
prompt_path: Optional[str] = None,
|
|
916
918
|
prompt_legend: Optional[dict] = None,
|
|
917
919
|
model_artifact: Union[ModelArtifact, str] = None,
|
|
@@ -935,7 +937,7 @@ class MLClientCtx:
|
|
|
935
937
|
# Log an inline prompt
|
|
936
938
|
context.log_llm_prompt(
|
|
937
939
|
key="qa-prompt",
|
|
938
|
-
|
|
940
|
+
prompt_template=[{"role: "user", "content": "question with {place_holder}"}],
|
|
939
941
|
model_artifact=model,
|
|
940
942
|
prompt_legend={"question": "user_input"},
|
|
941
943
|
model_configuration={"temperature": 0.7, "max_tokens": 128},
|
|
@@ -943,10 +945,16 @@ class MLClientCtx:
|
|
|
943
945
|
)
|
|
944
946
|
|
|
945
947
|
:param key: Unique name of the artifact.
|
|
946
|
-
:param
|
|
948
|
+
:param prompt_template: Raw prompt list of dicts -
|
|
949
|
+
[{"role": "system", "content": "You are a {profession} advisor"},
|
|
950
|
+
"role": "user", "content": "I need your help with {profession}"]. only "role" and "content" keys allow in any
|
|
951
|
+
str format (upper/lower case), keys will be modified to lower case.
|
|
952
|
+
Cannot be used with `prompt_path`.
|
|
947
953
|
:param prompt_path: Path to a file containing the prompt content. Cannot be used with `prompt_string`.
|
|
948
954
|
:param prompt_legend: A dictionary where each key is a placeholder in the prompt (e.g., ``{user_name}``)
|
|
949
|
-
and the value is a description
|
|
955
|
+
and the value is a dictionary holding two keys, "field", "description". "field" points to the field in
|
|
956
|
+
the event where the value of the place-holder inside the event, if None or not exist will be replaced
|
|
957
|
+
with the place-holder name. "description" will point to explanation of what that placeholder represents.
|
|
950
958
|
Useful for documenting and clarifying dynamic parts of the prompt.
|
|
951
959
|
:param model_artifact: Reference to the parent model (either `ModelArtifact` or model URI string).
|
|
952
960
|
:param model_configuration: Dictionary of generation parameters (e.g., temperature, max_tokens).
|
|
@@ -961,15 +969,15 @@ class MLClientCtx:
|
|
|
961
969
|
:returns: The logged `LLMPromptArtifact` object.
|
|
962
970
|
"""
|
|
963
971
|
|
|
964
|
-
if not
|
|
972
|
+
if not prompt_template and not prompt_path:
|
|
965
973
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
966
|
-
"Either '
|
|
974
|
+
"Either 'prompt_template' or 'prompt_path' must be provided"
|
|
967
975
|
)
|
|
968
976
|
|
|
969
977
|
llm_prompt = LLMPromptArtifact(
|
|
970
978
|
key=key,
|
|
971
979
|
project=self.project or "",
|
|
972
|
-
|
|
980
|
+
prompt_template=prompt_template,
|
|
973
981
|
prompt_path=prompt_path,
|
|
974
982
|
prompt_legend=prompt_legend,
|
|
975
983
|
model_artifact=model_artifact,
|
|
@@ -1267,6 +1275,7 @@ class MLClientCtx:
|
|
|
1267
1275
|
"start_time": to_date_str(self._start_time),
|
|
1268
1276
|
"last_update": to_date_str(self._last_update),
|
|
1269
1277
|
"retry_count": self._retry_count,
|
|
1278
|
+
"retries": self._retries,
|
|
1270
1279
|
},
|
|
1271
1280
|
}
|
|
1272
1281
|
|
mlrun/model.py
CHANGED
|
@@ -1375,6 +1375,7 @@ class RunStatus(ModelObj):
|
|
|
1375
1375
|
notifications: Optional[dict[str, Notification]] = None,
|
|
1376
1376
|
artifact_uris: Optional[dict[str, str]] = None,
|
|
1377
1377
|
retry_count: Optional[int] = None,
|
|
1378
|
+
retries: Optional[list[dict]] = None,
|
|
1378
1379
|
):
|
|
1379
1380
|
self.state = state or "created"
|
|
1380
1381
|
self.status_text = status_text
|
|
@@ -1393,6 +1394,7 @@ class RunStatus(ModelObj):
|
|
|
1393
1394
|
# Artifact key -> URI mapping, since the full artifacts are not stored in the runs DB table
|
|
1394
1395
|
self._artifact_uris = artifact_uris or {}
|
|
1395
1396
|
self._retry_count = retry_count or None
|
|
1397
|
+
self._retries = retries or []
|
|
1396
1398
|
|
|
1397
1399
|
@classmethod
|
|
1398
1400
|
def from_dict(
|
|
@@ -1461,6 +1463,19 @@ class RunStatus(ModelObj):
|
|
|
1461
1463
|
"""
|
|
1462
1464
|
self._retry_count = retry_count
|
|
1463
1465
|
|
|
1466
|
+
@property
|
|
1467
|
+
def retries(self) -> list[dict]:
|
|
1468
|
+
"""List of metadata for each retry attempt."""
|
|
1469
|
+
return self._retries
|
|
1470
|
+
|
|
1471
|
+
@retries.setter
|
|
1472
|
+
def retries(self, retries: list[dict]):
|
|
1473
|
+
"""
|
|
1474
|
+
Set the list of retry attempt metadata.
|
|
1475
|
+
:param retries: A list of dictionaries, each representing a retry attempt.
|
|
1476
|
+
"""
|
|
1477
|
+
self._retries = retries
|
|
1478
|
+
|
|
1464
1479
|
def is_failed(self) -> Optional[bool]:
|
|
1465
1480
|
"""
|
|
1466
1481
|
This method returns whether a run has failed.
|
|
@@ -17,7 +17,7 @@ import socket
|
|
|
17
17
|
from abc import ABC, abstractmethod
|
|
18
18
|
from collections import defaultdict
|
|
19
19
|
from collections.abc import Iterator
|
|
20
|
-
from contextlib import contextmanager
|
|
20
|
+
from contextlib import contextmanager, nullcontext
|
|
21
21
|
from datetime import datetime, timedelta
|
|
22
22
|
from typing import Any, Literal, Optional, Union, cast
|
|
23
23
|
|
|
@@ -31,6 +31,7 @@ import mlrun.errors
|
|
|
31
31
|
import mlrun.model_monitoring.api as mm_api
|
|
32
32
|
import mlrun.model_monitoring.applications.context as mm_context
|
|
33
33
|
import mlrun.model_monitoring.applications.results as mm_results
|
|
34
|
+
import mlrun.model_monitoring.db._schedules as mm_schedules
|
|
34
35
|
import mlrun.model_monitoring.helpers as mm_helpers
|
|
35
36
|
from mlrun.serving.utils import MonitoringApplicationToDict
|
|
36
37
|
from mlrun.utils import logger
|
|
@@ -183,14 +184,27 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
183
184
|
cls,
|
|
184
185
|
*,
|
|
185
186
|
write_output: bool,
|
|
187
|
+
application_name: str,
|
|
188
|
+
artifact_path: str,
|
|
186
189
|
stream_profile: Optional[ds_profile.DatastoreProfile],
|
|
187
190
|
project: "mlrun.MlrunProject",
|
|
188
|
-
) -> Iterator[
|
|
191
|
+
) -> Iterator[
|
|
192
|
+
tuple[
|
|
193
|
+
dict[str, list[tuple]],
|
|
194
|
+
Optional[mm_schedules.ModelMonitoringSchedulesFileApplication],
|
|
195
|
+
]
|
|
196
|
+
]:
|
|
189
197
|
endpoints_output: dict[str, list[tuple]] = defaultdict(list)
|
|
198
|
+
application_schedules = nullcontext()
|
|
190
199
|
if write_output:
|
|
191
200
|
cls._check_writer_is_up(project)
|
|
201
|
+
application_schedules = (
|
|
202
|
+
mm_schedules.ModelMonitoringSchedulesFileApplication(
|
|
203
|
+
artifact_path, application=application_name
|
|
204
|
+
)
|
|
205
|
+
)
|
|
192
206
|
try:
|
|
193
|
-
yield endpoints_output
|
|
207
|
+
yield endpoints_output, application_schedules.__enter__()
|
|
194
208
|
finally:
|
|
195
209
|
if write_output:
|
|
196
210
|
logger.debug(
|
|
@@ -218,6 +232,12 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
218
232
|
endpoints_output=endpoints_output,
|
|
219
233
|
)
|
|
220
234
|
|
|
235
|
+
logger.debug(
|
|
236
|
+
"Saving the application schedules",
|
|
237
|
+
application_name=application_name,
|
|
238
|
+
)
|
|
239
|
+
application_schedules.__exit__(None, None, None)
|
|
240
|
+
|
|
221
241
|
def _handler(
|
|
222
242
|
self,
|
|
223
243
|
context: "mlrun.MLClientCtx",
|
|
@@ -230,6 +250,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
230
250
|
end: Optional[str] = None,
|
|
231
251
|
base_period: Optional[int] = None,
|
|
232
252
|
write_output: bool = False,
|
|
253
|
+
allow_unordered_data: bool = False,
|
|
233
254
|
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
234
255
|
):
|
|
235
256
|
"""
|
|
@@ -250,6 +271,8 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
250
271
|
"working with endpoints, without any custom data-frame input"
|
|
251
272
|
)
|
|
252
273
|
|
|
274
|
+
application_name = self.__class__.__name__
|
|
275
|
+
|
|
253
276
|
feature_stats = (
|
|
254
277
|
mm_api.get_sample_set_statistics(reference_data)
|
|
255
278
|
if reference_data is not None
|
|
@@ -257,8 +280,12 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
257
280
|
)
|
|
258
281
|
|
|
259
282
|
with self._push_to_writer(
|
|
260
|
-
write_output=write_output,
|
|
261
|
-
|
|
283
|
+
write_output=write_output,
|
|
284
|
+
stream_profile=stream_profile,
|
|
285
|
+
application_name=application_name,
|
|
286
|
+
artifact_path=context.artifact_path,
|
|
287
|
+
project=project,
|
|
288
|
+
) as (endpoints_output, application_schedules):
|
|
262
289
|
|
|
263
290
|
def call_do_tracking(event: Optional[dict] = None):
|
|
264
291
|
nonlocal endpoints_output
|
|
@@ -268,7 +295,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
268
295
|
monitoring_context = (
|
|
269
296
|
mm_context.MonitoringApplicationContext._from_ml_ctx(
|
|
270
297
|
event=event,
|
|
271
|
-
application_name=
|
|
298
|
+
application_name=application_name,
|
|
272
299
|
context=context,
|
|
273
300
|
project=project,
|
|
274
301
|
sample_df=sample_data,
|
|
@@ -285,10 +312,16 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
285
312
|
resolved_endpoints = self._handle_endpoints_type_evaluate(
|
|
286
313
|
project=project, endpoints=endpoints
|
|
287
314
|
)
|
|
288
|
-
for
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
315
|
+
for endpoint_name, endpoint_id in resolved_endpoints:
|
|
316
|
+
for window_start, window_end in self._window_generator(
|
|
317
|
+
start=start,
|
|
318
|
+
end=end,
|
|
319
|
+
base_period=base_period,
|
|
320
|
+
application_schedules=application_schedules,
|
|
321
|
+
endpoint_id=endpoint_id,
|
|
322
|
+
application_name=application_name,
|
|
323
|
+
allow_unordered_data=allow_unordered_data,
|
|
324
|
+
):
|
|
292
325
|
result = call_do_tracking(
|
|
293
326
|
event={
|
|
294
327
|
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
@@ -370,8 +403,103 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
370
403
|
)
|
|
371
404
|
|
|
372
405
|
@staticmethod
|
|
406
|
+
def _validate_and_get_window_length(
|
|
407
|
+
*, base_period: int, start_dt: datetime, end_dt: datetime
|
|
408
|
+
) -> timedelta:
|
|
409
|
+
if not isinstance(base_period, int) or base_period <= 0:
|
|
410
|
+
raise mlrun.errors.MLRunValueError(
|
|
411
|
+
"`base_period` must be a nonnegative integer - the number of minutes in a monitoring window"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
window_length = timedelta(minutes=base_period)
|
|
415
|
+
|
|
416
|
+
full_interval_length = end_dt - start_dt
|
|
417
|
+
remainder = full_interval_length % window_length
|
|
418
|
+
if remainder:
|
|
419
|
+
if full_interval_length < window_length:
|
|
420
|
+
extra_msg = (
|
|
421
|
+
"The `base_period` is longer than the difference between `end` and `start`: "
|
|
422
|
+
f"{full_interval_length}. Consider not specifying `base_period`."
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
extra_msg = (
|
|
426
|
+
f"Consider changing the `end` time to `end`={end_dt - remainder}"
|
|
427
|
+
)
|
|
428
|
+
raise mlrun.errors.MLRunValueError(
|
|
429
|
+
"The difference between `end` and `start` must be a multiple of `base_period`: "
|
|
430
|
+
f"`base_period`={window_length}, `start`={start_dt}, `end`={end_dt}. "
|
|
431
|
+
f"{extra_msg}"
|
|
432
|
+
)
|
|
433
|
+
return window_length
|
|
434
|
+
|
|
435
|
+
@staticmethod
|
|
436
|
+
def _validate_monotonically_increasing_data(
|
|
437
|
+
*,
|
|
438
|
+
application_schedules: Optional[
|
|
439
|
+
mm_schedules.ModelMonitoringSchedulesFileApplication
|
|
440
|
+
],
|
|
441
|
+
endpoint_id: str,
|
|
442
|
+
start_dt: datetime,
|
|
443
|
+
end_dt: datetime,
|
|
444
|
+
base_period: Optional[int],
|
|
445
|
+
application_name: str,
|
|
446
|
+
allow_unordered_data: bool,
|
|
447
|
+
) -> datetime:
|
|
448
|
+
"""Make sure that the (app, endpoint) pair doesn't write output before the last analyzed window"""
|
|
449
|
+
if application_schedules:
|
|
450
|
+
last_analyzed = application_schedules.get_endpoint_last_analyzed(
|
|
451
|
+
endpoint_id
|
|
452
|
+
)
|
|
453
|
+
if last_analyzed:
|
|
454
|
+
if start_dt < last_analyzed:
|
|
455
|
+
if allow_unordered_data:
|
|
456
|
+
if last_analyzed < end_dt and base_period is None:
|
|
457
|
+
logger.warn(
|
|
458
|
+
"Setting the start time to last_analyzed since the original start time precedes "
|
|
459
|
+
"last_analyzed",
|
|
460
|
+
original_start=start_dt,
|
|
461
|
+
new_start=last_analyzed,
|
|
462
|
+
application_name=application_name,
|
|
463
|
+
endpoint_id=endpoint_id,
|
|
464
|
+
)
|
|
465
|
+
start_dt = last_analyzed
|
|
466
|
+
else:
|
|
467
|
+
raise mlrun.errors.MLRunValueError(
|
|
468
|
+
"The start time for the application and endpoint precedes the last analyzed time: "
|
|
469
|
+
f"{start_dt=}, {last_analyzed=}, {application_name=}, {endpoint_id=}. "
|
|
470
|
+
"Writing data out of order is not supported, and the start time could not be "
|
|
471
|
+
"dynamically reset, as last_analyzed is later than the given end time or that "
|
|
472
|
+
f"base_period was specified ({end_dt=}, {base_period=})."
|
|
473
|
+
)
|
|
474
|
+
else:
|
|
475
|
+
raise mlrun.errors.MLRunValueError(
|
|
476
|
+
"The start time for the application and endpoint precedes the last analyzed time: "
|
|
477
|
+
f"{start_dt=}, {last_analyzed=}, {application_name=}, {endpoint_id=}. "
|
|
478
|
+
"Writing data out of order is not supported. You should change the start time to "
|
|
479
|
+
f"'{last_analyzed}' or later."
|
|
480
|
+
)
|
|
481
|
+
else:
|
|
482
|
+
logger.debug(
|
|
483
|
+
"The application is running on the endpoint for the first time",
|
|
484
|
+
endpoint_id=endpoint_id,
|
|
485
|
+
start_dt=start_dt,
|
|
486
|
+
application_name=application_name,
|
|
487
|
+
)
|
|
488
|
+
return start_dt
|
|
489
|
+
|
|
490
|
+
@classmethod
|
|
373
491
|
def _window_generator(
|
|
374
|
-
|
|
492
|
+
cls,
|
|
493
|
+
*,
|
|
494
|
+
start: Optional[str],
|
|
495
|
+
end: Optional[str],
|
|
496
|
+
base_period: Optional[int],
|
|
497
|
+
application_schedules: Optional[
|
|
498
|
+
mm_schedules.ModelMonitoringSchedulesFileApplication
|
|
499
|
+
],
|
|
500
|
+
endpoint_id: str,
|
|
501
|
+
application_name: str,
|
|
502
|
+
allow_unordered_data: bool,
|
|
375
503
|
) -> Iterator[tuple[Optional[datetime], Optional[datetime]]]:
|
|
376
504
|
if start is None or end is None:
|
|
377
505
|
# A single window based on the `sample_data` input - see `_handler`.
|
|
@@ -381,20 +509,36 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
381
509
|
start_dt = datetime.fromisoformat(start)
|
|
382
510
|
end_dt = datetime.fromisoformat(end)
|
|
383
511
|
|
|
512
|
+
start_dt = cls._validate_monotonically_increasing_data(
|
|
513
|
+
application_schedules=application_schedules,
|
|
514
|
+
endpoint_id=endpoint_id,
|
|
515
|
+
start_dt=start_dt,
|
|
516
|
+
end_dt=end_dt,
|
|
517
|
+
base_period=base_period,
|
|
518
|
+
application_name=application_name,
|
|
519
|
+
allow_unordered_data=allow_unordered_data,
|
|
520
|
+
)
|
|
521
|
+
|
|
384
522
|
if base_period is None:
|
|
385
523
|
yield start_dt, end_dt
|
|
524
|
+
if application_schedules:
|
|
525
|
+
application_schedules.update_endpoint_last_analyzed(
|
|
526
|
+
endpoint_uid=endpoint_id, last_analyzed=end_dt
|
|
527
|
+
)
|
|
386
528
|
return
|
|
387
529
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
)
|
|
530
|
+
window_length = cls._validate_and_get_window_length(
|
|
531
|
+
base_period=base_period, start_dt=start_dt, end_dt=end_dt
|
|
532
|
+
)
|
|
392
533
|
|
|
393
|
-
window_length = timedelta(minutes=base_period)
|
|
394
534
|
current_start_time = start_dt
|
|
395
535
|
while current_start_time < end_dt:
|
|
396
536
|
current_end_time = min(current_start_time + window_length, end_dt)
|
|
397
537
|
yield current_start_time, current_end_time
|
|
538
|
+
if application_schedules:
|
|
539
|
+
application_schedules.update_endpoint_last_analyzed(
|
|
540
|
+
endpoint_uid=endpoint_id, last_analyzed=current_end_time
|
|
541
|
+
)
|
|
398
542
|
current_start_time = current_end_time
|
|
399
543
|
|
|
400
544
|
@classmethod
|
|
@@ -484,6 +628,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
484
628
|
* ``end``, ``datetime``
|
|
485
629
|
* ``base_period``, ``int``
|
|
486
630
|
* ``write_output``, ``bool``
|
|
631
|
+
* ``allow_unordered_data``, ``bool``
|
|
487
632
|
|
|
488
633
|
For Git sources, add the source archive to the returned job and change the handler:
|
|
489
634
|
|
|
@@ -567,6 +712,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
567
712
|
end: Optional[datetime] = None,
|
|
568
713
|
base_period: Optional[int] = None,
|
|
569
714
|
write_output: bool = False,
|
|
715
|
+
allow_unordered_data: bool = False,
|
|
570
716
|
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
571
717
|
) -> "mlrun.RunObject":
|
|
572
718
|
"""
|
|
@@ -608,6 +754,8 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
608
754
|
:param start: The start time of the endpoint's data, not included.
|
|
609
755
|
If you want the model endpoint's data at ``start`` included, you need to subtract a
|
|
610
756
|
small ``datetime.timedelta`` from it.
|
|
757
|
+
Make sure to include the time zone when constructing `datetime.datetime` objects
|
|
758
|
+
manually.
|
|
611
759
|
:param end: The end time of the endpoint's data, included.
|
|
612
760
|
Please note: when ``start`` and ``end`` are set, they create a left-open time interval
|
|
613
761
|
("window") :math:`(\\operatorname{start}, \\operatorname{end}]` that excludes the
|
|
@@ -616,17 +764,24 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
616
764
|
taken in the window's data.
|
|
617
765
|
:param base_period: The window length in minutes. If ``None``, the whole window from ``start`` to ``end``
|
|
618
766
|
is taken. If an integer is specified, the application is run from ``start`` to ``end``
|
|
619
|
-
in ``base_period`` length windows
|
|
620
|
-
therefore may be shorter:
|
|
767
|
+
in ``base_period`` length windows:
|
|
621
768
|
:math:`(\\operatorname{start}, \\operatorname{start} + \\operatorname{base\\_period}],
|
|
622
769
|
(\\operatorname{start} + \\operatorname{base\\_period},
|
|
623
770
|
\\operatorname{start} + 2\\cdot\\operatorname{base\\_period}],
|
|
624
771
|
..., (\\operatorname{start} +
|
|
625
|
-
m\\cdot\\operatorname{base\\_period}, \\operatorname{end}]`,
|
|
626
|
-
where :math:`m` is
|
|
772
|
+
(m - 1)\\cdot\\operatorname{base\\_period}, \\operatorname{end}]`,
|
|
773
|
+
where :math:`m` is a positive integer and :math:`\\operatorname{end} =
|
|
774
|
+
\\operatorname{start} + m\\cdot\\operatorname{base\\_period}`.
|
|
775
|
+
Please note that the difference between ``end`` and ``start`` must be a multiple of
|
|
776
|
+
``base_period``.
|
|
627
777
|
:param write_output: Whether to write the results and metrics to the time-series DB. Can be ``True`` only
|
|
628
778
|
if ``endpoints`` are passed.
|
|
629
779
|
Note: the model monitoring infrastructure must be up for the writing to work.
|
|
780
|
+
:param allow_unordered_data: Relevant only when writing outputs to the database. When ``False``, and the
|
|
781
|
+
requested ``start`` time precedes the ``end`` time of a previous run that also
|
|
782
|
+
wrote to the database - an error is raised.
|
|
783
|
+
If ``True``, when the previously described situation occurs, the relevant time
|
|
784
|
+
window is cut so that it starts at the earliest possible time after ``start``.
|
|
630
785
|
:param stream_profile: The stream datastore profile. It should be provided only when running locally and
|
|
631
786
|
writing the outputs to the database (i.e., when both ``run_local`` and
|
|
632
787
|
``write_output`` are set to ``True``).
|
|
@@ -666,6 +821,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
666
821
|
params["end"] = end.isoformat() if isinstance(end, datetime) else end
|
|
667
822
|
params["base_period"] = base_period
|
|
668
823
|
params["write_output"] = write_output
|
|
824
|
+
params["allow_unordered_data"] = allow_unordered_data
|
|
669
825
|
if stream_profile:
|
|
670
826
|
if not run_local:
|
|
671
827
|
raise mlrun.errors.MLRunValueError(
|