mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +4 -0
- mlrun/common/schemas/notification.py +3 -3
- mlrun/datastore/azure_blob.py +120 -30
- mlrun/feature_store/common.py +6 -11
- mlrun/model.py +5 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -6
- mlrun/model_monitoring/db/tsdb/base.py +121 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +65 -5
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +211 -35
- mlrun/model_monitoring/stream_processing.py +67 -25
- mlrun/projects/operations.py +1 -1
- mlrun/projects/project.py +7 -1
- mlrun/runtimes/__init__.py +15 -8
- mlrun/runtimes/nuclio/application/application.py +45 -5
- mlrun/runtimes/pod.py +2 -2
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +7 -9
- mlrun/serving/v2_serving.py +1 -0
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/METADATA +7 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/RECORD +28 -28
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc38.dist-info}/top_level.txt +0 -0
mlrun/alerts/alert.py
CHANGED
|
@@ -29,6 +29,7 @@ class AlertConfig(ModelObj):
|
|
|
29
29
|
"reset_policy",
|
|
30
30
|
"state",
|
|
31
31
|
"count",
|
|
32
|
+
"created",
|
|
32
33
|
]
|
|
33
34
|
_fields_to_serialize = ModelObj._fields_to_serialize + [
|
|
34
35
|
"entities",
|
|
@@ -55,12 +56,12 @@ class AlertConfig(ModelObj):
|
|
|
55
56
|
created: str = None,
|
|
56
57
|
count: int = None,
|
|
57
58
|
):
|
|
58
|
-
"""
|
|
59
|
-
Alert config object
|
|
59
|
+
"""Alert config object
|
|
60
60
|
|
|
61
61
|
Example::
|
|
62
|
+
|
|
62
63
|
# create an alert on endpoint_id, which will be triggered to slack if there is a "data_drift_detected" event
|
|
63
|
-
3 times in the next hour.
|
|
64
|
+
# 3 times in the next hour.
|
|
64
65
|
from mlrun.alerts import AlertConfig
|
|
65
66
|
import mlrun.common.schemas.alert as alert_objects
|
|
66
67
|
|
|
@@ -53,9 +53,11 @@ class EventFieldType:
|
|
|
53
53
|
PREDICTIONS = "predictions"
|
|
54
54
|
NAMED_PREDICTIONS = "named_predictions"
|
|
55
55
|
ERROR_COUNT = "error_count"
|
|
56
|
+
MODEL_ERROR = "model_error"
|
|
56
57
|
ENTITIES = "entities"
|
|
57
58
|
FIRST_REQUEST = "first_request"
|
|
58
59
|
LAST_REQUEST = "last_request"
|
|
60
|
+
LAST_REQUEST_TIMESTAMP = "last_request_timestamp"
|
|
59
61
|
METRIC = "metric"
|
|
60
62
|
METRICS = "metrics"
|
|
61
63
|
BATCH_INTERVALS_DICT = "batch_intervals_dict"
|
|
@@ -217,6 +219,7 @@ class FileTargetKind:
|
|
|
217
219
|
APP_METRICS = "app_metrics"
|
|
218
220
|
MONITORING_SCHEDULES = "monitoring_schedules"
|
|
219
221
|
MONITORING_APPLICATION = "monitoring_application"
|
|
222
|
+
ERRORS = "errors"
|
|
220
223
|
|
|
221
224
|
|
|
222
225
|
class ModelMonitoringMode(str, Enum):
|
|
@@ -240,6 +243,7 @@ class V3IOTSDBTables(MonitoringStrEnum):
|
|
|
240
243
|
APP_RESULTS = "app-results"
|
|
241
244
|
METRICS = "metrics"
|
|
242
245
|
EVENTS = "events"
|
|
246
|
+
ERRORS = "errors"
|
|
243
247
|
|
|
244
248
|
|
|
245
249
|
class TDEngineSuperTables(MonitoringStrEnum):
|
|
@@ -71,9 +71,9 @@ class Notification(pydantic.BaseModel):
|
|
|
71
71
|
|
|
72
72
|
kind: NotificationKind
|
|
73
73
|
name: str
|
|
74
|
-
message: str
|
|
75
|
-
severity: NotificationSeverity
|
|
76
|
-
when: list[str]
|
|
74
|
+
message: typing.Optional[str] = None
|
|
75
|
+
severity: typing.Optional[NotificationSeverity] = None
|
|
76
|
+
when: typing.Optional[list[str]] = None
|
|
77
77
|
condition: typing.Optional[str] = None
|
|
78
78
|
params: typing.Optional[dict[str, typing.Any]] = None
|
|
79
79
|
status: typing.Optional[NotificationStatus] = None
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -16,6 +16,7 @@ import time
|
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from urllib.parse import urlparse
|
|
18
18
|
|
|
19
|
+
from azure.storage.blob import BlobServiceClient
|
|
19
20
|
from azure.storage.blob._shared.base_client import parse_connection_str
|
|
20
21
|
from fsspec.registry import get_filesystem_class
|
|
21
22
|
|
|
@@ -29,47 +30,128 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
|
29
30
|
|
|
30
31
|
class AzureBlobStore(DataStore):
|
|
31
32
|
using_bucket = True
|
|
33
|
+
max_concurrency = 100
|
|
34
|
+
max_blocksize = 1024 * 1024 * 4
|
|
35
|
+
max_single_put_size = (
|
|
36
|
+
1024 * 1024 * 8
|
|
37
|
+
) # for service_client property only, does not affect filesystem
|
|
32
38
|
|
|
33
39
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
40
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
41
|
+
self._service_client = None
|
|
42
|
+
self._storage_options = None
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def storage_options(self):
|
|
46
|
+
if not self._storage_options:
|
|
47
|
+
res = dict(
|
|
48
|
+
account_name=self._get_secret_or_env("account_name")
|
|
49
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
|
|
50
|
+
account_key=self._get_secret_or_env("account_key")
|
|
51
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
|
|
52
|
+
connection_string=self._get_secret_or_env("connection_string")
|
|
53
|
+
or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
|
|
54
|
+
tenant_id=self._get_secret_or_env("tenant_id")
|
|
55
|
+
or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
|
|
56
|
+
client_id=self._get_secret_or_env("client_id")
|
|
57
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
|
|
58
|
+
client_secret=self._get_secret_or_env("client_secret")
|
|
59
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
|
|
60
|
+
sas_token=self._get_secret_or_env("sas_token")
|
|
61
|
+
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
62
|
+
credential=self._get_secret_or_env("credential"),
|
|
63
|
+
)
|
|
64
|
+
self._storage_options = self._sanitize_storage_options(res)
|
|
65
|
+
return self._storage_options
|
|
35
66
|
|
|
36
67
|
@property
|
|
37
68
|
def filesystem(self):
|
|
38
69
|
"""return fsspec file system object, if supported"""
|
|
39
|
-
if self._filesystem:
|
|
40
|
-
return self._filesystem
|
|
41
70
|
try:
|
|
42
71
|
import adlfs # noqa
|
|
43
72
|
except ImportError as exc:
|
|
44
73
|
raise ImportError("Azure adlfs not installed") from exc
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
filesystem_class
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
74
|
+
|
|
75
|
+
if not self._filesystem:
|
|
76
|
+
# in order to support az and wasbs kinds
|
|
77
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
78
|
+
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
79
|
+
filesystem_class,
|
|
80
|
+
using_bucket=self.using_bucket,
|
|
81
|
+
blocksize=self.max_blocksize,
|
|
82
|
+
**self.storage_options,
|
|
83
|
+
)
|
|
52
84
|
return self._filesystem
|
|
53
85
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
86
|
+
@property
|
|
87
|
+
def service_client(self):
|
|
88
|
+
try:
|
|
89
|
+
import azure # noqa
|
|
90
|
+
except ImportError as exc:
|
|
91
|
+
raise ImportError("Azure not installed") from exc
|
|
92
|
+
|
|
93
|
+
if not self._service_client:
|
|
94
|
+
self._do_connect()
|
|
95
|
+
return self._service_client
|
|
96
|
+
|
|
97
|
+
def _do_connect(self):
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
Creates a client for azure.
|
|
101
|
+
Raises MLRunInvalidArgumentError if none of the connection details are available
|
|
102
|
+
based on do_connect in AzureBlobFileSystem:
|
|
103
|
+
https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
|
|
104
|
+
"""
|
|
105
|
+
from azure.identity import ClientSecretCredential
|
|
106
|
+
|
|
107
|
+
storage_options = self.storage_options
|
|
108
|
+
connection_string = storage_options.get("connection_string")
|
|
109
|
+
client_name = storage_options.get("account_name")
|
|
110
|
+
account_key = storage_options.get("account_key")
|
|
111
|
+
sas_token = storage_options.get("sas_token")
|
|
112
|
+
client_id = storage_options.get("client_id")
|
|
113
|
+
credential = storage_options.get("credential")
|
|
114
|
+
|
|
115
|
+
credential_from_client_id = None
|
|
116
|
+
if (
|
|
117
|
+
credential is None
|
|
118
|
+
and account_key is None
|
|
119
|
+
and sas_token is None
|
|
120
|
+
and client_id is not None
|
|
121
|
+
):
|
|
122
|
+
credential_from_client_id = ClientSecretCredential(
|
|
123
|
+
tenant_id=storage_options.get("tenant_id"),
|
|
124
|
+
client_id=client_id,
|
|
125
|
+
client_secret=storage_options.get("client_secret"),
|
|
126
|
+
)
|
|
127
|
+
try:
|
|
128
|
+
if connection_string is not None:
|
|
129
|
+
self._service_client = BlobServiceClient.from_connection_string(
|
|
130
|
+
conn_str=connection_string,
|
|
131
|
+
max_block_size=self.max_blocksize,
|
|
132
|
+
max_single_put_size=self.max_single_put_size,
|
|
133
|
+
)
|
|
134
|
+
elif client_name is not None:
|
|
135
|
+
account_url = f"https://{client_name}.blob.core.windows.net"
|
|
136
|
+
cred = credential_from_client_id or credential or account_key
|
|
137
|
+
if not cred and sas_token is not None:
|
|
138
|
+
if not sas_token.startswith("?"):
|
|
139
|
+
sas_token = f"?{sas_token}"
|
|
140
|
+
account_url = account_url + sas_token
|
|
141
|
+
self._service_client = BlobServiceClient(
|
|
142
|
+
account_url=account_url,
|
|
143
|
+
credential=cred,
|
|
144
|
+
max_block_size=self.max_blocksize,
|
|
145
|
+
max_single_put_size=self.max_single_put_size,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
149
|
+
"Must provide either a connection_string or account_name with credentials"
|
|
150
|
+
)
|
|
151
|
+
except Exception as e:
|
|
152
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
153
|
+
f"unable to connect to account for {e}"
|
|
154
|
+
)
|
|
73
155
|
|
|
74
156
|
def _convert_key_to_remote_path(self, key):
|
|
75
157
|
key = key.strip("/")
|
|
@@ -82,7 +164,15 @@ class AzureBlobStore(DataStore):
|
|
|
82
164
|
|
|
83
165
|
def upload(self, key, src_path):
|
|
84
166
|
remote_path = self._convert_key_to_remote_path(key)
|
|
85
|
-
|
|
167
|
+
container, remote_path = remote_path.split("/", 1)
|
|
168
|
+
container_client = self.service_client.get_container_client(container=container)
|
|
169
|
+
with open(file=src_path, mode="rb") as data:
|
|
170
|
+
container_client.upload_blob(
|
|
171
|
+
name=remote_path,
|
|
172
|
+
data=data,
|
|
173
|
+
overwrite=True,
|
|
174
|
+
max_concurrency=self.max_concurrency,
|
|
175
|
+
)
|
|
86
176
|
|
|
87
177
|
def get(self, key, size=None, offset=0):
|
|
88
178
|
remote_path = self._convert_key_to_remote_path(key)
|
|
@@ -135,7 +225,7 @@ class AzureBlobStore(DataStore):
|
|
|
135
225
|
|
|
136
226
|
def get_spark_options(self):
|
|
137
227
|
res = {}
|
|
138
|
-
st = self.
|
|
228
|
+
st = self.storage_options()
|
|
139
229
|
service = "blob"
|
|
140
230
|
primary_url = None
|
|
141
231
|
if st.get("connection_string"):
|
mlrun/feature_store/common.py
CHANGED
|
@@ -37,17 +37,12 @@ def parse_feature_string(feature):
|
|
|
37
37
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
38
38
|
f"feature {feature} must be {expected_message}"
|
|
39
39
|
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
feature_set
|
|
46
|
-
feature_name = splitted[1]
|
|
47
|
-
splitted = feature_name.split(" as ")
|
|
48
|
-
if len(splitted) > 1:
|
|
49
|
-
return feature_set.strip(), splitted[0].strip(), splitted[1].strip()
|
|
50
|
-
return feature_set.strip(), feature_name.strip(), None
|
|
40
|
+
feature_set, feature_name = feature.rsplit(feature_separator, 1)
|
|
41
|
+
feature_set = feature_set.strip()
|
|
42
|
+
split_result = feature_name.split(" as ", 1)
|
|
43
|
+
feature_name = split_result[0].strip()
|
|
44
|
+
alias = split_result[1].strip() if len(split_result) > 1 else None
|
|
45
|
+
return feature_set, feature_name, alias
|
|
51
46
|
|
|
52
47
|
|
|
53
48
|
def parse_project_name_from_feature_string(feature):
|
mlrun/model.py
CHANGED
|
@@ -1789,6 +1789,11 @@ class RunObject(RunTemplate):
|
|
|
1789
1789
|
|
|
1790
1790
|
return state
|
|
1791
1791
|
|
|
1792
|
+
def abort(self):
|
|
1793
|
+
"""abort the run"""
|
|
1794
|
+
db = mlrun.get_run_db()
|
|
1795
|
+
db.abort_run(self.metadata.uid, self.metadata.project)
|
|
1796
|
+
|
|
1792
1797
|
@staticmethod
|
|
1793
1798
|
def create_uri(project: str, uid: str, iteration: Union[int, str], tag: str = ""):
|
|
1794
1799
|
if tag:
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import http
|
|
15
15
|
import json
|
|
16
16
|
import typing
|
|
17
17
|
from dataclasses import dataclass
|
|
@@ -417,11 +417,14 @@ class KVStoreBase(StoreBase):
|
|
|
417
417
|
)
|
|
418
418
|
return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
419
419
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
420
|
+
if err.status_code == http.HTTPStatus.NOT_FOUND:
|
|
421
|
+
logger.debug("Last analyzed time not found", err=err)
|
|
422
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
423
|
+
f"No last analyzed value has been found for {application_name} "
|
|
424
|
+
f"that processes model endpoint {endpoint_id}",
|
|
425
|
+
)
|
|
426
|
+
logger.error("Error while getting last analyzed time", err=err)
|
|
427
|
+
raise err
|
|
425
428
|
|
|
426
429
|
def update_last_analyzed(
|
|
427
430
|
self, endpoint_id: str, application_name: str, last_analyzed: int
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import typing
|
|
16
16
|
from abc import ABC, abstractmethod
|
|
17
17
|
from datetime import datetime
|
|
18
|
+
from typing import Union
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pydantic
|
|
@@ -47,7 +48,7 @@ class TSDBConnector(ABC):
|
|
|
47
48
|
self.project = project
|
|
48
49
|
|
|
49
50
|
@abstractmethod
|
|
50
|
-
def apply_monitoring_stream_steps(self, graph):
|
|
51
|
+
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
51
52
|
"""
|
|
52
53
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
53
54
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -59,6 +60,14 @@ class TSDBConnector(ABC):
|
|
|
59
60
|
"""
|
|
60
61
|
pass
|
|
61
62
|
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Adds a branch to the stream pod graph to handle events that
|
|
67
|
+
arrive with errors from the model server and saves them to the error TSDB table.
|
|
68
|
+
The first step that generates by this method should come after `ForwardError` step.
|
|
69
|
+
"""
|
|
70
|
+
|
|
62
71
|
@abstractmethod
|
|
63
72
|
def write_application_event(
|
|
64
73
|
self,
|
|
@@ -181,6 +190,117 @@ class TSDBConnector(ABC):
|
|
|
181
190
|
:return: Metric values object or no data object.
|
|
182
191
|
"""
|
|
183
192
|
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def get_last_request(
|
|
195
|
+
self,
|
|
196
|
+
endpoint_ids: Union[str, list[str]],
|
|
197
|
+
start: Union[datetime, str] = "0",
|
|
198
|
+
end: Union[datetime, str] = "now",
|
|
199
|
+
) -> pd.DataFrame:
|
|
200
|
+
"""
|
|
201
|
+
Fetches data from the predictions TSDB table and returns the most recent request
|
|
202
|
+
timestamp for each specified endpoint.
|
|
203
|
+
|
|
204
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
205
|
+
:param start: The start time for the query.
|
|
206
|
+
:param end: The end time for the query.
|
|
207
|
+
|
|
208
|
+
:return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
|
|
209
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
@abstractmethod
|
|
213
|
+
def get_drift_status(
|
|
214
|
+
self,
|
|
215
|
+
endpoint_ids: Union[str, list[str]],
|
|
216
|
+
start: Union[datetime, str] = "now-24h",
|
|
217
|
+
end: Union[datetime, str] = "now",
|
|
218
|
+
) -> pd.DataFrame:
|
|
219
|
+
"""
|
|
220
|
+
Fetches data from the app-results TSDB table and returns the highest status among all
|
|
221
|
+
the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
|
|
222
|
+
|
|
223
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
224
|
+
:param start: The start time for the query.
|
|
225
|
+
:param end: The end time for the query.
|
|
226
|
+
|
|
227
|
+
:return: A pd.DataFrame containing the columns [result_status, endpoint_id].
|
|
228
|
+
If an endpoint has not been monitored within the specified time range (last 24 hours),
|
|
229
|
+
it will not appear in the result.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def get_metrics_metadata(
|
|
234
|
+
self,
|
|
235
|
+
endpoint_id: str,
|
|
236
|
+
start: Union[datetime, str] = "0",
|
|
237
|
+
end: Union[datetime, str] = "now",
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""
|
|
240
|
+
Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
|
|
241
|
+
|
|
242
|
+
:param endpoint_id: The model endpoint identifier.
|
|
243
|
+
:param start: The start time of the query.
|
|
244
|
+
:param end: The end time of the query.
|
|
245
|
+
|
|
246
|
+
:return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
|
|
247
|
+
Containing the columns [application_name, metric_name, endpoint_id]
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
@abstractmethod
|
|
251
|
+
def get_results_metadata(
|
|
252
|
+
self,
|
|
253
|
+
endpoint_id: str,
|
|
254
|
+
start: Union[datetime, str] = "0",
|
|
255
|
+
end: Union[datetime, str] = "now",
|
|
256
|
+
) -> pd.DataFrame:
|
|
257
|
+
"""
|
|
258
|
+
Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
|
|
259
|
+
|
|
260
|
+
:param endpoint_id: The model endpoint identifier.
|
|
261
|
+
:param start: The start time of the query.
|
|
262
|
+
:param end: The end time of the query.
|
|
263
|
+
|
|
264
|
+
:return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
|
|
265
|
+
Containing the columns [application_name, result_name, result_kind, endpoint_id]
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def get_error_count(
|
|
270
|
+
self,
|
|
271
|
+
endpoint_ids: Union[str, list[str]],
|
|
272
|
+
start: Union[datetime, str] = "0",
|
|
273
|
+
end: Union[datetime, str] = "now",
|
|
274
|
+
) -> pd.DataFrame:
|
|
275
|
+
"""
|
|
276
|
+
Fetches data from the error TSDB table and returns the error count for each specified endpoint.
|
|
277
|
+
|
|
278
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
279
|
+
:param start: The start time for the query.
|
|
280
|
+
:param end: The end time for the query.
|
|
281
|
+
|
|
282
|
+
:return: A pd.DataFrame containing the columns [error_count, endpoint_id].
|
|
283
|
+
If an endpoint have not raised error within the specified time range, it will not appear in the result.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def get_avg_latency(
|
|
288
|
+
self,
|
|
289
|
+
endpoint_ids: Union[str, list[str]],
|
|
290
|
+
start: Union[datetime, str] = "0",
|
|
291
|
+
end: Union[datetime, str] = "now",
|
|
292
|
+
) -> pd.DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
295
|
+
|
|
296
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
297
|
+
:param start: The start time for the query.
|
|
298
|
+
:param end: The end time for the query.
|
|
299
|
+
|
|
300
|
+
:return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
|
|
301
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
302
|
+
"""
|
|
303
|
+
|
|
184
304
|
@staticmethod
|
|
185
305
|
def df_to_metrics_values(
|
|
186
306
|
*,
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import typing
|
|
16
16
|
from datetime import datetime
|
|
17
|
+
from typing import Union
|
|
17
18
|
|
|
18
19
|
import pandas as pd
|
|
19
20
|
import taosws
|
|
@@ -156,6 +157,9 @@ class TDEngineConnector(TSDBConnector):
|
|
|
156
157
|
after="ProcessBeforeTDEngine",
|
|
157
158
|
)
|
|
158
159
|
|
|
160
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
161
|
+
pass
|
|
162
|
+
|
|
159
163
|
def delete_tsdb_resources(self):
|
|
160
164
|
"""
|
|
161
165
|
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
@@ -246,11 +250,9 @@ class TDEngineConnector(TSDBConnector):
|
|
|
246
250
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
247
251
|
f"Failed to query table {table} in database {self.database}, {str(e)}"
|
|
248
252
|
)
|
|
249
|
-
columns = []
|
|
250
|
-
for column in query_result.fields:
|
|
251
|
-
columns.append(column.name())
|
|
252
253
|
|
|
253
|
-
|
|
254
|
+
df_columns = [field.name() for field in query_result.fields]
|
|
255
|
+
return pd.DataFrame(query_result, columns=df_columns)
|
|
254
256
|
|
|
255
257
|
def read_metrics_data(
|
|
256
258
|
self,
|
|
@@ -274,13 +276,22 @@ class TDEngineConnector(TSDBConnector):
|
|
|
274
276
|
],
|
|
275
277
|
],
|
|
276
278
|
]:
|
|
279
|
+
timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
|
|
280
|
+
columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
|
|
277
281
|
if type == "metrics":
|
|
278
282
|
table = mm_schemas.TDEngineSuperTables.METRICS
|
|
279
283
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
284
|
+
columns += [name, mm_schemas.MetricData.METRIC_VALUE]
|
|
280
285
|
df_handler = self.df_to_metrics_values
|
|
281
286
|
elif type == "results":
|
|
282
287
|
table = mm_schemas.TDEngineSuperTables.APP_RESULTS
|
|
283
288
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
289
|
+
columns += [
|
|
290
|
+
name,
|
|
291
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
292
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
293
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
294
|
+
]
|
|
284
295
|
df_handler = self.df_to_results_values
|
|
285
296
|
else:
|
|
286
297
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -300,7 +311,8 @@ class TDEngineConnector(TSDBConnector):
|
|
|
300
311
|
start=start,
|
|
301
312
|
end=end,
|
|
302
313
|
filter_query=filter_query,
|
|
303
|
-
timestamp_column=
|
|
314
|
+
timestamp_column=timestamp_column,
|
|
315
|
+
columns=columns,
|
|
304
316
|
)
|
|
305
317
|
|
|
306
318
|
df[mm_schemas.WriterEvent.END_INFER_TIME] = pd.to_datetime(
|
|
@@ -377,6 +389,54 @@ class TDEngineConnector(TSDBConnector):
|
|
|
377
389
|
), # pyright: ignore[reportArgumentType]
|
|
378
390
|
)
|
|
379
391
|
|
|
392
|
+
def get_last_request(
|
|
393
|
+
self,
|
|
394
|
+
endpoint_ids: Union[str, list[str]],
|
|
395
|
+
start: Union[datetime, str] = "0",
|
|
396
|
+
end: Union[datetime, str] = "now",
|
|
397
|
+
) -> pd.DataFrame:
|
|
398
|
+
pass
|
|
399
|
+
|
|
400
|
+
def get_drift_status(
|
|
401
|
+
self,
|
|
402
|
+
endpoint_ids: Union[str, list[str]],
|
|
403
|
+
start: Union[datetime, str] = "now-24h",
|
|
404
|
+
end: Union[datetime, str] = "now",
|
|
405
|
+
) -> pd.DataFrame:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
def get_metrics_metadata(
|
|
409
|
+
self,
|
|
410
|
+
endpoint_id: str,
|
|
411
|
+
start: Union[datetime, str] = "0",
|
|
412
|
+
end: Union[datetime, str] = "now",
|
|
413
|
+
) -> pd.DataFrame:
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
def get_results_metadata(
|
|
417
|
+
self,
|
|
418
|
+
endpoint_id: str,
|
|
419
|
+
start: Union[datetime, str] = "0",
|
|
420
|
+
end: Union[datetime, str] = "now",
|
|
421
|
+
) -> pd.DataFrame:
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
def get_error_count(
|
|
425
|
+
self,
|
|
426
|
+
endpoint_ids: Union[str, list[str]],
|
|
427
|
+
start: Union[datetime, str] = "0",
|
|
428
|
+
end: Union[datetime, str] = "now",
|
|
429
|
+
) -> pd.DataFrame:
|
|
430
|
+
pass
|
|
431
|
+
|
|
432
|
+
def get_avg_latency(
|
|
433
|
+
self,
|
|
434
|
+
endpoint_ids: Union[str, list[str]],
|
|
435
|
+
start: Union[datetime, str] = "0",
|
|
436
|
+
end: Union[datetime, str] = "now",
|
|
437
|
+
) -> pd.DataFrame:
|
|
438
|
+
pass
|
|
439
|
+
|
|
380
440
|
# Note: this function serves as a reference for checking the TSDB for the existence of a metric.
|
|
381
441
|
#
|
|
382
442
|
# def read_prediction_metric_for_endpoint_if_exists(
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
from datetime import datetime
|
|
15
15
|
from typing import Any
|
|
16
16
|
|
|
17
17
|
import mlrun.feature_store.steps
|
|
@@ -20,6 +20,7 @@ from mlrun.common.schemas.model_monitoring import (
|
|
|
20
20
|
EventKeyMetrics,
|
|
21
21
|
EventLiveStats,
|
|
22
22
|
)
|
|
23
|
+
from mlrun.utils import logger
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -134,3 +135,24 @@ class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
|
134
135
|
else:
|
|
135
136
|
unpacked[key] = new_event[key]
|
|
136
137
|
return unpacked if unpacked else None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ErrorExtractor(mlrun.feature_store.steps.MapClass):
|
|
141
|
+
def __init__(self, **kwargs):
|
|
142
|
+
"""
|
|
143
|
+
Prepare the event for insertion into the errors TSDB table.
|
|
144
|
+
"""
|
|
145
|
+
super().__init__(**kwargs)
|
|
146
|
+
|
|
147
|
+
def do(self, event):
|
|
148
|
+
error = event.get("error")
|
|
149
|
+
timestamp = datetime.fromisoformat(event.get("when"))
|
|
150
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
151
|
+
event = {
|
|
152
|
+
EventFieldType.MODEL_ERROR: str(error),
|
|
153
|
+
EventFieldType.ENDPOINT_ID: endpoint_id,
|
|
154
|
+
EventFieldType.TIMESTAMP: timestamp,
|
|
155
|
+
EventFieldType.ERROR_COUNT: 1.0,
|
|
156
|
+
}
|
|
157
|
+
logger.info("Write error to errors TSDB table", event=event)
|
|
158
|
+
return event
|