mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +9 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +34 -21
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +265 -7
- mlrun/datastore/datastore.py +10 -5
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +367 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +211 -74
- mlrun/datastore/model_provider/openai_provider.py +243 -71
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +27 -19
- mlrun/db/httpdb.py +57 -48
- mlrun/db/nopdb.py +25 -10
- mlrun/execution.py +55 -13
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +2 -0
- mlrun/model.py +9 -3
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +388 -138
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +36 -13
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
- mlrun/model_monitoring/helpers.py +28 -5
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +16 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +157 -69
- mlrun/run.py +97 -20
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +147 -17
- mlrun/runtimes/nuclio/function.py +72 -27
- mlrun/runtimes/nuclio/serving.py +102 -20
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +54 -13
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +230 -40
- mlrun/serving/states.py +605 -232
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +136 -81
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +215 -83
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/mail.py +38 -15
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -25,10 +25,12 @@ from mlrun.utils import logger
|
|
|
25
25
|
|
|
26
26
|
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
27
27
|
"""
|
|
28
|
-
Normalize user
|
|
29
|
-
to a form V3IO frames tolerates.
|
|
28
|
+
Normalize user-defined keys (e.g., model input data and predictions) to a format V3IO Frames tolerates.
|
|
30
29
|
|
|
31
|
-
|
|
30
|
+
- Keys must match regex: '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'
|
|
31
|
+
- Replace invalid characters (e.g., '-') with '_'.
|
|
32
|
+
- Prefix keys starting with digits with '_'.
|
|
33
|
+
- Flatten nested dictionaries using dot notation, while normalizing keys recursively.
|
|
32
34
|
"""
|
|
33
35
|
prefix = "_"
|
|
34
36
|
|
|
@@ -38,7 +40,18 @@ def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
|
38
40
|
return prefix + key
|
|
39
41
|
return key
|
|
40
42
|
|
|
41
|
-
|
|
43
|
+
def flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
|
|
44
|
+
items = {}
|
|
45
|
+
for k, v in d.items():
|
|
46
|
+
new_key = norm_key(k)
|
|
47
|
+
full_key = f"{parent_key}.{new_key}" if parent_key else new_key
|
|
48
|
+
if isinstance(v, dict):
|
|
49
|
+
items.update(flatten_dict(v, full_key))
|
|
50
|
+
else:
|
|
51
|
+
items[full_key] = v
|
|
52
|
+
return items
|
|
53
|
+
|
|
54
|
+
return flatten_dict(event)
|
|
42
55
|
|
|
43
56
|
|
|
44
57
|
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import math
|
|
15
|
-
from datetime import datetime, timedelta
|
|
15
|
+
from datetime import datetime, timedelta, timezone
|
|
16
16
|
from io import StringIO
|
|
17
|
-
from typing import
|
|
17
|
+
from typing import Literal, Optional, Union
|
|
18
18
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import v3io_frames
|
|
@@ -25,6 +25,7 @@ import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
|
25
25
|
import mlrun.feature_store.steps
|
|
26
26
|
import mlrun.utils.v3io_clients
|
|
27
27
|
from mlrun.common.schemas import EventFieldType
|
|
28
|
+
from mlrun.config import config
|
|
28
29
|
from mlrun.model_monitoring.db import TSDBConnector
|
|
29
30
|
from mlrun.model_monitoring.helpers import get_invocations_fqn, get_start_end
|
|
30
31
|
from mlrun.utils import logger
|
|
@@ -369,6 +370,49 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
369
370
|
apply_storey_filter()
|
|
370
371
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
371
372
|
|
|
373
|
+
def apply_writer_steps(self, graph, after, **kwargs) -> None:
|
|
374
|
+
graph.add_step(
|
|
375
|
+
"storey.TSDBTarget",
|
|
376
|
+
name="tsdb_metrics",
|
|
377
|
+
after=after,
|
|
378
|
+
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.METRICS]}",
|
|
379
|
+
time_col=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
380
|
+
container=self.container,
|
|
381
|
+
v3io_frames=self.v3io_framesd,
|
|
382
|
+
infer_columns_from_data=True,
|
|
383
|
+
graph_shape="cylinder",
|
|
384
|
+
index_cols=[
|
|
385
|
+
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
386
|
+
mm_schemas.WriterEvent.ENDPOINT_NAME,
|
|
387
|
+
mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
388
|
+
mm_schemas.MetricData.METRIC_NAME,
|
|
389
|
+
],
|
|
390
|
+
max_events=config.model_endpoint_monitoring.writer_graph.max_events,
|
|
391
|
+
flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
|
|
392
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
graph.add_step(
|
|
396
|
+
"storey.TSDBTarget",
|
|
397
|
+
name="tsdb_app_results",
|
|
398
|
+
after=after,
|
|
399
|
+
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]}",
|
|
400
|
+
time_col=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
401
|
+
container=self.container,
|
|
402
|
+
v3io_frames=self.v3io_framesd,
|
|
403
|
+
infer_columns_from_data=True,
|
|
404
|
+
graph_shape="cylinder",
|
|
405
|
+
index_cols=[
|
|
406
|
+
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
407
|
+
mm_schemas.WriterEvent.ENDPOINT_NAME,
|
|
408
|
+
mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
409
|
+
mm_schemas.ResultData.RESULT_NAME,
|
|
410
|
+
],
|
|
411
|
+
max_events=config.model_endpoint_monitoring.writer_graph.max_events,
|
|
412
|
+
flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
|
|
413
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
414
|
+
)
|
|
415
|
+
|
|
372
416
|
def handle_model_error(
|
|
373
417
|
self,
|
|
374
418
|
graph,
|
|
@@ -492,7 +536,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
492
536
|
# Split the endpoint ids into chunks to avoid exceeding the v3io-engine filter-expression limit
|
|
493
537
|
for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
|
|
494
538
|
endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
|
|
495
|
-
|
|
539
|
+
endpoints_list = "', '".join(endpoint_id_chunk)
|
|
540
|
+
filter_query = f"endpoint_id IN('{endpoints_list}')"
|
|
496
541
|
for table in tables:
|
|
497
542
|
try:
|
|
498
543
|
self.frames_client.delete(
|
|
@@ -532,6 +577,43 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
532
577
|
project=self.project,
|
|
533
578
|
)
|
|
534
579
|
|
|
580
|
+
def delete_application_records(
|
|
581
|
+
self, application_name: str, endpoint_ids: Optional[list[str]] = None
|
|
582
|
+
) -> None:
|
|
583
|
+
"""
|
|
584
|
+
Delete application records from the TSDB for the given model endpoints or all if ``endpoint_ids`` is ``None``.
|
|
585
|
+
"""
|
|
586
|
+
base_filter_query = f"application_name=='{application_name}'"
|
|
587
|
+
|
|
588
|
+
filter_queries: list[str] = []
|
|
589
|
+
if endpoint_ids:
|
|
590
|
+
for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
|
|
591
|
+
endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
|
|
592
|
+
endpoints_list = "', '".join(endpoint_id_chunk)
|
|
593
|
+
filter_queries.append(
|
|
594
|
+
f"{base_filter_query} AND endpoint_id IN ('{endpoints_list}')"
|
|
595
|
+
)
|
|
596
|
+
else:
|
|
597
|
+
filter_queries = [base_filter_query]
|
|
598
|
+
|
|
599
|
+
for table in [
|
|
600
|
+
self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS],
|
|
601
|
+
self.tables[mm_schemas.V3IOTSDBTables.METRICS],
|
|
602
|
+
]:
|
|
603
|
+
logger.debug(
|
|
604
|
+
"Deleting application records from TSDB",
|
|
605
|
+
table=table,
|
|
606
|
+
filter_queries=filter_queries,
|
|
607
|
+
project=self.project,
|
|
608
|
+
)
|
|
609
|
+
for filter_query in filter_queries:
|
|
610
|
+
self.frames_client.delete(
|
|
611
|
+
backend=_TSDB_BE,
|
|
612
|
+
table=table,
|
|
613
|
+
filter=filter_query,
|
|
614
|
+
start="0",
|
|
615
|
+
)
|
|
616
|
+
|
|
535
617
|
def get_model_endpoint_real_time_metrics(
|
|
536
618
|
self, endpoint_id: str, metrics: list[str], start: str, end: str
|
|
537
619
|
) -> dict[str, list[tuple[str, float]]]:
|
|
@@ -935,6 +1017,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
935
1017
|
start: Optional[datetime] = None,
|
|
936
1018
|
end: Optional[datetime] = None,
|
|
937
1019
|
) -> dict[str, float]:
|
|
1020
|
+
if not endpoint_ids:
|
|
1021
|
+
return {}
|
|
1022
|
+
|
|
938
1023
|
# Get the last request timestamp for each endpoint from the KV table.
|
|
939
1024
|
# The result of the query is a list of dictionaries,
|
|
940
1025
|
# each dictionary contains the endpoint id and the last request timestamp.
|
|
@@ -1145,11 +1230,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1145
1230
|
)
|
|
1146
1231
|
return df.reset_index(drop=True)
|
|
1147
1232
|
|
|
1148
|
-
|
|
1233
|
+
def add_basic_metrics(
|
|
1149
1234
|
self,
|
|
1150
1235
|
model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
|
|
1151
|
-
project: str,
|
|
1152
|
-
run_in_threadpool: Callable,
|
|
1153
1236
|
metric_list: Optional[list[str]] = None,
|
|
1154
1237
|
) -> list[mlrun.common.schemas.ModelEndpoint]:
|
|
1155
1238
|
"""
|
|
@@ -1157,8 +1240,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1157
1240
|
|
|
1158
1241
|
:param model_endpoint_objects: A list of `ModelEndpoint` objects that will
|
|
1159
1242
|
be filled with the relevant basic metrics.
|
|
1160
|
-
:param project: The name of the project.
|
|
1161
|
-
:param run_in_threadpool: A function that runs another function in a thread pool.
|
|
1162
1243
|
:param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
|
|
1163
1244
|
|
|
1164
1245
|
:return: A list of `ModelEndpointMonitoringMetric` objects.
|
|
@@ -1187,8 +1268,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1187
1268
|
function,
|
|
1188
1269
|
_,
|
|
1189
1270
|
) in metric_name_to_function_and_column_name.items():
|
|
1190
|
-
metric_name_to_result[metric_name] =
|
|
1191
|
-
function,
|
|
1271
|
+
metric_name_to_result[metric_name] = function(
|
|
1192
1272
|
endpoint_ids=uids,
|
|
1193
1273
|
get_raw=True,
|
|
1194
1274
|
)
|
|
@@ -1259,7 +1339,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1259
1339
|
else:
|
|
1260
1340
|
filter_query = app_filter_query
|
|
1261
1341
|
|
|
1262
|
-
|
|
1342
|
+
raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
|
|
1263
1343
|
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
1264
1344
|
start=start,
|
|
1265
1345
|
end=end,
|
|
@@ -1268,39 +1348,33 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1268
1348
|
mm_schemas.ResultData.RESULT_STATUS,
|
|
1269
1349
|
],
|
|
1270
1350
|
filter_query=filter_query,
|
|
1351
|
+
get_raw=True,
|
|
1271
1352
|
)
|
|
1272
1353
|
|
|
1273
|
-
|
|
1274
|
-
if result_status_list and not df.empty:
|
|
1275
|
-
df = df[df[mm_schemas.ResultData.RESULT_STATUS].isin(result_status_list)]
|
|
1276
|
-
|
|
1277
|
-
if df.empty:
|
|
1354
|
+
if not raw_frames:
|
|
1278
1355
|
return {}
|
|
1279
|
-
else:
|
|
1280
|
-
# convert application name to lower case
|
|
1281
|
-
df[mm_schemas.ApplicationEvent.APPLICATION_NAME] = df[
|
|
1282
|
-
mm_schemas.ApplicationEvent.APPLICATION_NAME
|
|
1283
|
-
].str.lower()
|
|
1284
|
-
|
|
1285
|
-
df = (
|
|
1286
|
-
df[
|
|
1287
|
-
[
|
|
1288
|
-
mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1289
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
1290
|
-
mm_schemas.ResultData.RESULT_VALUE,
|
|
1291
|
-
]
|
|
1292
|
-
]
|
|
1293
|
-
.groupby(
|
|
1294
|
-
[
|
|
1295
|
-
mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1296
|
-
mm_schemas.ResultData.RESULT_STATUS,
|
|
1297
|
-
],
|
|
1298
|
-
observed=True,
|
|
1299
|
-
)
|
|
1300
|
-
.count()
|
|
1301
|
-
)
|
|
1302
1356
|
|
|
1303
|
-
|
|
1357
|
+
# Count occurrences by (application_name, result_status) from RawFrame objects
|
|
1358
|
+
count_dict = {}
|
|
1359
|
+
|
|
1360
|
+
for frame in raw_frames:
|
|
1361
|
+
# Extract column data from each RawFrame
|
|
1362
|
+
app_name = frame.column_data(mm_schemas.ApplicationEvent.APPLICATION_NAME)[
|
|
1363
|
+
0
|
|
1364
|
+
]
|
|
1365
|
+
statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
|
|
1366
|
+
|
|
1367
|
+
for status in statuses:
|
|
1368
|
+
# Filter by result status if specified
|
|
1369
|
+
if result_status_list and status not in result_status_list:
|
|
1370
|
+
continue
|
|
1371
|
+
|
|
1372
|
+
# Convert application name to lower case
|
|
1373
|
+
key = (app_name.lower(), status)
|
|
1374
|
+
|
|
1375
|
+
# Update the count in the dictionary
|
|
1376
|
+
count_dict[key] = count_dict.get(key, 0) + 1
|
|
1377
|
+
return count_dict
|
|
1304
1378
|
|
|
1305
1379
|
def count_processed_model_endpoints(
|
|
1306
1380
|
self,
|
|
@@ -1458,20 +1532,140 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1458
1532
|
) -> mm_schemas.ModelEndpointDriftValues:
|
|
1459
1533
|
table = mm_schemas.V3IOTSDBTables.APP_RESULTS
|
|
1460
1534
|
start, end, interval = self._prepare_aligned_start_end(start, end)
|
|
1461
|
-
|
|
1462
|
-
# get per time-interval x endpoint_id combination the max result status
|
|
1463
|
-
df = self._get_records(
|
|
1535
|
+
raw_frames: list[v3io_frames.client.RawFrame] = self._get_records(
|
|
1464
1536
|
table=table,
|
|
1465
1537
|
start=start,
|
|
1466
1538
|
end=end,
|
|
1467
|
-
interval=interval,
|
|
1468
|
-
sliding_window_step=interval,
|
|
1469
1539
|
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
1470
|
-
|
|
1471
|
-
group_by=mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
1540
|
+
get_raw=True,
|
|
1472
1541
|
)
|
|
1473
|
-
|
|
1542
|
+
|
|
1543
|
+
if not raw_frames:
|
|
1544
|
+
return mm_schemas.ModelEndpointDriftValues(values=[])
|
|
1545
|
+
|
|
1546
|
+
aggregated_data = self._aggregate_raw_drift_data(
|
|
1547
|
+
raw_frames=raw_frames, start=start, end=end, interval=interval
|
|
1548
|
+
)
|
|
1549
|
+
if not aggregated_data:
|
|
1550
|
+
return mm_schemas.ModelEndpointDriftValues(values=[])
|
|
1551
|
+
|
|
1552
|
+
# Filter to only include entries with max result_status >= 1
|
|
1553
|
+
filtered_data = [
|
|
1554
|
+
(endpoint_id, timestamp, max_status)
|
|
1555
|
+
for endpoint_id, timestamp, max_status in aggregated_data
|
|
1556
|
+
if max_status >= 1
|
|
1557
|
+
]
|
|
1558
|
+
|
|
1559
|
+
if not filtered_data:
|
|
1474
1560
|
return mm_schemas.ModelEndpointDriftValues(values=[])
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1561
|
+
|
|
1562
|
+
return self._convert_drift_data_to_values(aggregated_data=filtered_data)
|
|
1563
|
+
|
|
1564
|
+
@staticmethod
|
|
1565
|
+
def _aggregate_raw_drift_data(
|
|
1566
|
+
raw_frames: list[v3io_frames.client.RawFrame],
|
|
1567
|
+
start: datetime,
|
|
1568
|
+
end: datetime,
|
|
1569
|
+
interval: str,
|
|
1570
|
+
) -> list[tuple[str, datetime, float]]:
|
|
1571
|
+
"""
|
|
1572
|
+
Aggregate raw drift data from RawFrame objects.
|
|
1573
|
+
|
|
1574
|
+
:param raw_frames: List of RawFrame objects containing drift data.
|
|
1575
|
+
:param start: Start datetime for filtering data.
|
|
1576
|
+
:param end: End datetime for filtering data.
|
|
1577
|
+
:param interval: Time interval string (e.g., '5min') for aggregation
|
|
1578
|
+
|
|
1579
|
+
:returns: list of tuples: (endpoint_id, timestamp, max_result_status)
|
|
1580
|
+
"""
|
|
1581
|
+
if not raw_frames:
|
|
1582
|
+
return []
|
|
1583
|
+
|
|
1584
|
+
# Parse interval to get timedelta
|
|
1585
|
+
interval_td = pd.Timedelta(interval)
|
|
1586
|
+
|
|
1587
|
+
# Collect all data points from RawFrame objects
|
|
1588
|
+
data_points = []
|
|
1589
|
+
for frame in raw_frames:
|
|
1590
|
+
endpoint_id = frame.column_data(EventFieldType.ENDPOINT_ID)[0]
|
|
1591
|
+
result_statuses = frame.column_data(mm_schemas.ResultData.RESULT_STATUS)
|
|
1592
|
+
timestamps = frame.indices()[0].times
|
|
1593
|
+
|
|
1594
|
+
# Combine data from this frame
|
|
1595
|
+
for i, (status, timestamp) in enumerate(zip(result_statuses, timestamps)):
|
|
1596
|
+
# V3IO TSDB returns timestamps in nanoseconds
|
|
1597
|
+
timestamp_dt = pd.Timestamp(
|
|
1598
|
+
timestamp, unit="ns", tzinfo=timezone.utc
|
|
1599
|
+
).to_pydatetime()
|
|
1600
|
+
|
|
1601
|
+
# Filter by time window
|
|
1602
|
+
if start <= timestamp_dt < end:
|
|
1603
|
+
data_points.append((endpoint_id, timestamp_dt, status))
|
|
1604
|
+
|
|
1605
|
+
if not data_points:
|
|
1606
|
+
return []
|
|
1607
|
+
|
|
1608
|
+
# Group by endpoint_id and time intervals, then find max status
|
|
1609
|
+
# Create time buckets aligned to start
|
|
1610
|
+
grouped_data = {}
|
|
1611
|
+
for endpoint_id, timestamp, status in data_points:
|
|
1612
|
+
# Calculate which interval bucket this timestamp falls into
|
|
1613
|
+
time_diff = timestamp - start
|
|
1614
|
+
bucket_index = int(time_diff / interval_td)
|
|
1615
|
+
bucket_start = start + (bucket_index * interval_td)
|
|
1616
|
+
|
|
1617
|
+
key = (endpoint_id, bucket_start)
|
|
1618
|
+
if key not in grouped_data:
|
|
1619
|
+
grouped_data[key] = status
|
|
1620
|
+
else:
|
|
1621
|
+
# Keep the maximum status value
|
|
1622
|
+
grouped_data[key] = max(grouped_data[key], status)
|
|
1623
|
+
|
|
1624
|
+
# Convert to list of tuples
|
|
1625
|
+
result = [
|
|
1626
|
+
(endpoint_id, timestamp, max_status)
|
|
1627
|
+
for (endpoint_id, timestamp), max_status in grouped_data.items()
|
|
1628
|
+
]
|
|
1629
|
+
|
|
1630
|
+
return result
|
|
1631
|
+
|
|
1632
|
+
@staticmethod
|
|
1633
|
+
def _convert_drift_data_to_values(
|
|
1634
|
+
aggregated_data: list[tuple[str, datetime, float]],
|
|
1635
|
+
) -> mm_schemas.ModelEndpointDriftValues:
|
|
1636
|
+
"""
|
|
1637
|
+
Convert aggregated drift data to ModelEndpointDriftValues format.
|
|
1638
|
+
|
|
1639
|
+
:param aggregated_data: List of tuples (endpoint_id, timestamp, max_result_status)
|
|
1640
|
+
:return: ModelEndpointDriftValues with counts of suspected and detected per timestamp
|
|
1641
|
+
"""
|
|
1642
|
+
suspected_val = mm_schemas.constants.ResultStatusApp.potential_detection.value
|
|
1643
|
+
detected_val = mm_schemas.constants.ResultStatusApp.detected.value
|
|
1644
|
+
|
|
1645
|
+
# Group by timestamp and result status, then count occurrences
|
|
1646
|
+
timestamp_status_counts = {}
|
|
1647
|
+
for _, timestamp, max_status in aggregated_data:
|
|
1648
|
+
key = (timestamp, max_status)
|
|
1649
|
+
timestamp_status_counts[key] = timestamp_status_counts.get(key, 0) + 1
|
|
1650
|
+
|
|
1651
|
+
# Organize by timestamp with counts for suspected and detected
|
|
1652
|
+
timestamp_counts = {}
|
|
1653
|
+
for (timestamp, status), count in timestamp_status_counts.items():
|
|
1654
|
+
if timestamp not in timestamp_counts:
|
|
1655
|
+
timestamp_counts[timestamp] = {
|
|
1656
|
+
"count_suspected": 0,
|
|
1657
|
+
"count_detected": 0,
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
if status == suspected_val:
|
|
1661
|
+
timestamp_counts[timestamp]["count_suspected"] = count
|
|
1662
|
+
elif status == detected_val:
|
|
1663
|
+
timestamp_counts[timestamp]["count_detected"] = count
|
|
1664
|
+
|
|
1665
|
+
# Convert to the expected format: list of (timestamp, count_suspected, count_detected)
|
|
1666
|
+
values = [
|
|
1667
|
+
(timestamp, counts["count_suspected"], counts["count_detected"])
|
|
1668
|
+
for timestamp, counts in sorted(timestamp_counts.items())
|
|
1669
|
+
]
|
|
1670
|
+
|
|
1671
|
+
return mm_schemas.ModelEndpointDriftValues(values=values)
|
|
@@ -143,7 +143,7 @@ def get_stream_path(
|
|
|
143
143
|
return stream_uri.replace("v3io://", f"ds://{profile.name}")
|
|
144
144
|
|
|
145
145
|
elif isinstance(
|
|
146
|
-
profile, mlrun.datastore.datastore_profile.
|
|
146
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
147
147
|
):
|
|
148
148
|
topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
|
|
149
149
|
project=project, function_name=function_name
|
|
@@ -152,7 +152,7 @@ def get_stream_path(
|
|
|
152
152
|
else:
|
|
153
153
|
raise mlrun.errors.MLRunValueError(
|
|
154
154
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
155
|
-
"Expects `DatastoreProfileV3io` or `
|
|
155
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
156
156
|
)
|
|
157
157
|
|
|
158
158
|
|
|
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
|
|
|
300
300
|
|
|
301
301
|
def _get_kafka_output_stream(
|
|
302
302
|
*,
|
|
303
|
-
kafka_profile: mlrun.datastore.datastore_profile.
|
|
303
|
+
kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
|
|
304
304
|
project: str,
|
|
305
305
|
function_name: str,
|
|
306
306
|
mock: bool = False,
|
|
@@ -356,7 +356,7 @@ def get_output_stream(
|
|
|
356
356
|
)
|
|
357
357
|
|
|
358
358
|
elif isinstance(
|
|
359
|
-
profile, mlrun.datastore.datastore_profile.
|
|
359
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
360
360
|
):
|
|
361
361
|
return _get_kafka_output_stream(
|
|
362
362
|
kafka_profile=profile,
|
|
@@ -368,7 +368,7 @@ def get_output_stream(
|
|
|
368
368
|
else:
|
|
369
369
|
raise mlrun.errors.MLRunValueError(
|
|
370
370
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
371
|
-
"Expects `DatastoreProfileV3io` or `
|
|
371
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
372
372
|
)
|
|
373
373
|
|
|
374
374
|
|
|
@@ -659,3 +659,26 @@ def get_start_end(
|
|
|
659
659
|
)
|
|
660
660
|
|
|
661
661
|
return start, end
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def validate_time_range(
|
|
665
|
+
start: Optional[datetime.datetime] = None, end: Optional[datetime.datetime] = None
|
|
666
|
+
) -> tuple[datetime.datetime, datetime.datetime]:
|
|
667
|
+
"""
|
|
668
|
+
validate start and end parameters and set default values if needed.
|
|
669
|
+
:param start: Either None or datetime, None is handled as datetime.now(tz=timezone.utc) - timedelta(days=1)
|
|
670
|
+
:param end: Either None or datetime, None is handled as datetime.now(tz=timezone.utc)
|
|
671
|
+
:return: start datetime, end datetime
|
|
672
|
+
"""
|
|
673
|
+
end = end or mlrun.utils.helpers.datetime_now()
|
|
674
|
+
start = start or (end - datetime.timedelta(days=1))
|
|
675
|
+
if start.tzinfo is None or end.tzinfo is None:
|
|
676
|
+
raise mlrun.errors.MLRunInvalidArgumentTypeError(
|
|
677
|
+
"Custom start and end times must contain the timezone."
|
|
678
|
+
)
|
|
679
|
+
if start > end:
|
|
680
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
681
|
+
"The start time must be before the end time. Note that if end time is not provided, "
|
|
682
|
+
"the current time is used by default."
|
|
683
|
+
)
|
|
684
|
+
return start, end
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import asyncio
|
|
15
15
|
import datetime
|
|
16
16
|
import typing
|
|
17
17
|
|
|
@@ -134,6 +134,9 @@ class EventStreamProcessor:
|
|
|
134
134
|
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
135
135
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
136
136
|
|
|
137
|
+
In a separate branch, "batch complete" events are forwarded to the controller stream with an intentional delay,
|
|
138
|
+
to allow for data to first be written to parquet.
|
|
139
|
+
|
|
137
140
|
:param fn: A serving function.
|
|
138
141
|
:param tsdb_connector: Time series database connector.
|
|
139
142
|
:param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
|
|
@@ -145,6 +148,20 @@ class EventStreamProcessor:
|
|
|
145
148
|
fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
|
|
146
149
|
)
|
|
147
150
|
|
|
151
|
+
# forward back complete events to controller
|
|
152
|
+
graph.add_step(
|
|
153
|
+
"storey.Filter",
|
|
154
|
+
"FilterBatchComplete",
|
|
155
|
+
_fn="(event.get('kind') == 'batch_complete')",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
graph.add_step(
|
|
159
|
+
"Delay",
|
|
160
|
+
name="BatchDelay",
|
|
161
|
+
after="FilterBatchComplete",
|
|
162
|
+
delay=self.parquet_batching_timeout_secs + 5, # add margin
|
|
163
|
+
)
|
|
164
|
+
|
|
148
165
|
# split the graph between event with error vs valid event
|
|
149
166
|
graph.add_step(
|
|
150
167
|
"storey.Filter",
|
|
@@ -261,7 +278,7 @@ class EventStreamProcessor:
|
|
|
261
278
|
"controller_stream",
|
|
262
279
|
path=stream_uri,
|
|
263
280
|
sharding_func=ControllerEvent.ENDPOINT_ID,
|
|
264
|
-
after="ForwardNOP",
|
|
281
|
+
after=["ForwardNOP", "BatchDelay"],
|
|
265
282
|
# Force using the pipeline key instead of the one in the profile in case of v3io profile.
|
|
266
283
|
# In case of Kafka, this parameter will be ignored.
|
|
267
284
|
alternative_v3io_access_key="V3IO_ACCESS_KEY",
|
|
@@ -309,6 +326,16 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
|
309
326
|
return event
|
|
310
327
|
|
|
311
328
|
|
|
329
|
+
class Delay(mlrun.feature_store.steps.MapClass):
|
|
330
|
+
def __init__(self, delay: int, **kwargs):
|
|
331
|
+
super().__init__(**kwargs)
|
|
332
|
+
self._delay = delay
|
|
333
|
+
|
|
334
|
+
async def do(self, event):
|
|
335
|
+
await asyncio.sleep(self._delay)
|
|
336
|
+
return event
|
|
337
|
+
|
|
338
|
+
|
|
312
339
|
class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
313
340
|
def __init__(
|
|
314
341
|
self,
|
|
@@ -369,6 +396,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
369
396
|
request_id = event.get("request", {}).get("id") or event.get("resp", {}).get(
|
|
370
397
|
"id"
|
|
371
398
|
)
|
|
399
|
+
feature_names = event.get("request", {}).get("input_schema")
|
|
400
|
+
labels_names = event.get("resp", {}).get("output_schema")
|
|
372
401
|
latency = event.get("microsec")
|
|
373
402
|
features = event.get("request", {}).get("inputs")
|
|
374
403
|
predictions = event.get("resp", {}).get("outputs")
|
|
@@ -469,6 +498,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
469
498
|
),
|
|
470
499
|
EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
|
|
471
500
|
EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
|
|
501
|
+
EventFieldType.FEATURE_NAMES: feature_names,
|
|
502
|
+
EventFieldType.LABEL_NAMES: labels_names,
|
|
472
503
|
}
|
|
473
504
|
)
|
|
474
505
|
|
|
@@ -575,19 +606,19 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
575
606
|
self.endpoint_type = {}
|
|
576
607
|
|
|
577
608
|
def _infer_feature_names_from_data(self, event):
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
609
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
610
|
+
if endpoint_id in self.feature_names and len(
|
|
611
|
+
self.feature_names[endpoint_id]
|
|
612
|
+
) >= len(event[EventFieldType.FEATURES]):
|
|
613
|
+
return self.feature_names[endpoint_id]
|
|
583
614
|
return None
|
|
584
615
|
|
|
585
616
|
def _infer_label_columns_from_data(self, event):
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
617
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
618
|
+
if endpoint_id in self.label_columns and len(
|
|
619
|
+
self.label_columns[endpoint_id]
|
|
620
|
+
) >= len(event[EventFieldType.PREDICTION]):
|
|
621
|
+
return self.label_columns[endpoint_id]
|
|
591
622
|
return None
|
|
592
623
|
|
|
593
624
|
def do(self, event: dict):
|
|
@@ -632,7 +663,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
632
663
|
"Feature names are not initialized, they will be automatically generated",
|
|
633
664
|
endpoint_id=endpoint_id,
|
|
634
665
|
)
|
|
635
|
-
feature_names = [
|
|
666
|
+
feature_names = event.get(EventFieldType.FEATURE_NAMES) or [
|
|
636
667
|
f"f{i}" for i, _ in enumerate(event[EventFieldType.FEATURES])
|
|
637
668
|
]
|
|
638
669
|
|
|
@@ -655,7 +686,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
655
686
|
"label column names are not initialized, they will be automatically generated",
|
|
656
687
|
endpoint_id=endpoint_id,
|
|
657
688
|
)
|
|
658
|
-
label_columns = [
|
|
689
|
+
label_columns = event.get(EventFieldType.LABEL_NAMES) or [
|
|
659
690
|
f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
|
|
660
691
|
]
|
|
661
692
|
attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns
|