mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +34 -30
- mlrun/common/schemas/alert.py +3 -0
- mlrun/common/schemas/model_monitoring/constants.py +4 -0
- mlrun/common/schemas/notification.py +4 -3
- mlrun/datastore/alibaba_oss.py +2 -2
- mlrun/datastore/azure_blob.py +124 -31
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/dbfs_store.py +2 -2
- mlrun/datastore/google_cloud_storage.py +83 -20
- mlrun/datastore/s3.py +2 -2
- mlrun/datastore/sources.py +54 -0
- mlrun/datastore/targets.py +9 -53
- mlrun/db/httpdb.py +6 -1
- mlrun/errors.py +8 -0
- mlrun/execution.py +7 -0
- mlrun/feature_store/api.py +5 -0
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/retrieval/job.py +1 -0
- mlrun/model.py +29 -3
- mlrun/model_monitoring/api.py +9 -0
- mlrun/model_monitoring/applications/_application_steps.py +36 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +15 -13
- mlrun/model_monitoring/controller.py +15 -11
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +14 -11
- mlrun/model_monitoring/db/tsdb/base.py +121 -1
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +85 -47
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +100 -12
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +214 -36
- mlrun/model_monitoring/helpers.py +16 -17
- mlrun/model_monitoring/stream_processing.py +68 -27
- mlrun/projects/operations.py +1 -1
- mlrun/projects/pipelines.py +19 -30
- mlrun/projects/project.py +76 -52
- mlrun/run.py +8 -6
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/nuclio/api_gateway.py +9 -0
- mlrun/runtimes/nuclio/application/application.py +64 -9
- mlrun/runtimes/nuclio/function.py +1 -1
- mlrun/runtimes/pod.py +2 -2
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +7 -9
- mlrun/serving/v2_serving.py +1 -0
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/helpers.py +21 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/METADATA +14 -11
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/RECORD +52 -52
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/top_level.txt +0 -0
mlrun/datastore/targets.py
CHANGED
|
@@ -390,6 +390,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
390
390
|
is_offline = False
|
|
391
391
|
support_spark = False
|
|
392
392
|
support_storey = False
|
|
393
|
+
support_pandas = False
|
|
393
394
|
support_append = False
|
|
394
395
|
|
|
395
396
|
def __init__(
|
|
@@ -758,6 +759,8 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
758
759
|
**kwargs,
|
|
759
760
|
):
|
|
760
761
|
"""return the target data as dataframe"""
|
|
762
|
+
if not self.support_pandas:
|
|
763
|
+
raise NotImplementedError()
|
|
761
764
|
mlrun.utils.helpers.additional_filters_warning(
|
|
762
765
|
additional_filters, self.__class__
|
|
763
766
|
)
|
|
@@ -819,6 +822,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
819
822
|
support_spark = True
|
|
820
823
|
support_storey = True
|
|
821
824
|
support_dask = True
|
|
825
|
+
support_pandas = True
|
|
822
826
|
support_append = True
|
|
823
827
|
|
|
824
828
|
def __init__(
|
|
@@ -1084,6 +1088,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1084
1088
|
is_offline = True
|
|
1085
1089
|
support_spark = True
|
|
1086
1090
|
support_storey = True
|
|
1091
|
+
support_pandas = True
|
|
1087
1092
|
|
|
1088
1093
|
@staticmethod
|
|
1089
1094
|
def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
|
|
@@ -1292,7 +1297,7 @@ class SnowflakeTarget(BaseStoreTarget):
|
|
|
1292
1297
|
**kwargs,
|
|
1293
1298
|
):
|
|
1294
1299
|
raise mlrun.errors.MLRunRuntimeError(
|
|
1295
|
-
f"{type(self).__name__} does not support
|
|
1300
|
+
f"{type(self).__name__} does not support pandas engine"
|
|
1296
1301
|
)
|
|
1297
1302
|
|
|
1298
1303
|
@property
|
|
@@ -1366,19 +1371,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1366
1371
|
def get_dask_options(self):
|
|
1367
1372
|
return {"format": "csv"}
|
|
1368
1373
|
|
|
1369
|
-
def as_df(
|
|
1370
|
-
self,
|
|
1371
|
-
columns=None,
|
|
1372
|
-
df_module=None,
|
|
1373
|
-
entities=None,
|
|
1374
|
-
start_time=None,
|
|
1375
|
-
end_time=None,
|
|
1376
|
-
time_column=None,
|
|
1377
|
-
additional_filters=None,
|
|
1378
|
-
**kwargs,
|
|
1379
|
-
):
|
|
1380
|
-
raise NotImplementedError()
|
|
1381
|
-
|
|
1382
1374
|
def write_dataframe(
|
|
1383
1375
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1384
1376
|
):
|
|
@@ -1612,19 +1604,6 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1612
1604
|
**self.attributes,
|
|
1613
1605
|
)
|
|
1614
1606
|
|
|
1615
|
-
def as_df(
|
|
1616
|
-
self,
|
|
1617
|
-
columns=None,
|
|
1618
|
-
df_module=None,
|
|
1619
|
-
entities=None,
|
|
1620
|
-
start_time=None,
|
|
1621
|
-
end_time=None,
|
|
1622
|
-
time_column=None,
|
|
1623
|
-
additional_filters=None,
|
|
1624
|
-
**kwargs,
|
|
1625
|
-
):
|
|
1626
|
-
raise NotImplementedError()
|
|
1627
|
-
|
|
1628
1607
|
|
|
1629
1608
|
class KafkaTarget(BaseStoreTarget):
|
|
1630
1609
|
"""
|
|
@@ -1727,19 +1706,6 @@ class KafkaTarget(BaseStoreTarget):
|
|
|
1727
1706
|
**attributes,
|
|
1728
1707
|
)
|
|
1729
1708
|
|
|
1730
|
-
def as_df(
|
|
1731
|
-
self,
|
|
1732
|
-
columns=None,
|
|
1733
|
-
df_module=None,
|
|
1734
|
-
entities=None,
|
|
1735
|
-
start_time=None,
|
|
1736
|
-
end_time=None,
|
|
1737
|
-
time_column=None,
|
|
1738
|
-
additional_filters=None,
|
|
1739
|
-
**kwargs,
|
|
1740
|
-
):
|
|
1741
|
-
raise NotImplementedError()
|
|
1742
|
-
|
|
1743
1709
|
def purge(self):
|
|
1744
1710
|
pass
|
|
1745
1711
|
|
|
@@ -1784,19 +1750,6 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1784
1750
|
**self.attributes,
|
|
1785
1751
|
)
|
|
1786
1752
|
|
|
1787
|
-
def as_df(
|
|
1788
|
-
self,
|
|
1789
|
-
columns=None,
|
|
1790
|
-
df_module=None,
|
|
1791
|
-
entities=None,
|
|
1792
|
-
start_time=None,
|
|
1793
|
-
end_time=None,
|
|
1794
|
-
time_column=None,
|
|
1795
|
-
additional_filters=None,
|
|
1796
|
-
**kwargs,
|
|
1797
|
-
):
|
|
1798
|
-
raise NotImplementedError()
|
|
1799
|
-
|
|
1800
1753
|
def write_dataframe(
|
|
1801
1754
|
self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
|
|
1802
1755
|
):
|
|
@@ -1834,6 +1787,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1834
1787
|
is_online = False
|
|
1835
1788
|
support_spark = False
|
|
1836
1789
|
support_storey = True
|
|
1790
|
+
support_pandas = True
|
|
1837
1791
|
|
|
1838
1792
|
def __init__(
|
|
1839
1793
|
self,
|
|
@@ -1869,6 +1823,7 @@ class CustomTarget(BaseStoreTarget):
|
|
|
1869
1823
|
class DFTarget(BaseStoreTarget):
|
|
1870
1824
|
kind = TargetTypes.dataframe
|
|
1871
1825
|
support_storey = True
|
|
1826
|
+
support_pandas = True
|
|
1872
1827
|
|
|
1873
1828
|
def __init__(self, *args, name="dataframe", **kwargs):
|
|
1874
1829
|
self._df = None
|
|
@@ -1931,6 +1886,7 @@ class SQLTarget(BaseStoreTarget):
|
|
|
1931
1886
|
is_online = True
|
|
1932
1887
|
support_spark = False
|
|
1933
1888
|
support_storey = True
|
|
1889
|
+
support_pandas = True
|
|
1934
1890
|
|
|
1935
1891
|
def __init__(
|
|
1936
1892
|
self,
|
mlrun/db/httpdb.py
CHANGED
|
@@ -3475,7 +3475,7 @@ class HTTPRunDB(RunDBInterface):
|
|
|
3475
3475
|
if response.status_code == http.HTTPStatus.ACCEPTED:
|
|
3476
3476
|
if delete_resources:
|
|
3477
3477
|
logger.info(
|
|
3478
|
-
"Model Monitoring is being
|
|
3478
|
+
"Model Monitoring is being disabled",
|
|
3479
3479
|
project_name=project,
|
|
3480
3480
|
)
|
|
3481
3481
|
if delete_user_applications:
|
|
@@ -4216,6 +4216,9 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4216
4216
|
:param project: The project that the alert belongs to.
|
|
4217
4217
|
:returns: The created/modified alert.
|
|
4218
4218
|
"""
|
|
4219
|
+
if not alert_data:
|
|
4220
|
+
raise mlrun.errors.MLRunInvalidArgumentError("Alert data must be provided")
|
|
4221
|
+
|
|
4219
4222
|
project = project or config.default_project
|
|
4220
4223
|
endpoint_path = f"projects/{project}/alerts/{alert_name}"
|
|
4221
4224
|
error_message = f"put alert {project}/alerts/{alert_name}"
|
|
@@ -4224,6 +4227,8 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4224
4227
|
if isinstance(alert_data, AlertConfig)
|
|
4225
4228
|
else AlertConfig.from_dict(alert_data)
|
|
4226
4229
|
)
|
|
4230
|
+
# Validation is necessary here because users can directly invoke this function
|
|
4231
|
+
# through `mlrun.get_run_db().store_alert_config()`.
|
|
4227
4232
|
alert_instance.validate_required_fields()
|
|
4228
4233
|
|
|
4229
4234
|
alert_data = alert_instance.to_dict()
|
mlrun/errors.py
CHANGED
|
@@ -209,6 +209,14 @@ class MLRunInvalidMMStoreType(MLRunHTTPStatusError, ValueError):
|
|
|
209
209
|
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
210
210
|
|
|
211
211
|
|
|
212
|
+
class MLRunStreamConnectionFailure(MLRunHTTPStatusError, ValueError):
|
|
213
|
+
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class MLRunTSDBConnectionFailure(MLRunHTTPStatusError, ValueError):
|
|
217
|
+
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
218
|
+
|
|
219
|
+
|
|
212
220
|
class MLRunRetryExhaustedError(Exception):
|
|
213
221
|
pass
|
|
214
222
|
|
mlrun/execution.py
CHANGED
|
@@ -921,6 +921,13 @@ class MLClientCtx:
|
|
|
921
921
|
updates, self._uid, self.project, iter=self._iteration
|
|
922
922
|
)
|
|
923
923
|
|
|
924
|
+
def get_notifications(self):
|
|
925
|
+
"""Get the list of notifications"""
|
|
926
|
+
return [
|
|
927
|
+
mlrun.model.Notification.from_dict(notification)
|
|
928
|
+
for notification in self._notifications
|
|
929
|
+
]
|
|
930
|
+
|
|
924
931
|
def to_dict(self):
|
|
925
932
|
"""Convert the run context to a dictionary"""
|
|
926
933
|
|
mlrun/feature_store/api.py
CHANGED
|
@@ -230,6 +230,11 @@ def _get_offline_features(
|
|
|
230
230
|
"entity_timestamp_column param "
|
|
231
231
|
"can not be specified without entity_rows param"
|
|
232
232
|
)
|
|
233
|
+
if isinstance(target, BaseStoreTarget) and not target.support_pandas:
|
|
234
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
+
f"get_offline_features does not support targets that do not support pandas engine."
|
|
236
|
+
f" Target kind: {target.kind}"
|
|
237
|
+
)
|
|
233
238
|
|
|
234
239
|
if isinstance(feature_vector, FeatureVector):
|
|
235
240
|
update_stats = True
|
mlrun/feature_store/common.py
CHANGED
|
@@ -37,17 +37,12 @@ def parse_feature_string(feature):
|
|
|
37
37
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
38
38
|
f"feature {feature} must be {expected_message}"
|
|
39
39
|
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
feature_set
|
|
46
|
-
feature_name = splitted[1]
|
|
47
|
-
splitted = feature_name.split(" as ")
|
|
48
|
-
if len(splitted) > 1:
|
|
49
|
-
return feature_set.strip(), splitted[0].strip(), splitted[1].strip()
|
|
50
|
-
return feature_set.strip(), feature_name.strip(), None
|
|
40
|
+
feature_set, feature_name = feature.rsplit(feature_separator, 1)
|
|
41
|
+
feature_set = feature_set.strip()
|
|
42
|
+
split_result = feature_name.split(" as ", 1)
|
|
43
|
+
feature_name = split_result[0].strip()
|
|
44
|
+
alias = split_result[1].strip() if len(split_result) > 1 else None
|
|
45
|
+
return feature_set, feature_name, alias
|
|
51
46
|
|
|
52
47
|
|
|
53
48
|
def parse_project_name_from_feature_string(feature):
|
|
@@ -181,6 +181,7 @@ class RemoteVectorResponse:
|
|
|
181
181
|
file_format = kwargs.get("format")
|
|
182
182
|
if not file_format:
|
|
183
183
|
file_format = self.run.status.results["target"]["kind"]
|
|
184
|
+
|
|
184
185
|
df = mlrun.get_dataitem(self.target_uri).as_df(
|
|
185
186
|
columns=columns, df_module=df_module, format=file_format, **kwargs
|
|
186
187
|
)
|
mlrun/model.py
CHANGED
|
@@ -679,7 +679,24 @@ class ImageBuilder(ModelObj):
|
|
|
679
679
|
|
|
680
680
|
|
|
681
681
|
class Notification(ModelObj):
|
|
682
|
-
"""Notification
|
|
682
|
+
"""Notification object
|
|
683
|
+
|
|
684
|
+
:param kind: notification implementation kind - slack, webhook, etc.
|
|
685
|
+
:param name: for logging and identification
|
|
686
|
+
:param message: message content in the notification
|
|
687
|
+
:param severity: severity to display in the notification
|
|
688
|
+
:param when: list of statuses to trigger the notification: 'running', 'completed', 'error'
|
|
689
|
+
:param condition: optional condition to trigger the notification, a jinja2 expression that can use run data
|
|
690
|
+
to evaluate if the notification should be sent in addition to the 'when' statuses.
|
|
691
|
+
e.g.: '{{ run["status"]["results"]["accuracy"] < 0.9}}'
|
|
692
|
+
:param params: Implementation specific parameters for the notification implementation (e.g. slack webhook url,
|
|
693
|
+
git repository details, etc.)
|
|
694
|
+
:param secret_params: secret parameters for the notification implementation, same as params but will be stored
|
|
695
|
+
in a k8s secret and passed as a secret reference to the implementation.
|
|
696
|
+
:param status: notification status - pending, sent, error
|
|
697
|
+
:param sent_time: time the notification was sent
|
|
698
|
+
:param reason: failure reason if the notification failed to send
|
|
699
|
+
"""
|
|
683
700
|
|
|
684
701
|
def __init__(
|
|
685
702
|
self,
|
|
@@ -1468,7 +1485,11 @@ class RunObject(RunTemplate):
|
|
|
1468
1485
|
@property
|
|
1469
1486
|
def error(self) -> str:
|
|
1470
1487
|
"""error string if failed"""
|
|
1471
|
-
if
|
|
1488
|
+
if (
|
|
1489
|
+
self.status
|
|
1490
|
+
and self.status.state
|
|
1491
|
+
in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
|
|
1492
|
+
):
|
|
1472
1493
|
unknown_error = ""
|
|
1473
1494
|
if (
|
|
1474
1495
|
self.status.state
|
|
@@ -1484,8 +1505,8 @@ class RunObject(RunTemplate):
|
|
|
1484
1505
|
|
|
1485
1506
|
return (
|
|
1486
1507
|
self.status.error
|
|
1487
|
-
or self.status.reason
|
|
1488
1508
|
or self.status.status_text
|
|
1509
|
+
or self.status.reason
|
|
1489
1510
|
or unknown_error
|
|
1490
1511
|
)
|
|
1491
1512
|
return ""
|
|
@@ -1789,6 +1810,11 @@ class RunObject(RunTemplate):
|
|
|
1789
1810
|
|
|
1790
1811
|
return state
|
|
1791
1812
|
|
|
1813
|
+
def abort(self):
|
|
1814
|
+
"""abort the run"""
|
|
1815
|
+
db = mlrun.get_run_db()
|
|
1816
|
+
db.abort_run(self.metadata.uid, self.metadata.project)
|
|
1817
|
+
|
|
1792
1818
|
@staticmethod
|
|
1793
1819
|
def create_uri(project: str, uid: str, iteration: Union[int, str], tag: str = ""):
|
|
1794
1820
|
if tag:
|
mlrun/model_monitoring/api.py
CHANGED
|
@@ -616,7 +616,16 @@ def _create_model_monitoring_function_base(
|
|
|
616
616
|
app_step = prepare_step.to(class_name=application_class, **application_kwargs)
|
|
617
617
|
else:
|
|
618
618
|
app_step = prepare_step.to(class_name=application_class)
|
|
619
|
+
|
|
619
620
|
app_step.__class__ = mlrun.serving.MonitoringApplicationStep
|
|
621
|
+
|
|
622
|
+
app_step.error_handler(
|
|
623
|
+
name="ApplicationErrorHandler",
|
|
624
|
+
class_name="mlrun.model_monitoring.applications._application_steps._ApplicationErrorHandler",
|
|
625
|
+
full_event=True,
|
|
626
|
+
project=project,
|
|
627
|
+
)
|
|
628
|
+
|
|
620
629
|
app_step.to(
|
|
621
630
|
class_name="mlrun.model_monitoring.applications._application_steps._PushToMonitoringWriter",
|
|
622
631
|
name="PushToMonitoringWriter",
|
|
@@ -17,6 +17,7 @@ from typing import Optional
|
|
|
17
17
|
|
|
18
18
|
import mlrun.common.helpers
|
|
19
19
|
import mlrun.common.model_monitoring.helpers
|
|
20
|
+
import mlrun.common.schemas.alert as alert_objects
|
|
20
21
|
import mlrun.common.schemas.model_monitoring.constants as mm_constant
|
|
21
22
|
import mlrun.datastore
|
|
22
23
|
import mlrun.serving
|
|
@@ -164,3 +165,38 @@ class _PrepareMonitoringEvent(StepToDict):
|
|
|
164
165
|
)
|
|
165
166
|
context.__class__ = MonitoringApplicationContext
|
|
166
167
|
return context
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class _ApplicationErrorHandler(StepToDict):
|
|
171
|
+
def __init__(self, project: str, name: Optional[str] = None):
|
|
172
|
+
self.project = project
|
|
173
|
+
self.name = name or "ApplicationErrorHandler"
|
|
174
|
+
|
|
175
|
+
def do(self, event):
|
|
176
|
+
"""
|
|
177
|
+
Handle model monitoring application error. This step will generate an event, describing the error.
|
|
178
|
+
|
|
179
|
+
:param event: Application event.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
logger.error(f"Error in application step: {event}")
|
|
183
|
+
|
|
184
|
+
event_data = mlrun.common.schemas.Event(
|
|
185
|
+
kind=alert_objects.EventKind.MM_APP_FAILED,
|
|
186
|
+
entity={
|
|
187
|
+
"kind": alert_objects.EventEntityKind.MODEL_MONITORING_APPLICATION,
|
|
188
|
+
"project": self.project,
|
|
189
|
+
"ids": [f"{self.project}_{event.body.application_name}"],
|
|
190
|
+
},
|
|
191
|
+
value_dict={
|
|
192
|
+
"Error": event.error,
|
|
193
|
+
"Timestamp": event.timestamp,
|
|
194
|
+
"Application Class": event.body.application_name,
|
|
195
|
+
"Endpoint ID": event.body.endpoint_id,
|
|
196
|
+
},
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
mlrun.get_run_db().generate_event(
|
|
200
|
+
name=alert_objects.EventKind.MM_APP_FAILED, event_data=event_data
|
|
201
|
+
)
|
|
202
|
+
logger.info("Event generated successfully")
|
|
@@ -91,7 +91,9 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
|
|
|
91
91
|
"""
|
|
92
92
|
MLRun's default data drift application for model monitoring.
|
|
93
93
|
|
|
94
|
-
The application expects tabular numerical data, and calculates three metrics over the features' histograms.
|
|
94
|
+
The application expects tabular numerical data, and calculates three metrics over the shared features' histograms.
|
|
95
|
+
The metrics are calculated on features that have reference data from the training dataset. When there is no
|
|
96
|
+
reference data (`feature_stats`), this application send a warning log and does nothing.
|
|
95
97
|
The three metrics are:
|
|
96
98
|
|
|
97
99
|
* Hellinger distance.
|
|
@@ -112,6 +114,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
|
|
|
112
114
|
|
|
113
115
|
project.enable_model_monitoring()
|
|
114
116
|
|
|
117
|
+
To avoid it, pass `deploy_histogram_data_drift_app=False`.
|
|
115
118
|
"""
|
|
116
119
|
|
|
117
120
|
NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
|
|
@@ -223,19 +226,18 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
|
|
|
223
226
|
return metrics
|
|
224
227
|
|
|
225
228
|
@staticmethod
|
|
226
|
-
def
|
|
227
|
-
|
|
229
|
+
def _get_shared_features_sample_stats(
|
|
230
|
+
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
228
231
|
) -> mlrun.common.model_monitoring.helpers.FeatureStats:
|
|
229
232
|
"""
|
|
230
|
-
|
|
231
|
-
in the plotly artifact
|
|
233
|
+
Filter out features without reference data in `feature_stats`, e.g. `timestamp`.
|
|
232
234
|
"""
|
|
233
|
-
|
|
234
|
-
|
|
235
|
+
return mlrun.common.model_monitoring.helpers.FeatureStats(
|
|
236
|
+
{
|
|
237
|
+
key: monitoring_context.sample_df_stats[key]
|
|
238
|
+
for key in monitoring_context.feature_stats
|
|
239
|
+
}
|
|
235
240
|
)
|
|
236
|
-
if EventFieldType.TIMESTAMP in sample_set_statistics:
|
|
237
|
-
del sample_set_statistics[EventFieldType.TIMESTAMP]
|
|
238
|
-
return sample_set_statistics
|
|
239
241
|
|
|
240
242
|
@staticmethod
|
|
241
243
|
def _log_json_artifact(
|
|
@@ -299,8 +301,8 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
|
|
|
299
301
|
self._log_json_artifact(drift_per_feature_values, monitoring_context)
|
|
300
302
|
|
|
301
303
|
self._log_plotly_table_artifact(
|
|
302
|
-
sample_set_statistics=self.
|
|
303
|
-
monitoring_context
|
|
304
|
+
sample_set_statistics=self._get_shared_features_sample_stats(
|
|
305
|
+
monitoring_context
|
|
304
306
|
),
|
|
305
307
|
inputs_statistics=monitoring_context.feature_stats,
|
|
306
308
|
metrics_per_feature=metrics_per_feature,
|
|
@@ -325,7 +327,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBaseV2):
|
|
|
325
327
|
"""
|
|
326
328
|
monitoring_context.logger.debug("Starting to run the application")
|
|
327
329
|
if not monitoring_context.feature_stats:
|
|
328
|
-
monitoring_context.logger.
|
|
330
|
+
monitoring_context.logger.warning(
|
|
329
331
|
"No feature statistics found, skipping the application. \n"
|
|
330
332
|
"In order to run the application, training set must be provided when logging the model."
|
|
331
333
|
)
|
|
@@ -335,19 +335,23 @@ class MonitoringApplicationController:
|
|
|
335
335
|
return
|
|
336
336
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
337
337
|
if monitoring_functions:
|
|
338
|
-
# Gets only application in ready state
|
|
339
338
|
applications_names = list(
|
|
340
|
-
{
|
|
341
|
-
app.metadata.name
|
|
342
|
-
for app in monitoring_functions
|
|
343
|
-
if (
|
|
344
|
-
app.status.state == "ready"
|
|
345
|
-
# workaround for the default app, as its `status.state` is `None`
|
|
346
|
-
or app.metadata.name
|
|
347
|
-
== mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
348
|
-
)
|
|
349
|
-
}
|
|
339
|
+
{app.metadata.name for app in monitoring_functions}
|
|
350
340
|
)
|
|
341
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
342
|
+
# Gets only application in ready state
|
|
343
|
+
# applications_names = list(
|
|
344
|
+
# {
|
|
345
|
+
# app.metadata.name
|
|
346
|
+
# for app in monitoring_functions
|
|
347
|
+
# if (
|
|
348
|
+
# app.status.state == "ready"
|
|
349
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
350
|
+
# or app.metadata.name
|
|
351
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
352
|
+
# )
|
|
353
|
+
# }
|
|
354
|
+
# )
|
|
351
355
|
if not applications_names:
|
|
352
356
|
logger.info("No monitoring functions found", project=self.project)
|
|
353
357
|
return
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import http
|
|
15
15
|
import json
|
|
16
16
|
import typing
|
|
17
17
|
from dataclasses import dataclass
|
|
@@ -34,11 +34,11 @@ fields_to_encode_decode = [
|
|
|
34
34
|
]
|
|
35
35
|
|
|
36
36
|
_METRIC_FIELDS: list[str] = [
|
|
37
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
38
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
39
|
-
mm_schemas.MetricData.METRIC_VALUE,
|
|
40
|
-
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
41
|
-
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
37
|
+
mm_schemas.WriterEvent.APPLICATION_NAME.value,
|
|
38
|
+
mm_schemas.MetricData.METRIC_NAME.value,
|
|
39
|
+
mm_schemas.MetricData.METRIC_VALUE.value,
|
|
40
|
+
mm_schemas.WriterEvent.START_INFER_TIME.value,
|
|
41
|
+
mm_schemas.WriterEvent.END_INFER_TIME.value,
|
|
42
42
|
]
|
|
43
43
|
|
|
44
44
|
|
|
@@ -417,11 +417,14 @@ class KVStoreBase(StoreBase):
|
|
|
417
417
|
)
|
|
418
418
|
return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
419
419
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
420
|
+
if err.status_code == http.HTTPStatus.NOT_FOUND:
|
|
421
|
+
logger.debug("Last analyzed time not found", err=err)
|
|
422
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
423
|
+
f"No last analyzed value has been found for {application_name} "
|
|
424
|
+
f"that processes model endpoint {endpoint_id}",
|
|
425
|
+
)
|
|
426
|
+
logger.error("Error while getting last analyzed time", err=err)
|
|
427
|
+
raise err
|
|
425
428
|
|
|
426
429
|
def update_last_analyzed(
|
|
427
430
|
self, endpoint_id: str, application_name: str, last_analyzed: int
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import typing
|
|
16
16
|
from abc import ABC, abstractmethod
|
|
17
17
|
from datetime import datetime
|
|
18
|
+
from typing import Union
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import pydantic
|
|
@@ -47,7 +48,7 @@ class TSDBConnector(ABC):
|
|
|
47
48
|
self.project = project
|
|
48
49
|
|
|
49
50
|
@abstractmethod
|
|
50
|
-
def apply_monitoring_stream_steps(self, graph):
|
|
51
|
+
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
51
52
|
"""
|
|
52
53
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
53
54
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -59,6 +60,14 @@ class TSDBConnector(ABC):
|
|
|
59
60
|
"""
|
|
60
61
|
pass
|
|
61
62
|
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Adds a branch to the stream pod graph to handle events that
|
|
67
|
+
arrive with errors from the model server and saves them to the error TSDB table.
|
|
68
|
+
The first step that generates by this method should come after `ForwardError` step.
|
|
69
|
+
"""
|
|
70
|
+
|
|
62
71
|
@abstractmethod
|
|
63
72
|
def write_application_event(
|
|
64
73
|
self,
|
|
@@ -181,6 +190,117 @@ class TSDBConnector(ABC):
|
|
|
181
190
|
:return: Metric values object or no data object.
|
|
182
191
|
"""
|
|
183
192
|
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def get_last_request(
|
|
195
|
+
self,
|
|
196
|
+
endpoint_ids: Union[str, list[str]],
|
|
197
|
+
start: Union[datetime, str] = "0",
|
|
198
|
+
end: Union[datetime, str] = "now",
|
|
199
|
+
) -> pd.DataFrame:
|
|
200
|
+
"""
|
|
201
|
+
Fetches data from the predictions TSDB table and returns the most recent request
|
|
202
|
+
timestamp for each specified endpoint.
|
|
203
|
+
|
|
204
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
205
|
+
:param start: The start time for the query.
|
|
206
|
+
:param end: The end time for the query.
|
|
207
|
+
|
|
208
|
+
:return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
|
|
209
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
@abstractmethod
|
|
213
|
+
def get_drift_status(
|
|
214
|
+
self,
|
|
215
|
+
endpoint_ids: Union[str, list[str]],
|
|
216
|
+
start: Union[datetime, str] = "now-24h",
|
|
217
|
+
end: Union[datetime, str] = "now",
|
|
218
|
+
) -> pd.DataFrame:
|
|
219
|
+
"""
|
|
220
|
+
Fetches data from the app-results TSDB table and returns the highest status among all
|
|
221
|
+
the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
|
|
222
|
+
|
|
223
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
224
|
+
:param start: The start time for the query.
|
|
225
|
+
:param end: The end time for the query.
|
|
226
|
+
|
|
227
|
+
:return: A pd.DataFrame containing the columns [result_status, endpoint_id].
|
|
228
|
+
If an endpoint has not been monitored within the specified time range (last 24 hours),
|
|
229
|
+
it will not appear in the result.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def get_metrics_metadata(
|
|
234
|
+
self,
|
|
235
|
+
endpoint_id: str,
|
|
236
|
+
start: Union[datetime, str] = "0",
|
|
237
|
+
end: Union[datetime, str] = "now",
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""
|
|
240
|
+
Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
|
|
241
|
+
|
|
242
|
+
:param endpoint_id: The model endpoint identifier.
|
|
243
|
+
:param start: The start time of the query.
|
|
244
|
+
:param end: The end time of the query.
|
|
245
|
+
|
|
246
|
+
:return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
|
|
247
|
+
Containing the columns [application_name, metric_name, endpoint_id]
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
@abstractmethod
|
|
251
|
+
def get_results_metadata(
|
|
252
|
+
self,
|
|
253
|
+
endpoint_id: str,
|
|
254
|
+
start: Union[datetime, str] = "0",
|
|
255
|
+
end: Union[datetime, str] = "now",
|
|
256
|
+
) -> pd.DataFrame:
|
|
257
|
+
"""
|
|
258
|
+
Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
|
|
259
|
+
|
|
260
|
+
:param endpoint_id: The model endpoint identifier.
|
|
261
|
+
:param start: The start time of the query.
|
|
262
|
+
:param end: The end time of the query.
|
|
263
|
+
|
|
264
|
+
:return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
|
|
265
|
+
Containing the columns [application_name, result_name, result_kind, endpoint_id]
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def get_error_count(
|
|
270
|
+
self,
|
|
271
|
+
endpoint_ids: Union[str, list[str]],
|
|
272
|
+
start: Union[datetime, str] = "0",
|
|
273
|
+
end: Union[datetime, str] = "now",
|
|
274
|
+
) -> pd.DataFrame:
|
|
275
|
+
"""
|
|
276
|
+
Fetches data from the error TSDB table and returns the error count for each specified endpoint.
|
|
277
|
+
|
|
278
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
279
|
+
:param start: The start time for the query.
|
|
280
|
+
:param end: The end time for the query.
|
|
281
|
+
|
|
282
|
+
:return: A pd.DataFrame containing the columns [error_count, endpoint_id].
|
|
283
|
+
If an endpoint have not raised error within the specified time range, it will not appear in the result.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def get_avg_latency(
|
|
288
|
+
self,
|
|
289
|
+
endpoint_ids: Union[str, list[str]],
|
|
290
|
+
start: Union[datetime, str] = "0",
|
|
291
|
+
end: Union[datetime, str] = "now",
|
|
292
|
+
) -> pd.DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
295
|
+
|
|
296
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
297
|
+
:param start: The start time for the query.
|
|
298
|
+
:param end: The end time for the query.
|
|
299
|
+
|
|
300
|
+
:return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
|
|
301
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
302
|
+
"""
|
|
303
|
+
|
|
184
304
|
@staticmethod
|
|
185
305
|
def df_to_metrics_values(
|
|
186
306
|
*,
|