mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from sqlalchemy.ext.declarative import declarative_base, declared_attr
|
|
|
18
18
|
|
|
19
19
|
from mlrun.common.schemas.model_monitoring import (
|
|
20
20
|
EventFieldType,
|
|
21
|
+
ResultData,
|
|
21
22
|
WriterEvent,
|
|
22
23
|
)
|
|
23
24
|
|
|
@@ -32,12 +33,21 @@ Base = declarative_base()
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class ModelEndpointsTable(Base, ModelEndpointsBaseTable):
|
|
36
|
+
feature_stats = Column(
|
|
37
|
+
EventFieldType.FEATURE_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
|
|
38
|
+
)
|
|
39
|
+
current_stats = Column(
|
|
40
|
+
EventFieldType.CURRENT_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
|
|
41
|
+
)
|
|
42
|
+
metrics = Column(EventFieldType.METRICS, sqlalchemy.dialects.mysql.MEDIUMTEXT)
|
|
35
43
|
first_request = Column(
|
|
36
44
|
EventFieldType.FIRST_REQUEST,
|
|
45
|
+
# TODO: migrate to DATETIME, see ML-6921
|
|
37
46
|
sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
|
|
38
47
|
)
|
|
39
48
|
last_request = Column(
|
|
40
49
|
EventFieldType.LAST_REQUEST,
|
|
50
|
+
# TODO: migrate to DATETIME, see ML-6921
|
|
41
51
|
sqlalchemy.dialects.mysql.TIMESTAMP(fsp=3, timezone=True),
|
|
42
52
|
)
|
|
43
53
|
|
|
@@ -52,11 +62,11 @@ class _ApplicationResultOrMetric:
|
|
|
52
62
|
|
|
53
63
|
start_infer_time = Column(
|
|
54
64
|
WriterEvent.START_INFER_TIME,
|
|
55
|
-
sqlalchemy.dialects.mysql.
|
|
65
|
+
sqlalchemy.dialects.mysql.DATETIME(fsp=3, timezone=True),
|
|
56
66
|
)
|
|
57
67
|
end_infer_time = Column(
|
|
58
68
|
WriterEvent.END_INFER_TIME,
|
|
59
|
-
sqlalchemy.dialects.mysql.
|
|
69
|
+
sqlalchemy.dialects.mysql.DATETIME(fsp=3, timezone=True),
|
|
60
70
|
)
|
|
61
71
|
|
|
62
72
|
@declared_attr
|
|
@@ -70,7 +80,12 @@ class _ApplicationResultOrMetric:
|
|
|
70
80
|
class ApplicationResultTable(
|
|
71
81
|
Base, _ApplicationResultOrMetric, ApplicationResultBaseTable
|
|
72
82
|
):
|
|
73
|
-
|
|
83
|
+
result_extra_data = Column(
|
|
84
|
+
ResultData.RESULT_EXTRA_DATA, sqlalchemy.dialects.mysql.MEDIUMTEXT
|
|
85
|
+
)
|
|
86
|
+
current_stats = Column(
|
|
87
|
+
ResultData.CURRENT_STATS, sqlalchemy.dialects.mysql.MEDIUMTEXT
|
|
88
|
+
)
|
|
74
89
|
|
|
75
90
|
|
|
76
91
|
class ApplicationMetricsTable(
|
|
@@ -20,7 +20,7 @@ import pandas as pd
|
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sqlalchemy.exc
|
|
22
22
|
import sqlalchemy.orm
|
|
23
|
-
from sqlalchemy.engine import make_url
|
|
23
|
+
from sqlalchemy.engine import Engine, make_url
|
|
24
24
|
from sqlalchemy.sql.elements import BinaryExpression
|
|
25
25
|
|
|
26
26
|
import mlrun.common.model_monitoring.helpers
|
|
@@ -61,9 +61,15 @@ class SQLStoreBase(StoreBase):
|
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
self._sql_connection_string = kwargs.get("store_connection_string")
|
|
64
|
-
self._engine =
|
|
64
|
+
self._engine = None
|
|
65
65
|
self._init_tables()
|
|
66
66
|
|
|
67
|
+
@property
|
|
68
|
+
def engine(self) -> Engine:
|
|
69
|
+
if not self._engine:
|
|
70
|
+
self._engine = get_engine(dsn=self._sql_connection_string)
|
|
71
|
+
return self._engine
|
|
72
|
+
|
|
67
73
|
def create_tables(self):
|
|
68
74
|
self._create_tables_if_not_exist()
|
|
69
75
|
|
|
@@ -116,7 +122,7 @@ class SQLStoreBase(StoreBase):
|
|
|
116
122
|
:param table_name: Target table name.
|
|
117
123
|
:param event: Event dictionary that will be written into the DB.
|
|
118
124
|
"""
|
|
119
|
-
with self.
|
|
125
|
+
with self.engine.connect() as connection:
|
|
120
126
|
# Convert the result into a pandas Dataframe and write it into the database
|
|
121
127
|
event_df = pd.DataFrame([event])
|
|
122
128
|
event_df.to_sql(table_name, con=connection, index=False, if_exists="append")
|
|
@@ -177,6 +183,11 @@ class SQLStoreBase(StoreBase):
|
|
|
177
183
|
param table: SQLAlchemy declarative table.
|
|
178
184
|
:param criteria: A list of binary expressions that filter the query.
|
|
179
185
|
"""
|
|
186
|
+
if not self.engine.has_table(table.__tablename__):
|
|
187
|
+
logger.debug(
|
|
188
|
+
f"Table {table.__tablename__} does not exist in the database. Skipping deletion."
|
|
189
|
+
)
|
|
190
|
+
return
|
|
180
191
|
with create_session(dsn=self._sql_connection_string) as session:
|
|
181
192
|
# Generate and commit the delete query
|
|
182
193
|
session.query(
|
|
@@ -266,22 +277,8 @@ class SQLStoreBase(StoreBase):
|
|
|
266
277
|
labels: list[str] = None,
|
|
267
278
|
top_level: bool = None,
|
|
268
279
|
uids: list = None,
|
|
280
|
+
include_stats: bool = None,
|
|
269
281
|
) -> list[dict[str, typing.Any]]:
|
|
270
|
-
"""
|
|
271
|
-
Returns a list of model endpoint dictionaries, supports filtering by model, function, labels or top level.
|
|
272
|
-
By default, when no filters are applied, all available model endpoints for the given project will
|
|
273
|
-
be listed.
|
|
274
|
-
|
|
275
|
-
:param model: The name of the model to filter by.
|
|
276
|
-
:param function: The name of the function to filter by.
|
|
277
|
-
:param labels: A list of labels to filter by. Label filters work by either filtering a specific value
|
|
278
|
-
of a label (i.e. list("key=value")) or by looking for the existence of a given
|
|
279
|
-
key (i.e. "key").
|
|
280
|
-
:param top_level: If True will return only routers and endpoint that are NOT children of any router.
|
|
281
|
-
:param uids: List of model endpoint unique ids to include in the result.
|
|
282
|
-
|
|
283
|
-
:return: A list of model endpoint dictionaries.
|
|
284
|
-
"""
|
|
285
282
|
# Generate an empty model endpoints that will be filled afterwards with model endpoint dictionaries
|
|
286
283
|
endpoint_list = []
|
|
287
284
|
|
|
@@ -341,6 +338,12 @@ class SQLStoreBase(StoreBase):
|
|
|
341
338
|
):
|
|
342
339
|
continue
|
|
343
340
|
|
|
341
|
+
if not include_stats:
|
|
342
|
+
# Exclude these fields when listing model endpoints to avoid returning too much data (ML-6594)
|
|
343
|
+
# TODO: Remove stats from table schema (ML-7196)
|
|
344
|
+
endpoint_dict.pop(mm_schemas.EventFieldType.FEATURE_STATS)
|
|
345
|
+
endpoint_dict.pop(mm_schemas.EventFieldType.CURRENT_STATS)
|
|
346
|
+
|
|
344
347
|
endpoint_list.append(endpoint_dict)
|
|
345
348
|
|
|
346
349
|
return endpoint_list
|
|
@@ -527,9 +530,9 @@ class SQLStoreBase(StoreBase):
|
|
|
527
530
|
for table in self._tables:
|
|
528
531
|
# Create table if not exist. The `metadata` contains the `ModelEndpointsTable`
|
|
529
532
|
db_name = make_url(self._sql_connection_string).database
|
|
530
|
-
if not self.
|
|
533
|
+
if not self.engine.has_table(table):
|
|
531
534
|
logger.info(f"Creating table {table} on {db_name} db.")
|
|
532
|
-
self._tables[table].metadata.create_all(bind=self.
|
|
535
|
+
self._tables[table].metadata.create_all(bind=self.engine)
|
|
533
536
|
else:
|
|
534
537
|
logger.info(f"Table {table} already exists on {db_name} db.")
|
|
535
538
|
|
|
@@ -577,12 +580,19 @@ class SQLStoreBase(StoreBase):
|
|
|
577
580
|
"""
|
|
578
581
|
Delete all the model monitoring resources of the project in the SQL tables.
|
|
579
582
|
"""
|
|
583
|
+
logger.debug(
|
|
584
|
+
"Deleting model monitoring endpoints resources from the SQL tables",
|
|
585
|
+
project=self.project,
|
|
586
|
+
)
|
|
580
587
|
endpoints = self.list_model_endpoints()
|
|
581
|
-
logger.debug("Deleting model monitoring resources", project=self.project)
|
|
582
588
|
|
|
583
589
|
for endpoint_dict in endpoints:
|
|
584
590
|
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
|
|
585
|
-
|
|
591
|
+
logger.debug(
|
|
592
|
+
"Deleting model endpoint resources from the SQL tables",
|
|
593
|
+
endpoint_id=endpoint_id,
|
|
594
|
+
project=self.project,
|
|
595
|
+
)
|
|
586
596
|
# Delete last analyzed records
|
|
587
597
|
self._delete_last_analyzed(endpoint_id=endpoint_id)
|
|
588
598
|
|
|
@@ -592,6 +602,16 @@ class SQLStoreBase(StoreBase):
|
|
|
592
602
|
|
|
593
603
|
# Delete model endpoint record
|
|
594
604
|
self.delete_model_endpoint(endpoint_id=endpoint_id)
|
|
605
|
+
logger.debug(
|
|
606
|
+
"Successfully deleted model endpoint resources",
|
|
607
|
+
endpoint_id=endpoint_id,
|
|
608
|
+
project=self.project,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
logger.debug(
|
|
612
|
+
"Successfully deleted model monitoring endpoints resources from the SQL tables",
|
|
613
|
+
project=self.project,
|
|
614
|
+
)
|
|
595
615
|
|
|
596
616
|
def get_model_endpoint_metrics(
|
|
597
617
|
self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
|
|
@@ -615,7 +635,7 @@ class SQLStoreBase(StoreBase):
|
|
|
615
635
|
|
|
616
636
|
# Note: the block below does not use self._get, as we need here all the
|
|
617
637
|
# results, not only `one_or_none`.
|
|
618
|
-
with sqlalchemy.orm.Session(self.
|
|
638
|
+
with sqlalchemy.orm.Session(self.engine) as session:
|
|
619
639
|
metric_rows = (
|
|
620
640
|
session.query(table) # pyright: ignore[reportOptionalCall]
|
|
621
641
|
.filter(table.endpoint_id == endpoint_id)
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import http
|
|
15
15
|
import json
|
|
16
16
|
import typing
|
|
17
17
|
from dataclasses import dataclass
|
|
@@ -20,6 +20,7 @@ from http import HTTPStatus
|
|
|
20
20
|
import v3io.dataplane
|
|
21
21
|
import v3io.dataplane.output
|
|
22
22
|
import v3io.dataplane.response
|
|
23
|
+
from v3io.dataplane import Client as V3IOClient
|
|
23
24
|
|
|
24
25
|
import mlrun.common.model_monitoring.helpers
|
|
25
26
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
@@ -34,11 +35,11 @@ fields_to_encode_decode = [
|
|
|
34
35
|
]
|
|
35
36
|
|
|
36
37
|
_METRIC_FIELDS: list[str] = [
|
|
37
|
-
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
38
|
-
mm_schemas.MetricData.METRIC_NAME,
|
|
39
|
-
mm_schemas.MetricData.METRIC_VALUE,
|
|
40
|
-
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
41
|
-
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
38
|
+
mm_schemas.WriterEvent.APPLICATION_NAME.value,
|
|
39
|
+
mm_schemas.MetricData.METRIC_NAME.value,
|
|
40
|
+
mm_schemas.MetricData.METRIC_VALUE.value,
|
|
41
|
+
mm_schemas.WriterEvent.START_INFER_TIME.value,
|
|
42
|
+
mm_schemas.WriterEvent.END_INFER_TIME.value,
|
|
42
43
|
]
|
|
43
44
|
|
|
44
45
|
|
|
@@ -100,13 +101,18 @@ class KVStoreBase(StoreBase):
|
|
|
100
101
|
project: str,
|
|
101
102
|
) -> None:
|
|
102
103
|
super().__init__(project=project)
|
|
103
|
-
|
|
104
|
-
self.client = mlrun.utils.v3io_clients.get_v3io_client(
|
|
105
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
106
|
-
)
|
|
104
|
+
self._client = None
|
|
107
105
|
# Get the KV table path and container
|
|
108
106
|
self.path, self.container = self._get_path_and_container()
|
|
109
107
|
|
|
108
|
+
@property
|
|
109
|
+
def client(self) -> V3IOClient:
|
|
110
|
+
if not self._client:
|
|
111
|
+
self._client = mlrun.utils.v3io_clients.get_v3io_client(
|
|
112
|
+
endpoint=mlrun.mlconf.v3io_api,
|
|
113
|
+
)
|
|
114
|
+
return self._client
|
|
115
|
+
|
|
110
116
|
def write_model_endpoint(self, endpoint: dict[str, typing.Any]):
|
|
111
117
|
"""
|
|
112
118
|
Create a new endpoint record in the KV table.
|
|
@@ -226,24 +232,8 @@ class KVStoreBase(StoreBase):
|
|
|
226
232
|
labels: list[str] = None,
|
|
227
233
|
top_level: bool = None,
|
|
228
234
|
uids: list = None,
|
|
235
|
+
include_stats: bool = None,
|
|
229
236
|
) -> list[dict[str, typing.Any]]:
|
|
230
|
-
"""
|
|
231
|
-
Returns a list of model endpoint dictionaries, supports filtering by model, function, labels or top level.
|
|
232
|
-
By default, when no filters are applied, all available model endpoints for the given project will
|
|
233
|
-
be listed.
|
|
234
|
-
|
|
235
|
-
:param model: The name of the model to filter by.
|
|
236
|
-
:param function: The name of the function to filter by.
|
|
237
|
-
:param labels: A list of labels to filter by. Label filters work by either filtering a specific value
|
|
238
|
-
of a label (i.e. list("key=value")) or by looking for the existence of a given
|
|
239
|
-
key (i.e. "key").
|
|
240
|
-
:param top_level: If True will return only routers and endpoint that are NOT children of any router.
|
|
241
|
-
:param uids: List of model endpoint unique ids to include in the result.
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
:return: A list of model endpoint dictionaries.
|
|
245
|
-
"""
|
|
246
|
-
|
|
247
237
|
# # Initialize an empty model endpoints list
|
|
248
238
|
endpoint_list = []
|
|
249
239
|
|
|
@@ -283,6 +273,10 @@ class KVStoreBase(StoreBase):
|
|
|
283
273
|
endpoint_dict = self.get_model_endpoint(
|
|
284
274
|
endpoint_id=endpoint_id,
|
|
285
275
|
)
|
|
276
|
+
if not include_stats:
|
|
277
|
+
# Exclude these fields when listing model endpoints to avoid returning too much data (ML-6594)
|
|
278
|
+
endpoint_dict.pop(mm_schemas.EventFieldType.FEATURE_STATS)
|
|
279
|
+
endpoint_dict.pop(mm_schemas.EventFieldType.CURRENT_STATS)
|
|
286
280
|
|
|
287
281
|
if labels and not self._validate_labels(
|
|
288
282
|
endpoint_dict=endpoint_dict, labels=labels
|
|
@@ -297,6 +291,10 @@ class KVStoreBase(StoreBase):
|
|
|
297
291
|
"""
|
|
298
292
|
Delete all model endpoints resources in V3IO KV.
|
|
299
293
|
"""
|
|
294
|
+
logger.debug(
|
|
295
|
+
"Deleting model monitoring endpoints resources in V3IO KV",
|
|
296
|
+
project=self.project,
|
|
297
|
+
)
|
|
300
298
|
|
|
301
299
|
endpoints = self.list_model_endpoints()
|
|
302
300
|
|
|
@@ -307,10 +305,22 @@ class KVStoreBase(StoreBase):
|
|
|
307
305
|
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
|
|
308
306
|
else:
|
|
309
307
|
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
|
|
308
|
+
|
|
309
|
+
logger.debug(
|
|
310
|
+
"Deleting model endpoint resources from the V3IO KV table",
|
|
311
|
+
endpoint_id=endpoint_id,
|
|
312
|
+
project=self.project,
|
|
313
|
+
)
|
|
314
|
+
|
|
310
315
|
self.delete_model_endpoint(
|
|
311
316
|
endpoint_id,
|
|
312
317
|
)
|
|
313
318
|
|
|
319
|
+
logger.debug(
|
|
320
|
+
"Successfully deleted model monitoring endpoints from the V3IO KV table",
|
|
321
|
+
project=self.project,
|
|
322
|
+
)
|
|
323
|
+
|
|
314
324
|
# Delete remain records in the KV
|
|
315
325
|
all_records = self.client.kv.new_cursor(
|
|
316
326
|
container=self.container,
|
|
@@ -362,7 +372,7 @@ class KVStoreBase(StoreBase):
|
|
|
362
372
|
table_path = self._get_results_table_path(endpoint_id)
|
|
363
373
|
key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
|
|
364
374
|
metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
|
|
365
|
-
attributes = {metric_name: json.dumps(event)}
|
|
375
|
+
attributes = {metric_name: self._encode_field(json.dumps(event))}
|
|
366
376
|
else:
|
|
367
377
|
raise ValueError(f"Invalid {kind = }")
|
|
368
378
|
|
|
@@ -420,20 +430,23 @@ class KVStoreBase(StoreBase):
|
|
|
420
430
|
|
|
421
431
|
"""
|
|
422
432
|
try:
|
|
423
|
-
|
|
433
|
+
response = self.client.kv.get(
|
|
424
434
|
container=self._get_monitoring_schedules_container(
|
|
425
435
|
project_name=self.project
|
|
426
436
|
),
|
|
427
437
|
table_path=endpoint_id,
|
|
428
438
|
key=application_name,
|
|
429
439
|
)
|
|
430
|
-
return
|
|
440
|
+
return response.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
431
441
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
442
|
+
if err.status_code == http.HTTPStatus.NOT_FOUND:
|
|
443
|
+
logger.debug("Last analyzed time not found", err=err)
|
|
444
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
445
|
+
f"No last analyzed value has been found for {application_name} "
|
|
446
|
+
f"that processes model endpoint {endpoint_id}",
|
|
447
|
+
)
|
|
448
|
+
logger.error("Error while getting last analyzed time", err=err)
|
|
449
|
+
raise err
|
|
437
450
|
|
|
438
451
|
def update_last_analyzed(
|
|
439
452
|
self, endpoint_id: str, application_name: str, last_analyzed: int
|
|
@@ -57,7 +57,7 @@ class ObjectTSDBFactory(enum.Enum):
|
|
|
57
57
|
:param value: Provided enum (invalid) value.
|
|
58
58
|
"""
|
|
59
59
|
valid_values = list(cls.__members__.keys())
|
|
60
|
-
raise mlrun.errors.
|
|
60
|
+
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
61
61
|
f"{value} is not a valid tsdb, please choose a valid value: %{valid_values}."
|
|
62
62
|
)
|
|
63
63
|
|
|
@@ -76,6 +76,8 @@ def get_tsdb_connector(
|
|
|
76
76
|
|
|
77
77
|
:return: `TSDBConnector` object. The main goal of this object is to handle different operations on the
|
|
78
78
|
TSDB connector such as updating drift metrics or write application record result.
|
|
79
|
+
:raise: `MLRunInvalidMMStoreTypeError` if the user didn't provide TSDB connection
|
|
80
|
+
or the provided TSDB connection is invalid.
|
|
79
81
|
"""
|
|
80
82
|
|
|
81
83
|
tsdb_connection_string = (
|
|
@@ -91,7 +93,10 @@ def get_tsdb_connector(
|
|
|
91
93
|
elif tsdb_connection_string and tsdb_connection_string == "v3io":
|
|
92
94
|
tsdb_connector_type = mlrun.common.schemas.model_monitoring.TSDBTarget.V3IO_TSDB
|
|
93
95
|
else:
|
|
94
|
-
|
|
96
|
+
raise mlrun.errors.MLRunInvalidMMStoreTypeError(
|
|
97
|
+
"You must provide a valid tsdb store connection by using "
|
|
98
|
+
"set_model_monitoring_credentials API."
|
|
99
|
+
)
|
|
95
100
|
|
|
96
101
|
# Get connector type value from ObjectTSDBFactory enum class
|
|
97
102
|
tsdb_connector_factory = ObjectTSDBFactory(tsdb_connector_type)
|
|
@@ -15,8 +15,10 @@
|
|
|
15
15
|
import typing
|
|
16
16
|
from abc import ABC, abstractmethod
|
|
17
17
|
from datetime import datetime
|
|
18
|
+
from typing import Union
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
21
|
+
import pydantic
|
|
20
22
|
|
|
21
23
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
22
24
|
import mlrun.model_monitoring.db.tsdb.helpers
|
|
@@ -27,7 +29,7 @@ from mlrun.utils import logger
|
|
|
27
29
|
class TSDBConnector(ABC):
|
|
28
30
|
type: typing.ClassVar[str]
|
|
29
31
|
|
|
30
|
-
def __init__(self, project: str):
|
|
32
|
+
def __init__(self, project: str) -> None:
|
|
31
33
|
"""
|
|
32
34
|
Initialize a new TSDB connector. The connector is used to interact with the TSDB and store monitoring data.
|
|
33
35
|
At the moment we have 3 different types of monitoring data:
|
|
@@ -42,11 +44,11 @@ class TSDBConnector(ABC):
|
|
|
42
44
|
writer.
|
|
43
45
|
|
|
44
46
|
:param project: the name of the project.
|
|
45
|
-
|
|
46
47
|
"""
|
|
47
48
|
self.project = project
|
|
48
49
|
|
|
49
|
-
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
50
52
|
"""
|
|
51
53
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
52
54
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -58,6 +60,15 @@ class TSDBConnector(ABC):
|
|
|
58
60
|
"""
|
|
59
61
|
pass
|
|
60
62
|
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def handle_model_error(self, graph, **kwargs) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Adds a branch to the stream pod graph to handle events that
|
|
67
|
+
arrive with errors from the model server and saves them to the error TSDB table.
|
|
68
|
+
The first step that generates by this method should come after `ForwardError` step.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
61
72
|
def write_application_event(
|
|
62
73
|
self,
|
|
63
74
|
event: dict,
|
|
@@ -69,13 +80,14 @@ class TSDBConnector(ABC):
|
|
|
69
80
|
:raise mlrun.errors.MLRunRuntimeError: If an error occurred while writing the event.
|
|
70
81
|
"""
|
|
71
82
|
|
|
83
|
+
@abstractmethod
|
|
72
84
|
def delete_tsdb_resources(self):
|
|
73
85
|
"""
|
|
74
86
|
Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
|
|
75
87
|
"""
|
|
76
|
-
|
|
77
88
|
pass
|
|
78
89
|
|
|
90
|
+
@abstractmethod
|
|
79
91
|
def get_model_endpoint_real_time_metrics(
|
|
80
92
|
self,
|
|
81
93
|
endpoint_id: str,
|
|
@@ -102,6 +114,7 @@ class TSDBConnector(ABC):
|
|
|
102
114
|
"""
|
|
103
115
|
pass
|
|
104
116
|
|
|
117
|
+
@abstractmethod
|
|
105
118
|
def create_tables(self) -> None:
|
|
106
119
|
"""
|
|
107
120
|
Create the TSDB tables using the TSDB connector. At the moment we support 3 types of tables:
|
|
@@ -177,6 +190,117 @@ class TSDBConnector(ABC):
|
|
|
177
190
|
:return: Metric values object or no data object.
|
|
178
191
|
"""
|
|
179
192
|
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def get_last_request(
|
|
195
|
+
self,
|
|
196
|
+
endpoint_ids: Union[str, list[str]],
|
|
197
|
+
start: Union[datetime, str] = "0",
|
|
198
|
+
end: Union[datetime, str] = "now",
|
|
199
|
+
) -> pd.DataFrame:
|
|
200
|
+
"""
|
|
201
|
+
Fetches data from the predictions TSDB table and returns the most recent request
|
|
202
|
+
timestamp for each specified endpoint.
|
|
203
|
+
|
|
204
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
205
|
+
:param start: The start time for the query.
|
|
206
|
+
:param end: The end time for the query.
|
|
207
|
+
|
|
208
|
+
:return: A pd.DataFrame containing the columns [endpoint_id, last_request, last_latency].
|
|
209
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
@abstractmethod
|
|
213
|
+
def get_drift_status(
|
|
214
|
+
self,
|
|
215
|
+
endpoint_ids: Union[str, list[str]],
|
|
216
|
+
start: Union[datetime, str] = "now-24h",
|
|
217
|
+
end: Union[datetime, str] = "now",
|
|
218
|
+
) -> pd.DataFrame:
|
|
219
|
+
"""
|
|
220
|
+
Fetches data from the app-results TSDB table and returns the highest status among all
|
|
221
|
+
the result in the provided time range, which by default is the last 24 hours, for each specified endpoint.
|
|
222
|
+
|
|
223
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
224
|
+
:param start: The start time for the query.
|
|
225
|
+
:param end: The end time for the query.
|
|
226
|
+
|
|
227
|
+
:return: A pd.DataFrame containing the columns [result_status, endpoint_id].
|
|
228
|
+
If an endpoint has not been monitored within the specified time range (last 24 hours),
|
|
229
|
+
it will not appear in the result.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def get_metrics_metadata(
|
|
234
|
+
self,
|
|
235
|
+
endpoint_id: str,
|
|
236
|
+
start: Union[datetime, str] = "0",
|
|
237
|
+
end: Union[datetime, str] = "now",
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""
|
|
240
|
+
Fetches distinct metrics metadata from the metrics TSDB table for a specified model endpoint.
|
|
241
|
+
|
|
242
|
+
:param endpoint_id: The model endpoint identifier.
|
|
243
|
+
:param start: The start time of the query.
|
|
244
|
+
:param end: The end time of the query.
|
|
245
|
+
|
|
246
|
+
:return: A pd.DataFrame containing all distinct metrics for the specified endpoint within the given time range.
|
|
247
|
+
Containing the columns [application_name, metric_name, endpoint_id]
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
@abstractmethod
|
|
251
|
+
def get_results_metadata(
|
|
252
|
+
self,
|
|
253
|
+
endpoint_id: str,
|
|
254
|
+
start: Union[datetime, str] = "0",
|
|
255
|
+
end: Union[datetime, str] = "now",
|
|
256
|
+
) -> pd.DataFrame:
|
|
257
|
+
"""
|
|
258
|
+
Fetches distinct results metadata from the app-results TSDB table for a specified model endpoint.
|
|
259
|
+
|
|
260
|
+
:param endpoint_id: The model endpoint identifier.
|
|
261
|
+
:param start: The start time of the query.
|
|
262
|
+
:param end: The end time of the query.
|
|
263
|
+
|
|
264
|
+
:return: A pd.DataFrame containing all distinct results for the specified endpoint within the given time range.
|
|
265
|
+
Containing the columns [application_name, result_name, result_kind, endpoint_id]
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def get_error_count(
|
|
270
|
+
self,
|
|
271
|
+
endpoint_ids: Union[str, list[str]],
|
|
272
|
+
start: Union[datetime, str] = "0",
|
|
273
|
+
end: Union[datetime, str] = "now",
|
|
274
|
+
) -> pd.DataFrame:
|
|
275
|
+
"""
|
|
276
|
+
Fetches data from the error TSDB table and returns the error count for each specified endpoint.
|
|
277
|
+
|
|
278
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
279
|
+
:param start: The start time for the query.
|
|
280
|
+
:param end: The end time for the query.
|
|
281
|
+
|
|
282
|
+
:return: A pd.DataFrame containing the columns [error_count, endpoint_id].
|
|
283
|
+
If an endpoint have not raised error within the specified time range, it will not appear in the result.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def get_avg_latency(
|
|
288
|
+
self,
|
|
289
|
+
endpoint_ids: Union[str, list[str]],
|
|
290
|
+
start: Union[datetime, str] = "0",
|
|
291
|
+
end: Union[datetime, str] = "now",
|
|
292
|
+
) -> pd.DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
295
|
+
|
|
296
|
+
:param endpoint_ids: A list of model endpoint identifiers.
|
|
297
|
+
:param start: The start time for the query.
|
|
298
|
+
:param end: The end time for the query.
|
|
299
|
+
|
|
300
|
+
:return: A pd.DataFrame containing the columns [avg_latency, endpoint_id].
|
|
301
|
+
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
302
|
+
"""
|
|
303
|
+
|
|
180
304
|
@staticmethod
|
|
181
305
|
def df_to_metrics_values(
|
|
182
306
|
*,
|
|
@@ -286,19 +410,27 @@ class TSDBConnector(ABC):
|
|
|
286
410
|
full_name = mlrun.model_monitoring.helpers._compose_full_name(
|
|
287
411
|
project=project, app=app_name, name=name
|
|
288
412
|
)
|
|
289
|
-
|
|
290
|
-
|
|
413
|
+
try:
|
|
414
|
+
metrics_values.append(
|
|
415
|
+
mm_schemas.ModelEndpointMonitoringResultValues(
|
|
416
|
+
full_name=full_name,
|
|
417
|
+
result_kind=result_kind,
|
|
418
|
+
values=list(
|
|
419
|
+
zip(
|
|
420
|
+
sub_df.index,
|
|
421
|
+
sub_df[mm_schemas.ResultData.RESULT_VALUE],
|
|
422
|
+
sub_df[mm_schemas.ResultData.RESULT_STATUS],
|
|
423
|
+
)
|
|
424
|
+
), # pyright: ignore[reportArgumentType]
|
|
425
|
+
)
|
|
426
|
+
)
|
|
427
|
+
except pydantic.ValidationError:
|
|
428
|
+
logger.exception(
|
|
429
|
+
"Failed to convert data-frame into `ModelEndpointMonitoringResultValues`",
|
|
291
430
|
full_name=full_name,
|
|
292
|
-
|
|
293
|
-
values=list(
|
|
294
|
-
zip(
|
|
295
|
-
sub_df.index,
|
|
296
|
-
sub_df[mm_schemas.ResultData.RESULT_VALUE],
|
|
297
|
-
sub_df[mm_schemas.ResultData.RESULT_STATUS],
|
|
298
|
-
)
|
|
299
|
-
), # pyright: ignore[reportArgumentType]
|
|
431
|
+
sub_df_json=sub_df.to_json(),
|
|
300
432
|
)
|
|
301
|
-
|
|
433
|
+
raise
|
|
302
434
|
del metrics_without_data[full_name]
|
|
303
435
|
|
|
304
436
|
for metric in metrics_without_data.values():
|