mlrun 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +26 -22
- mlrun/__main__.py +15 -16
- mlrun/alerts/alert.py +150 -15
- mlrun/api/schemas/__init__.py +1 -9
- mlrun/artifacts/__init__.py +2 -3
- mlrun/artifacts/base.py +62 -19
- mlrun/artifacts/dataset.py +17 -17
- mlrun/artifacts/document.py +454 -0
- mlrun/artifacts/manager.py +28 -18
- mlrun/artifacts/model.py +91 -59
- mlrun/artifacts/plots.py +2 -2
- mlrun/common/constants.py +8 -0
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -1
- mlrun/common/formatters/feature_set.py +2 -0
- mlrun/common/formatters/function.py +1 -0
- mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
- mlrun/common/formatters/pipeline.py +1 -2
- mlrun/common/formatters/project.py +9 -0
- mlrun/common/model_monitoring/__init__.py +0 -5
- mlrun/common/model_monitoring/helpers.py +12 -62
- mlrun/common/runtimes/constants.py +25 -4
- mlrun/common/schemas/__init__.py +9 -5
- mlrun/common/schemas/alert.py +114 -19
- mlrun/common/schemas/api_gateway.py +3 -3
- mlrun/common/schemas/artifact.py +22 -9
- mlrun/common/schemas/auth.py +8 -4
- mlrun/common/schemas/background_task.py +7 -7
- mlrun/common/schemas/client_spec.py +4 -4
- mlrun/common/schemas/clusterization_spec.py +2 -2
- mlrun/common/schemas/common.py +53 -3
- mlrun/common/schemas/constants.py +15 -0
- mlrun/common/schemas/datastore_profile.py +1 -1
- mlrun/common/schemas/feature_store.py +9 -9
- mlrun/common/schemas/frontend_spec.py +4 -4
- mlrun/common/schemas/function.py +10 -10
- mlrun/common/schemas/hub.py +1 -1
- mlrun/common/schemas/k8s.py +3 -3
- mlrun/common/schemas/memory_reports.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +4 -8
- mlrun/common/schemas/model_monitoring/constants.py +127 -46
- mlrun/common/schemas/model_monitoring/grafana.py +18 -12
- mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
- mlrun/common/schemas/notification.py +24 -3
- mlrun/common/schemas/object.py +1 -1
- mlrun/common/schemas/pagination.py +4 -4
- mlrun/common/schemas/partition.py +142 -0
- mlrun/common/schemas/pipeline.py +3 -3
- mlrun/common/schemas/project.py +26 -18
- mlrun/common/schemas/runs.py +3 -3
- mlrun/common/schemas/runtime_resource.py +5 -5
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/secret.py +1 -1
- mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
- mlrun/common/schemas/tag.py +3 -3
- mlrun/common/schemas/workflow.py +6 -5
- mlrun/common/types.py +1 -0
- mlrun/config.py +157 -89
- mlrun/data_types/__init__.py +5 -3
- mlrun/data_types/infer.py +13 -3
- mlrun/data_types/spark.py +2 -1
- mlrun/datastore/__init__.py +59 -18
- mlrun/datastore/alibaba_oss.py +4 -1
- mlrun/datastore/azure_blob.py +4 -1
- mlrun/datastore/base.py +19 -24
- mlrun/datastore/datastore.py +10 -4
- mlrun/datastore/datastore_profile.py +178 -45
- mlrun/datastore/dbfs_store.py +4 -1
- mlrun/datastore/filestore.py +4 -1
- mlrun/datastore/google_cloud_storage.py +4 -1
- mlrun/datastore/hdfs.py +4 -1
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +4 -1
- mlrun/datastore/s3.py +14 -3
- mlrun/datastore/sources.py +89 -92
- mlrun/datastore/store_resources.py +7 -4
- mlrun/datastore/storeytargets.py +51 -16
- mlrun/datastore/targets.py +38 -31
- mlrun/datastore/utils.py +87 -4
- mlrun/datastore/v3io.py +4 -1
- mlrun/datastore/vectorstore.py +291 -0
- mlrun/datastore/wasbfs/fs.py +13 -12
- mlrun/db/base.py +286 -100
- mlrun/db/httpdb.py +1562 -490
- mlrun/db/nopdb.py +250 -83
- mlrun/errors.py +6 -2
- mlrun/execution.py +194 -50
- mlrun/feature_store/__init__.py +2 -10
- mlrun/feature_store/api.py +20 -458
- mlrun/feature_store/common.py +9 -9
- mlrun/feature_store/feature_set.py +20 -18
- mlrun/feature_store/feature_vector.py +105 -479
- mlrun/feature_store/feature_vector_utils.py +466 -0
- mlrun/feature_store/retrieval/base.py +15 -11
- mlrun/feature_store/retrieval/job.py +2 -1
- mlrun/feature_store/retrieval/storey_merger.py +1 -1
- mlrun/feature_store/steps.py +3 -3
- mlrun/features.py +30 -13
- mlrun/frameworks/__init__.py +1 -2
- mlrun/frameworks/_common/__init__.py +1 -2
- mlrun/frameworks/_common/artifacts_library.py +2 -2
- mlrun/frameworks/_common/mlrun_interface.py +10 -6
- mlrun/frameworks/_common/model_handler.py +31 -31
- mlrun/frameworks/_common/producer.py +3 -1
- mlrun/frameworks/_dl_common/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
- mlrun/frameworks/_ml_common/__init__.py +1 -2
- mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
- mlrun/frameworks/_ml_common/model_handler.py +21 -21
- mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/auto_mlrun/__init__.py +1 -2
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
- mlrun/frameworks/huggingface/__init__.py +1 -2
- mlrun/frameworks/huggingface/model_server.py +9 -9
- mlrun/frameworks/lgbm/__init__.py +47 -44
- mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
- mlrun/frameworks/lgbm/model_handler.py +15 -11
- mlrun/frameworks/lgbm/model_server.py +11 -7
- mlrun/frameworks/lgbm/utils.py +2 -2
- mlrun/frameworks/onnx/__init__.py +1 -2
- mlrun/frameworks/onnx/dataset.py +3 -3
- mlrun/frameworks/onnx/mlrun_interface.py +2 -2
- mlrun/frameworks/onnx/model_handler.py +7 -5
- mlrun/frameworks/onnx/model_server.py +8 -6
- mlrun/frameworks/parallel_coordinates.py +11 -11
- mlrun/frameworks/pytorch/__init__.py +22 -23
- mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
- mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
- mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
- mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
- mlrun/frameworks/pytorch/model_handler.py +21 -17
- mlrun/frameworks/pytorch/model_server.py +13 -9
- mlrun/frameworks/sklearn/__init__.py +19 -18
- mlrun/frameworks/sklearn/estimator.py +2 -2
- mlrun/frameworks/sklearn/metric.py +3 -3
- mlrun/frameworks/sklearn/metrics_library.py +8 -6
- mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
- mlrun/frameworks/sklearn/model_handler.py +4 -3
- mlrun/frameworks/tf_keras/__init__.py +11 -12
- mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
- mlrun/frameworks/tf_keras/model_handler.py +17 -13
- mlrun/frameworks/tf_keras/model_server.py +12 -8
- mlrun/frameworks/xgboost/__init__.py +19 -18
- mlrun/frameworks/xgboost/model_handler.py +13 -9
- mlrun/k8s_utils.py +2 -5
- mlrun/launcher/base.py +3 -4
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +6 -2
- mlrun/launcher/remote.py +1 -1
- mlrun/lists.py +8 -4
- mlrun/model.py +132 -46
- mlrun/model_monitoring/__init__.py +3 -5
- mlrun/model_monitoring/api.py +113 -98
- mlrun/model_monitoring/applications/__init__.py +0 -5
- mlrun/model_monitoring/applications/_application_steps.py +81 -50
- mlrun/model_monitoring/applications/base.py +467 -14
- mlrun/model_monitoring/applications/context.py +212 -134
- mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
- mlrun/model_monitoring/applications/evidently/base.py +146 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
- mlrun/model_monitoring/applications/results.py +67 -15
- mlrun/model_monitoring/controller.py +701 -315
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/_schedules.py +242 -0
- mlrun/model_monitoring/db/_stats.py +189 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
- mlrun/model_monitoring/db/tsdb/base.py +243 -49
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
- mlrun/model_monitoring/helpers.py +356 -114
- mlrun/model_monitoring/stream_processing.py +190 -345
- mlrun/model_monitoring/tracking_policy.py +11 -4
- mlrun/model_monitoring/writer.py +49 -90
- mlrun/package/__init__.py +3 -6
- mlrun/package/context_handler.py +2 -2
- mlrun/package/packager.py +12 -9
- mlrun/package/packagers/__init__.py +0 -2
- mlrun/package/packagers/default_packager.py +14 -11
- mlrun/package/packagers/numpy_packagers.py +16 -7
- mlrun/package/packagers/pandas_packagers.py +18 -18
- mlrun/package/packagers/python_standard_library_packagers.py +25 -11
- mlrun/package/packagers_manager.py +35 -32
- mlrun/package/utils/__init__.py +0 -3
- mlrun/package/utils/_pickler.py +6 -6
- mlrun/platforms/__init__.py +47 -16
- mlrun/platforms/iguazio.py +4 -1
- mlrun/projects/operations.py +30 -30
- mlrun/projects/pipelines.py +116 -47
- mlrun/projects/project.py +1292 -329
- mlrun/render.py +5 -9
- mlrun/run.py +57 -14
- mlrun/runtimes/__init__.py +1 -3
- mlrun/runtimes/base.py +30 -22
- mlrun/runtimes/daskjob.py +9 -9
- mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
- mlrun/runtimes/function_reference.py +5 -2
- mlrun/runtimes/generators.py +3 -2
- mlrun/runtimes/kubejob.py +6 -7
- mlrun/runtimes/mounts.py +574 -0
- mlrun/runtimes/mpijob/__init__.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +7 -6
- mlrun/runtimes/nuclio/api_gateway.py +7 -7
- mlrun/runtimes/nuclio/application/application.py +11 -13
- mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
- mlrun/runtimes/nuclio/function.py +127 -70
- mlrun/runtimes/nuclio/serving.py +105 -37
- mlrun/runtimes/pod.py +159 -54
- mlrun/runtimes/remotesparkjob.py +3 -2
- mlrun/runtimes/sparkjob/__init__.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +22 -12
- mlrun/runtimes/utils.py +7 -6
- mlrun/secrets.py +2 -2
- mlrun/serving/__init__.py +8 -0
- mlrun/serving/merger.py +7 -5
- mlrun/serving/remote.py +35 -22
- mlrun/serving/routers.py +186 -240
- mlrun/serving/server.py +41 -10
- mlrun/serving/states.py +432 -118
- mlrun/serving/utils.py +13 -2
- mlrun/serving/v1_serving.py +3 -2
- mlrun/serving/v2_serving.py +161 -203
- mlrun/track/__init__.py +1 -1
- mlrun/track/tracker.py +2 -2
- mlrun/track/trackers/mlflow_tracker.py +6 -5
- mlrun/utils/async_http.py +35 -22
- mlrun/utils/clones.py +7 -4
- mlrun/utils/helpers.py +511 -58
- mlrun/utils/logger.py +119 -13
- mlrun/utils/notifications/notification/__init__.py +22 -19
- mlrun/utils/notifications/notification/base.py +39 -15
- mlrun/utils/notifications/notification/console.py +6 -6
- mlrun/utils/notifications/notification/git.py +11 -11
- mlrun/utils/notifications/notification/ipython.py +10 -9
- mlrun/utils/notifications/notification/mail.py +176 -0
- mlrun/utils/notifications/notification/slack.py +16 -8
- mlrun/utils/notifications/notification/webhook.py +24 -8
- mlrun/utils/notifications/notification_pusher.py +191 -200
- mlrun/utils/regex.py +12 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/METADATA +69 -54
- mlrun-1.8.0.dist-info/RECORD +351 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
- mlrun/model_monitoring/applications/evidently_base.py +0 -137
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/store.py +0 -213
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
- mlrun/model_monitoring/model_endpoint.py +0 -118
- mlrun-1.7.2rc4.dist-info/RECORD +0 -351
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
- {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -11,10 +11,10 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from datetime import datetime,
|
|
14
|
+
import math
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
16
|
from io import StringIO
|
|
17
|
-
from typing import Literal, Optional, Union
|
|
17
|
+
from typing import Callable, Literal, Optional, Union
|
|
18
18
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import v3io_frames
|
|
@@ -33,6 +33,13 @@ _TSDB_BE = "tsdb"
|
|
|
33
33
|
_TSDB_RATE = "1/s"
|
|
34
34
|
_CONTAINER = "users"
|
|
35
35
|
|
|
36
|
+
V3IO_FRAMESD_MEPS_LIMIT = (
|
|
37
|
+
200 # Maximum number of model endpoints per single request when using V3IO Frames
|
|
38
|
+
)
|
|
39
|
+
V3IO_CLIENT_MEPS_LIMIT = (
|
|
40
|
+
150 # Maximum number of model endpoints per single request when using V3IO Client
|
|
41
|
+
)
|
|
42
|
+
|
|
36
43
|
|
|
37
44
|
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
38
45
|
"""
|
|
@@ -58,6 +65,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
58
65
|
project: str,
|
|
59
66
|
container: str = _CONTAINER,
|
|
60
67
|
v3io_framesd: Optional[str] = None,
|
|
68
|
+
v3io_access_key: str = "",
|
|
61
69
|
create_table: bool = False,
|
|
62
70
|
) -> None:
|
|
63
71
|
super().__init__(project=project)
|
|
@@ -65,14 +73,26 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
65
73
|
self.container = container
|
|
66
74
|
|
|
67
75
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
76
|
+
self._v3io_access_key = v3io_access_key
|
|
68
77
|
self._frames_client: Optional[v3io_frames.client.ClientBase] = None
|
|
69
78
|
self._init_tables_path()
|
|
70
79
|
self._create_table = create_table
|
|
80
|
+
self._v3io_client = None
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def v3io_client(self):
|
|
84
|
+
if not self._v3io_client:
|
|
85
|
+
self._v3io_client = mlrun.utils.v3io_clients.get_v3io_client(
|
|
86
|
+
endpoint=mlrun.mlconf.v3io_api, access_key=self._v3io_access_key
|
|
87
|
+
)
|
|
88
|
+
return self._v3io_client
|
|
71
89
|
|
|
72
90
|
@property
|
|
73
91
|
def frames_client(self) -> v3io_frames.client.ClientBase:
|
|
74
92
|
if not self._frames_client:
|
|
75
|
-
self._frames_client = self._get_v3io_frames_client(
|
|
93
|
+
self._frames_client = self._get_v3io_frames_client(
|
|
94
|
+
self.container, v3io_access_key=self._v3io_access_key
|
|
95
|
+
)
|
|
76
96
|
if self._create_table:
|
|
77
97
|
self.create_tables()
|
|
78
98
|
return self._frames_client
|
|
@@ -129,7 +149,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
129
149
|
monitoring_predictions_full_path = (
|
|
130
150
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
131
151
|
project=self.project,
|
|
132
|
-
kind=mm_schemas.
|
|
152
|
+
kind=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
133
153
|
)
|
|
134
154
|
)
|
|
135
155
|
(
|
|
@@ -139,28 +159,51 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
139
159
|
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
140
160
|
monitoring_predictions_full_path
|
|
141
161
|
)
|
|
142
|
-
self.tables[mm_schemas.
|
|
162
|
+
self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS] = monitoring_predictions_path
|
|
163
|
+
|
|
164
|
+
# initialize kv table
|
|
165
|
+
last_request_full_table_path = (
|
|
166
|
+
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
167
|
+
project=self.project,
|
|
168
|
+
kind=mm_schemas.FileTargetKind.LAST_REQUEST,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
(
|
|
172
|
+
_,
|
|
173
|
+
_,
|
|
174
|
+
self.last_request_table,
|
|
175
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
176
|
+
last_request_full_table_path
|
|
177
|
+
)
|
|
143
178
|
|
|
144
179
|
def create_tables(self) -> None:
|
|
145
180
|
"""
|
|
146
|
-
Create the tables using the TSDB connector.
|
|
181
|
+
Create the tables using the TSDB connector. These are the tables that are stored in the V3IO TSDB:
|
|
147
182
|
- app_results: a detailed result that includes status, kind, extra data, etc.
|
|
148
183
|
- metrics: a basic key value that represents a single numeric metric.
|
|
149
|
-
|
|
184
|
+
- events: A statistics table that includes pre-aggregated metrics (such as average latency over the
|
|
185
|
+
last 5 minutes) and data samples
|
|
186
|
+
- predictions: a detailed prediction that includes latency, request timestamp, etc. This table also
|
|
187
|
+
includes pre-aggregated operations such as count and average on 1 minute granularity.
|
|
188
|
+
- errors: a detailed error that includes error desc, error type, etc.
|
|
189
|
+
|
|
150
190
|
"""
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
191
|
+
|
|
192
|
+
default_configurations = {
|
|
193
|
+
"backend": _TSDB_BE,
|
|
194
|
+
"if_exists": v3io_frames.IGNORE,
|
|
195
|
+
"rate": _TSDB_RATE,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
for table_name in self.tables:
|
|
199
|
+
default_configurations["table"] = self.tables[table_name]
|
|
200
|
+
if table_name == mm_schemas.V3IOTSDBTables.PREDICTIONS:
|
|
201
|
+
default_configurations["aggregates"] = "count,avg"
|
|
202
|
+
default_configurations["aggregation_granularity"] = "1m"
|
|
203
|
+
elif table_name == mm_schemas.V3IOTSDBTables.EVENTS:
|
|
204
|
+
default_configurations["rate"] = "10/m"
|
|
156
205
|
logger.info("Creating table in V3IO TSDB", table_name=table_name)
|
|
157
|
-
|
|
158
|
-
self.frames_client.create(
|
|
159
|
-
backend=_TSDB_BE,
|
|
160
|
-
table=table,
|
|
161
|
-
if_exists=v3io_frames.IGNORE,
|
|
162
|
-
rate=_TSDB_RATE,
|
|
163
|
-
)
|
|
206
|
+
self.frames_client.create(**default_configurations)
|
|
164
207
|
|
|
165
208
|
def apply_monitoring_stream_steps(
|
|
166
209
|
self,
|
|
@@ -168,6 +211,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
168
211
|
tsdb_batching_max_events: int = 1000,
|
|
169
212
|
tsdb_batching_timeout_secs: int = 30,
|
|
170
213
|
sample_window: int = 10,
|
|
214
|
+
aggregate_windows: Optional[list[str]] = None,
|
|
215
|
+
aggregate_period: str = "1m",
|
|
216
|
+
**kwarg,
|
|
171
217
|
):
|
|
172
218
|
"""
|
|
173
219
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -178,31 +224,73 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
178
224
|
- endpoint_features (Prediction and feature names and values)
|
|
179
225
|
- custom_metrics (user-defined metrics)
|
|
180
226
|
"""
|
|
227
|
+
aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
228
|
+
|
|
229
|
+
# Calculate number of predictions and average latency
|
|
230
|
+
def apply_storey_aggregations():
|
|
231
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
232
|
+
graph.add_step(
|
|
233
|
+
class_name="storey.AggregateByKey",
|
|
234
|
+
aggregates=[
|
|
235
|
+
{
|
|
236
|
+
"name": EventFieldType.LATENCY,
|
|
237
|
+
"column": EventFieldType.LATENCY,
|
|
238
|
+
"operations": ["count", "avg"],
|
|
239
|
+
"windows": aggregate_windows,
|
|
240
|
+
"period": aggregate_period,
|
|
241
|
+
}
|
|
242
|
+
],
|
|
243
|
+
name=EventFieldType.LATENCY,
|
|
244
|
+
after="FilterNOP",
|
|
245
|
+
step_name="Aggregates",
|
|
246
|
+
table=".",
|
|
247
|
+
key_field=EventFieldType.ENDPOINT_ID,
|
|
248
|
+
)
|
|
249
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
250
|
+
graph.add_step(
|
|
251
|
+
class_name="storey.Rename",
|
|
252
|
+
mapping={
|
|
253
|
+
"latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
254
|
+
"latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
255
|
+
},
|
|
256
|
+
name="Rename",
|
|
257
|
+
after=EventFieldType.LATENCY,
|
|
258
|
+
)
|
|
181
259
|
|
|
260
|
+
apply_storey_aggregations()
|
|
182
261
|
# Write latency per prediction, labeled by endpoint ID only
|
|
183
262
|
graph.add_step(
|
|
184
263
|
"storey.TSDBTarget",
|
|
185
264
|
name="tsdb_predictions",
|
|
186
|
-
after="
|
|
187
|
-
path=f"{self.container}/{self.tables[mm_schemas.
|
|
188
|
-
rate="1/s",
|
|
265
|
+
after="FilterNOP",
|
|
266
|
+
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS]}",
|
|
189
267
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
190
268
|
container=self.container,
|
|
191
269
|
v3io_frames=self.v3io_framesd,
|
|
192
270
|
columns=[
|
|
193
271
|
mm_schemas.EventFieldType.LATENCY,
|
|
194
272
|
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
273
|
+
mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
|
|
274
|
+
mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
|
|
195
275
|
],
|
|
196
276
|
index_cols=[
|
|
197
277
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
198
278
|
],
|
|
199
|
-
aggr="count,avg",
|
|
200
|
-
aggr_granularity="1m",
|
|
201
279
|
max_events=tsdb_batching_max_events,
|
|
202
280
|
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
203
281
|
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
204
282
|
)
|
|
205
283
|
|
|
284
|
+
# Write last request timestamp to KV table
|
|
285
|
+
graph.add_step(
|
|
286
|
+
"storey.NoSqlTarget",
|
|
287
|
+
name="KVLastRequest",
|
|
288
|
+
after="tsdb_predictions",
|
|
289
|
+
table=f"v3io:///users/{self.last_request_table}",
|
|
290
|
+
columns=[EventFieldType.LAST_REQUEST_TIMESTAMP],
|
|
291
|
+
index_cols=[EventFieldType.ENDPOINT_ID],
|
|
292
|
+
)
|
|
293
|
+
|
|
206
294
|
# Emits the event in window size of events based on sample_window size (10 by default)
|
|
207
295
|
graph.add_step(
|
|
208
296
|
"storey.steps.SampleWindow",
|
|
@@ -236,7 +324,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
236
324
|
name=name,
|
|
237
325
|
after=after,
|
|
238
326
|
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.EVENTS]}",
|
|
239
|
-
rate="10/m",
|
|
240
327
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
241
328
|
container=self.container,
|
|
242
329
|
v3io_frames=self.v3io_framesd,
|
|
@@ -300,7 +387,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
300
387
|
name="tsdb_error",
|
|
301
388
|
after="error_extractor",
|
|
302
389
|
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
|
|
303
|
-
rate="1/s",
|
|
304
390
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
305
391
|
container=self.container,
|
|
306
392
|
v3io_frames=self.v3io_framesd,
|
|
@@ -310,6 +396,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
310
396
|
],
|
|
311
397
|
index_cols=[
|
|
312
398
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
399
|
+
mm_schemas.EventFieldType.ERROR_TYPE,
|
|
313
400
|
],
|
|
314
401
|
max_events=tsdb_batching_max_events,
|
|
315
402
|
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
@@ -338,9 +425,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
338
425
|
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
339
426
|
table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
340
427
|
index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
|
|
341
|
-
event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
|
|
342
|
-
# TODO: remove this when extra data is supported (ML-7460)
|
|
343
|
-
event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
|
|
344
428
|
else:
|
|
345
429
|
raise ValueError(f"Invalid {kind = }")
|
|
346
430
|
|
|
@@ -371,12 +455,20 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
371
455
|
# Delete all tables
|
|
372
456
|
tables = mm_schemas.V3IOTSDBTables.list()
|
|
373
457
|
for table_to_delete in tables:
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
458
|
+
if table_to_delete in self.tables:
|
|
459
|
+
try:
|
|
460
|
+
self.frames_client.delete(
|
|
461
|
+
backend=_TSDB_BE, table=self.tables[table_to_delete]
|
|
462
|
+
)
|
|
463
|
+
except v3io_frames.DeleteError as e:
|
|
464
|
+
logger.warning(
|
|
465
|
+
f"Failed to delete TSDB table '{table_to_delete}'",
|
|
466
|
+
err=mlrun.errors.err_to_str(e),
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
377
469
|
logger.warning(
|
|
378
|
-
f"
|
|
379
|
-
|
|
470
|
+
f"Skipping deletion: table '{table_to_delete}' is not among the initialized tables.",
|
|
471
|
+
initialized_tables=list(self.tables.keys()),
|
|
380
472
|
)
|
|
381
473
|
|
|
382
474
|
# Final cleanup of tsdb path
|
|
@@ -385,6 +477,60 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
385
477
|
store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
|
|
386
478
|
store.rm(tsdb_path, recursive=True)
|
|
387
479
|
|
|
480
|
+
def delete_tsdb_records(
|
|
481
|
+
self,
|
|
482
|
+
endpoint_ids: list[str],
|
|
483
|
+
):
|
|
484
|
+
logger.debug(
|
|
485
|
+
"Deleting model endpoints resources using the V3IO TSDB connector",
|
|
486
|
+
project=self.project,
|
|
487
|
+
number_of_endpoints_to_delete=len(endpoint_ids),
|
|
488
|
+
)
|
|
489
|
+
tables = mm_schemas.V3IOTSDBTables.list()
|
|
490
|
+
|
|
491
|
+
# Split the endpoint ids into chunks to avoid exceeding the v3io-engine filter-expression limit
|
|
492
|
+
for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
|
|
493
|
+
endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
|
|
494
|
+
filter_query = f"endpoint_id IN({str(endpoint_id_chunk)[1:-1]}) "
|
|
495
|
+
for table in tables:
|
|
496
|
+
try:
|
|
497
|
+
self.frames_client.delete(
|
|
498
|
+
backend=_TSDB_BE,
|
|
499
|
+
table=self.tables[table],
|
|
500
|
+
filter=filter_query,
|
|
501
|
+
start="0",
|
|
502
|
+
)
|
|
503
|
+
except Exception as e:
|
|
504
|
+
logger.warning(
|
|
505
|
+
f"Failed to delete TSDB records for the provided endpoints from table '{table}'",
|
|
506
|
+
error=mlrun.errors.err_to_str(e),
|
|
507
|
+
project=self.project,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Clean the last request records from the KV table
|
|
511
|
+
self._delete_last_request_records(endpoint_ids=endpoint_ids)
|
|
512
|
+
|
|
513
|
+
logger.debug(
|
|
514
|
+
"Deleted all model endpoint resources using the V3IO connector",
|
|
515
|
+
project=self.project,
|
|
516
|
+
number_of_endpoints_to_delete=len(endpoint_ids),
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
def _delete_last_request_records(self, endpoint_ids: list[str]):
|
|
520
|
+
for endpoint_id in endpoint_ids:
|
|
521
|
+
try:
|
|
522
|
+
self.v3io_client.kv.delete(
|
|
523
|
+
container=self.container,
|
|
524
|
+
table=self.last_request_table,
|
|
525
|
+
key=endpoint_id,
|
|
526
|
+
)
|
|
527
|
+
except Exception as e:
|
|
528
|
+
logger.warning(
|
|
529
|
+
f"Failed to delete last request record for endpoint '{endpoint_id}'",
|
|
530
|
+
error=mlrun.errors.err_to_str(e),
|
|
531
|
+
project=self.project,
|
|
532
|
+
)
|
|
533
|
+
|
|
388
534
|
def get_model_endpoint_real_time_metrics(
|
|
389
535
|
self, endpoint_id: str, metrics: list[str], start: str, end: str
|
|
390
536
|
) -> dict[str, list[tuple[str, float]]]:
|
|
@@ -449,8 +595,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
449
595
|
interval: Optional[str] = None,
|
|
450
596
|
agg_funcs: Optional[list[str]] = None,
|
|
451
597
|
sliding_window_step: Optional[str] = None,
|
|
598
|
+
get_raw: bool = False,
|
|
452
599
|
**kwargs,
|
|
453
|
-
) -> pd.DataFrame:
|
|
600
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
454
601
|
"""
|
|
455
602
|
Getting records from V3IO TSDB data collection.
|
|
456
603
|
:param table: Path to the collection to query.
|
|
@@ -475,6 +622,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
475
622
|
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
476
623
|
as a string in the format of '1m', '1h', etc.
|
|
477
624
|
:param kwargs: Additional keyword arguments passed to the read method of frames client.
|
|
625
|
+
:param get_raw: Whether to return the request as raw frames rather than a pandas dataframe.
|
|
626
|
+
Defaults to False. This can greatly improve performance when a dataframe isn't
|
|
627
|
+
needed.
|
|
628
|
+
|
|
478
629
|
:return: DataFrame with the provided attributes from the data collection.
|
|
479
630
|
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
480
631
|
"""
|
|
@@ -488,7 +639,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
488
639
|
aggregators = ",".join(agg_funcs) if agg_funcs else None
|
|
489
640
|
table_path = self.tables[table]
|
|
490
641
|
try:
|
|
491
|
-
|
|
642
|
+
res = self.frames_client.read(
|
|
492
643
|
backend=_TSDB_BE,
|
|
493
644
|
table=table_path,
|
|
494
645
|
start=start,
|
|
@@ -498,15 +649,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
498
649
|
aggregation_window=interval,
|
|
499
650
|
aggregators=aggregators,
|
|
500
651
|
step=sliding_window_step,
|
|
652
|
+
get_raw=get_raw,
|
|
501
653
|
**kwargs,
|
|
502
654
|
)
|
|
655
|
+
if get_raw:
|
|
656
|
+
res = list(res)
|
|
503
657
|
except v3io_frames.Error as err:
|
|
504
658
|
if _is_no_schema_error(err):
|
|
505
|
-
return pd.DataFrame()
|
|
659
|
+
return [] if get_raw else pd.DataFrame()
|
|
506
660
|
else:
|
|
507
661
|
raise err
|
|
508
662
|
|
|
509
|
-
return
|
|
663
|
+
return res
|
|
510
664
|
|
|
511
665
|
def _get_v3io_source_directory(self) -> str:
|
|
512
666
|
"""
|
|
@@ -530,12 +684,34 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
530
684
|
return source_directory
|
|
531
685
|
|
|
532
686
|
@staticmethod
|
|
533
|
-
def _get_v3io_frames_client(
|
|
687
|
+
def _get_v3io_frames_client(
|
|
688
|
+
v3io_container: str, v3io_access_key: str = ""
|
|
689
|
+
) -> v3io_frames.client.ClientBase:
|
|
534
690
|
return mlrun.utils.v3io_clients.get_frames_client(
|
|
535
691
|
address=mlrun.mlconf.v3io_framesd,
|
|
536
692
|
container=v3io_container,
|
|
693
|
+
token=v3io_access_key,
|
|
537
694
|
)
|
|
538
695
|
|
|
696
|
+
@staticmethod
|
|
697
|
+
def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
|
|
698
|
+
if isinstance(endpoint_id, str):
|
|
699
|
+
return f"endpoint_id=='{endpoint_id}'"
|
|
700
|
+
elif isinstance(endpoint_id, list):
|
|
701
|
+
if len(endpoint_id) > V3IO_FRAMESD_MEPS_LIMIT:
|
|
702
|
+
logger.info(
|
|
703
|
+
"The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
|
|
704
|
+
"retrieving all the model endpoints from the db.",
|
|
705
|
+
limit=V3IO_FRAMESD_MEPS_LIMIT,
|
|
706
|
+
amount=len(endpoint_id),
|
|
707
|
+
)
|
|
708
|
+
return None
|
|
709
|
+
return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
|
|
710
|
+
else:
|
|
711
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
712
|
+
f"Invalid 'endpoint_id' filter: must be a string or a list, endpoint_id: {endpoint_id}"
|
|
713
|
+
)
|
|
714
|
+
|
|
539
715
|
def read_metrics_data(
|
|
540
716
|
self,
|
|
541
717
|
*,
|
|
@@ -544,6 +720,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
544
720
|
end: datetime,
|
|
545
721
|
metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
|
|
546
722
|
type: Literal["metrics", "results"] = "results",
|
|
723
|
+
with_result_extra_data: bool = False,
|
|
547
724
|
) -> Union[
|
|
548
725
|
list[
|
|
549
726
|
Union[
|
|
@@ -565,6 +742,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
565
742
|
"""
|
|
566
743
|
|
|
567
744
|
if type == "metrics":
|
|
745
|
+
if with_result_extra_data:
|
|
746
|
+
logger.warning(
|
|
747
|
+
"The 'with_result_extra_data' parameter is not supported for metrics, just for results",
|
|
748
|
+
project=self.project,
|
|
749
|
+
endpoint_id=endpoint_id,
|
|
750
|
+
)
|
|
568
751
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
569
752
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
570
753
|
columns = [mm_schemas.MetricData.METRIC_VALUE]
|
|
@@ -577,6 +760,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
577
760
|
mm_schemas.ResultData.RESULT_STATUS,
|
|
578
761
|
mm_schemas.ResultData.RESULT_KIND,
|
|
579
762
|
]
|
|
763
|
+
if with_result_extra_data:
|
|
764
|
+
columns.append(mm_schemas.ResultData.RESULT_EXTRA_DATA)
|
|
580
765
|
df_handler = self.df_to_results_values
|
|
581
766
|
else:
|
|
582
767
|
raise ValueError(f"Invalid {type = }")
|
|
@@ -605,6 +790,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
605
790
|
endpoint_id=endpoint_id,
|
|
606
791
|
is_empty=df.empty,
|
|
607
792
|
)
|
|
793
|
+
if not with_result_extra_data and type == "results":
|
|
794
|
+
# Set the extra data to an empty string if it's not requested
|
|
795
|
+
df[mm_schemas.ResultData.RESULT_EXTRA_DATA] = ""
|
|
608
796
|
|
|
609
797
|
return df_handler(df=df, metrics=metrics, project=self.project)
|
|
610
798
|
|
|
@@ -653,6 +841,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
653
841
|
end: Union[datetime, str],
|
|
654
842
|
aggregation_window: Optional[str] = None,
|
|
655
843
|
agg_funcs: Optional[list[str]] = None,
|
|
844
|
+
limit: Optional[
|
|
845
|
+
int
|
|
846
|
+
] = None, # no effect, just for compatibility with the abstract method
|
|
656
847
|
) -> Union[
|
|
657
848
|
mm_schemas.ModelEndpointMonitoringMetricNoData,
|
|
658
849
|
mm_schemas.ModelEndpointMonitoringMetricValues,
|
|
@@ -664,10 +855,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
664
855
|
"both or neither of `aggregation_window` and `agg_funcs` must be provided"
|
|
665
856
|
)
|
|
666
857
|
df = self._get_records(
|
|
667
|
-
table=mm_schemas.
|
|
858
|
+
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
668
859
|
start=start,
|
|
669
860
|
end=end,
|
|
670
|
-
columns=[mm_schemas.EventFieldType.
|
|
861
|
+
columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
|
|
671
862
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
672
863
|
agg_funcs=agg_funcs,
|
|
673
864
|
sliding_window_step=aggregation_window,
|
|
@@ -681,10 +872,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
681
872
|
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
682
873
|
)
|
|
683
874
|
|
|
684
|
-
|
|
685
|
-
f"{agg_funcs[0]}({mm_schemas.EventFieldType.
|
|
875
|
+
estimated_prediction_count = (
|
|
876
|
+
f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
|
|
686
877
|
if agg_funcs
|
|
687
|
-
else mm_schemas.EventFieldType.
|
|
878
|
+
else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
|
|
688
879
|
)
|
|
689
880
|
|
|
690
881
|
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
@@ -692,7 +883,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
692
883
|
values=list(
|
|
693
884
|
zip(
|
|
694
885
|
df.index,
|
|
695
|
-
df[
|
|
886
|
+
df[estimated_prediction_count],
|
|
696
887
|
)
|
|
697
888
|
), # pyright: ignore[reportArgumentType]
|
|
698
889
|
)
|
|
@@ -700,55 +891,78 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
700
891
|
def get_last_request(
|
|
701
892
|
self,
|
|
702
893
|
endpoint_ids: Union[str, list[str]],
|
|
703
|
-
start:
|
|
704
|
-
end:
|
|
705
|
-
) ->
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
last_request, tz=timezone.utc
|
|
894
|
+
start: Optional[datetime] = None,
|
|
895
|
+
end: Optional[datetime] = None,
|
|
896
|
+
) -> dict[str, float]:
|
|
897
|
+
# Get the last request timestamp for each endpoint from the KV table.
|
|
898
|
+
# The result of the query is a list of dictionaries,
|
|
899
|
+
# each dictionary contains the endpoint id and the last request timestamp.
|
|
900
|
+
last_request_timestamps = {}
|
|
901
|
+
if isinstance(endpoint_ids, str):
|
|
902
|
+
endpoint_ids = [endpoint_ids]
|
|
903
|
+
|
|
904
|
+
try:
|
|
905
|
+
if len(endpoint_ids) > V3IO_CLIENT_MEPS_LIMIT:
|
|
906
|
+
logger.warning(
|
|
907
|
+
"The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
|
|
908
|
+
"retrieving last request for all the model endpoints from the KV table.",
|
|
909
|
+
limit=V3IO_CLIENT_MEPS_LIMIT,
|
|
910
|
+
amount=len(endpoint_ids),
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
res = self.v3io_client.kv.new_cursor(
|
|
914
|
+
container=self.container,
|
|
915
|
+
table_path=self.last_request_table,
|
|
916
|
+
).all()
|
|
917
|
+
last_request_timestamps.update(
|
|
918
|
+
{d["__name"]: d["last_request_timestamp"] for d in res}
|
|
729
919
|
)
|
|
920
|
+
else:
|
|
921
|
+
filter_expression = " OR ".join(
|
|
922
|
+
[f"__name=='{endpoint_id}'" for endpoint_id in endpoint_ids]
|
|
923
|
+
)
|
|
924
|
+
res = self.v3io_client.kv.new_cursor(
|
|
925
|
+
container=self.container,
|
|
926
|
+
table_path=self.last_request_table,
|
|
927
|
+
filter_expression=filter_expression,
|
|
928
|
+
).all()
|
|
929
|
+
last_request_timestamps.update(
|
|
930
|
+
{d["__name"]: d["last_request_timestamp"] for d in res}
|
|
931
|
+
)
|
|
932
|
+
except Exception as e:
|
|
933
|
+
logger.warning(
|
|
934
|
+
"Failed to get last request timestamp from V3IO KV table.",
|
|
935
|
+
err=mlrun.errors.err_to_str(e),
|
|
936
|
+
project=self.project,
|
|
937
|
+
table=self.last_request_table,
|
|
730
938
|
)
|
|
731
939
|
|
|
732
|
-
return
|
|
940
|
+
return last_request_timestamps
|
|
733
941
|
|
|
734
942
|
def get_drift_status(
|
|
735
943
|
self,
|
|
736
944
|
endpoint_ids: Union[str, list[str]],
|
|
737
|
-
start:
|
|
738
|
-
end:
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
)
|
|
743
|
-
|
|
945
|
+
start: Optional[datetime] = None,
|
|
946
|
+
end: Optional[datetime] = None,
|
|
947
|
+
get_raw: bool = False,
|
|
948
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
949
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
950
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
951
|
+
start, end = self._get_start_end(start, end)
|
|
952
|
+
res = self._get_records(
|
|
744
953
|
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
745
954
|
start=start,
|
|
746
955
|
end=end,
|
|
747
956
|
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
748
|
-
filter_query=
|
|
957
|
+
filter_query=filter_query,
|
|
749
958
|
agg_funcs=["max"],
|
|
750
959
|
group_by="endpoint_id",
|
|
960
|
+
get_raw=get_raw,
|
|
751
961
|
)
|
|
962
|
+
if get_raw:
|
|
963
|
+
return res
|
|
964
|
+
|
|
965
|
+
df = res
|
|
752
966
|
if not df.empty:
|
|
753
967
|
df.columns = [
|
|
754
968
|
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
@@ -757,16 +971,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
757
971
|
|
|
758
972
|
def get_metrics_metadata(
|
|
759
973
|
self,
|
|
760
|
-
endpoint_id: str,
|
|
761
|
-
start:
|
|
762
|
-
end:
|
|
974
|
+
endpoint_id: Union[str, list[str]],
|
|
975
|
+
start: Optional[datetime] = None,
|
|
976
|
+
end: Optional[datetime] = None,
|
|
763
977
|
) -> pd.DataFrame:
|
|
978
|
+
start, end = self._get_start_end(start, end)
|
|
979
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
|
|
764
980
|
df = self._get_records(
|
|
765
981
|
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
766
982
|
start=start,
|
|
767
983
|
end=end,
|
|
768
984
|
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
769
|
-
filter_query=
|
|
985
|
+
filter_query=filter_query,
|
|
770
986
|
agg_funcs=["last"],
|
|
771
987
|
)
|
|
772
988
|
if not df.empty:
|
|
@@ -777,10 +993,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
777
993
|
|
|
778
994
|
def get_results_metadata(
|
|
779
995
|
self,
|
|
780
|
-
endpoint_id: str,
|
|
781
|
-
start:
|
|
782
|
-
end:
|
|
996
|
+
endpoint_id: Union[str, list[str]],
|
|
997
|
+
start: Optional[datetime] = None,
|
|
998
|
+
end: Optional[datetime] = None,
|
|
783
999
|
) -> pd.DataFrame:
|
|
1000
|
+
start, end = self._get_start_end(start, end)
|
|
1001
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
|
|
784
1002
|
df = self._get_records(
|
|
785
1003
|
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
786
1004
|
start=start,
|
|
@@ -788,7 +1006,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
788
1006
|
columns=[
|
|
789
1007
|
mm_schemas.ResultData.RESULT_KIND,
|
|
790
1008
|
],
|
|
791
|
-
filter_query=
|
|
1009
|
+
filter_query=filter_query,
|
|
792
1010
|
agg_funcs=["last"],
|
|
793
1011
|
)
|
|
794
1012
|
if not df.empty:
|
|
@@ -803,20 +1021,30 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
803
1021
|
def get_error_count(
|
|
804
1022
|
self,
|
|
805
1023
|
endpoint_ids: Union[str, list[str]],
|
|
806
|
-
start:
|
|
807
|
-
end:
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
1024
|
+
start: Optional[datetime] = None,
|
|
1025
|
+
end: Optional[datetime] = None,
|
|
1026
|
+
get_raw: bool = False,
|
|
1027
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
1028
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
1029
|
+
if filter_query:
|
|
1030
|
+
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
1031
|
+
else:
|
|
1032
|
+
filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
|
|
1033
|
+
start, end = self._get_start_end(start, end)
|
|
1034
|
+
res = self._get_records(
|
|
813
1035
|
table=mm_schemas.FileTargetKind.ERRORS,
|
|
814
1036
|
start=start,
|
|
815
1037
|
end=end,
|
|
816
1038
|
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
817
|
-
filter_query=
|
|
1039
|
+
filter_query=filter_query,
|
|
818
1040
|
agg_funcs=["count"],
|
|
1041
|
+
get_raw=get_raw,
|
|
819
1042
|
)
|
|
1043
|
+
|
|
1044
|
+
if get_raw:
|
|
1045
|
+
return res
|
|
1046
|
+
|
|
1047
|
+
df = res
|
|
820
1048
|
if not df.empty:
|
|
821
1049
|
df.rename(
|
|
822
1050
|
columns={
|
|
@@ -830,20 +1058,122 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
830
1058
|
def get_avg_latency(
|
|
831
1059
|
self,
|
|
832
1060
|
endpoint_ids: Union[str, list[str]],
|
|
833
|
-
start:
|
|
834
|
-
end:
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
)
|
|
839
|
-
|
|
840
|
-
|
|
1061
|
+
start: Optional[datetime] = None,
|
|
1062
|
+
end: Optional[datetime] = None,
|
|
1063
|
+
get_raw: bool = False,
|
|
1064
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
1065
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
1066
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
1067
|
+
start, end = self._get_start_end(start, end)
|
|
1068
|
+
res = self._get_records(
|
|
1069
|
+
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
841
1070
|
start=start,
|
|
842
1071
|
end=end,
|
|
843
1072
|
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
844
|
-
filter_query=
|
|
1073
|
+
filter_query=filter_query,
|
|
845
1074
|
agg_funcs=["avg"],
|
|
1075
|
+
get_raw=get_raw,
|
|
846
1076
|
)
|
|
1077
|
+
|
|
1078
|
+
if get_raw:
|
|
1079
|
+
return res
|
|
1080
|
+
|
|
1081
|
+
df = res
|
|
847
1082
|
if not df.empty:
|
|
848
1083
|
df.dropna(inplace=True)
|
|
1084
|
+
df.rename(
|
|
1085
|
+
columns={
|
|
1086
|
+
f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
|
|
1087
|
+
},
|
|
1088
|
+
inplace=True,
|
|
1089
|
+
)
|
|
849
1090
|
return df.reset_index(drop=True)
|
|
1091
|
+
|
|
1092
|
+
async def add_basic_metrics(
|
|
1093
|
+
self,
|
|
1094
|
+
model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
|
|
1095
|
+
project: str,
|
|
1096
|
+
run_in_threadpool: Callable,
|
|
1097
|
+
metric_list: Optional[list[str]] = None,
|
|
1098
|
+
) -> list[mlrun.common.schemas.ModelEndpoint]:
|
|
1099
|
+
"""
|
|
1100
|
+
Fetch basic metrics from V3IO TSDB and add them to MEP objects.
|
|
1101
|
+
|
|
1102
|
+
:param model_endpoint_objects: A list of `ModelEndpoint` objects that will
|
|
1103
|
+
be filled with the relevant basic metrics.
|
|
1104
|
+
:param project: The name of the project.
|
|
1105
|
+
:param run_in_threadpool: A function that runs another function in a thread pool.
|
|
1106
|
+
:param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
|
|
1107
|
+
|
|
1108
|
+
:return: A list of `ModelEndpointMonitoringMetric` objects.
|
|
1109
|
+
"""
|
|
1110
|
+
|
|
1111
|
+
uids = []
|
|
1112
|
+
model_endpoint_objects_by_uid = {}
|
|
1113
|
+
for model_endpoint_object in model_endpoint_objects:
|
|
1114
|
+
uid = model_endpoint_object.metadata.uid
|
|
1115
|
+
uids.append(uid)
|
|
1116
|
+
model_endpoint_objects_by_uid[uid] = model_endpoint_object
|
|
1117
|
+
|
|
1118
|
+
metric_name_to_function_and_column_name = {
|
|
1119
|
+
"error_count": (self.get_error_count, "count(error_count)"),
|
|
1120
|
+
"avg_latency": (self.get_avg_latency, "avg(latency)"),
|
|
1121
|
+
"result_status": (self.get_drift_status, "max(result_status)"),
|
|
1122
|
+
}
|
|
1123
|
+
if metric_list is not None:
|
|
1124
|
+
for metric_name in list(metric_name_to_function_and_column_name):
|
|
1125
|
+
if metric_name not in metric_list:
|
|
1126
|
+
del metric_name_to_function_and_column_name[metric_name]
|
|
1127
|
+
|
|
1128
|
+
metric_name_to_result = {}
|
|
1129
|
+
|
|
1130
|
+
for metric_name, (
|
|
1131
|
+
function,
|
|
1132
|
+
_,
|
|
1133
|
+
) in metric_name_to_function_and_column_name.items():
|
|
1134
|
+
metric_name_to_result[metric_name] = await run_in_threadpool(
|
|
1135
|
+
function,
|
|
1136
|
+
endpoint_ids=uids,
|
|
1137
|
+
get_raw=True,
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
def add_metric(
|
|
1141
|
+
metric: str,
|
|
1142
|
+
column_name: str,
|
|
1143
|
+
frames: list,
|
|
1144
|
+
):
|
|
1145
|
+
for frame in frames:
|
|
1146
|
+
endpoint_ids = frame.column_data("endpoint_id")
|
|
1147
|
+
metric_data = frame.column_data(column_name)
|
|
1148
|
+
for index, endpoint_id in enumerate(endpoint_ids):
|
|
1149
|
+
mep = model_endpoint_objects_by_uid.get(endpoint_id)
|
|
1150
|
+
value = metric_data[index]
|
|
1151
|
+
if mep and value is not None and not math.isnan(value):
|
|
1152
|
+
setattr(mep.status, metric, value)
|
|
1153
|
+
|
|
1154
|
+
for metric_name, result in metric_name_to_result.items():
|
|
1155
|
+
add_metric(
|
|
1156
|
+
metric_name,
|
|
1157
|
+
metric_name_to_function_and_column_name[metric_name][1],
|
|
1158
|
+
result,
|
|
1159
|
+
)
|
|
1160
|
+
if metric_list is None or "last_request" in metric_list:
|
|
1161
|
+
self._enrich_mep_with_last_request(
|
|
1162
|
+
model_endpoint_objects_by_uid=model_endpoint_objects_by_uid
|
|
1163
|
+
)
|
|
1164
|
+
|
|
1165
|
+
return list(model_endpoint_objects_by_uid.values())
|
|
1166
|
+
|
|
1167
|
+
def _enrich_mep_with_last_request(
|
|
1168
|
+
self,
|
|
1169
|
+
model_endpoint_objects_by_uid: dict[str, mlrun.common.schemas.ModelEndpoint],
|
|
1170
|
+
):
|
|
1171
|
+
last_request_dictionary = self.get_last_request(
|
|
1172
|
+
endpoint_ids=list(model_endpoint_objects_by_uid.keys())
|
|
1173
|
+
)
|
|
1174
|
+
for uid, mep in model_endpoint_objects_by_uid.items():
|
|
1175
|
+
# Set the last request timestamp to the MEP object. If not found, keep the existing value from the
|
|
1176
|
+
# DB (relevant for batch EP).
|
|
1177
|
+
mep.status.last_request = last_request_dictionary.get(
|
|
1178
|
+
uid, mep.status.last_request
|
|
1179
|
+
)
|