mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +10 -1
- mlrun/__main__.py +23 -111
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +36 -253
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +46 -42
- mlrun/artifacts/model.py +9 -141
- mlrun/artifacts/plots.py +14 -375
- mlrun/common/constants.py +65 -3
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +10 -5
- mlrun/common/schemas/alert.py +92 -11
- mlrun/common/schemas/api_gateway.py +56 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +15 -3
- mlrun/common/schemas/model_monitoring/constants.py +58 -7
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +5 -11
- mlrun/common/types.py +1 -0
- mlrun/config.py +27 -9
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/base.py +41 -9
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/datastore_profile.py +56 -4
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +147 -7
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +110 -42
- mlrun/datastore/utils.py +42 -0
- mlrun/db/base.py +54 -10
- mlrun/db/httpdb.py +282 -79
- mlrun/db/nopdb.py +52 -10
- mlrun/errors.py +11 -0
- mlrun/execution.py +24 -9
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +12 -47
- mlrun/feature_store/feature_set.py +9 -0
- mlrun/feature_store/feature_vector.py +8 -0
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +16 -0
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/tf_keras/__init__.py +4 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/base.py +4 -3
- mlrun/launcher/client.py +5 -3
- mlrun/launcher/local.py +8 -2
- mlrun/launcher/remote.py +8 -2
- mlrun/lists.py +6 -2
- mlrun/model.py +45 -21
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +41 -18
- mlrun/model_monitoring/application.py +5 -305
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +280 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +3 -1
- mlrun/model_monitoring/db/__init__.py +2 -0
- mlrun/model_monitoring/db/stores/__init__.py +0 -2
- mlrun/model_monitoring/db/stores/base/store.py +22 -37
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +636 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/helpers.py +46 -1
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +57 -216
- mlrun/model_monitoring/writer.py +134 -124
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +10 -9
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +19 -12
- mlrun/projects/pipelines.py +79 -102
- mlrun/projects/project.py +265 -103
- mlrun/render.py +15 -14
- mlrun/run.py +16 -46
- mlrun/runtimes/__init__.py +6 -3
- mlrun/runtimes/base.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +194 -84
- mlrun/runtimes/nuclio/application/application.py +170 -8
- mlrun/runtimes/nuclio/function.py +39 -49
- mlrun/runtimes/pod.py +16 -36
- mlrun/runtimes/remotesparkjob.py +9 -3
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -45
- mlrun/serving/server.py +2 -1
- mlrun/serving/v2_serving.py +5 -1
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +107 -75
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +1 -1
- mlrun/utils/notifications/notification/slack.py +34 -7
- mlrun/utils/notifications/notification/webhook.py +1 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/METADATA +14 -6
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/RECORD +150 -130
- mlrun/kfpops.py +0 -865
- mlrun/platforms/other.py +0 -305
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc21.dist-info}/top_level.txt +0 -0
|
@@ -11,37 +11,91 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
16
15
|
import json
|
|
17
16
|
import os
|
|
18
17
|
import typing
|
|
18
|
+
from dataclasses import dataclass
|
|
19
19
|
from http import HTTPStatus
|
|
20
20
|
|
|
21
21
|
import v3io.dataplane
|
|
22
|
+
import v3io.dataplane.output
|
|
22
23
|
import v3io.dataplane.response
|
|
23
|
-
import v3io_frames
|
|
24
24
|
|
|
25
25
|
import mlrun.common.model_monitoring.helpers
|
|
26
|
-
import mlrun.common.schemas.model_monitoring
|
|
26
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
27
27
|
import mlrun.model_monitoring.db
|
|
28
28
|
import mlrun.utils.v3io_clients
|
|
29
29
|
from mlrun.utils import logger
|
|
30
30
|
|
|
31
31
|
# Fields to encode before storing in the KV table or to decode after retrieving
|
|
32
32
|
fields_to_encode_decode = [
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
mm_schemas.EventFieldType.FEATURE_STATS,
|
|
34
|
+
mm_schemas.EventFieldType.CURRENT_STATS,
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
_METRIC_FIELDS: list[str] = [
|
|
38
|
+
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
39
|
+
mm_schemas.MetricData.METRIC_NAME,
|
|
40
|
+
mm_schemas.MetricData.METRIC_VALUE,
|
|
41
|
+
mm_schemas.WriterEvent.START_INFER_TIME,
|
|
42
|
+
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SchemaField(typing.TypedDict):
|
|
47
|
+
name: str
|
|
48
|
+
type: str
|
|
49
|
+
nullable: bool
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class SchemaParams:
|
|
54
|
+
key: str
|
|
55
|
+
fields: list[SchemaField]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_RESULT_SCHEMA: list[SchemaField] = [
|
|
59
|
+
SchemaField(
|
|
60
|
+
name=mm_schemas.ResultData.RESULT_NAME,
|
|
61
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
62
|
+
nullable=False,
|
|
63
|
+
)
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
_METRIC_SCHEMA: list[SchemaField] = [
|
|
67
|
+
SchemaField(
|
|
68
|
+
name=mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
69
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
70
|
+
nullable=False,
|
|
71
|
+
),
|
|
72
|
+
SchemaField(
|
|
73
|
+
name=mm_schemas.MetricData.METRIC_NAME,
|
|
74
|
+
type=mm_schemas.GrafanaColumnType.STRING,
|
|
75
|
+
nullable=False,
|
|
76
|
+
),
|
|
35
77
|
]
|
|
36
78
|
|
|
37
79
|
|
|
80
|
+
_KIND_TO_SCHEMA_PARAMS: dict[mm_schemas.WriterEventKind, SchemaParams] = {
|
|
81
|
+
mm_schemas.WriterEventKind.RESULT: SchemaParams(
|
|
82
|
+
key=mm_schemas.WriterEvent.APPLICATION_NAME, fields=_RESULT_SCHEMA
|
|
83
|
+
),
|
|
84
|
+
mm_schemas.WriterEventKind.METRIC: SchemaParams(
|
|
85
|
+
key="metric_id", fields=_METRIC_SCHEMA
|
|
86
|
+
),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_EXCLUDE_SCHEMA_FILTER_EXPRESSION = '__name!=".#schema"'
|
|
90
|
+
|
|
91
|
+
|
|
38
92
|
class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
39
93
|
"""
|
|
40
94
|
Handles the DB operations when the DB target is from type KV. For the KV operations, we use an instance of V3IO
|
|
41
95
|
client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
|
|
42
96
|
"""
|
|
43
97
|
|
|
44
|
-
def __init__(self, project: str, access_key: str):
|
|
98
|
+
def __init__(self, project: str, access_key: typing.Optional[str] = None) -> None:
|
|
45
99
|
super().__init__(project=project)
|
|
46
100
|
# Initialize a V3IO client instance
|
|
47
101
|
self.access_key = access_key or os.environ.get("V3IO_ACCESS_KEY")
|
|
@@ -66,7 +120,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
66
120
|
self.client.kv.put(
|
|
67
121
|
container=self.container,
|
|
68
122
|
table_path=self.path,
|
|
69
|
-
key=endpoint[
|
|
123
|
+
key=endpoint[mm_schemas.EventFieldType.UID],
|
|
70
124
|
attributes=endpoint,
|
|
71
125
|
)
|
|
72
126
|
|
|
@@ -153,7 +207,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
153
207
|
"""Getting path and container based on the model monitoring configurations"""
|
|
154
208
|
path = mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
155
209
|
project=self.project,
|
|
156
|
-
kind=
|
|
210
|
+
kind=mm_schemas.ModelMonitoringStoreKinds.ENDPOINTS,
|
|
157
211
|
)
|
|
158
212
|
(
|
|
159
213
|
_,
|
|
@@ -219,17 +273,11 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
219
273
|
if uids is None:
|
|
220
274
|
uids = []
|
|
221
275
|
for item in items:
|
|
222
|
-
if
|
|
276
|
+
if mm_schemas.EventFieldType.UID not in item:
|
|
223
277
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
224
|
-
uids.append(
|
|
225
|
-
item[
|
|
226
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
|
|
227
|
-
]
|
|
228
|
-
)
|
|
278
|
+
uids.append(item[mm_schemas.EventFieldType.ENDPOINT_ID])
|
|
229
279
|
else:
|
|
230
|
-
uids.append(
|
|
231
|
-
item[mlrun.common.schemas.model_monitoring.EventFieldType.UID]
|
|
232
|
-
)
|
|
280
|
+
uids.append(item[mm_schemas.EventFieldType.UID])
|
|
233
281
|
|
|
234
282
|
# Add each relevant model endpoint to the model endpoints list
|
|
235
283
|
for endpoint_id in uids:
|
|
@@ -240,27 +288,20 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
240
288
|
|
|
241
289
|
return endpoint_list
|
|
242
290
|
|
|
243
|
-
def delete_model_endpoints_resources(self
|
|
291
|
+
def delete_model_endpoints_resources(self):
|
|
244
292
|
"""
|
|
245
|
-
Delete all model endpoints resources in
|
|
246
|
-
|
|
247
|
-
:param endpoints: A list of model endpoints flattened dictionaries.
|
|
293
|
+
Delete all model endpoints resources in V3IO KV.
|
|
248
294
|
"""
|
|
249
295
|
|
|
296
|
+
endpoints = self.list_model_endpoints()
|
|
297
|
+
|
|
250
298
|
# Delete model endpoint record from KV table
|
|
251
299
|
for endpoint_dict in endpoints:
|
|
252
|
-
if
|
|
253
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
254
|
-
not in endpoint_dict
|
|
255
|
-
):
|
|
300
|
+
if mm_schemas.EventFieldType.UID not in endpoint_dict:
|
|
256
301
|
# This is kept for backwards compatibility - in old versions the key column named endpoint_id
|
|
257
|
-
endpoint_id = endpoint_dict[
|
|
258
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ENDPOINT_ID
|
|
259
|
-
]
|
|
302
|
+
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.ENDPOINT_ID]
|
|
260
303
|
else:
|
|
261
|
-
endpoint_id = endpoint_dict[
|
|
262
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.UID
|
|
263
|
-
]
|
|
304
|
+
endpoint_id = endpoint_dict[mm_schemas.EventFieldType.UID]
|
|
264
305
|
self.delete_model_endpoint(
|
|
265
306
|
endpoint_id,
|
|
266
307
|
)
|
|
@@ -283,188 +324,84 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
283
324
|
raise_for_status=v3io.dataplane.RaiseForStatus.never,
|
|
284
325
|
)
|
|
285
326
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
# Generate the required tsdb paths
|
|
290
|
-
tsdb_path, filtered_path = self._generate_tsdb_paths()
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _get_results_table_path(endpoint_id: str) -> str:
|
|
329
|
+
return endpoint_id
|
|
291
330
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
|
|
296
|
-
table=filtered_path,
|
|
297
|
-
)
|
|
298
|
-
except v3io_frames.errors.DeleteError as e:
|
|
299
|
-
if "No TSDB schema file found" not in str(e):
|
|
300
|
-
logger.warning(
|
|
301
|
-
f"Failed to delete TSDB table '{filtered_path}'",
|
|
302
|
-
err=mlrun.errors.err_to_str(e),
|
|
303
|
-
)
|
|
304
|
-
# Final cleanup of tsdb path
|
|
305
|
-
tsdb_path.replace("://u", ":///u")
|
|
306
|
-
store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
|
|
307
|
-
store.rm(tsdb_path, recursive=True)
|
|
331
|
+
@staticmethod
|
|
332
|
+
def _get_metrics_table_path(endpoint_id: str) -> str:
|
|
333
|
+
return f"{endpoint_id}_metrics"
|
|
308
334
|
|
|
309
|
-
def
|
|
335
|
+
def write_application_event(
|
|
310
336
|
self,
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
end: str = "now",
|
|
315
|
-
access_key: str = None,
|
|
316
|
-
) -> dict[str, list[tuple[str, float]]]:
|
|
317
|
-
"""
|
|
318
|
-
Getting metrics from the time series DB. There are pre-defined metrics for model endpoints such as
|
|
319
|
-
`predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user.
|
|
320
|
-
|
|
321
|
-
:param endpoint_id: The unique id of the model endpoint.
|
|
322
|
-
:param metrics: A list of real-time metrics to return for the model endpoint.
|
|
323
|
-
:param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
|
|
324
|
-
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
325
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
|
|
326
|
-
earliest time.
|
|
327
|
-
:param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
|
|
328
|
-
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
329
|
-
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the
|
|
330
|
-
earliest time.
|
|
331
|
-
:param access_key: V3IO access key that will be used for generating Frames client object. If not
|
|
332
|
-
provided, the access key will be retrieved from the environment variables.
|
|
333
|
-
|
|
334
|
-
:return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
|
|
335
|
-
includes timestamps and the values.
|
|
336
|
-
"""
|
|
337
|
-
|
|
338
|
-
# Initialize access key
|
|
339
|
-
access_key = access_key or mlrun.mlconf.get_v3io_access_key()
|
|
340
|
-
|
|
341
|
-
if not metrics:
|
|
342
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
343
|
-
"Metric names must be provided"
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
# Initialize metrics mapping dictionary
|
|
347
|
-
metrics_mapping = {}
|
|
348
|
-
|
|
349
|
-
# Getting the path for the time series DB
|
|
350
|
-
events_path = (
|
|
351
|
-
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
352
|
-
project=self.project,
|
|
353
|
-
kind=mlrun.common.schemas.ModelMonitoringStoreKinds.EVENTS,
|
|
354
|
-
)
|
|
355
|
-
)
|
|
356
|
-
(
|
|
357
|
-
_,
|
|
358
|
-
container,
|
|
359
|
-
events_path,
|
|
360
|
-
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
361
|
-
events_path
|
|
362
|
-
)
|
|
363
|
-
|
|
364
|
-
# Retrieve the raw data from the time series DB based on the provided metrics and time ranges
|
|
365
|
-
frames_client = mlrun.utils.v3io_clients.get_frames_client(
|
|
366
|
-
token=access_key,
|
|
367
|
-
address=mlrun.mlconf.v3io_framesd,
|
|
368
|
-
container=container,
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
try:
|
|
372
|
-
data = frames_client.read(
|
|
373
|
-
backend=mlrun.common.schemas.model_monitoring.TimeSeriesTarget.TSDB,
|
|
374
|
-
table=events_path,
|
|
375
|
-
columns=["endpoint_id", *metrics],
|
|
376
|
-
filter=f"endpoint_id=='{endpoint_id}'",
|
|
377
|
-
start=start,
|
|
378
|
-
end=end,
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
# Fill the metrics mapping dictionary with the metric name and values
|
|
382
|
-
data_dict = data.to_dict()
|
|
383
|
-
for metric in metrics:
|
|
384
|
-
metric_data = data_dict.get(metric)
|
|
385
|
-
if metric_data is None:
|
|
386
|
-
continue
|
|
387
|
-
|
|
388
|
-
values = [
|
|
389
|
-
(str(timestamp), value) for timestamp, value in metric_data.items()
|
|
390
|
-
]
|
|
391
|
-
metrics_mapping[metric] = values
|
|
392
|
-
|
|
393
|
-
except v3io_frames.errors.ReadError:
|
|
394
|
-
logger.warn("Failed to read tsdb", endpoint=endpoint_id)
|
|
395
|
-
|
|
396
|
-
return metrics_mapping
|
|
397
|
-
|
|
398
|
-
def write_application_result(self, event: dict[str, typing.Any]):
|
|
337
|
+
event: dict[str, typing.Any],
|
|
338
|
+
kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
|
|
339
|
+
) -> None:
|
|
399
340
|
"""
|
|
400
|
-
Write a new application
|
|
341
|
+
Write a new application event in the target table.
|
|
401
342
|
|
|
402
343
|
:param event: An event dictionary that represents the application result, should be corresponded to the
|
|
403
344
|
schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
|
|
404
345
|
object.
|
|
346
|
+
:param kind: The type of the event, can be either "result" or "metric".
|
|
405
347
|
"""
|
|
406
|
-
endpoint_id = event.pop(
|
|
407
|
-
mlrun.common.schemas.model_monitoring.WriterEvent.ENDPOINT_ID
|
|
408
|
-
)
|
|
409
|
-
app_name = event.pop(
|
|
410
|
-
mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
|
|
411
|
-
)
|
|
412
|
-
metric_name = event.pop(
|
|
413
|
-
mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME
|
|
414
|
-
)
|
|
415
|
-
attributes = {metric_name: json.dumps(event)}
|
|
416
348
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
349
|
+
container = self.get_v3io_monitoring_apps_container(project_name=self.project)
|
|
350
|
+
endpoint_id = event.pop(mm_schemas.WriterEvent.ENDPOINT_ID)
|
|
351
|
+
|
|
352
|
+
if kind == mm_schemas.WriterEventKind.METRIC:
|
|
353
|
+
table_path = self._get_metrics_table_path(endpoint_id)
|
|
354
|
+
key = f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}.{event[mm_schemas.MetricData.METRIC_NAME]}"
|
|
355
|
+
attributes = {event_key: event[event_key] for event_key in _METRIC_FIELDS}
|
|
356
|
+
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
357
|
+
table_path = self._get_results_table_path(endpoint_id)
|
|
358
|
+
key = event.pop(mm_schemas.WriterEvent.APPLICATION_NAME)
|
|
359
|
+
metric_name = event.pop(mm_schemas.ResultData.RESULT_NAME)
|
|
360
|
+
attributes = {metric_name: json.dumps(event)}
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError(f"Invalid {kind = }")
|
|
420
363
|
|
|
421
364
|
self.client.kv.update(
|
|
422
|
-
container=
|
|
423
|
-
table_path=
|
|
424
|
-
key=
|
|
365
|
+
container=container,
|
|
366
|
+
table_path=table_path,
|
|
367
|
+
key=key,
|
|
425
368
|
attributes=attributes,
|
|
426
369
|
)
|
|
427
370
|
|
|
428
371
|
schema_file = self.client.kv.new_cursor(
|
|
429
|
-
container=
|
|
430
|
-
table_path=
|
|
372
|
+
container=container,
|
|
373
|
+
table_path=table_path,
|
|
431
374
|
filter_expression='__name==".#schema"',
|
|
432
375
|
)
|
|
433
376
|
|
|
434
377
|
if not schema_file.all():
|
|
435
378
|
logger.info(
|
|
436
|
-
"
|
|
437
|
-
container=
|
|
438
|
-
|
|
379
|
+
"Generating a new V3IO KV schema file",
|
|
380
|
+
container=container,
|
|
381
|
+
table_path=table_path,
|
|
439
382
|
)
|
|
440
|
-
self._generate_kv_schema(
|
|
441
|
-
|
|
383
|
+
self._generate_kv_schema(
|
|
384
|
+
container=container, table_path=table_path, kind=kind
|
|
385
|
+
)
|
|
386
|
+
logger.info("Updated V3IO KV successfully", key=key)
|
|
442
387
|
|
|
443
388
|
def _generate_kv_schema(
|
|
444
|
-
self,
|
|
445
|
-
):
|
|
389
|
+
self, *, container: str, table_path: str, kind: mm_schemas.WriterEventKind
|
|
390
|
+
) -> None:
|
|
446
391
|
"""Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
|
|
447
|
-
|
|
448
|
-
{
|
|
449
|
-
"name": mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME,
|
|
450
|
-
"type": "string",
|
|
451
|
-
"nullable": False,
|
|
452
|
-
}
|
|
453
|
-
]
|
|
392
|
+
schema_params = _KIND_TO_SCHEMA_PARAMS[kind]
|
|
454
393
|
res = self.client.kv.create_schema(
|
|
455
|
-
container=
|
|
456
|
-
table_path=
|
|
457
|
-
key=
|
|
458
|
-
fields=fields,
|
|
394
|
+
container=container,
|
|
395
|
+
table_path=table_path,
|
|
396
|
+
key=schema_params.key,
|
|
397
|
+
fields=schema_params.fields,
|
|
459
398
|
)
|
|
460
399
|
if res.status_code != HTTPStatus.OK:
|
|
461
400
|
raise mlrun.errors.MLRunBadRequestError(
|
|
462
|
-
f"Couldn't infer schema for endpoint {
|
|
401
|
+
f"Couldn't infer schema for endpoint {table_path} which is required for Grafana dashboards"
|
|
463
402
|
)
|
|
464
403
|
else:
|
|
465
|
-
logger.info(
|
|
466
|
-
"Generated V3IO KV schema successfully", endpoint_id=endpoint_id
|
|
467
|
-
)
|
|
404
|
+
logger.info("Generated V3IO KV schema successfully", table_path=table_path)
|
|
468
405
|
|
|
469
406
|
def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
|
|
470
407
|
"""
|
|
@@ -485,9 +422,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
485
422
|
table_path=endpoint_id,
|
|
486
423
|
key=application_name,
|
|
487
424
|
)
|
|
488
|
-
return data.output.item[
|
|
489
|
-
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
|
|
490
|
-
]
|
|
425
|
+
return data.output.item[mm_schemas.SchedulingKeys.LAST_ANALYZED]
|
|
491
426
|
except v3io.dataplane.response.HttpResponseError as err:
|
|
492
427
|
logger.debug("Error while getting last analyzed time", err=err)
|
|
493
428
|
raise mlrun.errors.MLRunNotFoundError(
|
|
@@ -512,9 +447,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
512
447
|
),
|
|
513
448
|
table_path=endpoint_id,
|
|
514
449
|
key=application_name,
|
|
515
|
-
attributes={
|
|
516
|
-
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
|
|
517
|
-
},
|
|
450
|
+
attributes={mm_schemas.SchedulingKeys.LAST_ANALYZED: last_analyzed},
|
|
518
451
|
)
|
|
519
452
|
|
|
520
453
|
def _generate_tsdb_paths(self) -> tuple[str, str]:
|
|
@@ -527,7 +460,7 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
527
460
|
full_path = (
|
|
528
461
|
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
529
462
|
project=self.project,
|
|
530
|
-
kind=
|
|
463
|
+
kind=mm_schemas.ModelMonitoringStoreKinds.EVENTS,
|
|
531
464
|
)
|
|
532
465
|
)
|
|
533
466
|
|
|
@@ -623,8 +556,8 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
623
556
|
# Apply top_level filter (remove endpoints that considered a child of a router)
|
|
624
557
|
if top_level:
|
|
625
558
|
filter_expression.append(
|
|
626
|
-
f"(endpoint_type=='{str(
|
|
627
|
-
f"OR endpoint_type=='{str(
|
|
559
|
+
f"(endpoint_type=='{str(mm_schemas.EndpointType.NODE_EP.value)}' "
|
|
560
|
+
f"OR endpoint_type=='{str(mm_schemas.EndpointType.ROUTER.value)}')"
|
|
628
561
|
)
|
|
629
562
|
|
|
630
563
|
return " AND ".join(filter_expression)
|
|
@@ -644,41 +577,31 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
644
577
|
# Validate default value for `error_count`
|
|
645
578
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `error_count` key
|
|
646
579
|
if (
|
|
647
|
-
|
|
648
|
-
and endpoint[
|
|
649
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
|
|
650
|
-
]
|
|
651
|
-
== "null"
|
|
580
|
+
mm_schemas.EventFieldType.ERROR_COUNT in endpoint
|
|
581
|
+
and endpoint[mm_schemas.EventFieldType.ERROR_COUNT] == "null"
|
|
652
582
|
):
|
|
653
|
-
endpoint[
|
|
654
|
-
mlrun.common.schemas.model_monitoring.EventFieldType.ERROR_COUNT
|
|
655
|
-
] = "0"
|
|
583
|
+
endpoint[mm_schemas.EventFieldType.ERROR_COUNT] = "0"
|
|
656
584
|
|
|
657
585
|
# Validate default value for `metrics`
|
|
658
586
|
# For backwards compatibility reasons, we validate that the model endpoint includes the `metrics` key
|
|
659
587
|
if (
|
|
660
|
-
|
|
661
|
-
and endpoint[
|
|
662
|
-
== "null"
|
|
588
|
+
mm_schemas.EventFieldType.METRICS in endpoint
|
|
589
|
+
and endpoint[mm_schemas.EventFieldType.METRICS] == "null"
|
|
663
590
|
):
|
|
664
|
-
endpoint[
|
|
665
|
-
|
|
666
|
-
{
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
mlrun.common.schemas.model_monitoring.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
|
|
670
|
-
}
|
|
591
|
+
endpoint[mm_schemas.EventFieldType.METRICS] = json.dumps(
|
|
592
|
+
{
|
|
593
|
+
mm_schemas.EventKeyMetrics.GENERIC: {
|
|
594
|
+
mm_schemas.EventLiveStats.LATENCY_AVG_1H: 0,
|
|
595
|
+
mm_schemas.EventLiveStats.PREDICTIONS_PER_SECOND: 0,
|
|
671
596
|
}
|
|
672
|
-
|
|
597
|
+
}
|
|
673
598
|
)
|
|
674
599
|
# Validate key `uid` instead of `endpoint_id`
|
|
675
600
|
# For backwards compatibility reasons, we replace the `endpoint_id` with `uid` which is the updated key name
|
|
676
|
-
if
|
|
677
|
-
endpoint[
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
]
|
|
681
|
-
)
|
|
601
|
+
if mm_schemas.EventFieldType.ENDPOINT_ID in endpoint:
|
|
602
|
+
endpoint[mm_schemas.EventFieldType.UID] = endpoint[
|
|
603
|
+
mm_schemas.EventFieldType.ENDPOINT_ID
|
|
604
|
+
]
|
|
682
605
|
|
|
683
606
|
@staticmethod
|
|
684
607
|
def _encode_field(field: typing.Union[str, bytes]) -> bytes:
|
|
@@ -703,3 +626,96 @@ class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
|
703
626
|
@staticmethod
|
|
704
627
|
def _get_monitoring_schedules_container(project_name: str) -> str:
|
|
705
628
|
return f"users/pipelines/{project_name}/monitoring-schedules/functions"
|
|
629
|
+
|
|
630
|
+
def _extract_results_from_items(
|
|
631
|
+
self, app_items: list[dict[str, str]]
|
|
632
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
633
|
+
"""Assuming .#schema items are filtered out"""
|
|
634
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
635
|
+
for app_item in app_items:
|
|
636
|
+
app_name = app_item.pop("__name")
|
|
637
|
+
for result_name in app_item:
|
|
638
|
+
metrics.append(
|
|
639
|
+
mm_schemas.ModelEndpointMonitoringMetric(
|
|
640
|
+
project=self.project,
|
|
641
|
+
app=app_name,
|
|
642
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.RESULT,
|
|
643
|
+
name=result_name,
|
|
644
|
+
full_name=mm_schemas.model_endpoints._compose_full_name(
|
|
645
|
+
project=self.project, app=app_name, name=result_name
|
|
646
|
+
),
|
|
647
|
+
)
|
|
648
|
+
)
|
|
649
|
+
return metrics
|
|
650
|
+
|
|
651
|
+
def _extract_metrics_from_items(
|
|
652
|
+
self, result_items: list[dict[str, str]]
|
|
653
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
654
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
655
|
+
logger.debug("Result items", result_items=result_items)
|
|
656
|
+
for result_item in result_items:
|
|
657
|
+
app = result_item[mm_schemas.WriterEvent.APPLICATION_NAME]
|
|
658
|
+
name = result_item[mm_schemas.MetricData.METRIC_NAME]
|
|
659
|
+
metrics.append(
|
|
660
|
+
mm_schemas.ModelEndpointMonitoringMetric(
|
|
661
|
+
project=self.project,
|
|
662
|
+
app=app,
|
|
663
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
664
|
+
name=name,
|
|
665
|
+
full_name=mm_schemas.model_endpoints._compose_full_name(
|
|
666
|
+
project=self.project,
|
|
667
|
+
app=app,
|
|
668
|
+
name=name,
|
|
669
|
+
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
670
|
+
),
|
|
671
|
+
)
|
|
672
|
+
)
|
|
673
|
+
return metrics
|
|
674
|
+
|
|
675
|
+
def get_model_endpoint_metrics(
|
|
676
|
+
self, endpoint_id: str, type: mm_schemas.ModelEndpointMonitoringMetricType
|
|
677
|
+
) -> list[mm_schemas.ModelEndpointMonitoringMetric]:
|
|
678
|
+
"""Get model monitoring results and metrics on the endpoint"""
|
|
679
|
+
metrics: list[mm_schemas.ModelEndpointMonitoringMetric] = []
|
|
680
|
+
container = self.get_v3io_monitoring_apps_container(self.project)
|
|
681
|
+
if type == mm_schemas.ModelEndpointMonitoringMetricType.METRIC:
|
|
682
|
+
table_path = self._get_metrics_table_path(endpoint_id)
|
|
683
|
+
items_extractor = self._extract_metrics_from_items
|
|
684
|
+
elif type == mm_schemas.ModelEndpointMonitoringMetricType.RESULT:
|
|
685
|
+
table_path = self._get_results_table_path(endpoint_id)
|
|
686
|
+
items_extractor = self._extract_results_from_items
|
|
687
|
+
else:
|
|
688
|
+
raise ValueError(f"Invalid metric {type = }")
|
|
689
|
+
|
|
690
|
+
def scan(
|
|
691
|
+
marker: typing.Optional[str] = None,
|
|
692
|
+
) -> v3io.dataplane.response.Response:
|
|
693
|
+
# TODO: Use AIO client: `v3io.aio.dataplane.client.Client`
|
|
694
|
+
return self.client.kv.scan(
|
|
695
|
+
container=container,
|
|
696
|
+
table_path=table_path,
|
|
697
|
+
marker=marker,
|
|
698
|
+
filter_expression=_EXCLUDE_SCHEMA_FILTER_EXPRESSION,
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
try:
|
|
702
|
+
response = scan()
|
|
703
|
+
except v3io.dataplane.response.HttpResponseError as err:
|
|
704
|
+
if err.status_code == HTTPStatus.NOT_FOUND:
|
|
705
|
+
logger.warning(
|
|
706
|
+
f"Attempt getting {type}s - no data. Check the "
|
|
707
|
+
"project name, endpoint, or wait for the applications to start.",
|
|
708
|
+
container=container,
|
|
709
|
+
table_path=table_path,
|
|
710
|
+
)
|
|
711
|
+
return []
|
|
712
|
+
raise
|
|
713
|
+
|
|
714
|
+
while True:
|
|
715
|
+
output = typing.cast(v3io.dataplane.output.GetItemsOutput, response.output)
|
|
716
|
+
metrics.extend(items_extractor(output.items))
|
|
717
|
+
if output.last:
|
|
718
|
+
break
|
|
719
|
+
response = scan(marker=output.next_marker)
|
|
720
|
+
|
|
721
|
+
return metrics
|