mlrun 1.7.0rc16__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +27 -24
- mlrun/artifacts/manager.py +5 -1
- mlrun/artifacts/model.py +1 -1
- mlrun/common/runtimes/constants.py +3 -0
- mlrun/common/schemas/__init__.py +8 -2
- mlrun/common/schemas/alert.py +49 -10
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +3 -1
- mlrun/common/schemas/model_monitoring/constants.py +21 -1
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +17 -6
- mlrun/common/schemas/project.py +3 -1
- mlrun/config.py +9 -3
- mlrun/data_types/to_pandas.py +5 -5
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +111 -6
- mlrun/datastore/targets.py +2 -2
- mlrun/db/base.py +6 -2
- mlrun/db/httpdb.py +22 -3
- mlrun/db/nopdb.py +10 -3
- mlrun/errors.py +6 -0
- mlrun/feature_store/retrieval/conversion.py +5 -5
- mlrun/feature_store/retrieval/job.py +3 -2
- mlrun/feature_store/retrieval/spark_merger.py +2 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -2
- mlrun/lists.py +2 -0
- mlrun/model.py +8 -6
- mlrun/model_monitoring/db/stores/base/store.py +16 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +44 -43
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +190 -91
- mlrun/model_monitoring/db/tsdb/__init__.py +35 -6
- mlrun/model_monitoring/db/tsdb/base.py +25 -18
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +207 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +231 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +103 -64
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +217 -16
- mlrun/model_monitoring/helpers.py +32 -0
- mlrun/model_monitoring/stream_processing.py +7 -4
- mlrun/model_monitoring/writer.py +19 -14
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/projects/project.py +40 -11
- mlrun/render.py +8 -5
- mlrun/runtimes/__init__.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +97 -77
- mlrun/runtimes/nuclio/application/application.py +160 -7
- mlrun/runtimes/nuclio/function.py +18 -12
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +28 -3
- mlrun/utils/logger.py +11 -6
- mlrun/utils/notifications/notification/slack.py +27 -7
- mlrun/utils/notifications/notification_pusher.py +45 -41
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/METADATA +8 -3
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/RECORD +65 -61
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/top_level.txt +0 -0
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import datetime
|
|
15
|
+
import typing
|
|
16
16
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import v3io_frames.client
|
|
@@ -21,7 +21,7 @@ from v3io.dataplane import Client as V3IOClient
|
|
|
21
21
|
from v3io_frames.frames_pb2 import IGNORE
|
|
22
22
|
|
|
23
23
|
import mlrun.common.model_monitoring
|
|
24
|
-
import mlrun.common.schemas.model_monitoring as
|
|
24
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
25
25
|
import mlrun.feature_store.steps
|
|
26
26
|
import mlrun.utils.v3io_clients
|
|
27
27
|
from mlrun.model_monitoring.db import TSDBConnector
|
|
@@ -37,12 +37,14 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
37
37
|
Client that provides API for executing commands on the V3IO TSDB table.
|
|
38
38
|
"""
|
|
39
39
|
|
|
40
|
+
type: str = mm_schemas.TSDBTarget.V3IO_TSDB
|
|
41
|
+
|
|
40
42
|
def __init__(
|
|
41
43
|
self,
|
|
42
44
|
project: str,
|
|
43
|
-
access_key: str = None,
|
|
45
|
+
access_key: typing.Optional[str] = None,
|
|
44
46
|
container: str = "users",
|
|
45
|
-
v3io_framesd: str = None,
|
|
47
|
+
v3io_framesd: typing.Optional[str] = None,
|
|
46
48
|
create_table: bool = False,
|
|
47
49
|
):
|
|
48
50
|
super().__init__(project=project)
|
|
@@ -61,14 +63,14 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
61
63
|
self._init_tables_path()
|
|
62
64
|
|
|
63
65
|
if create_table:
|
|
64
|
-
self.
|
|
66
|
+
self.create_tables()
|
|
65
67
|
|
|
66
68
|
def _init_tables_path(self):
|
|
67
69
|
self.tables = {}
|
|
68
70
|
|
|
69
71
|
events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
70
72
|
project=self.project,
|
|
71
|
-
kind=
|
|
73
|
+
kind=mm_schemas.FileTargetKind.EVENTS,
|
|
72
74
|
)
|
|
73
75
|
(
|
|
74
76
|
_,
|
|
@@ -77,12 +79,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
77
79
|
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
78
80
|
events_table_full_path
|
|
79
81
|
)
|
|
80
|
-
self.tables[
|
|
82
|
+
self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
|
|
81
83
|
|
|
82
84
|
monitoring_application_full_path = (
|
|
83
85
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
84
86
|
project=self.project,
|
|
85
|
-
kind=
|
|
87
|
+
kind=mm_schemas.FileTargetKind.MONITORING_APPLICATION,
|
|
86
88
|
)
|
|
87
89
|
)
|
|
88
90
|
(
|
|
@@ -92,28 +94,45 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
92
94
|
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
93
95
|
monitoring_application_full_path
|
|
94
96
|
)
|
|
95
|
-
self.tables[
|
|
96
|
-
monitoring_application_path +
|
|
97
|
+
self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS] = (
|
|
98
|
+
monitoring_application_path + mm_schemas.V3IOTSDBTables.APP_RESULTS
|
|
99
|
+
)
|
|
100
|
+
self.tables[mm_schemas.V3IOTSDBTables.METRICS] = (
|
|
101
|
+
monitoring_application_path + mm_schemas.V3IOTSDBTables.METRICS
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
monitoring_predictions_full_path = (
|
|
105
|
+
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
106
|
+
project=self.project,
|
|
107
|
+
kind=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
108
|
+
)
|
|
97
109
|
)
|
|
98
|
-
|
|
99
|
-
|
|
110
|
+
(
|
|
111
|
+
_,
|
|
112
|
+
_,
|
|
113
|
+
monitoring_predictions_path,
|
|
114
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
115
|
+
monitoring_predictions_full_path
|
|
100
116
|
)
|
|
117
|
+
self.tables[mm_schemas.FileTargetKind.PREDICTIONS] = monitoring_predictions_path
|
|
101
118
|
|
|
102
|
-
def
|
|
119
|
+
def create_tables(self) -> None:
|
|
103
120
|
"""
|
|
104
|
-
Create the
|
|
121
|
+
Create the tables using the TSDB connector. The tables are being created in the V3IO TSDB and include:
|
|
105
122
|
- app_results: a detailed result that includes status, kind, extra data, etc.
|
|
106
123
|
- metrics: a basic key value that represents a single numeric metric.
|
|
124
|
+
Note that the predictions table is automatically created by the model monitoring stream pod.
|
|
107
125
|
"""
|
|
108
126
|
application_tables = [
|
|
109
|
-
|
|
110
|
-
|
|
127
|
+
mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
128
|
+
mm_schemas.V3IOTSDBTables.METRICS,
|
|
111
129
|
]
|
|
112
|
-
for
|
|
113
|
-
logger.info("Creating table in V3IO TSDB",
|
|
130
|
+
for table_name in application_tables:
|
|
131
|
+
logger.info("Creating table in V3IO TSDB", table_name=table_name)
|
|
132
|
+
table = self.tables[table_name]
|
|
114
133
|
self._frames_client.create(
|
|
115
134
|
backend=_TSDB_BE,
|
|
116
|
-
table=
|
|
135
|
+
table=table,
|
|
117
136
|
if_exists=IGNORE,
|
|
118
137
|
rate=_TSDB_RATE,
|
|
119
138
|
)
|
|
@@ -134,6 +153,27 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
134
153
|
- custom_metrics (user-defined metrics)
|
|
135
154
|
"""
|
|
136
155
|
|
|
156
|
+
# Write latency per prediction, labeled by endpoint ID only
|
|
157
|
+
graph.add_step(
|
|
158
|
+
"storey.TSDBTarget",
|
|
159
|
+
name="tsdb_predictions",
|
|
160
|
+
after="MapFeatureNames",
|
|
161
|
+
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.PREDICTIONS]}",
|
|
162
|
+
rate="1/s",
|
|
163
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
164
|
+
container=self.container,
|
|
165
|
+
v3io_frames=self.v3io_framesd,
|
|
166
|
+
columns=["latency"],
|
|
167
|
+
index_cols=[
|
|
168
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
169
|
+
],
|
|
170
|
+
aggr="count,avg",
|
|
171
|
+
aggr_granularity="1m",
|
|
172
|
+
max_events=tsdb_batching_max_events,
|
|
173
|
+
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
174
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
175
|
+
)
|
|
176
|
+
|
|
137
177
|
# Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
138
178
|
# stats and details about the events
|
|
139
179
|
|
|
@@ -160,40 +200,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
160
200
|
"storey.TSDBTarget",
|
|
161
201
|
name=name,
|
|
162
202
|
after=after,
|
|
163
|
-
path=f"{self.container}/{self.tables[
|
|
203
|
+
path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.EVENTS]}",
|
|
164
204
|
rate="10/m",
|
|
165
|
-
time_col=
|
|
205
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
166
206
|
container=self.container,
|
|
167
207
|
v3io_frames=self.v3io_framesd,
|
|
168
208
|
infer_columns_from_data=True,
|
|
169
209
|
index_cols=[
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
210
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
211
|
+
mm_schemas.EventFieldType.RECORD_TYPE,
|
|
212
|
+
mm_schemas.EventFieldType.ENDPOINT_TYPE,
|
|
173
213
|
],
|
|
174
214
|
max_events=tsdb_batching_max_events,
|
|
175
215
|
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
176
|
-
key=
|
|
216
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
177
217
|
)
|
|
178
218
|
|
|
179
219
|
# unpacked base_metrics dictionary
|
|
180
220
|
apply_filter_and_unpacked_keys(
|
|
181
221
|
name="FilterAndUnpackKeys1",
|
|
182
|
-
keys=
|
|
222
|
+
keys=mm_schemas.EventKeyMetrics.BASE_METRICS,
|
|
183
223
|
)
|
|
184
224
|
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
185
225
|
|
|
186
226
|
# unpacked endpoint_features dictionary
|
|
187
227
|
apply_filter_and_unpacked_keys(
|
|
188
228
|
name="FilterAndUnpackKeys2",
|
|
189
|
-
keys=
|
|
229
|
+
keys=mm_schemas.EventKeyMetrics.ENDPOINT_FEATURES,
|
|
190
230
|
)
|
|
191
231
|
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
192
232
|
|
|
193
233
|
# unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
194
234
|
apply_filter_and_unpacked_keys(
|
|
195
235
|
name="FilterAndUnpackKeys3",
|
|
196
|
-
keys=
|
|
236
|
+
keys=mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
|
|
197
237
|
)
|
|
198
238
|
|
|
199
239
|
def apply_storey_filter():
|
|
@@ -210,56 +250,55 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
210
250
|
def write_application_event(
|
|
211
251
|
self,
|
|
212
252
|
event: dict,
|
|
213
|
-
kind:
|
|
214
|
-
):
|
|
253
|
+
kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
|
|
254
|
+
) -> None:
|
|
215
255
|
"""Write a single result or metric to TSDB"""
|
|
216
256
|
|
|
217
|
-
event[
|
|
218
|
-
|
|
219
|
-
event[mm_constants.WriterEvent.END_INFER_TIME]
|
|
220
|
-
)
|
|
257
|
+
event[mm_schemas.WriterEvent.END_INFER_TIME] = datetime.datetime.fromisoformat(
|
|
258
|
+
event[mm_schemas.WriterEvent.END_INFER_TIME]
|
|
221
259
|
)
|
|
260
|
+
index_cols_base = [
|
|
261
|
+
mm_schemas.WriterEvent.END_INFER_TIME,
|
|
262
|
+
mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
263
|
+
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
264
|
+
]
|
|
222
265
|
|
|
223
|
-
if kind ==
|
|
224
|
-
|
|
225
|
-
|
|
266
|
+
if kind == mm_schemas.WriterEventKind.METRIC:
|
|
267
|
+
table = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
268
|
+
index_cols = index_cols_base + [mm_schemas.MetricData.METRIC_NAME]
|
|
269
|
+
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
270
|
+
table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
271
|
+
index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
|
|
272
|
+
del event[mm_schemas.ResultData.RESULT_EXTRA_DATA]
|
|
273
|
+
else:
|
|
274
|
+
raise ValueError(f"Invalid {kind = }")
|
|
226
275
|
|
|
227
|
-
del event[mm_constants.ResultData.RESULT_EXTRA_DATA]
|
|
228
276
|
try:
|
|
229
277
|
self._frames_client.write(
|
|
230
278
|
backend=_TSDB_BE,
|
|
231
|
-
table=
|
|
279
|
+
table=table,
|
|
232
280
|
dfs=pd.DataFrame.from_records([event]),
|
|
233
|
-
index_cols=
|
|
234
|
-
mm_constants.WriterEvent.END_INFER_TIME,
|
|
235
|
-
mm_constants.WriterEvent.ENDPOINT_ID,
|
|
236
|
-
mm_constants.WriterEvent.APPLICATION_NAME,
|
|
237
|
-
mm_constants.ResultData.RESULT_NAME,
|
|
238
|
-
],
|
|
239
|
-
)
|
|
240
|
-
logger.info(
|
|
241
|
-
"Updated V3IO TSDB successfully",
|
|
242
|
-
table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
|
|
281
|
+
index_cols=index_cols,
|
|
243
282
|
)
|
|
283
|
+
logger.info("Updated V3IO TSDB successfully", table=table)
|
|
244
284
|
except v3io_frames.errors.Error as err:
|
|
245
|
-
logger.
|
|
285
|
+
logger.exception(
|
|
246
286
|
"Could not write drift measures to TSDB",
|
|
247
287
|
err=err,
|
|
248
|
-
table=
|
|
288
|
+
table=table,
|
|
249
289
|
event=event,
|
|
250
290
|
)
|
|
251
|
-
|
|
252
291
|
raise mlrun.errors.MLRunRuntimeError(
|
|
253
292
|
f"Failed to write application result to TSDB: {err}"
|
|
254
293
|
)
|
|
255
294
|
|
|
256
|
-
def delete_tsdb_resources(self, table: str = None):
|
|
295
|
+
def delete_tsdb_resources(self, table: typing.Optional[str] = None):
|
|
257
296
|
if table:
|
|
258
297
|
# Delete a specific table
|
|
259
298
|
tables = [table]
|
|
260
299
|
else:
|
|
261
300
|
# Delete all tables
|
|
262
|
-
tables =
|
|
301
|
+
tables = mm_schemas.V3IOTSDBTables.list()
|
|
263
302
|
for table in tables:
|
|
264
303
|
try:
|
|
265
304
|
self._frames_client.delete(
|
|
@@ -282,8 +321,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
282
321
|
self,
|
|
283
322
|
endpoint_id: str,
|
|
284
323
|
metrics: list[str],
|
|
285
|
-
start: str
|
|
286
|
-
end: str
|
|
324
|
+
start: str,
|
|
325
|
+
end: str,
|
|
287
326
|
) -> dict[str, list[tuple[str, float]]]:
|
|
288
327
|
"""
|
|
289
328
|
Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
|
|
@@ -312,7 +351,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
312
351
|
|
|
313
352
|
try:
|
|
314
353
|
data = self.get_records(
|
|
315
|
-
table=
|
|
354
|
+
table=mm_schemas.V3IOTSDBTables.EVENTS,
|
|
316
355
|
columns=["endpoint_id", *metrics],
|
|
317
356
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
318
357
|
start=start,
|
|
@@ -339,17 +378,14 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
339
378
|
def get_records(
|
|
340
379
|
self,
|
|
341
380
|
table: str,
|
|
342
|
-
|
|
381
|
+
start: str,
|
|
382
|
+
end: str,
|
|
383
|
+
columns: typing.Optional[list[str]] = None,
|
|
343
384
|
filter_query: str = "",
|
|
344
|
-
start: str = "now-1h",
|
|
345
|
-
end: str = "now",
|
|
346
385
|
) -> pd.DataFrame:
|
|
347
386
|
"""
|
|
348
387
|
Getting records from V3IO TSDB data collection.
|
|
349
388
|
:param table: Path to the collection to query.
|
|
350
|
-
:param columns: Columns to include in the result.
|
|
351
|
-
:param filter_query: V3IO filter expression. The expected filter expression includes different conditions,
|
|
352
|
-
divided by ' AND '.
|
|
353
389
|
:param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
|
|
354
390
|
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
355
391
|
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
|
|
@@ -358,6 +394,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
358
394
|
time, a Unix timestamp in milliseconds, a relative time (`'now'` or
|
|
359
395
|
`'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
|
|
360
396
|
`'s'` = seconds), or 0 for the earliest time.
|
|
397
|
+
:param columns: Columns to include in the result.
|
|
398
|
+
:param filter_query: V3IO filter expression. The expected filter expression includes different conditions,
|
|
399
|
+
divided by ' AND '.
|
|
361
400
|
:return: DataFrame with the provided attributes from the data collection.
|
|
362
401
|
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
363
402
|
"""
|
|
@@ -384,7 +423,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
384
423
|
"""
|
|
385
424
|
events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
386
425
|
project=self.project,
|
|
387
|
-
kind=
|
|
426
|
+
kind=mm_schemas.FileTargetKind.EVENTS,
|
|
388
427
|
)
|
|
389
428
|
|
|
390
429
|
# Generate the main directory with the V3IO resources
|
|
@@ -12,10 +12,12 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
# TODO: Move this module into the TSDB abstraction
|
|
15
|
+
# TODO: Move this module into the TSDB abstraction:
|
|
16
|
+
# mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py
|
|
16
17
|
|
|
17
18
|
from datetime import datetime
|
|
18
19
|
from io import StringIO
|
|
20
|
+
from typing import Literal, Optional, Union
|
|
19
21
|
|
|
20
22
|
import pandas as pd
|
|
21
23
|
|
|
@@ -25,21 +27,27 @@ import mlrun.model_monitoring.writer as mm_writer
|
|
|
25
27
|
import mlrun.utils.v3io_clients
|
|
26
28
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
27
29
|
ModelEndpointMonitoringMetric,
|
|
30
|
+
ModelEndpointMonitoringMetricNoData,
|
|
28
31
|
ModelEndpointMonitoringMetricType,
|
|
29
|
-
|
|
32
|
+
ModelEndpointMonitoringMetricValues,
|
|
30
33
|
ModelEndpointMonitoringResultValues,
|
|
31
34
|
_compose_full_name,
|
|
32
|
-
|
|
35
|
+
_ModelEndpointMonitoringMetricValuesBase,
|
|
33
36
|
)
|
|
34
37
|
from mlrun.model_monitoring.db.stores.v3io_kv.kv_store import KVStoreBase
|
|
35
38
|
from mlrun.model_monitoring.db.tsdb.v3io.v3io_connector import _TSDB_BE
|
|
36
39
|
from mlrun.utils import logger
|
|
37
40
|
|
|
38
41
|
|
|
39
|
-
def _get_sql_query(
|
|
42
|
+
def _get_sql_query(
|
|
43
|
+
endpoint_id: str,
|
|
44
|
+
names: list[tuple[str, str]],
|
|
45
|
+
table_name: str = mm_constants.V3IOTSDBTables.APP_RESULTS,
|
|
46
|
+
name: str = mm_writer.ResultData.RESULT_NAME,
|
|
47
|
+
) -> str:
|
|
40
48
|
with StringIO() as query:
|
|
41
49
|
query.write(
|
|
42
|
-
f"SELECT * FROM '{
|
|
50
|
+
f"SELECT * FROM '{table_name}' "
|
|
43
51
|
f"WHERE {mm_writer.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
|
|
44
52
|
)
|
|
45
53
|
if names:
|
|
@@ -48,7 +56,7 @@ def _get_sql_query(endpoint_id: str, names: list[tuple[str, str]]) -> str:
|
|
|
48
56
|
for i, (app_name, result_name) in enumerate(names):
|
|
49
57
|
sub_cond = (
|
|
50
58
|
f"({mm_writer.WriterEvent.APPLICATION_NAME}='{app_name}' "
|
|
51
|
-
f"AND {
|
|
59
|
+
f"AND {name}='{result_name}')"
|
|
52
60
|
)
|
|
53
61
|
if i != 0: # not first sub condition
|
|
54
62
|
query.write(" OR ")
|
|
@@ -73,30 +81,87 @@ def _get_result_kind(result_df: pd.DataFrame) -> mm_constants.ResultKindApp:
|
|
|
73
81
|
return unique_kinds[0]
|
|
74
82
|
|
|
75
83
|
|
|
76
|
-
def
|
|
84
|
+
def read_metrics_data(
|
|
77
85
|
*,
|
|
78
86
|
project: str,
|
|
79
87
|
endpoint_id: str,
|
|
80
88
|
start: datetime,
|
|
81
89
|
end: datetime,
|
|
82
90
|
metrics: list[ModelEndpointMonitoringMetric],
|
|
83
|
-
|
|
91
|
+
type: Literal["metrics", "results"] = "results",
|
|
92
|
+
) -> Union[
|
|
93
|
+
list[
|
|
94
|
+
Union[
|
|
95
|
+
ModelEndpointMonitoringResultValues,
|
|
96
|
+
ModelEndpointMonitoringMetricNoData,
|
|
97
|
+
],
|
|
98
|
+
],
|
|
99
|
+
list[
|
|
100
|
+
Union[
|
|
101
|
+
ModelEndpointMonitoringMetricValues,
|
|
102
|
+
ModelEndpointMonitoringMetricNoData,
|
|
103
|
+
],
|
|
104
|
+
],
|
|
105
|
+
]:
|
|
106
|
+
"""
|
|
107
|
+
Read metrics OR results from the TSDB and return as a list.
|
|
108
|
+
Note: the type must match the actual metrics in the `metrics` parameter.
|
|
109
|
+
If the type is "results", pass only results in the `metrics` parameter.
|
|
110
|
+
"""
|
|
84
111
|
client = mlrun.utils.v3io_clients.get_frames_client(
|
|
85
112
|
address=mlrun.mlconf.v3io_framesd,
|
|
86
113
|
container=KVStoreBase.get_v3io_monitoring_apps_container(project),
|
|
87
114
|
)
|
|
115
|
+
|
|
116
|
+
if type == "metrics":
|
|
117
|
+
table_name = mm_constants.V3IOTSDBTables.METRICS
|
|
118
|
+
name = mm_constants.MetricData.METRIC_NAME
|
|
119
|
+
df_handler = df_to_metrics_values
|
|
120
|
+
elif type == "results":
|
|
121
|
+
table_name = mm_constants.V3IOTSDBTables.APP_RESULTS
|
|
122
|
+
name = mm_constants.ResultData.RESULT_NAME
|
|
123
|
+
df_handler = df_to_results_values
|
|
124
|
+
else:
|
|
125
|
+
raise ValueError(f"Invalid {type = }")
|
|
126
|
+
|
|
127
|
+
query = _get_sql_query(
|
|
128
|
+
endpoint_id,
|
|
129
|
+
[(metric.app, metric.name) for metric in metrics],
|
|
130
|
+
table_name=table_name,
|
|
131
|
+
name=name,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
logger.debug("Querying V3IO TSDB", query=query)
|
|
135
|
+
|
|
88
136
|
df: pd.DataFrame = client.read(
|
|
89
137
|
backend=_TSDB_BE,
|
|
90
|
-
query=
|
|
91
|
-
endpoint_id, [(metric.app, metric.name) for metric in metrics]
|
|
92
|
-
),
|
|
138
|
+
query=query,
|
|
93
139
|
start=start,
|
|
94
140
|
end=end,
|
|
95
141
|
)
|
|
96
142
|
|
|
143
|
+
logger.debug(
|
|
144
|
+
"Read a data-frame", project=project, endpoint_id=endpoint_id, is_empty=df.empty
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return df_handler(df=df, metrics=metrics, project=project)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def df_to_results_values(
|
|
151
|
+
*, df: pd.DataFrame, metrics: list[ModelEndpointMonitoringMetric], project: str
|
|
152
|
+
) -> list[
|
|
153
|
+
Union[ModelEndpointMonitoringResultValues, ModelEndpointMonitoringMetricNoData]
|
|
154
|
+
]:
|
|
155
|
+
"""
|
|
156
|
+
Parse a time-indexed data-frame of results from the TSDB into a list of
|
|
157
|
+
results values per distinct results.
|
|
158
|
+
When a result is not found in the data-frame, it is represented in no-data object.
|
|
159
|
+
"""
|
|
97
160
|
metrics_without_data = {metric.full_name: metric for metric in metrics}
|
|
98
161
|
|
|
99
|
-
metrics_values: list[
|
|
162
|
+
metrics_values: list[
|
|
163
|
+
Union[ModelEndpointMonitoringResultValues, ModelEndpointMonitoringMetricNoData]
|
|
164
|
+
] = []
|
|
100
165
|
if not df.empty:
|
|
101
166
|
grouped = df.groupby(
|
|
102
167
|
[mm_writer.WriterEvent.APPLICATION_NAME, mm_writer.ResultData.RESULT_NAME],
|
|
@@ -104,13 +169,13 @@ def read_data(
|
|
|
104
169
|
)
|
|
105
170
|
else:
|
|
106
171
|
grouped = []
|
|
107
|
-
|
|
172
|
+
logger.debug("No results", missing_results=metrics_without_data.keys())
|
|
173
|
+
for (app_name, name), sub_df in grouped:
|
|
108
174
|
result_kind = _get_result_kind(sub_df)
|
|
109
|
-
full_name = _compose_full_name(project=project, app=app_name, name=
|
|
175
|
+
full_name = _compose_full_name(project=project, app=app_name, name=name)
|
|
110
176
|
metrics_values.append(
|
|
111
177
|
ModelEndpointMonitoringResultValues(
|
|
112
178
|
full_name=full_name,
|
|
113
|
-
type=ModelEndpointMonitoringMetricType.RESULT,
|
|
114
179
|
result_kind=result_kind,
|
|
115
180
|
values=list(
|
|
116
181
|
zip(
|
|
@@ -124,11 +189,147 @@ def read_data(
|
|
|
124
189
|
del metrics_without_data[full_name]
|
|
125
190
|
|
|
126
191
|
for metric in metrics_without_data.values():
|
|
192
|
+
if metric.full_name == get_invocations_fqn(project):
|
|
193
|
+
continue
|
|
127
194
|
metrics_values.append(
|
|
128
|
-
|
|
195
|
+
ModelEndpointMonitoringMetricNoData(
|
|
129
196
|
full_name=metric.full_name,
|
|
130
197
|
type=ModelEndpointMonitoringMetricType.RESULT,
|
|
131
198
|
)
|
|
132
199
|
)
|
|
133
200
|
|
|
134
201
|
return metrics_values
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def df_to_metrics_values(
|
|
205
|
+
*, df: pd.DataFrame, metrics: list[ModelEndpointMonitoringMetric], project: str
|
|
206
|
+
) -> list[
|
|
207
|
+
Union[ModelEndpointMonitoringMetricValues, ModelEndpointMonitoringMetricNoData]
|
|
208
|
+
]:
|
|
209
|
+
"""
|
|
210
|
+
Parse a time-indexed data-frame of metrics from the TSDB into a list of
|
|
211
|
+
metrics values per distinct results.
|
|
212
|
+
When a metric is not found in the data-frame, it is represented in no-data object.
|
|
213
|
+
"""
|
|
214
|
+
metrics_without_data = {metric.full_name: metric for metric in metrics}
|
|
215
|
+
|
|
216
|
+
metrics_values: list[
|
|
217
|
+
Union[ModelEndpointMonitoringMetricValues, ModelEndpointMonitoringMetricNoData]
|
|
218
|
+
] = []
|
|
219
|
+
if not df.empty:
|
|
220
|
+
grouped = df.groupby(
|
|
221
|
+
[mm_writer.WriterEvent.APPLICATION_NAME, mm_writer.MetricData.METRIC_NAME],
|
|
222
|
+
observed=False,
|
|
223
|
+
)
|
|
224
|
+
else:
|
|
225
|
+
logger.debug("No metrics", missing_metrics=metrics_without_data.keys())
|
|
226
|
+
grouped = []
|
|
227
|
+
for (app_name, name), sub_df in grouped:
|
|
228
|
+
full_name = _compose_full_name(
|
|
229
|
+
project=project,
|
|
230
|
+
app=app_name,
|
|
231
|
+
name=name,
|
|
232
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
233
|
+
)
|
|
234
|
+
metrics_values.append(
|
|
235
|
+
ModelEndpointMonitoringMetricValues(
|
|
236
|
+
full_name=full_name,
|
|
237
|
+
values=list(
|
|
238
|
+
zip(
|
|
239
|
+
sub_df.index,
|
|
240
|
+
sub_df[mm_writer.MetricData.METRIC_VALUE],
|
|
241
|
+
)
|
|
242
|
+
), # pyright: ignore[reportArgumentType]
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
del metrics_without_data[full_name]
|
|
246
|
+
|
|
247
|
+
for metric in metrics_without_data.values():
|
|
248
|
+
metrics_values.append(
|
|
249
|
+
ModelEndpointMonitoringMetricNoData(
|
|
250
|
+
full_name=metric.full_name,
|
|
251
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return metrics_values
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_invocations_fqn(project: str):
|
|
259
|
+
return mlrun.common.schemas.model_monitoring.model_endpoints._compose_full_name(
|
|
260
|
+
project=project,
|
|
261
|
+
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
262
|
+
name=mlrun.common.schemas.model_monitoring.constants.PredictionsQueryConstants.INVOCATIONS,
|
|
263
|
+
type=mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.METRIC,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def read_predictions(
|
|
268
|
+
*,
|
|
269
|
+
project: str,
|
|
270
|
+
endpoint_id: str,
|
|
271
|
+
start: Optional[Union[datetime, str]] = None,
|
|
272
|
+
end: Optional[Union[datetime, str]] = None,
|
|
273
|
+
aggregation_window: Optional[str] = None,
|
|
274
|
+
limit: Optional[int] = None,
|
|
275
|
+
) -> _ModelEndpointMonitoringMetricValuesBase:
|
|
276
|
+
client = mlrun.utils.v3io_clients.get_frames_client(
|
|
277
|
+
address=mlrun.mlconf.v3io_framesd,
|
|
278
|
+
container="users",
|
|
279
|
+
)
|
|
280
|
+
frames_client_kwargs = {}
|
|
281
|
+
if aggregation_window:
|
|
282
|
+
frames_client_kwargs["step"] = aggregation_window
|
|
283
|
+
frames_client_kwargs["aggregation_window"] = aggregation_window
|
|
284
|
+
if limit:
|
|
285
|
+
frames_client_kwargs["limit"] = limit
|
|
286
|
+
df: pd.DataFrame = client.read(
|
|
287
|
+
backend=_TSDB_BE,
|
|
288
|
+
table=f"pipelines/{project}/model-endpoints/predictions",
|
|
289
|
+
columns=["latency"],
|
|
290
|
+
filter=f"endpoint_id=='{endpoint_id}'",
|
|
291
|
+
start=start,
|
|
292
|
+
end=end,
|
|
293
|
+
aggregators="count",
|
|
294
|
+
**frames_client_kwargs,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
full_name = get_invocations_fqn(project)
|
|
298
|
+
|
|
299
|
+
if df.empty:
|
|
300
|
+
return ModelEndpointMonitoringMetricNoData(
|
|
301
|
+
full_name=full_name,
|
|
302
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
return ModelEndpointMonitoringMetricValues(
|
|
306
|
+
full_name=full_name,
|
|
307
|
+
values=list(
|
|
308
|
+
zip(
|
|
309
|
+
df.index,
|
|
310
|
+
df["count(latency)"],
|
|
311
|
+
)
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def read_prediction_metric_for_endpoint_if_exists(
|
|
317
|
+
*,
|
|
318
|
+
project: str,
|
|
319
|
+
endpoint_id: str,
|
|
320
|
+
) -> Optional[ModelEndpointMonitoringMetric]:
|
|
321
|
+
predictions = read_predictions(
|
|
322
|
+
project=project,
|
|
323
|
+
endpoint_id=endpoint_id,
|
|
324
|
+
start="0",
|
|
325
|
+
end="now",
|
|
326
|
+
limit=1, # Read just one record, because we just want to check if there is any data for this endpoint_id
|
|
327
|
+
)
|
|
328
|
+
if predictions:
|
|
329
|
+
return ModelEndpointMonitoringMetric(
|
|
330
|
+
project=project,
|
|
331
|
+
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
332
|
+
type=ModelEndpointMonitoringMetricType.METRIC,
|
|
333
|
+
name=mlrun.common.schemas.model_monitoring.constants.PredictionsQueryConstants.INVOCATIONS,
|
|
334
|
+
full_name=get_invocations_fqn(project),
|
|
335
|
+
)
|