mlrun 1.7.0rc15__py3-none-any.whl → 1.7.0rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (77) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +18 -4
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/artifacts/__init__.py +7 -1
  6. mlrun/artifacts/base.py +28 -3
  7. mlrun/artifacts/dataset.py +8 -0
  8. mlrun/artifacts/manager.py +18 -0
  9. mlrun/artifacts/model.py +8 -1
  10. mlrun/artifacts/plots.py +13 -0
  11. mlrun/common/schemas/__init__.py +10 -2
  12. mlrun/common/schemas/alert.py +64 -5
  13. mlrun/common/schemas/api_gateway.py +4 -0
  14. mlrun/common/schemas/artifact.py +15 -0
  15. mlrun/common/schemas/auth.py +2 -0
  16. mlrun/common/schemas/model_monitoring/__init__.py +4 -1
  17. mlrun/common/schemas/model_monitoring/constants.py +17 -1
  18. mlrun/common/schemas/model_monitoring/model_endpoints.py +60 -1
  19. mlrun/common/schemas/project.py +5 -1
  20. mlrun/config.py +11 -4
  21. mlrun/datastore/datastore_profile.py +10 -7
  22. mlrun/db/base.py +24 -4
  23. mlrun/db/httpdb.py +97 -43
  24. mlrun/db/nopdb.py +25 -4
  25. mlrun/errors.py +5 -0
  26. mlrun/launcher/base.py +3 -2
  27. mlrun/lists.py +4 -0
  28. mlrun/model.py +15 -8
  29. mlrun/model_monitoring/__init__.py +1 -1
  30. mlrun/model_monitoring/applications/_application_steps.py +1 -2
  31. mlrun/model_monitoring/applications/context.py +1 -1
  32. mlrun/model_monitoring/applications/histogram_data_drift.py +64 -38
  33. mlrun/model_monitoring/db/__init__.py +2 -0
  34. mlrun/model_monitoring/db/stores/base/store.py +9 -36
  35. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
  36. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +56 -202
  37. mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
  38. mlrun/model_monitoring/db/tsdb/base.py +135 -0
  39. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  40. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  41. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +442 -0
  42. mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
  43. mlrun/model_monitoring/stream_processing.py +46 -210
  44. mlrun/model_monitoring/writer.py +50 -100
  45. mlrun/platforms/__init__.py +10 -9
  46. mlrun/platforms/iguazio.py +19 -200
  47. mlrun/projects/operations.py +11 -7
  48. mlrun/projects/pipelines.py +13 -76
  49. mlrun/projects/project.py +62 -17
  50. mlrun/render.py +9 -3
  51. mlrun/run.py +5 -38
  52. mlrun/runtimes/__init__.py +1 -0
  53. mlrun/runtimes/base.py +3 -3
  54. mlrun/runtimes/kubejob.py +2 -1
  55. mlrun/runtimes/nuclio/api_gateway.py +163 -77
  56. mlrun/runtimes/nuclio/application/application.py +160 -7
  57. mlrun/runtimes/nuclio/function.py +25 -45
  58. mlrun/runtimes/pod.py +16 -36
  59. mlrun/runtimes/remotesparkjob.py +1 -1
  60. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  61. mlrun/runtimes/utils.py +0 -38
  62. mlrun/track/tracker.py +2 -1
  63. mlrun/utils/helpers.py +51 -31
  64. mlrun/utils/logger.py +11 -6
  65. mlrun/utils/notifications/notification/base.py +1 -1
  66. mlrun/utils/notifications/notification/slack.py +9 -4
  67. mlrun/utils/notifications/notification/webhook.py +1 -1
  68. mlrun/utils/notifications/notification_pusher.py +21 -14
  69. mlrun/utils/version/version.json +2 -2
  70. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/METADATA +4 -3
  71. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/RECORD +75 -69
  72. mlrun/kfpops.py +0 -860
  73. mlrun/platforms/other.py +0 -305
  74. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/LICENSE +0 -0
  75. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/WHEEL +0 -0
  76. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/entry_points.txt +0 -0
  77. {mlrun-1.7.0rc15.dist-info → mlrun-1.7.0rc17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,117 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ import mlrun.feature_store.steps
17
+ from mlrun.common.schemas.model_monitoring import (
18
+ EventFieldType,
19
+ EventKeyMetrics,
20
+ EventLiveStats,
21
+ )
22
+
23
+
24
+ class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
25
+ def __init__(self, **kwargs):
26
+ """
27
+ Process the data before writing to TSDB. This step creates a dictionary that includes 3 different dictionaries
28
+ that each one of them contains important details and stats about the events:
29
+ 1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
30
+ storey.AggregateByKey which was executed in step 5.
31
+ 2. endpoint_features: feature names and values along with the prediction names and value.
32
+ 3. custom_metric (opt): optional metrics provided by the user.
33
+ :returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
34
+ """
35
+ super().__init__(**kwargs)
36
+
37
+ def do(self, event):
38
+ # Compute prediction per second
39
+ event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
40
+ float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
41
+ )
42
+ base_fields = [
43
+ EventFieldType.TIMESTAMP,
44
+ EventFieldType.ENDPOINT_ID,
45
+ EventFieldType.ENDPOINT_TYPE,
46
+ ]
47
+
48
+ # Getting event timestamp and endpoint_id
49
+ base_event = {k: event[k] for k in base_fields}
50
+
51
+ # base_metrics includes the stats about the average latency and the amount of predictions over time
52
+ base_metrics = {
53
+ EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
54
+ EventLiveStats.PREDICTIONS_PER_SECOND: event[
55
+ EventLiveStats.PREDICTIONS_PER_SECOND
56
+ ],
57
+ EventLiveStats.PREDICTIONS_COUNT_5M: event[
58
+ EventLiveStats.PREDICTIONS_COUNT_5M
59
+ ],
60
+ EventLiveStats.PREDICTIONS_COUNT_1H: event[
61
+ EventLiveStats.PREDICTIONS_COUNT_1H
62
+ ],
63
+ EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
64
+ EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
65
+ **base_event,
66
+ }
67
+
68
+ # endpoint_features includes the event values of each feature and prediction
69
+ endpoint_features = {
70
+ EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
71
+ **event[EventFieldType.NAMED_PREDICTIONS],
72
+ **event[EventFieldType.NAMED_FEATURES],
73
+ **base_event,
74
+ }
75
+ # Create a dictionary that includes both base_metrics and endpoint_features
76
+ processed = {
77
+ EventKeyMetrics.BASE_METRICS: base_metrics,
78
+ EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
79
+ }
80
+
81
+ # If metrics provided, add another dictionary if custom_metrics values
82
+ if event[EventFieldType.METRICS]:
83
+ processed[EventKeyMetrics.CUSTOM_METRICS] = {
84
+ EventFieldType.RECORD_TYPE: EventKeyMetrics.CUSTOM_METRICS,
85
+ **event[EventFieldType.METRICS],
86
+ **base_event,
87
+ }
88
+
89
+ return processed
90
+
91
+
92
+ class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
93
+ def __init__(self, keys, **kwargs):
94
+ """
95
+ Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
96
+ or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
97
+ :param keys: list of key metrics.
98
+ :returns: An unpacked dictionary of event filtered by the provided key metrics.
99
+ """
100
+ super().__init__(**kwargs)
101
+ self.keys = keys
102
+
103
+ def do(self, event):
104
+ # Keep only the relevant dictionary based on the provided keys
105
+ new_event = {}
106
+ for key in self.keys:
107
+ if key in event:
108
+ new_event[key] = event[key]
109
+
110
+ # Create unpacked dictionary
111
+ unpacked = {}
112
+ for key in new_event.keys():
113
+ if key in self.keys:
114
+ unpacked = {**unpacked, **new_event[key]}
115
+ else:
116
+ unpacked[key] = new_event[key]
117
+ return unpacked if unpacked else None
@@ -0,0 +1,442 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+
17
+ import pandas as pd
18
+ import v3io_frames.client
19
+ import v3io_frames.errors
20
+ from v3io.dataplane import Client as V3IOClient
21
+ from v3io_frames.frames_pb2 import IGNORE
22
+
23
+ import mlrun.common.model_monitoring
24
+ import mlrun.common.schemas.model_monitoring as mm_constants
25
+ import mlrun.feature_store.steps
26
+ import mlrun.utils.v3io_clients
27
+ from mlrun.model_monitoring.db import TSDBConnector
28
+ from mlrun.utils import logger
29
+
30
+ _TSDB_BE = "tsdb"
31
+ _TSDB_RATE = "1/s"
32
+
33
+
34
+ class V3IOTSDBConnector(TSDBConnector):
35
+ """
36
+ Handles the TSDB operations when the TSDB connector is of type V3IO. To manage these operations we use V3IO Frames
37
+ Client that provides API for executing commands on the V3IO TSDB table.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ project: str,
43
+ access_key: str = None,
44
+ container: str = "users",
45
+ v3io_framesd: str = None,
46
+ create_table: bool = False,
47
+ ):
48
+ super().__init__(project=project)
49
+ self.access_key = access_key or mlrun.mlconf.get_v3io_access_key()
50
+
51
+ self.container = container
52
+
53
+ self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
54
+ self._frames_client: v3io_frames.client.ClientBase = (
55
+ self._get_v3io_frames_client(self.container)
56
+ )
57
+ self._v3io_client: V3IOClient = mlrun.utils.v3io_clients.get_v3io_client(
58
+ endpoint=mlrun.mlconf.v3io_api,
59
+ )
60
+
61
+ self._init_tables_path()
62
+
63
+ if create_table:
64
+ self.create_tsdb_application_tables()
65
+
66
+ def _init_tables_path(self):
67
+ self.tables = {}
68
+
69
+ events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
70
+ project=self.project,
71
+ kind=mm_constants.FileTargetKind.EVENTS,
72
+ )
73
+ (
74
+ _,
75
+ _,
76
+ events_path,
77
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
78
+ events_table_full_path
79
+ )
80
+ self.tables[mm_constants.MonitoringTSDBTables.EVENTS] = events_path
81
+
82
+ monitoring_application_full_path = (
83
+ mlrun.mlconf.get_model_monitoring_file_target_path(
84
+ project=self.project,
85
+ kind=mm_constants.FileTargetKind.MONITORING_APPLICATION,
86
+ )
87
+ )
88
+ (
89
+ _,
90
+ _,
91
+ monitoring_application_path,
92
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
93
+ monitoring_application_full_path
94
+ )
95
+ self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS] = (
96
+ monitoring_application_path + mm_constants.MonitoringTSDBTables.APP_RESULTS
97
+ )
98
+ self.tables[mm_constants.MonitoringTSDBTables.METRICS] = (
99
+ monitoring_application_path + mm_constants.MonitoringTSDBTables.METRICS
100
+ )
101
+
102
+ monitoring_predictions_full_path = (
103
+ mlrun.mlconf.get_model_monitoring_file_target_path(
104
+ project=self.project,
105
+ kind=mm_constants.FileTargetKind.PREDICTIONS,
106
+ )
107
+ )
108
+ (
109
+ _,
110
+ _,
111
+ monitoring_predictions_path,
112
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
113
+ monitoring_predictions_full_path
114
+ )
115
+ self.tables[mm_constants.FileTargetKind.PREDICTIONS] = (
116
+ monitoring_predictions_path
117
+ )
118
+
119
+ def create_tsdb_application_tables(self):
120
+ """
121
+ Create the application tables using the TSDB connector. At the moment we support 2 types of application tables:
122
+ - app_results: a detailed result that includes status, kind, extra data, etc.
123
+ - metrics: a basic key value that represents a single numeric metric.
124
+ """
125
+ application_tables = [
126
+ mm_constants.MonitoringTSDBTables.APP_RESULTS,
127
+ mm_constants.MonitoringTSDBTables.METRICS,
128
+ ]
129
+ for table in application_tables:
130
+ logger.info("Creating table in V3IO TSDB", table=table)
131
+ self._frames_client.create(
132
+ backend=_TSDB_BE,
133
+ table=self.tables[table],
134
+ if_exists=IGNORE,
135
+ rate=_TSDB_RATE,
136
+ )
137
+
138
+ def apply_monitoring_stream_steps(
139
+ self,
140
+ graph,
141
+ tsdb_batching_max_events: int = 10,
142
+ tsdb_batching_timeout_secs: int = 300,
143
+ ):
144
+ """
145
+ Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
146
+ different key metric dictionaries.This data is being used by the monitoring dashboards in
147
+ grafana. Results can be found under v3io:///users/pipelines/project-name/model-endpoints/events/.
148
+ In that case, we generate 3 different key metric dictionaries:
149
+ - base_metrics (average latency and predictions over time)
150
+ - endpoint_features (Prediction and feature names and values)
151
+ - custom_metrics (user-defined metrics)
152
+ """
153
+
154
+ # Write latency per prediction, labeled by endpoint ID only
155
+ graph.add_step(
156
+ "storey.TSDBTarget",
157
+ name="tsdb_predictions",
158
+ after="MapFeatureNames",
159
+ path=f"{self.container}/{self.tables[mm_constants.FileTargetKind.PREDICTIONS]}",
160
+ rate="1/s",
161
+ time_col=mm_constants.EventFieldType.TIMESTAMP,
162
+ container=self.container,
163
+ v3io_frames=self.v3io_framesd,
164
+ columns=["latency"],
165
+ index_cols=[
166
+ mm_constants.EventFieldType.ENDPOINT_ID,
167
+ ],
168
+ aggr="count,avg",
169
+ aggr_granularity="1m",
170
+ max_events=tsdb_batching_max_events,
171
+ flush_after_seconds=tsdb_batching_timeout_secs,
172
+ key=mm_constants.EventFieldType.ENDPOINT_ID,
173
+ )
174
+
175
+ # Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
176
+ # stats and details about the events
177
+
178
+ def apply_process_before_tsdb():
179
+ graph.add_step(
180
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ProcessBeforeTSDB",
181
+ name="ProcessBeforeTSDB",
182
+ after="sample",
183
+ )
184
+
185
+ apply_process_before_tsdb()
186
+
187
+ # Unpacked keys from each dictionary and write to TSDB target
188
+ def apply_filter_and_unpacked_keys(name, keys):
189
+ graph.add_step(
190
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.FilterAndUnpackKeys",
191
+ name=name,
192
+ after="ProcessBeforeTSDB",
193
+ keys=[keys],
194
+ )
195
+
196
+ def apply_tsdb_target(name, after):
197
+ graph.add_step(
198
+ "storey.TSDBTarget",
199
+ name=name,
200
+ after=after,
201
+ path=f"{self.container}/{self.tables[mm_constants.MonitoringTSDBTables.EVENTS]}",
202
+ rate="10/m",
203
+ time_col=mm_constants.EventFieldType.TIMESTAMP,
204
+ container=self.container,
205
+ v3io_frames=self.v3io_framesd,
206
+ infer_columns_from_data=True,
207
+ index_cols=[
208
+ mm_constants.EventFieldType.ENDPOINT_ID,
209
+ mm_constants.EventFieldType.RECORD_TYPE,
210
+ mm_constants.EventFieldType.ENDPOINT_TYPE,
211
+ ],
212
+ max_events=tsdb_batching_max_events,
213
+ flush_after_seconds=tsdb_batching_timeout_secs,
214
+ key=mm_constants.EventFieldType.ENDPOINT_ID,
215
+ )
216
+
217
+ # unpacked base_metrics dictionary
218
+ apply_filter_and_unpacked_keys(
219
+ name="FilterAndUnpackKeys1",
220
+ keys=mm_constants.EventKeyMetrics.BASE_METRICS,
221
+ )
222
+ apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
223
+
224
+ # unpacked endpoint_features dictionary
225
+ apply_filter_and_unpacked_keys(
226
+ name="FilterAndUnpackKeys2",
227
+ keys=mm_constants.EventKeyMetrics.ENDPOINT_FEATURES,
228
+ )
229
+ apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
230
+
231
+ # unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
232
+ apply_filter_and_unpacked_keys(
233
+ name="FilterAndUnpackKeys3",
234
+ keys=mm_constants.EventKeyMetrics.CUSTOM_METRICS,
235
+ )
236
+
237
+ def apply_storey_filter():
238
+ graph.add_step(
239
+ "storey.Filter",
240
+ "FilterNotNone",
241
+ after="FilterAndUnpackKeys3",
242
+ _fn="(event is not None)",
243
+ )
244
+
245
+ apply_storey_filter()
246
+ apply_tsdb_target(name="tsdb3", after="FilterNotNone")
247
+
248
+ def write_application_event(
249
+ self,
250
+ event: dict,
251
+ kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
252
+ ):
253
+ """Write a single result or metric to TSDB"""
254
+
255
+ event[mm_constants.WriterEvent.END_INFER_TIME] = (
256
+ datetime.datetime.fromisoformat(
257
+ event[mm_constants.WriterEvent.END_INFER_TIME]
258
+ )
259
+ )
260
+
261
+ if kind == mm_constants.WriterEventKind.METRIC:
262
+ # TODO : Implement the logic for writing metrics to V3IO TSDB
263
+ return
264
+
265
+ del event[mm_constants.ResultData.RESULT_EXTRA_DATA]
266
+ try:
267
+ self._frames_client.write(
268
+ backend=_TSDB_BE,
269
+ table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
270
+ dfs=pd.DataFrame.from_records([event]),
271
+ index_cols=[
272
+ mm_constants.WriterEvent.END_INFER_TIME,
273
+ mm_constants.WriterEvent.ENDPOINT_ID,
274
+ mm_constants.WriterEvent.APPLICATION_NAME,
275
+ mm_constants.ResultData.RESULT_NAME,
276
+ ],
277
+ )
278
+ logger.info(
279
+ "Updated V3IO TSDB successfully",
280
+ table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
281
+ )
282
+ except v3io_frames.errors.Error as err:
283
+ logger.warn(
284
+ "Could not write drift measures to TSDB",
285
+ err=err,
286
+ table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
287
+ event=event,
288
+ )
289
+
290
+ raise mlrun.errors.MLRunRuntimeError(
291
+ f"Failed to write application result to TSDB: {err}"
292
+ )
293
+
294
+ def delete_tsdb_resources(self, table: str = None):
295
+ if table:
296
+ # Delete a specific table
297
+ tables = [table]
298
+ else:
299
+ # Delete all tables
300
+ tables = mm_constants.MonitoringTSDBTables.list()
301
+ for table in tables:
302
+ try:
303
+ self._frames_client.delete(
304
+ backend=mlrun.common.schemas.model_monitoring.TimeSeriesConnector.TSDB,
305
+ table=table,
306
+ )
307
+ except v3io_frames.errors.DeleteError as e:
308
+ logger.warning(
309
+ f"Failed to delete TSDB table '{table}'",
310
+ err=mlrun.errors.err_to_str(e),
311
+ )
312
+
313
+ # Final cleanup of tsdb path
314
+ tsdb_path = self._get_v3io_source_directory()
315
+ tsdb_path.replace("://u", ":///u")
316
+ store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
317
+ store.rm(tsdb_path, recursive=True)
318
+
319
+ def get_model_endpoint_real_time_metrics(
320
+ self,
321
+ endpoint_id: str,
322
+ metrics: list[str],
323
+ start: str = "now-1h",
324
+ end: str = "now",
325
+ ) -> dict[str, list[tuple[str, float]]]:
326
+ """
327
+ Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
328
+ `predictions_per_second` and `latency_avg_5m` but also custom metrics defined by the user. Note that these
329
+ metrics are being calculated by the model monitoring stream pod.
330
+ :param endpoint_id: The unique id of the model endpoint.
331
+ :param metrics: A list of real-time metrics to return for the model endpoint.
332
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
333
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
334
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
335
+ `'s'` = seconds), or 0 for the earliest time.
336
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
337
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
338
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, and `'d'` = days, and
339
+ `'s'` = seconds), or 0 for the earliest time.
340
+ :return: A dictionary of metrics in which the key is a metric name and the value is a list of tuples that
341
+ includes timestamps and the values.
342
+ """
343
+
344
+ if not metrics:
345
+ raise mlrun.errors.MLRunInvalidArgumentError(
346
+ "Metric names must be provided"
347
+ )
348
+
349
+ metrics_mapping = {}
350
+
351
+ try:
352
+ data = self.get_records(
353
+ table=mm_constants.MonitoringTSDBTables.EVENTS,
354
+ columns=["endpoint_id", *metrics],
355
+ filter_query=f"endpoint_id=='{endpoint_id}'",
356
+ start=start,
357
+ end=end,
358
+ )
359
+
360
+ # Fill the metrics mapping dictionary with the metric name and values
361
+ data_dict = data.to_dict()
362
+ for metric in metrics:
363
+ metric_data = data_dict.get(metric)
364
+ if metric_data is None:
365
+ continue
366
+
367
+ values = [
368
+ (str(timestamp), value) for timestamp, value in metric_data.items()
369
+ ]
370
+ metrics_mapping[metric] = values
371
+
372
+ except v3io_frames.errors.Error as err:
373
+ logger.warn("Failed to read tsdb", err=err, endpoint=endpoint_id)
374
+
375
+ return metrics_mapping
376
+
377
+ def get_records(
378
+ self,
379
+ table: str,
380
+ columns: list[str] = None,
381
+ filter_query: str = "",
382
+ start: str = "now-1h",
383
+ end: str = "now",
384
+ ) -> pd.DataFrame:
385
+ """
386
+ Getting records from V3IO TSDB data collection.
387
+ :param table: Path to the collection to query.
388
+ :param columns: Columns to include in the result.
389
+ :param filter_query: V3IO filter expression. The expected filter expression includes different conditions,
390
+ divided by ' AND '.
391
+ :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
392
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
393
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
394
+ `'s'` = seconds), or 0 for the earliest time.
395
+ :param end: The end time of the metrics. Can be represented by a string containing an RFC 3339
396
+ time, a Unix timestamp in milliseconds, a relative time (`'now'` or
397
+ `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
398
+ `'s'` = seconds), or 0 for the earliest time.
399
+ :return: DataFrame with the provided attributes from the data collection.
400
+ :raise: MLRunNotFoundError if the provided table wasn't found.
401
+ """
402
+ if table not in self.tables:
403
+ raise mlrun.errors.MLRunNotFoundError(
404
+ f"Table '{table}' does not exist in the tables list of the TSDB connector."
405
+ f"Available tables: {list(self.tables.keys())}"
406
+ )
407
+ return self._frames_client.read(
408
+ backend=mlrun.common.schemas.model_monitoring.TimeSeriesConnector.TSDB,
409
+ table=self.tables[table],
410
+ columns=columns,
411
+ filter=filter_query,
412
+ start=start,
413
+ end=end,
414
+ )
415
+
416
+ def _get_v3io_source_directory(self) -> str:
417
+ """
418
+ Get the V3IO source directory for the current project. Usually the source directory will
419
+ be under 'v3io:///users/pipelines/<project>'
420
+
421
+ :return: The V3IO source directory for the current project.
422
+ """
423
+ events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
424
+ project=self.project,
425
+ kind=mm_constants.FileTargetKind.EVENTS,
426
+ )
427
+
428
+ # Generate the main directory with the V3IO resources
429
+ source_directory = (
430
+ mlrun.common.model_monitoring.helpers.parse_model_endpoint_project_prefix(
431
+ events_table_full_path, self.project
432
+ )
433
+ )
434
+
435
+ return source_directory
436
+
437
+ @staticmethod
438
+ def _get_v3io_frames_client(v3io_container: str) -> v3io_frames.client.ClientBase:
439
+ return mlrun.utils.v3io_clients.get_frames_client(
440
+ address=mlrun.mlconf.v3io_framesd,
441
+ container=v3io_container,
442
+ )
@@ -0,0 +1,134 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # TODO: Move this module into the TSDB abstraction once it is in.
16
+
17
+ from datetime import datetime
18
+ from io import StringIO
19
+
20
+ import pandas as pd
21
+
22
+ import mlrun
23
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
24
+ import mlrun.model_monitoring.writer as mm_writer
25
+ import mlrun.utils.v3io_clients
26
+ from mlrun.common.schemas.model_monitoring.model_endpoints import (
27
+ ModelEndpointMonitoringMetric,
28
+ ModelEndpointMonitoringMetricType,
29
+ ModelEndpointMonitoringResultNoData,
30
+ ModelEndpointMonitoringResultValues,
31
+ _compose_full_name,
32
+ _ModelEndpointMonitoringResultValuesBase,
33
+ )
34
+ from mlrun.model_monitoring.db.stores.v3io_kv.kv_store import KVStoreBase
35
+ from mlrun.model_monitoring.db.tsdb.v3io.v3io_connector import _TSDB_BE
36
+ from mlrun.utils import logger
37
+
38
+
39
+ def _get_sql_query(endpoint_id: str, names: list[tuple[str, str]]) -> str:
40
+ with StringIO() as query:
41
+ query.write(
42
+ f"SELECT * FROM '{mm_constants.MonitoringTSDBTables.APP_RESULTS}' "
43
+ f"WHERE {mm_writer.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
44
+ )
45
+ if names:
46
+ query.write(" AND (")
47
+
48
+ for i, (app_name, result_name) in enumerate(names):
49
+ sub_cond = (
50
+ f"({mm_writer.WriterEvent.APPLICATION_NAME}='{app_name}' "
51
+ f"AND {mm_writer.ResultData.RESULT_NAME}='{result_name}')"
52
+ )
53
+ if i != 0: # not first sub condition
54
+ query.write(" OR ")
55
+ query.write(sub_cond)
56
+
57
+ query.write(")")
58
+
59
+ query.write(";")
60
+ return query.getvalue()
61
+
62
+
63
+ def _get_result_kind(result_df: pd.DataFrame) -> mm_constants.ResultKindApp:
64
+ kind_series = result_df[mm_writer.ResultData.RESULT_KIND]
65
+ unique_kinds = kind_series.unique()
66
+ if len(unique_kinds) > 1:
67
+ logger.warning(
68
+ "The result has more than one kind",
69
+ kinds=list(unique_kinds),
70
+ application_name=result_df[mm_writer.WriterEvent.APPLICATION_NAME],
71
+ result_name=result_df[mm_writer.ResultData.RESULT_NAME],
72
+ )
73
+ return unique_kinds[0]
74
+
75
+
76
+ def read_data(
77
+ *,
78
+ project: str,
79
+ endpoint_id: str,
80
+ start: datetime,
81
+ end: datetime,
82
+ metrics: list[ModelEndpointMonitoringMetric],
83
+ ) -> list[_ModelEndpointMonitoringResultValuesBase]:
84
+ client = mlrun.utils.v3io_clients.get_frames_client(
85
+ address=mlrun.mlconf.v3io_framesd,
86
+ container=KVStoreBase.get_v3io_monitoring_apps_container(project),
87
+ )
88
+ df: pd.DataFrame = client.read(
89
+ backend=_TSDB_BE,
90
+ query=_get_sql_query(
91
+ endpoint_id, [(metric.app, metric.name) for metric in metrics]
92
+ ),
93
+ start=start,
94
+ end=end,
95
+ )
96
+
97
+ metrics_without_data = {metric.full_name: metric for metric in metrics}
98
+
99
+ metrics_values: list[_ModelEndpointMonitoringResultValuesBase] = []
100
+ if not df.empty:
101
+ grouped = df.groupby(
102
+ [mm_writer.WriterEvent.APPLICATION_NAME, mm_writer.ResultData.RESULT_NAME],
103
+ observed=False,
104
+ )
105
+ else:
106
+ grouped = []
107
+ for (app_name, result_name), sub_df in grouped:
108
+ result_kind = _get_result_kind(sub_df)
109
+ full_name = _compose_full_name(project=project, app=app_name, name=result_name)
110
+ metrics_values.append(
111
+ ModelEndpointMonitoringResultValues(
112
+ full_name=full_name,
113
+ type=ModelEndpointMonitoringMetricType.RESULT,
114
+ result_kind=result_kind,
115
+ values=list(
116
+ zip(
117
+ sub_df.index,
118
+ sub_df[mm_writer.ResultData.RESULT_VALUE],
119
+ sub_df[mm_writer.ResultData.RESULT_STATUS],
120
+ )
121
+ ), # pyright: ignore[reportArgumentType]
122
+ )
123
+ )
124
+ del metrics_without_data[full_name]
125
+
126
+ for metric in metrics_without_data.values():
127
+ metrics_values.append(
128
+ ModelEndpointMonitoringResultNoData(
129
+ full_name=metric.full_name,
130
+ type=ModelEndpointMonitoringMetricType.RESULT,
131
+ )
132
+ )
133
+
134
+ return metrics_values