mlrun 1.7.0rc16__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (65) hide show
  1. mlrun/alerts/alert.py +27 -24
  2. mlrun/artifacts/manager.py +5 -1
  3. mlrun/artifacts/model.py +1 -1
  4. mlrun/common/runtimes/constants.py +3 -0
  5. mlrun/common/schemas/__init__.py +8 -2
  6. mlrun/common/schemas/alert.py +49 -10
  7. mlrun/common/schemas/client_spec.py +1 -0
  8. mlrun/common/schemas/function.py +4 -0
  9. mlrun/common/schemas/model_monitoring/__init__.py +3 -1
  10. mlrun/common/schemas/model_monitoring/constants.py +21 -1
  11. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  12. mlrun/common/schemas/model_monitoring/model_endpoints.py +17 -6
  13. mlrun/common/schemas/project.py +3 -1
  14. mlrun/config.py +9 -3
  15. mlrun/data_types/to_pandas.py +5 -5
  16. mlrun/datastore/datastore.py +6 -2
  17. mlrun/datastore/redis.py +2 -2
  18. mlrun/datastore/s3.py +5 -0
  19. mlrun/datastore/sources.py +111 -6
  20. mlrun/datastore/targets.py +2 -2
  21. mlrun/db/base.py +6 -2
  22. mlrun/db/httpdb.py +22 -3
  23. mlrun/db/nopdb.py +10 -3
  24. mlrun/errors.py +6 -0
  25. mlrun/feature_store/retrieval/conversion.py +5 -5
  26. mlrun/feature_store/retrieval/job.py +3 -2
  27. mlrun/feature_store/retrieval/spark_merger.py +2 -1
  28. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -2
  29. mlrun/lists.py +2 -0
  30. mlrun/model.py +8 -6
  31. mlrun/model_monitoring/db/stores/base/store.py +16 -3
  32. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +44 -43
  33. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +190 -91
  34. mlrun/model_monitoring/db/tsdb/__init__.py +35 -6
  35. mlrun/model_monitoring/db/tsdb/base.py +25 -18
  36. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  37. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +207 -0
  38. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  39. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +231 -0
  40. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +103 -64
  41. mlrun/model_monitoring/db/v3io_tsdb_reader.py +217 -16
  42. mlrun/model_monitoring/helpers.py +32 -0
  43. mlrun/model_monitoring/stream_processing.py +7 -4
  44. mlrun/model_monitoring/writer.py +19 -14
  45. mlrun/package/utils/_formatter.py +2 -2
  46. mlrun/projects/project.py +40 -11
  47. mlrun/render.py +8 -5
  48. mlrun/runtimes/__init__.py +1 -0
  49. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  50. mlrun/runtimes/nuclio/api_gateway.py +97 -77
  51. mlrun/runtimes/nuclio/application/application.py +160 -7
  52. mlrun/runtimes/nuclio/function.py +18 -12
  53. mlrun/track/tracker.py +2 -1
  54. mlrun/utils/async_http.py +25 -5
  55. mlrun/utils/helpers.py +28 -3
  56. mlrun/utils/logger.py +11 -6
  57. mlrun/utils/notifications/notification/slack.py +27 -7
  58. mlrun/utils/notifications/notification_pusher.py +45 -41
  59. mlrun/utils/version/version.json +2 -2
  60. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/METADATA +8 -3
  61. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/RECORD +65 -61
  62. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/LICENSE +0 -0
  63. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/WHEEL +0 -0
  64. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/entry_points.txt +0 -0
  65. {mlrun-1.7.0rc16.dist-info → mlrun-1.7.0rc18.dist-info}/top_level.txt +0 -0
@@ -11,8 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import datetime
15
+ import typing
16
16
 
17
17
  import pandas as pd
18
18
  import v3io_frames.client
@@ -21,7 +21,7 @@ from v3io.dataplane import Client as V3IOClient
21
21
  from v3io_frames.frames_pb2 import IGNORE
22
22
 
23
23
  import mlrun.common.model_monitoring
24
- import mlrun.common.schemas.model_monitoring as mm_constants
24
+ import mlrun.common.schemas.model_monitoring as mm_schemas
25
25
  import mlrun.feature_store.steps
26
26
  import mlrun.utils.v3io_clients
27
27
  from mlrun.model_monitoring.db import TSDBConnector
@@ -37,12 +37,14 @@ class V3IOTSDBConnector(TSDBConnector):
37
37
  Client that provides API for executing commands on the V3IO TSDB table.
38
38
  """
39
39
 
40
+ type: str = mm_schemas.TSDBTarget.V3IO_TSDB
41
+
40
42
  def __init__(
41
43
  self,
42
44
  project: str,
43
- access_key: str = None,
45
+ access_key: typing.Optional[str] = None,
44
46
  container: str = "users",
45
- v3io_framesd: str = None,
47
+ v3io_framesd: typing.Optional[str] = None,
46
48
  create_table: bool = False,
47
49
  ):
48
50
  super().__init__(project=project)
@@ -61,14 +63,14 @@ class V3IOTSDBConnector(TSDBConnector):
61
63
  self._init_tables_path()
62
64
 
63
65
  if create_table:
64
- self.create_tsdb_application_tables()
66
+ self.create_tables()
65
67
 
66
68
  def _init_tables_path(self):
67
69
  self.tables = {}
68
70
 
69
71
  events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
70
72
  project=self.project,
71
- kind=mm_constants.FileTargetKind.EVENTS,
73
+ kind=mm_schemas.FileTargetKind.EVENTS,
72
74
  )
73
75
  (
74
76
  _,
@@ -77,12 +79,12 @@ class V3IOTSDBConnector(TSDBConnector):
77
79
  ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
78
80
  events_table_full_path
79
81
  )
80
- self.tables[mm_constants.MonitoringTSDBTables.EVENTS] = events_path
82
+ self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
81
83
 
82
84
  monitoring_application_full_path = (
83
85
  mlrun.mlconf.get_model_monitoring_file_target_path(
84
86
  project=self.project,
85
- kind=mm_constants.FileTargetKind.MONITORING_APPLICATION,
87
+ kind=mm_schemas.FileTargetKind.MONITORING_APPLICATION,
86
88
  )
87
89
  )
88
90
  (
@@ -92,28 +94,45 @@ class V3IOTSDBConnector(TSDBConnector):
92
94
  ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
93
95
  monitoring_application_full_path
94
96
  )
95
- self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS] = (
96
- monitoring_application_path + mm_constants.MonitoringTSDBTables.APP_RESULTS
97
+ self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS] = (
98
+ monitoring_application_path + mm_schemas.V3IOTSDBTables.APP_RESULTS
99
+ )
100
+ self.tables[mm_schemas.V3IOTSDBTables.METRICS] = (
101
+ monitoring_application_path + mm_schemas.V3IOTSDBTables.METRICS
102
+ )
103
+
104
+ monitoring_predictions_full_path = (
105
+ mlrun.mlconf.get_model_monitoring_file_target_path(
106
+ project=self.project,
107
+ kind=mm_schemas.FileTargetKind.PREDICTIONS,
108
+ )
97
109
  )
98
- self.tables[mm_constants.MonitoringTSDBTables.METRICS] = (
99
- monitoring_application_path + mm_constants.MonitoringTSDBTables.METRICS
110
+ (
111
+ _,
112
+ _,
113
+ monitoring_predictions_path,
114
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
115
+ monitoring_predictions_full_path
100
116
  )
117
+ self.tables[mm_schemas.FileTargetKind.PREDICTIONS] = monitoring_predictions_path
101
118
 
102
- def create_tsdb_application_tables(self):
119
+ def create_tables(self) -> None:
103
120
  """
104
- Create the application tables using the TSDB connector. At the moment we support 2 types of application tables:
121
+ Create the tables using the TSDB connector. The tables are being created in the V3IO TSDB and include:
105
122
  - app_results: a detailed result that includes status, kind, extra data, etc.
106
123
  - metrics: a basic key value that represents a single numeric metric.
124
+ Note that the predictions table is automatically created by the model monitoring stream pod.
107
125
  """
108
126
  application_tables = [
109
- mm_constants.MonitoringTSDBTables.APP_RESULTS,
110
- mm_constants.MonitoringTSDBTables.METRICS,
127
+ mm_schemas.V3IOTSDBTables.APP_RESULTS,
128
+ mm_schemas.V3IOTSDBTables.METRICS,
111
129
  ]
112
- for table in application_tables:
113
- logger.info("Creating table in V3IO TSDB", table=table)
130
+ for table_name in application_tables:
131
+ logger.info("Creating table in V3IO TSDB", table_name=table_name)
132
+ table = self.tables[table_name]
114
133
  self._frames_client.create(
115
134
  backend=_TSDB_BE,
116
- table=self.tables[table],
135
+ table=table,
117
136
  if_exists=IGNORE,
118
137
  rate=_TSDB_RATE,
119
138
  )
@@ -134,6 +153,27 @@ class V3IOTSDBConnector(TSDBConnector):
134
153
  - custom_metrics (user-defined metrics)
135
154
  """
136
155
 
156
+ # Write latency per prediction, labeled by endpoint ID only
157
+ graph.add_step(
158
+ "storey.TSDBTarget",
159
+ name="tsdb_predictions",
160
+ after="MapFeatureNames",
161
+ path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.PREDICTIONS]}",
162
+ rate="1/s",
163
+ time_col=mm_schemas.EventFieldType.TIMESTAMP,
164
+ container=self.container,
165
+ v3io_frames=self.v3io_framesd,
166
+ columns=["latency"],
167
+ index_cols=[
168
+ mm_schemas.EventFieldType.ENDPOINT_ID,
169
+ ],
170
+ aggr="count,avg",
171
+ aggr_granularity="1m",
172
+ max_events=tsdb_batching_max_events,
173
+ flush_after_seconds=tsdb_batching_timeout_secs,
174
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
175
+ )
176
+
137
177
  # Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
138
178
  # stats and details about the events
139
179
 
@@ -160,40 +200,40 @@ class V3IOTSDBConnector(TSDBConnector):
160
200
  "storey.TSDBTarget",
161
201
  name=name,
162
202
  after=after,
163
- path=f"{self.container}/{self.tables[mm_constants.MonitoringTSDBTables.EVENTS]}",
203
+ path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.EVENTS]}",
164
204
  rate="10/m",
165
- time_col=mm_constants.EventFieldType.TIMESTAMP,
205
+ time_col=mm_schemas.EventFieldType.TIMESTAMP,
166
206
  container=self.container,
167
207
  v3io_frames=self.v3io_framesd,
168
208
  infer_columns_from_data=True,
169
209
  index_cols=[
170
- mm_constants.EventFieldType.ENDPOINT_ID,
171
- mm_constants.EventFieldType.RECORD_TYPE,
172
- mm_constants.EventFieldType.ENDPOINT_TYPE,
210
+ mm_schemas.EventFieldType.ENDPOINT_ID,
211
+ mm_schemas.EventFieldType.RECORD_TYPE,
212
+ mm_schemas.EventFieldType.ENDPOINT_TYPE,
173
213
  ],
174
214
  max_events=tsdb_batching_max_events,
175
215
  flush_after_seconds=tsdb_batching_timeout_secs,
176
- key=mm_constants.EventFieldType.ENDPOINT_ID,
216
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
177
217
  )
178
218
 
179
219
  # unpacked base_metrics dictionary
180
220
  apply_filter_and_unpacked_keys(
181
221
  name="FilterAndUnpackKeys1",
182
- keys=mm_constants.EventKeyMetrics.BASE_METRICS,
222
+ keys=mm_schemas.EventKeyMetrics.BASE_METRICS,
183
223
  )
184
224
  apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
185
225
 
186
226
  # unpacked endpoint_features dictionary
187
227
  apply_filter_and_unpacked_keys(
188
228
  name="FilterAndUnpackKeys2",
189
- keys=mm_constants.EventKeyMetrics.ENDPOINT_FEATURES,
229
+ keys=mm_schemas.EventKeyMetrics.ENDPOINT_FEATURES,
190
230
  )
191
231
  apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
192
232
 
193
233
  # unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
194
234
  apply_filter_and_unpacked_keys(
195
235
  name="FilterAndUnpackKeys3",
196
- keys=mm_constants.EventKeyMetrics.CUSTOM_METRICS,
236
+ keys=mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
197
237
  )
198
238
 
199
239
  def apply_storey_filter():
@@ -210,56 +250,55 @@ class V3IOTSDBConnector(TSDBConnector):
210
250
  def write_application_event(
211
251
  self,
212
252
  event: dict,
213
- kind: mm_constants.WriterEventKind = mm_constants.WriterEventKind.RESULT,
214
- ):
253
+ kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
254
+ ) -> None:
215
255
  """Write a single result or metric to TSDB"""
216
256
 
217
- event[mm_constants.WriterEvent.END_INFER_TIME] = (
218
- datetime.datetime.fromisoformat(
219
- event[mm_constants.WriterEvent.END_INFER_TIME]
220
- )
257
+ event[mm_schemas.WriterEvent.END_INFER_TIME] = datetime.datetime.fromisoformat(
258
+ event[mm_schemas.WriterEvent.END_INFER_TIME]
221
259
  )
260
+ index_cols_base = [
261
+ mm_schemas.WriterEvent.END_INFER_TIME,
262
+ mm_schemas.WriterEvent.ENDPOINT_ID,
263
+ mm_schemas.WriterEvent.APPLICATION_NAME,
264
+ ]
222
265
 
223
- if kind == mm_constants.WriterEventKind.METRIC:
224
- # TODO : Implement the logic for writing metrics to V3IO TSDB
225
- return
266
+ if kind == mm_schemas.WriterEventKind.METRIC:
267
+ table = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
268
+ index_cols = index_cols_base + [mm_schemas.MetricData.METRIC_NAME]
269
+ elif kind == mm_schemas.WriterEventKind.RESULT:
270
+ table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
271
+ index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
272
+ del event[mm_schemas.ResultData.RESULT_EXTRA_DATA]
273
+ else:
274
+ raise ValueError(f"Invalid {kind = }")
226
275
 
227
- del event[mm_constants.ResultData.RESULT_EXTRA_DATA]
228
276
  try:
229
277
  self._frames_client.write(
230
278
  backend=_TSDB_BE,
231
- table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
279
+ table=table,
232
280
  dfs=pd.DataFrame.from_records([event]),
233
- index_cols=[
234
- mm_constants.WriterEvent.END_INFER_TIME,
235
- mm_constants.WriterEvent.ENDPOINT_ID,
236
- mm_constants.WriterEvent.APPLICATION_NAME,
237
- mm_constants.ResultData.RESULT_NAME,
238
- ],
239
- )
240
- logger.info(
241
- "Updated V3IO TSDB successfully",
242
- table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
281
+ index_cols=index_cols,
243
282
  )
283
+ logger.info("Updated V3IO TSDB successfully", table=table)
244
284
  except v3io_frames.errors.Error as err:
245
- logger.warn(
285
+ logger.exception(
246
286
  "Could not write drift measures to TSDB",
247
287
  err=err,
248
- table=self.tables[mm_constants.MonitoringTSDBTables.APP_RESULTS],
288
+ table=table,
249
289
  event=event,
250
290
  )
251
-
252
291
  raise mlrun.errors.MLRunRuntimeError(
253
292
  f"Failed to write application result to TSDB: {err}"
254
293
  )
255
294
 
256
- def delete_tsdb_resources(self, table: str = None):
295
+ def delete_tsdb_resources(self, table: typing.Optional[str] = None):
257
296
  if table:
258
297
  # Delete a specific table
259
298
  tables = [table]
260
299
  else:
261
300
  # Delete all tables
262
- tables = mm_constants.MonitoringTSDBTables.list()
301
+ tables = mm_schemas.V3IOTSDBTables.list()
263
302
  for table in tables:
264
303
  try:
265
304
  self._frames_client.delete(
@@ -282,8 +321,8 @@ class V3IOTSDBConnector(TSDBConnector):
282
321
  self,
283
322
  endpoint_id: str,
284
323
  metrics: list[str],
285
- start: str = "now-1h",
286
- end: str = "now",
324
+ start: str,
325
+ end: str,
287
326
  ) -> dict[str, list[tuple[str, float]]]:
288
327
  """
289
328
  Getting real time metrics from the TSDB. There are pre-defined metrics for model endpoints such as
@@ -312,7 +351,7 @@ class V3IOTSDBConnector(TSDBConnector):
312
351
 
313
352
  try:
314
353
  data = self.get_records(
315
- table=mm_constants.MonitoringTSDBTables.EVENTS,
354
+ table=mm_schemas.V3IOTSDBTables.EVENTS,
316
355
  columns=["endpoint_id", *metrics],
317
356
  filter_query=f"endpoint_id=='{endpoint_id}'",
318
357
  start=start,
@@ -339,17 +378,14 @@ class V3IOTSDBConnector(TSDBConnector):
339
378
  def get_records(
340
379
  self,
341
380
  table: str,
342
- columns: list[str] = None,
381
+ start: str,
382
+ end: str,
383
+ columns: typing.Optional[list[str]] = None,
343
384
  filter_query: str = "",
344
- start: str = "now-1h",
345
- end: str = "now",
346
385
  ) -> pd.DataFrame:
347
386
  """
348
387
  Getting records from V3IO TSDB data collection.
349
388
  :param table: Path to the collection to query.
350
- :param columns: Columns to include in the result.
351
- :param filter_query: V3IO filter expression. The expected filter expression includes different conditions,
352
- divided by ' AND '.
353
389
  :param start: The start time of the metrics. Can be represented by a string containing an RFC 3339
354
390
  time, a Unix timestamp in milliseconds, a relative time (`'now'` or
355
391
  `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
@@ -358,6 +394,9 @@ class V3IOTSDBConnector(TSDBConnector):
358
394
  time, a Unix timestamp in milliseconds, a relative time (`'now'` or
359
395
  `'now-[0-9]+[mhd]'`, where `m` = minutes, `h` = hours, `'d'` = days, and
360
396
  `'s'` = seconds), or 0 for the earliest time.
397
+ :param columns: Columns to include in the result.
398
+ :param filter_query: V3IO filter expression. The expected filter expression includes different conditions,
399
+ divided by ' AND '.
361
400
  :return: DataFrame with the provided attributes from the data collection.
362
401
  :raise: MLRunNotFoundError if the provided table wasn't found.
363
402
  """
@@ -384,7 +423,7 @@ class V3IOTSDBConnector(TSDBConnector):
384
423
  """
385
424
  events_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
386
425
  project=self.project,
387
- kind=mm_constants.FileTargetKind.EVENTS,
426
+ kind=mm_schemas.FileTargetKind.EVENTS,
388
427
  )
389
428
 
390
429
  # Generate the main directory with the V3IO resources
@@ -12,10 +12,12 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- # TODO: Move this module into the TSDB abstraction once it is in.
15
+ # TODO: Move this module into the TSDB abstraction:
16
+ # mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py
16
17
 
17
18
  from datetime import datetime
18
19
  from io import StringIO
20
+ from typing import Literal, Optional, Union
19
21
 
20
22
  import pandas as pd
21
23
 
@@ -25,21 +27,27 @@ import mlrun.model_monitoring.writer as mm_writer
25
27
  import mlrun.utils.v3io_clients
26
28
  from mlrun.common.schemas.model_monitoring.model_endpoints import (
27
29
  ModelEndpointMonitoringMetric,
30
+ ModelEndpointMonitoringMetricNoData,
28
31
  ModelEndpointMonitoringMetricType,
29
- ModelEndpointMonitoringResultNoData,
32
+ ModelEndpointMonitoringMetricValues,
30
33
  ModelEndpointMonitoringResultValues,
31
34
  _compose_full_name,
32
- _ModelEndpointMonitoringResultValuesBase,
35
+ _ModelEndpointMonitoringMetricValuesBase,
33
36
  )
34
37
  from mlrun.model_monitoring.db.stores.v3io_kv.kv_store import KVStoreBase
35
38
  from mlrun.model_monitoring.db.tsdb.v3io.v3io_connector import _TSDB_BE
36
39
  from mlrun.utils import logger
37
40
 
38
41
 
39
- def _get_sql_query(endpoint_id: str, names: list[tuple[str, str]]) -> str:
42
+ def _get_sql_query(
43
+ endpoint_id: str,
44
+ names: list[tuple[str, str]],
45
+ table_name: str = mm_constants.V3IOTSDBTables.APP_RESULTS,
46
+ name: str = mm_writer.ResultData.RESULT_NAME,
47
+ ) -> str:
40
48
  with StringIO() as query:
41
49
  query.write(
42
- f"SELECT * FROM '{mm_constants.MonitoringTSDBTables.APP_RESULTS}' "
50
+ f"SELECT * FROM '{table_name}' "
43
51
  f"WHERE {mm_writer.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
44
52
  )
45
53
  if names:
@@ -48,7 +56,7 @@ def _get_sql_query(endpoint_id: str, names: list[tuple[str, str]]) -> str:
48
56
  for i, (app_name, result_name) in enumerate(names):
49
57
  sub_cond = (
50
58
  f"({mm_writer.WriterEvent.APPLICATION_NAME}='{app_name}' "
51
- f"AND {mm_writer.ResultData.RESULT_NAME}='{result_name}')"
59
+ f"AND {name}='{result_name}')"
52
60
  )
53
61
  if i != 0: # not first sub condition
54
62
  query.write(" OR ")
@@ -73,30 +81,87 @@ def _get_result_kind(result_df: pd.DataFrame) -> mm_constants.ResultKindApp:
73
81
  return unique_kinds[0]
74
82
 
75
83
 
76
- def read_data(
84
+ def read_metrics_data(
77
85
  *,
78
86
  project: str,
79
87
  endpoint_id: str,
80
88
  start: datetime,
81
89
  end: datetime,
82
90
  metrics: list[ModelEndpointMonitoringMetric],
83
- ) -> list[_ModelEndpointMonitoringResultValuesBase]:
91
+ type: Literal["metrics", "results"] = "results",
92
+ ) -> Union[
93
+ list[
94
+ Union[
95
+ ModelEndpointMonitoringResultValues,
96
+ ModelEndpointMonitoringMetricNoData,
97
+ ],
98
+ ],
99
+ list[
100
+ Union[
101
+ ModelEndpointMonitoringMetricValues,
102
+ ModelEndpointMonitoringMetricNoData,
103
+ ],
104
+ ],
105
+ ]:
106
+ """
107
+ Read metrics OR results from the TSDB and return as a list.
108
+ Note: the type must match the actual metrics in the `metrics` parameter.
109
+ If the type is "results", pass only results in the `metrics` parameter.
110
+ """
84
111
  client = mlrun.utils.v3io_clients.get_frames_client(
85
112
  address=mlrun.mlconf.v3io_framesd,
86
113
  container=KVStoreBase.get_v3io_monitoring_apps_container(project),
87
114
  )
115
+
116
+ if type == "metrics":
117
+ table_name = mm_constants.V3IOTSDBTables.METRICS
118
+ name = mm_constants.MetricData.METRIC_NAME
119
+ df_handler = df_to_metrics_values
120
+ elif type == "results":
121
+ table_name = mm_constants.V3IOTSDBTables.APP_RESULTS
122
+ name = mm_constants.ResultData.RESULT_NAME
123
+ df_handler = df_to_results_values
124
+ else:
125
+ raise ValueError(f"Invalid {type = }")
126
+
127
+ query = _get_sql_query(
128
+ endpoint_id,
129
+ [(metric.app, metric.name) for metric in metrics],
130
+ table_name=table_name,
131
+ name=name,
132
+ )
133
+
134
+ logger.debug("Querying V3IO TSDB", query=query)
135
+
88
136
  df: pd.DataFrame = client.read(
89
137
  backend=_TSDB_BE,
90
- query=_get_sql_query(
91
- endpoint_id, [(metric.app, metric.name) for metric in metrics]
92
- ),
138
+ query=query,
93
139
  start=start,
94
140
  end=end,
95
141
  )
96
142
 
143
+ logger.debug(
144
+ "Read a data-frame", project=project, endpoint_id=endpoint_id, is_empty=df.empty
145
+ )
146
+
147
+ return df_handler(df=df, metrics=metrics, project=project)
148
+
149
+
150
+ def df_to_results_values(
151
+ *, df: pd.DataFrame, metrics: list[ModelEndpointMonitoringMetric], project: str
152
+ ) -> list[
153
+ Union[ModelEndpointMonitoringResultValues, ModelEndpointMonitoringMetricNoData]
154
+ ]:
155
+ """
156
+ Parse a time-indexed data-frame of results from the TSDB into a list of
157
+ results values per distinct results.
158
+ When a result is not found in the data-frame, it is represented in no-data object.
159
+ """
97
160
  metrics_without_data = {metric.full_name: metric for metric in metrics}
98
161
 
99
- metrics_values: list[_ModelEndpointMonitoringResultValuesBase] = []
162
+ metrics_values: list[
163
+ Union[ModelEndpointMonitoringResultValues, ModelEndpointMonitoringMetricNoData]
164
+ ] = []
100
165
  if not df.empty:
101
166
  grouped = df.groupby(
102
167
  [mm_writer.WriterEvent.APPLICATION_NAME, mm_writer.ResultData.RESULT_NAME],
@@ -104,13 +169,13 @@ def read_data(
104
169
  )
105
170
  else:
106
171
  grouped = []
107
- for (app_name, result_name), sub_df in grouped:
172
+ logger.debug("No results", missing_results=metrics_without_data.keys())
173
+ for (app_name, name), sub_df in grouped:
108
174
  result_kind = _get_result_kind(sub_df)
109
- full_name = _compose_full_name(project=project, app=app_name, name=result_name)
175
+ full_name = _compose_full_name(project=project, app=app_name, name=name)
110
176
  metrics_values.append(
111
177
  ModelEndpointMonitoringResultValues(
112
178
  full_name=full_name,
113
- type=ModelEndpointMonitoringMetricType.RESULT,
114
179
  result_kind=result_kind,
115
180
  values=list(
116
181
  zip(
@@ -124,11 +189,147 @@ def read_data(
124
189
  del metrics_without_data[full_name]
125
190
 
126
191
  for metric in metrics_without_data.values():
192
+ if metric.full_name == get_invocations_fqn(project):
193
+ continue
127
194
  metrics_values.append(
128
- ModelEndpointMonitoringResultNoData(
195
+ ModelEndpointMonitoringMetricNoData(
129
196
  full_name=metric.full_name,
130
197
  type=ModelEndpointMonitoringMetricType.RESULT,
131
198
  )
132
199
  )
133
200
 
134
201
  return metrics_values
202
+
203
+
204
+ def df_to_metrics_values(
205
+ *, df: pd.DataFrame, metrics: list[ModelEndpointMonitoringMetric], project: str
206
+ ) -> list[
207
+ Union[ModelEndpointMonitoringMetricValues, ModelEndpointMonitoringMetricNoData]
208
+ ]:
209
+ """
210
+ Parse a time-indexed data-frame of metrics from the TSDB into a list of
211
+ metrics values per distinct results.
212
+ When a metric is not found in the data-frame, it is represented in no-data object.
213
+ """
214
+ metrics_without_data = {metric.full_name: metric for metric in metrics}
215
+
216
+ metrics_values: list[
217
+ Union[ModelEndpointMonitoringMetricValues, ModelEndpointMonitoringMetricNoData]
218
+ ] = []
219
+ if not df.empty:
220
+ grouped = df.groupby(
221
+ [mm_writer.WriterEvent.APPLICATION_NAME, mm_writer.MetricData.METRIC_NAME],
222
+ observed=False,
223
+ )
224
+ else:
225
+ logger.debug("No metrics", missing_metrics=metrics_without_data.keys())
226
+ grouped = []
227
+ for (app_name, name), sub_df in grouped:
228
+ full_name = _compose_full_name(
229
+ project=project,
230
+ app=app_name,
231
+ name=name,
232
+ type=ModelEndpointMonitoringMetricType.METRIC,
233
+ )
234
+ metrics_values.append(
235
+ ModelEndpointMonitoringMetricValues(
236
+ full_name=full_name,
237
+ values=list(
238
+ zip(
239
+ sub_df.index,
240
+ sub_df[mm_writer.MetricData.METRIC_VALUE],
241
+ )
242
+ ), # pyright: ignore[reportArgumentType]
243
+ )
244
+ )
245
+ del metrics_without_data[full_name]
246
+
247
+ for metric in metrics_without_data.values():
248
+ metrics_values.append(
249
+ ModelEndpointMonitoringMetricNoData(
250
+ full_name=metric.full_name,
251
+ type=ModelEndpointMonitoringMetricType.METRIC,
252
+ )
253
+ )
254
+
255
+ return metrics_values
256
+
257
+
258
+ def get_invocations_fqn(project: str):
259
+ return mlrun.common.schemas.model_monitoring.model_endpoints._compose_full_name(
260
+ project=project,
261
+ app=mm_constants.SpecialApps.MLRUN_INFRA,
262
+ name=mlrun.common.schemas.model_monitoring.constants.PredictionsQueryConstants.INVOCATIONS,
263
+ type=mlrun.common.schemas.model_monitoring.ModelEndpointMonitoringMetricType.METRIC,
264
+ )
265
+
266
+
267
+ def read_predictions(
268
+ *,
269
+ project: str,
270
+ endpoint_id: str,
271
+ start: Optional[Union[datetime, str]] = None,
272
+ end: Optional[Union[datetime, str]] = None,
273
+ aggregation_window: Optional[str] = None,
274
+ limit: Optional[int] = None,
275
+ ) -> _ModelEndpointMonitoringMetricValuesBase:
276
+ client = mlrun.utils.v3io_clients.get_frames_client(
277
+ address=mlrun.mlconf.v3io_framesd,
278
+ container="users",
279
+ )
280
+ frames_client_kwargs = {}
281
+ if aggregation_window:
282
+ frames_client_kwargs["step"] = aggregation_window
283
+ frames_client_kwargs["aggregation_window"] = aggregation_window
284
+ if limit:
285
+ frames_client_kwargs["limit"] = limit
286
+ df: pd.DataFrame = client.read(
287
+ backend=_TSDB_BE,
288
+ table=f"pipelines/{project}/model-endpoints/predictions",
289
+ columns=["latency"],
290
+ filter=f"endpoint_id=='{endpoint_id}'",
291
+ start=start,
292
+ end=end,
293
+ aggregators="count",
294
+ **frames_client_kwargs,
295
+ )
296
+
297
+ full_name = get_invocations_fqn(project)
298
+
299
+ if df.empty:
300
+ return ModelEndpointMonitoringMetricNoData(
301
+ full_name=full_name,
302
+ type=ModelEndpointMonitoringMetricType.METRIC,
303
+ )
304
+
305
+ return ModelEndpointMonitoringMetricValues(
306
+ full_name=full_name,
307
+ values=list(
308
+ zip(
309
+ df.index,
310
+ df["count(latency)"],
311
+ )
312
+ ),
313
+ )
314
+
315
+
316
+ def read_prediction_metric_for_endpoint_if_exists(
317
+ *,
318
+ project: str,
319
+ endpoint_id: str,
320
+ ) -> Optional[ModelEndpointMonitoringMetric]:
321
+ predictions = read_predictions(
322
+ project=project,
323
+ endpoint_id=endpoint_id,
324
+ start="0",
325
+ end="now",
326
+ limit=1, # Read just one record, because we just want to check if there is any data for this endpoint_id
327
+ )
328
+ if predictions:
329
+ return ModelEndpointMonitoringMetric(
330
+ project=project,
331
+ app=mm_constants.SpecialApps.MLRUN_INFRA,
332
+ type=ModelEndpointMonitoringMetricType.METRIC,
333
+ name=mlrun.common.schemas.model_monitoring.constants.PredictionsQueryConstants.INVOCATIONS,
334
+ full_name=get_invocations_fqn(project),
335
+ )