mlrun 1.10.0rc40__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (150) hide show
  1. mlrun/__init__.py +3 -2
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/plots.py +1 -1
  5. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  6. mlrun/auth/nuclio.py +89 -0
  7. mlrun/auth/providers.py +429 -0
  8. mlrun/auth/utils.py +415 -0
  9. mlrun/common/constants.py +7 -0
  10. mlrun/common/model_monitoring/helpers.py +41 -4
  11. mlrun/common/runtimes/constants.py +28 -0
  12. mlrun/common/schemas/__init__.py +13 -3
  13. mlrun/common/schemas/alert.py +2 -2
  14. mlrun/common/schemas/api_gateway.py +3 -0
  15. mlrun/common/schemas/auth.py +10 -10
  16. mlrun/common/schemas/client_spec.py +4 -0
  17. mlrun/common/schemas/constants.py +25 -0
  18. mlrun/common/schemas/frontend_spec.py +1 -8
  19. mlrun/common/schemas/function.py +24 -0
  20. mlrun/common/schemas/hub.py +3 -2
  21. mlrun/common/schemas/model_monitoring/__init__.py +1 -1
  22. mlrun/common/schemas/model_monitoring/constants.py +2 -2
  23. mlrun/common/schemas/secret.py +17 -2
  24. mlrun/common/secrets.py +95 -1
  25. mlrun/common/types.py +10 -10
  26. mlrun/config.py +53 -15
  27. mlrun/data_types/infer.py +2 -2
  28. mlrun/datastore/__init__.py +2 -3
  29. mlrun/datastore/base.py +274 -10
  30. mlrun/datastore/datastore.py +1 -1
  31. mlrun/datastore/datastore_profile.py +49 -17
  32. mlrun/datastore/model_provider/huggingface_provider.py +6 -2
  33. mlrun/datastore/model_provider/model_provider.py +2 -2
  34. mlrun/datastore/model_provider/openai_provider.py +2 -2
  35. mlrun/datastore/s3.py +15 -16
  36. mlrun/datastore/sources.py +1 -1
  37. mlrun/datastore/store_resources.py +4 -4
  38. mlrun/datastore/storeytargets.py +16 -10
  39. mlrun/datastore/targets.py +1 -1
  40. mlrun/datastore/utils.py +16 -3
  41. mlrun/datastore/v3io.py +1 -1
  42. mlrun/db/base.py +36 -12
  43. mlrun/db/httpdb.py +316 -101
  44. mlrun/db/nopdb.py +29 -11
  45. mlrun/errors.py +4 -2
  46. mlrun/execution.py +11 -12
  47. mlrun/feature_store/api.py +1 -1
  48. mlrun/feature_store/common.py +1 -1
  49. mlrun/feature_store/feature_vector_utils.py +1 -1
  50. mlrun/feature_store/steps.py +8 -6
  51. mlrun/frameworks/_common/utils.py +3 -3
  52. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  53. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  54. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  55. mlrun/frameworks/_ml_common/utils.py +2 -1
  56. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  57. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  58. mlrun/frameworks/onnx/dataset.py +2 -1
  59. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  60. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  61. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  62. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  63. mlrun/frameworks/pytorch/utils.py +2 -1
  64. mlrun/frameworks/sklearn/metric.py +2 -1
  65. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  66. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  67. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  68. mlrun/hub/__init__.py +37 -0
  69. mlrun/hub/base.py +142 -0
  70. mlrun/hub/module.py +67 -76
  71. mlrun/hub/step.py +113 -0
  72. mlrun/launcher/base.py +2 -1
  73. mlrun/launcher/local.py +2 -1
  74. mlrun/model.py +12 -2
  75. mlrun/model_monitoring/__init__.py +0 -1
  76. mlrun/model_monitoring/api.py +2 -2
  77. mlrun/model_monitoring/applications/base.py +20 -6
  78. mlrun/model_monitoring/applications/context.py +1 -0
  79. mlrun/model_monitoring/controller.py +7 -17
  80. mlrun/model_monitoring/db/_schedules.py +2 -16
  81. mlrun/model_monitoring/db/_stats.py +2 -13
  82. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  83. mlrun/model_monitoring/db/tsdb/base.py +2 -4
  84. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  85. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  86. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  87. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  88. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  89. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  90. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  91. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  92. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  93. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  94. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  95. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  98. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +4 -6
  99. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +147 -79
  100. mlrun/model_monitoring/features_drift_table.py +2 -1
  101. mlrun/model_monitoring/helpers.py +2 -1
  102. mlrun/model_monitoring/stream_processing.py +18 -16
  103. mlrun/model_monitoring/writer.py +4 -3
  104. mlrun/package/__init__.py +2 -1
  105. mlrun/platforms/__init__.py +0 -44
  106. mlrun/platforms/iguazio.py +1 -1
  107. mlrun/projects/operations.py +11 -10
  108. mlrun/projects/project.py +81 -82
  109. mlrun/run.py +4 -7
  110. mlrun/runtimes/__init__.py +2 -204
  111. mlrun/runtimes/base.py +89 -21
  112. mlrun/runtimes/constants.py +225 -0
  113. mlrun/runtimes/daskjob.py +4 -2
  114. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  115. mlrun/runtimes/mounts.py +5 -0
  116. mlrun/runtimes/nuclio/__init__.py +12 -8
  117. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  118. mlrun/runtimes/nuclio/application/application.py +200 -32
  119. mlrun/runtimes/nuclio/function.py +154 -49
  120. mlrun/runtimes/nuclio/serving.py +55 -42
  121. mlrun/runtimes/pod.py +59 -10
  122. mlrun/secrets.py +46 -2
  123. mlrun/serving/__init__.py +2 -0
  124. mlrun/serving/remote.py +5 -5
  125. mlrun/serving/routers.py +3 -3
  126. mlrun/serving/server.py +46 -43
  127. mlrun/serving/serving_wrapper.py +6 -2
  128. mlrun/serving/states.py +554 -207
  129. mlrun/serving/steps.py +1 -1
  130. mlrun/serving/system_steps.py +42 -33
  131. mlrun/track/trackers/mlflow_tracker.py +29 -31
  132. mlrun/utils/helpers.py +89 -16
  133. mlrun/utils/http.py +9 -2
  134. mlrun/utils/notifications/notification/git.py +1 -1
  135. mlrun/utils/notifications/notification/mail.py +39 -16
  136. mlrun/utils/notifications/notification_pusher.py +2 -2
  137. mlrun/utils/version/version.json +2 -2
  138. mlrun/utils/version/version.py +3 -4
  139. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +39 -49
  140. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +144 -130
  141. mlrun/db/auth_utils.py +0 -152
  142. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -343
  143. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  144. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1368
  146. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +0 -51
  147. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  148. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  149. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  150. {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
@@ -1,1368 +0,0 @@
1
- # Copyright 2024 Iguazio
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import threading
16
- from datetime import datetime, timedelta
17
- from typing import Callable, Final, Literal, Optional, Union
18
-
19
- import pandas as pd
20
- import taosws
21
-
22
- import mlrun.common.schemas.model_monitoring as mm_schemas
23
- import mlrun.common.types
24
- import mlrun.model_monitoring.db.tsdb.tdengine.schemas as tdengine_schemas
25
- from mlrun.config import config
26
- from mlrun.datastore.datastore_profile import DatastoreProfile
27
- from mlrun.model_monitoring.db import TSDBConnector
28
- from mlrun.model_monitoring.db.tsdb.tdengine.tdengine_connection import (
29
- Statement,
30
- TDEngineConnection,
31
- )
32
- from mlrun.model_monitoring.helpers import get_invocations_fqn, get_start_end
33
- from mlrun.utils import logger
34
-
35
- # Thread-local storage for connections
36
- _thread_local = threading.local()
37
-
38
-
39
- class TDEngineTimestampPrecision(mlrun.common.types.StrEnum):
40
- """
41
- The timestamp precision for the TDEngine database.
42
- For more information, see:
43
- https://docs.tdengine.com/tdengine-reference/sql-manual/data-types/#timestamp
44
- https://docs.tdengine.com/tdengine-reference/sql-manual/manage-databases/#create-database
45
- """
46
-
47
- MILLISECOND = "ms" # TDEngine's default
48
- MICROSECOND = "us" # MLRun's default
49
- NANOSECOND = "ns"
50
-
51
-
52
- class TDEngineConnector(TSDBConnector):
53
- """
54
- Handles the TSDB operations when the TSDB connector is of type TDEngine.
55
- """
56
-
57
- type: str = mm_schemas.TSDBTarget.TDEngine
58
-
59
- def __init__(
60
- self,
61
- project: str,
62
- profile: DatastoreProfile,
63
- timestamp_precision: TDEngineTimestampPrecision = TDEngineTimestampPrecision.MICROSECOND,
64
- ):
65
- super().__init__(project=project)
66
-
67
- self._tdengine_connection_profile = profile
68
-
69
- self._timestamp_precision: Final = ( # cannot be changed after initialization
70
- timestamp_precision
71
- )
72
-
73
- if not mlrun.mlconf.system_id:
74
- raise mlrun.errors.MLRunInvalidArgumentError(
75
- "system_id is not set in mlrun.mlconf. "
76
- "TDEngineConnector requires system_id to be configured for database name construction. "
77
- "Please ensure MLRun configuration is properly loaded before creating TDEngineConnector."
78
- )
79
- self.database = (
80
- f"{tdengine_schemas._MODEL_MONITORING_DATABASE}_{mlrun.mlconf.system_id}"
81
- )
82
- self._init_super_tables()
83
-
84
- @property
85
- def connection(self) -> TDEngineConnection:
86
- if not hasattr(_thread_local, "connection"):
87
- _thread_local.connection = self._create_connection()
88
- logger.debug(
89
- "Created new TDEngine connection for thread",
90
- project=self.project,
91
- thread_name=threading.current_thread().name,
92
- thread_id=threading.get_ident(),
93
- )
94
- return _thread_local.connection
95
-
96
- def _create_connection(self) -> TDEngineConnection:
97
- """Establish a connection to the TSDB server."""
98
- logger.debug("Creating a new connection to TDEngine", project=self.project)
99
- conn = TDEngineConnection(
100
- self._tdengine_connection_profile.dsn(),
101
- )
102
- conn.prefix_statements = [f"USE {self.database}"]
103
-
104
- return conn
105
-
106
- def _init_super_tables(self):
107
- """Initialize the super tables for the TSDB."""
108
- self.tables = {
109
- mm_schemas.TDEngineSuperTables.APP_RESULTS: tdengine_schemas.AppResultTable(
110
- project=self.project, database=self.database
111
- ),
112
- mm_schemas.TDEngineSuperTables.METRICS: tdengine_schemas.Metrics(
113
- project=self.project, database=self.database
114
- ),
115
- mm_schemas.TDEngineSuperTables.PREDICTIONS: tdengine_schemas.Predictions(
116
- project=self.project, database=self.database
117
- ),
118
- mm_schemas.TDEngineSuperTables.ERRORS: tdengine_schemas.Errors(
119
- project=self.project, database=self.database
120
- ),
121
- }
122
-
123
- def _create_db_if_not_exists(self):
124
- """Create the database if it does not exist."""
125
- self.connection.prefix_statements = []
126
- self.connection.run(
127
- statements=f"CREATE DATABASE IF NOT EXISTS {self.database} PRECISION '{self._timestamp_precision}'",
128
- )
129
- self.connection.prefix_statements = [f"USE {self.database}"]
130
- logger.debug(
131
- "The TDEngine database is currently in use",
132
- project=self.project,
133
- database=self.database,
134
- )
135
-
136
- def create_tables(self):
137
- """Create TDEngine supertables."""
138
-
139
- # Create the database if it does not exist
140
- self._create_db_if_not_exists()
141
-
142
- for table in self.tables:
143
- create_table_query = self.tables[table]._create_super_table_query()
144
- conn = self.connection
145
- conn.run(
146
- statements=create_table_query,
147
- )
148
-
149
- def write_application_event(
150
- self,
151
- event: dict,
152
- kind: mm_schemas.WriterEventKind = mm_schemas.WriterEventKind.RESULT,
153
- ) -> None:
154
- """
155
- Write a single result or metric to TSDB.
156
- """
157
-
158
- table_name = (
159
- f"{event[mm_schemas.WriterEvent.ENDPOINT_ID]}_"
160
- f"{event[mm_schemas.WriterEvent.APPLICATION_NAME]}"
161
- )
162
-
163
- if kind == mm_schemas.WriterEventKind.RESULT:
164
- # Write a new result
165
- table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS]
166
- table_name = (
167
- f"{table_name}_{event[mm_schemas.ResultData.RESULT_NAME]}"
168
- ).replace("-", "_")
169
-
170
- else:
171
- # Write a new metric
172
- table = self.tables[mm_schemas.TDEngineSuperTables.METRICS]
173
- table_name = (
174
- f"{table_name}_{event[mm_schemas.MetricData.METRIC_NAME]}"
175
- ).replace("-", "_")
176
-
177
- # Escape the table name for case-sensitivity (ML-7908)
178
- # https://github.com/taosdata/taos-connector-python/issues/260
179
- table_name = f"`{table_name}`"
180
-
181
- # Convert the datetime strings to datetime objects
182
- event[mm_schemas.WriterEvent.END_INFER_TIME] = self._convert_to_datetime(
183
- val=event[mm_schemas.WriterEvent.END_INFER_TIME]
184
- )
185
- event[mm_schemas.WriterEvent.START_INFER_TIME] = self._convert_to_datetime(
186
- val=event[mm_schemas.WriterEvent.START_INFER_TIME]
187
- )
188
-
189
- create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
190
-
191
- # we need the string values to be sent to the connection, not the enum
192
- columns = {str(key): str(val) for key, val in table.columns.items()}
193
-
194
- insert_statement = Statement(
195
- columns=columns,
196
- subtable=table_name,
197
- values=event,
198
- timestamp_precision=self._timestamp_precision,
199
- )
200
-
201
- self.connection.run(
202
- statements=[
203
- create_table_sql,
204
- insert_statement,
205
- ],
206
- )
207
-
208
- @staticmethod
209
- def _convert_to_datetime(val: Union[str, datetime]) -> datetime:
210
- return datetime.fromisoformat(val) if isinstance(val, str) else val
211
-
212
- @staticmethod
213
- def _generate_filter_query(
214
- filter_column: str, filter_values: Union[str, list[Union[str, int]]]
215
- ) -> str:
216
- """
217
- Generate a filter query for TDEngine based on the provided column and values.
218
-
219
- :param filter_column: The column to filter by.
220
- :param filter_values: A single value or a list of values to filter by.
221
-
222
- :return: A string representing the filter query.
223
- :raise: ``MLRunValueError`` if the filter values are not of type string or list.
224
- """
225
- if isinstance(filter_values, str):
226
- return f"{filter_column}='{filter_values}'"
227
- elif isinstance(filter_values, list):
228
- return f"{filter_column} IN ({', '.join(repr(v) for v in filter_values)}) "
229
- else:
230
- raise mlrun.errors.MLRunValueError(
231
- f"Invalid filter values {filter_values}: must be a string or a list, "
232
- f"got {type(filter_values).__name__}; filter values: {filter_values}"
233
- )
234
-
235
- def _drop_database_query(self) -> str:
236
- return f"DROP DATABASE IF EXISTS {self.database};"
237
-
238
- def _get_table_name_query(self) -> str:
239
- return f"SELECT table_name FROM information_schema.ins_tables where db_name='{self.database}' LIMIT 1;"
240
-
241
- def apply_monitoring_stream_steps(self, graph, **kwarg):
242
- """
243
- Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
244
- different key metric dictionaries. This data is being used by the monitoring dashboards in
245
- grafana. At the moment, we store two types of data:
246
- - prediction latency.
247
- - custom metrics.
248
- """
249
-
250
- def apply_process_before_tsdb():
251
- graph.add_step(
252
- "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
253
- name="ProcessBeforeTDEngine",
254
- after="FilterNOP",
255
- )
256
-
257
- def apply_tdengine_target(name, after):
258
- graph.add_step(
259
- "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
260
- name=name,
261
- after=after,
262
- url=f"ds://{self._tdengine_connection_profile.name}",
263
- supertable=self.tables[
264
- mm_schemas.TDEngineSuperTables.PREDICTIONS
265
- ].super_table,
266
- table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
267
- time_col=mm_schemas.EventFieldType.TIME,
268
- database=self.database,
269
- columns=[
270
- mm_schemas.EventFieldType.LATENCY,
271
- mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
272
- mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
273
- mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
274
- ],
275
- tag_cols=[
276
- mm_schemas.EventFieldType.ENDPOINT_ID,
277
- ],
278
- max_events=1000,
279
- flush_after_seconds=30,
280
- )
281
-
282
- apply_process_before_tsdb()
283
- apply_tdengine_target(
284
- name="TDEngineTarget",
285
- after="ProcessBeforeTDEngine",
286
- )
287
-
288
- def add_pre_writer_steps(self, graph, after):
289
- return graph.add_step(
290
- "mlrun.model_monitoring.db.tsdb.tdengine.writer_graph_steps.ProcessBeforeTDEngine",
291
- name="ProcessBeforeTDEngine",
292
- after=after,
293
- )
294
-
295
- def apply_writer_steps(self, graph, after, **kwargs) -> None:
296
- graph.add_step(
297
- "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
298
- name="tsdb_metrics",
299
- after=after,
300
- url=f"ds://{self._tdengine_connection_profile.name}",
301
- supertable=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table,
302
- table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
303
- time_col=mm_schemas.WriterEvent.END_INFER_TIME,
304
- database=self.database,
305
- graph_shape="cylinder",
306
- columns=[
307
- mm_schemas.WriterEvent.START_INFER_TIME,
308
- mm_schemas.MetricData.METRIC_VALUE,
309
- ],
310
- tag_cols=[
311
- mm_schemas.WriterEvent.ENDPOINT_ID,
312
- mm_schemas.WriterEvent.APPLICATION_NAME,
313
- mm_schemas.MetricData.METRIC_NAME,
314
- ],
315
- max_events=config.model_endpoint_monitoring.writer_graph.max_events,
316
- flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
317
- )
318
-
319
- graph.add_step(
320
- "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
321
- name="tsdb_app_results",
322
- after=after,
323
- url=f"ds://{self._tdengine_connection_profile.name}",
324
- supertable=self.tables[
325
- mm_schemas.TDEngineSuperTables.APP_RESULTS
326
- ].super_table,
327
- table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
328
- time_col=mm_schemas.WriterEvent.END_INFER_TIME,
329
- database=self.database,
330
- graph_shape="cylinder",
331
- columns=[
332
- mm_schemas.WriterEvent.START_INFER_TIME,
333
- mm_schemas.ResultData.RESULT_VALUE,
334
- mm_schemas.ResultData.RESULT_STATUS,
335
- mm_schemas.ResultData.RESULT_EXTRA_DATA,
336
- ],
337
- tag_cols=[
338
- mm_schemas.WriterEvent.ENDPOINT_ID,
339
- mm_schemas.WriterEvent.APPLICATION_NAME,
340
- mm_schemas.ResultData.RESULT_NAME,
341
- mm_schemas.ResultData.RESULT_KIND,
342
- ],
343
- max_events=config.model_endpoint_monitoring.writer_graph.max_events,
344
- flush_after_seconds=config.model_endpoint_monitoring.writer_graph.flush_after_seconds,
345
- )
346
-
347
- def handle_model_error(
348
- self,
349
- graph,
350
- tsdb_batching_max_events: int = 1000,
351
- tsdb_batching_timeout_secs: int = 30,
352
- **kwargs,
353
- ) -> None:
354
- graph.add_step(
355
- "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ErrorExtractor",
356
- name="error_extractor",
357
- after="ForwardError",
358
- )
359
- graph.add_step(
360
- "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
361
- name="tsdb_error",
362
- after="error_extractor",
363
- url=f"ds://{self._tdengine_connection_profile.name}",
364
- supertable=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
365
- table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
366
- time_col=mm_schemas.EventFieldType.TIME,
367
- database=self.database,
368
- columns=[
369
- mm_schemas.EventFieldType.MODEL_ERROR,
370
- ],
371
- tag_cols=[
372
- mm_schemas.EventFieldType.ENDPOINT_ID,
373
- mm_schemas.EventFieldType.ERROR_TYPE,
374
- ],
375
- max_events=tsdb_batching_max_events,
376
- flush_after_seconds=tsdb_batching_timeout_secs,
377
- )
378
-
379
- def delete_tsdb_records(self, endpoint_ids: list[str]) -> None:
380
- """
381
- To delete subtables within TDEngine, we first query the subtables names with the provided endpoint_ids.
382
- Then, we drop each subtable.
383
- """
384
- logger.debug(
385
- "Deleting model endpoint resources using the TDEngine connector",
386
- project=self.project,
387
- number_of_endpoints_to_delete=len(endpoint_ids),
388
- )
389
-
390
- # Get all subtables with the provided endpoint_ids
391
- subtables = []
392
- try:
393
- for table in self.tables:
394
- get_subtable_query = self.tables[table]._get_subtables_query_by_tag(
395
- filter_tag="endpoint_id", filter_values=endpoint_ids
396
- )
397
- subtables_result = self.connection.run(query=get_subtable_query)
398
- subtables.extend([subtable[0] for subtable in subtables_result.data])
399
- except Exception as e:
400
- logger.warning(
401
- "Failed to get subtables for deletion. You may need to delete them manually."
402
- "These can be found under the following supertables: app_results, "
403
- "metrics, errors, and predictions.",
404
- project=self.project,
405
- error=mlrun.errors.err_to_str(e),
406
- )
407
-
408
- # Prepare the drop statements
409
- drop_statements = [
410
- self.tables[table].drop_subtable_query(subtable=subtable)
411
- for subtable in subtables
412
- ]
413
- try:
414
- logger.debug("Dropping subtables", drop_statements=drop_statements)
415
- self.connection.run(statements=drop_statements)
416
- except Exception as e:
417
- logger.warning(
418
- "Failed to delete model endpoint resources. You may need to delete them manually. "
419
- "These can be found under the following supertables: app_results, "
420
- "metrics, errors, and predictions.",
421
- project=self.project,
422
- error=mlrun.errors.err_to_str(e),
423
- )
424
- logger.debug(
425
- "Deleted all model endpoint resources using the TDEngine connector",
426
- project=self.project,
427
- number_of_endpoints_to_delete=len(endpoint_ids),
428
- )
429
-
430
- def delete_application_records(
431
- self, application_name: str, endpoint_ids: Optional[list[str]] = None
432
- ) -> None:
433
- """
434
- Delete application records from the TSDB for the given model endpoints or all if ``endpoint_ids`` is ``None``.
435
- """
436
- logger.debug(
437
- "Deleting application records",
438
- project=self.project,
439
- application_name=application_name,
440
- endpoint_ids=endpoint_ids,
441
- )
442
- tables = [
443
- self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS],
444
- self.tables[mm_schemas.TDEngineSuperTables.METRICS],
445
- ]
446
-
447
- filter_query = self._generate_filter_query(
448
- filter_column=mm_schemas.ApplicationEvent.APPLICATION_NAME,
449
- filter_values=application_name,
450
- )
451
- if endpoint_ids:
452
- endpoint_ids_filter = self._generate_filter_query(
453
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
454
- filter_values=endpoint_ids,
455
- )
456
- filter_query += f" AND {endpoint_ids_filter}"
457
-
458
- drop_statements: list[str] = []
459
- for table in tables:
460
- get_subtable_query = table._get_tables_query_by_condition(filter_query)
461
- subtables_result = self.connection.run(query=get_subtable_query)
462
- drop_statements.extend(
463
- [
464
- table.drop_subtable_query(subtable=subtable[0])
465
- for subtable in subtables_result.data
466
- ]
467
- )
468
-
469
- logger.debug("Dropping application records", drop_statements=drop_statements)
470
- self.connection.run(statements=drop_statements)
471
-
472
- def delete_tsdb_resources(self):
473
- """
474
- Delete all project resources in the TSDB connector, such as model endpoints data and drift results.
475
- """
476
- logger.debug(
477
- "Deleting all project resources using the TDEngine connector",
478
- project=self.project,
479
- )
480
- drop_statements = []
481
- for table in self.tables:
482
- drop_statements.append(self.tables[table].drop_supertable_query())
483
-
484
- try:
485
- self.connection.run(
486
- statements=drop_statements,
487
- )
488
- except Exception as e:
489
- logger.warning(
490
- "Failed to drop TDEngine tables. You may need to drop them manually. "
491
- "These can be found under the following supertables: app_results, "
492
- "metrics, errors, and predictions.",
493
- project=self.project,
494
- error=mlrun.errors.err_to_str(e),
495
- )
496
- logger.debug(
497
- "Deleted all project resources using the TDEngine connector",
498
- project=self.project,
499
- )
500
-
501
- # Check if database is empty and if so, drop it
502
- self._drop_database_if_empty()
503
-
504
- def _drop_database_if_empty(self):
505
- query_random_table_name = self._get_table_name_query()
506
- drop_database = False
507
- try:
508
- table_name = self.connection.run(
509
- query=query_random_table_name,
510
- )
511
- if len(table_name.data) == 0:
512
- # no tables were found under the database
513
- drop_database = True
514
-
515
- except Exception as e:
516
- logger.warning(
517
- "Failed to query tables in the database. You may need to drop the database manually if it is empty.",
518
- project=self.project,
519
- error=mlrun.errors.err_to_str(e),
520
- )
521
-
522
- if drop_database:
523
- logger.debug(
524
- "Going to drop the TDEngine database",
525
- project=self.project,
526
- database=self.database,
527
- )
528
- drop_database_query = self._drop_database_query()
529
- try:
530
- self.connection.run(
531
- statements=drop_database_query,
532
- )
533
- logger.debug(
534
- "The TDEngine database has been successfully dropped",
535
- project=self.project,
536
- database=self.database,
537
- )
538
-
539
- except Exception as e:
540
- logger.warning(
541
- "Failed to drop the database. You may need to drop it manually if it is empty.",
542
- project=self.project,
543
- error=mlrun.errors.err_to_str(e),
544
- )
545
-
546
- def get_model_endpoint_real_time_metrics(
547
- self,
548
- endpoint_id: str,
549
- metrics: list[str],
550
- start: str,
551
- end: str,
552
- ) -> dict[str, list[tuple[str, float]]]:
553
- # Not implemented, use get_records() instead
554
- pass
555
-
556
- def _get_records(
557
- self,
558
- table: str,
559
- start: datetime,
560
- end: datetime,
561
- columns: Optional[list[str]] = None,
562
- filter_query: Optional[str] = None,
563
- interval: Optional[str] = None,
564
- agg_funcs: Optional[list] = None,
565
- limit: Optional[int] = None,
566
- sliding_window_step: Optional[str] = None,
567
- timestamp_column: str = mm_schemas.EventFieldType.TIME,
568
- group_by: Optional[Union[list[str], str]] = None,
569
- preform_agg_columns: Optional[list] = None,
570
- order_by: Optional[str] = None,
571
- desc: Optional[bool] = None,
572
- partition_by: Optional[str] = None,
573
- ) -> pd.DataFrame:
574
- """
575
- Getting records from TSDB data collection.
576
- :param table: Either a supertable or a subtable name.
577
- :param start: The start time of the metrics.
578
- :param end: The end time of the metrics.
579
- :param columns: Columns to include in the result.
580
- :param filter_query: Optional filter expression as a string. TDengine supports SQL-like syntax.
581
- :param interval: The interval to aggregate the data by. Note that if interval is provided,
582
- `agg_funcs` must bg provided as well. Provided as a string in the format of '1m',
583
- '1h', etc.
584
- :param agg_funcs: The aggregation functions to apply on the columns. Note that if `agg_funcs` is
585
- provided, `interval` must bg provided as well. Provided as a list of strings in
586
- the format of ['sum', 'avg', 'count', ...].
587
- :param limit: The maximum number of records to return.
588
- :param sliding_window_step: The time step for which the time window moves forward. Note that if
589
- `sliding_window_step` is provided, interval must be provided as well. Provided
590
- as a string in the format of '1m', '1h', etc.
591
- :param timestamp_column: The column name that holds the timestamp index.
592
- :param group_by: The column name to group by. Note that if `group_by` is provided, aggregation
593
- functions must bg provided
594
- :param preform_agg_columns: The columns to preform aggregation on.
595
- notice that all aggregation functions provided will preform on those columns.
596
- If not provided The default behavior is to preform on all columns in columns,
597
- if an empty list was provided The aggregation won't be performed.
598
- :param order_by: The column or alias to preform ordering on the query.
599
- :param desc: Whether or not to sort the results in descending order.
600
- :param partition_by: The column to partition the results by. Note that if interval is provided,
601
- `agg_funcs` must bg provided as well.
602
-
603
- :return: DataFrame with the provided attributes from the data collection.
604
- :raise: MLRunInvalidArgumentError if query the provided table failed.
605
- """
606
-
607
- full_query = tdengine_schemas.TDEngineSchema._get_records_query(
608
- table=table,
609
- start=start,
610
- end=end,
611
- columns_to_filter=columns,
612
- filter_query=filter_query,
613
- interval=interval,
614
- limit=limit,
615
- agg_funcs=agg_funcs,
616
- sliding_window_step=sliding_window_step,
617
- timestamp_column=timestamp_column,
618
- database=self.database,
619
- group_by=group_by,
620
- preform_agg_funcs_columns=preform_agg_columns,
621
- order_by=order_by,
622
- desc=desc,
623
- partition_by=partition_by,
624
- )
625
- logger.debug("Querying TDEngine", query=full_query)
626
- try:
627
- query_result = self.connection.run(
628
- query=full_query,
629
- )
630
- except taosws.QueryError as e:
631
- raise mlrun.errors.MLRunInvalidArgumentError(
632
- f"Failed to query table {table} in database {self.database}, {str(e)}"
633
- )
634
-
635
- df_columns = [field.name for field in query_result.fields]
636
- return pd.DataFrame(query_result.data, columns=df_columns)
637
-
638
- def read_metrics_data(
639
- self,
640
- *,
641
- endpoint_id: str,
642
- start: datetime,
643
- end: datetime,
644
- metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
645
- type: Literal["metrics", "results"],
646
- with_result_extra_data: bool = False,
647
- ) -> Union[
648
- list[
649
- Union[
650
- mm_schemas.ModelEndpointMonitoringResultValues,
651
- mm_schemas.ModelEndpointMonitoringMetricNoData,
652
- ],
653
- ],
654
- list[
655
- Union[
656
- mm_schemas.ModelEndpointMonitoringMetricValues,
657
- mm_schemas.ModelEndpointMonitoringMetricNoData,
658
- ],
659
- ],
660
- ]:
661
- timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
662
- columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
663
- if type == "metrics":
664
- if with_result_extra_data:
665
- logger.warning(
666
- "The 'with_result_extra_data' parameter is not supported for metrics, just for results",
667
- project=self.project,
668
- endpoint_id=endpoint_id,
669
- )
670
- table = self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
671
- name = mm_schemas.MetricData.METRIC_NAME
672
- columns += [name, mm_schemas.MetricData.METRIC_VALUE]
673
- df_handler = self.df_to_metrics_values
674
- elif type == "results":
675
- table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table
676
- name = mm_schemas.ResultData.RESULT_NAME
677
- columns += [
678
- name,
679
- mm_schemas.ResultData.RESULT_VALUE,
680
- mm_schemas.ResultData.RESULT_STATUS,
681
- mm_schemas.ResultData.RESULT_KIND,
682
- ]
683
- if with_result_extra_data:
684
- columns.append(mm_schemas.ResultData.RESULT_EXTRA_DATA)
685
- df_handler = self.df_to_results_values
686
- else:
687
- raise mlrun.errors.MLRunInvalidArgumentError(
688
- f"Invalid type {type}, must be either 'metrics' or 'results'."
689
- )
690
-
691
- metrics_condition = " OR ".join(
692
- [
693
- f"({mm_schemas.WriterEvent.APPLICATION_NAME}='{metric.app}' AND {name}='{metric.name}')"
694
- for metric in metrics
695
- ]
696
- )
697
- filter_query = f"(endpoint_id='{endpoint_id}') AND ({metrics_condition})"
698
-
699
- df = self._get_records(
700
- table=table,
701
- start=start,
702
- end=end,
703
- filter_query=filter_query,
704
- timestamp_column=timestamp_column,
705
- columns=columns,
706
- )
707
-
708
- df[mm_schemas.WriterEvent.END_INFER_TIME] = pd.to_datetime(
709
- df[mm_schemas.WriterEvent.END_INFER_TIME]
710
- )
711
- df.set_index(mm_schemas.WriterEvent.END_INFER_TIME, inplace=True)
712
-
713
- logger.debug(
714
- "Converting a DataFrame to a list of metrics or results values",
715
- table=table,
716
- project=self.project,
717
- endpoint_id=endpoint_id,
718
- is_empty=df.empty,
719
- )
720
-
721
- if not with_result_extra_data and type == "results":
722
- # Set the extra data to an empty string if it's not requested
723
- df[mm_schemas.ResultData.RESULT_EXTRA_DATA] = ""
724
-
725
- return df_handler(df=df, metrics=metrics, project=self.project)
726
-
727
- def read_predictions(
728
- self,
729
- *,
730
- endpoint_id: str,
731
- start: datetime,
732
- end: datetime,
733
- aggregation_window: Optional[str] = None,
734
- agg_funcs: Optional[list] = None,
735
- limit: Optional[int] = None,
736
- ) -> Union[
737
- mm_schemas.ModelEndpointMonitoringMetricValues,
738
- mm_schemas.ModelEndpointMonitoringMetricNoData,
739
- ]:
740
- if (agg_funcs and not aggregation_window) or (
741
- aggregation_window and not agg_funcs
742
- ):
743
- raise mlrun.errors.MLRunInvalidArgumentError(
744
- "both or neither of `aggregation_window` and `agg_funcs` must be provided"
745
- )
746
- df = self._get_records(
747
- table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
748
- start=start,
749
- end=end,
750
- columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
751
- filter_query=f"endpoint_id='{endpoint_id}'",
752
- agg_funcs=agg_funcs,
753
- interval=aggregation_window,
754
- limit=limit,
755
- )
756
-
757
- full_name = get_invocations_fqn(self.project)
758
-
759
- if df.empty:
760
- return mm_schemas.ModelEndpointMonitoringMetricNoData(
761
- full_name=full_name,
762
- type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
763
- )
764
-
765
- if aggregation_window:
766
- # _wend column, which represents the end time of each window, will be used as the time index
767
- df["_wend"] = pd.to_datetime(df["_wend"])
768
- df.set_index("_wend", inplace=True)
769
-
770
- estimated_prediction_count = (
771
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
772
- if agg_funcs
773
- else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
774
- )
775
-
776
- return mm_schemas.ModelEndpointMonitoringMetricValues(
777
- full_name=full_name,
778
- values=list(
779
- zip(
780
- df.index,
781
- df[estimated_prediction_count],
782
- )
783
- ), # pyright: ignore[reportArgumentType]
784
- )
785
-
786
- def get_last_request(
787
- self,
788
- endpoint_ids: Union[str, list[str]],
789
- start: Optional[datetime] = None,
790
- end: Optional[datetime] = None,
791
- ) -> Union[pd.DataFrame, dict[str, float]]:
792
- if not endpoint_ids:
793
- return {}
794
- filter_query = self._generate_filter_query(
795
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
796
- filter_values=endpoint_ids,
797
- )
798
- start, end = get_start_end(start, end)
799
- df = self._get_records(
800
- table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
801
- start=start,
802
- end=end,
803
- columns=[
804
- mm_schemas.EventFieldType.ENDPOINT_ID,
805
- mm_schemas.EventFieldType.TIME,
806
- mm_schemas.EventFieldType.LATENCY,
807
- ],
808
- filter_query=filter_query,
809
- timestamp_column=mm_schemas.EventFieldType.TIME,
810
- agg_funcs=["last"],
811
- group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
812
- preform_agg_columns=[mm_schemas.EventFieldType.TIME],
813
- )
814
- if not df.empty:
815
- df.dropna(inplace=True)
816
- df.rename(
817
- columns={
818
- f"last({mm_schemas.EventFieldType.TIME})": mm_schemas.EventFieldType.LAST_REQUEST,
819
- f"{mm_schemas.EventFieldType.LATENCY}": "last_latency",
820
- },
821
- inplace=True,
822
- )
823
- df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
824
- df[mm_schemas.EventFieldType.LAST_REQUEST],
825
- errors="coerce",
826
- format="ISO8601",
827
- utc=True,
828
- )
829
- return df
830
-
831
- def get_drift_status(
832
- self,
833
- endpoint_ids: Union[str, list[str]],
834
- start: Optional[datetime] = None,
835
- end: Optional[datetime] = None,
836
- get_raw: bool = False,
837
- ) -> pd.DataFrame:
838
- filter_query = self._generate_filter_query(
839
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
840
- filter_values=endpoint_ids,
841
- )
842
- start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
843
- start, end = get_start_end(start, end)
844
- df = self._get_records(
845
- table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
846
- start=start,
847
- end=end,
848
- columns=[
849
- mm_schemas.ResultData.RESULT_STATUS,
850
- mm_schemas.EventFieldType.ENDPOINT_ID,
851
- ],
852
- filter_query=filter_query,
853
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
854
- agg_funcs=["max"],
855
- group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
856
- preform_agg_columns=[mm_schemas.ResultData.RESULT_STATUS],
857
- )
858
- df.rename(
859
- columns={
860
- f"max({mm_schemas.ResultData.RESULT_STATUS})": mm_schemas.ResultData.RESULT_STATUS
861
- },
862
- inplace=True,
863
- )
864
- if not df.empty:
865
- df.dropna(inplace=True)
866
- return df
867
-
868
- def count_results_by_status(
869
- self,
870
- start: Optional[Union[datetime, str]] = None,
871
- end: Optional[Union[datetime, str]] = None,
872
- endpoint_ids: Optional[Union[str, list[str]]] = None,
873
- application_names: Optional[Union[str, list[str]]] = None,
874
- result_status_list: Optional[list[int]] = None,
875
- ) -> dict[tuple[str, int], int]:
876
- filter_query = ""
877
-
878
- start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
879
-
880
- if endpoint_ids:
881
- filter_query = self._generate_filter_query(
882
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
883
- filter_values=endpoint_ids,
884
- )
885
- if application_names:
886
- app_filter_query = self._generate_filter_query(
887
- filter_column=mm_schemas.ApplicationEvent.APPLICATION_NAME,
888
- filter_values=application_names,
889
- )
890
- if filter_query:
891
- filter_query += f" AND {app_filter_query}"
892
- else:
893
- filter_query = app_filter_query
894
- if result_status_list:
895
- status_filter_query = self._generate_filter_query(
896
- filter_column=mm_schemas.ResultData.RESULT_STATUS,
897
- filter_values=result_status_list,
898
- )
899
- if filter_query:
900
- filter_query += f" AND {status_filter_query}"
901
- else:
902
- filter_query = status_filter_query
903
-
904
- df = self._get_records(
905
- table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
906
- start=start,
907
- end=end,
908
- columns=[
909
- mm_schemas.WriterEvent.APPLICATION_NAME,
910
- mm_schemas.ResultData.RESULT_STATUS,
911
- mm_schemas.ResultData.RESULT_VALUE,
912
- ],
913
- filter_query=filter_query,
914
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
915
- group_by=[
916
- mm_schemas.WriterEvent.APPLICATION_NAME,
917
- mm_schemas.ResultData.RESULT_STATUS,
918
- ],
919
- agg_funcs=["count"],
920
- preform_agg_columns=[mm_schemas.ResultData.RESULT_VALUE],
921
- )
922
- if df.empty:
923
- return {}
924
-
925
- # Convert DataFrame to a dictionary
926
- return {
927
- (
928
- row[mm_schemas.WriterEvent.APPLICATION_NAME],
929
- row[mm_schemas.ResultData.RESULT_STATUS],
930
- ): row["count(result_value)"]
931
- for _, row in df.iterrows()
932
- }
933
-
934
- def count_processed_model_endpoints(
935
- self,
936
- start: Optional[Union[datetime, str]] = None,
937
- end: Optional[Union[datetime, str]] = None,
938
- application_names: Optional[Union[str, list[str]]] = None,
939
- ) -> dict:
940
- filter_query = ""
941
- start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
942
-
943
- if application_names:
944
- filter_query = self._generate_filter_query(
945
- filter_column=mm_schemas.WriterEvent.APPLICATION_NAME,
946
- filter_values=application_names,
947
- )
948
-
949
- def get_application_endpoints_records(super_table: str) -> pd.DataFrame:
950
- return self._get_records(
951
- table=super_table,
952
- start=start,
953
- end=end,
954
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
955
- columns=[
956
- mm_schemas.WriterEvent.APPLICATION_NAME,
957
- mm_schemas.EventFieldType.ENDPOINT_ID,
958
- ],
959
- filter_query=filter_query,
960
- group_by=[
961
- mm_schemas.WriterEvent.APPLICATION_NAME,
962
- mm_schemas.EventFieldType.ENDPOINT_ID,
963
- ],
964
- preform_agg_columns=[mm_schemas.ResultData.RESULT_VALUE],
965
- agg_funcs=["last"],
966
- )
967
-
968
- df_results = get_application_endpoints_records(
969
- super_table=self.tables[
970
- mm_schemas.TDEngineSuperTables.APP_RESULTS
971
- ].super_table
972
- )
973
- df_metrics = get_application_endpoints_records(
974
- super_table=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
975
- )
976
-
977
- combined_df = pd.concat([df_results, df_metrics]).drop_duplicates()
978
-
979
- if combined_df.empty:
980
- return {}
981
- grouped_df = combined_df.groupby(
982
- mm_schemas.WriterEvent.APPLICATION_NAME
983
- ).count()
984
-
985
- # Convert DataFrame to a dictionary
986
- return grouped_df[mm_schemas.WriterEvent.ENDPOINT_ID].to_dict()
987
-
988
- def calculate_latest_metrics(
989
- self,
990
- start: Optional[Union[datetime, str]] = None,
991
- end: Optional[Union[datetime, str]] = None,
992
- application_names: Optional[Union[str, list[str]]] = None,
993
- ) -> list[
994
- Union[mm_schemas.ApplicationResultRecord, mm_schemas.ApplicationMetricRecord]
995
- ]:
996
- metric_list = []
997
- filter_query = ""
998
- start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
999
-
1000
- if application_names:
1001
- filter_query = self._generate_filter_query(
1002
- filter_column=mm_schemas.WriterEvent.APPLICATION_NAME,
1003
- filter_values=application_names,
1004
- )
1005
-
1006
- def get_latest_metrics_records(
1007
- record_type: Literal["metrics", "results"],
1008
- ) -> pd.DataFrame:
1009
- columns = [
1010
- mm_schemas.WriterEvent.END_INFER_TIME,
1011
- mm_schemas.WriterEvent.APPLICATION_NAME,
1012
- ]
1013
- if record_type == "results":
1014
- table = self.tables[
1015
- mm_schemas.TDEngineSuperTables.APP_RESULTS
1016
- ].super_table
1017
- columns += [
1018
- mm_schemas.ResultData.RESULT_NAME,
1019
- mm_schemas.ResultData.RESULT_VALUE,
1020
- mm_schemas.ResultData.RESULT_STATUS,
1021
- mm_schemas.ResultData.RESULT_KIND,
1022
- ]
1023
- agg_column = mm_schemas.ResultData.RESULT_VALUE
1024
- else:
1025
- table = self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
1026
- columns += [
1027
- mm_schemas.MetricData.METRIC_NAME,
1028
- mm_schemas.MetricData.METRIC_VALUE,
1029
- ]
1030
- agg_column = mm_schemas.MetricData.METRIC_VALUE
1031
-
1032
- return self._get_records(
1033
- table=table,
1034
- start=start,
1035
- end=end,
1036
- columns=columns,
1037
- filter_query=filter_query,
1038
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
1039
- # Aggregate per application/metric pair regardless of timestamp
1040
- group_by=columns[1:],
1041
- preform_agg_columns=[agg_column],
1042
- agg_funcs=["last"],
1043
- )
1044
-
1045
- df_results = get_latest_metrics_records(record_type="results")
1046
- df_metrics = get_latest_metrics_records(record_type="metrics")
1047
-
1048
- if df_results.empty and df_metrics.empty:
1049
- return metric_list
1050
-
1051
- def build_metric_objects() -> (
1052
- list[
1053
- Union[
1054
- mm_schemas.ApplicationResultRecord,
1055
- mm_schemas.ApplicationMetricRecord,
1056
- ]
1057
- ]
1058
- ):
1059
- metric_objects = []
1060
-
1061
- if not df_results.empty:
1062
- df_results.rename(
1063
- columns={
1064
- f"last({mm_schemas.ResultData.RESULT_VALUE})": mm_schemas.ResultData.RESULT_VALUE,
1065
- },
1066
- inplace=True,
1067
- )
1068
- for _, row in df_results.iterrows():
1069
- metric_objects.append(
1070
- mm_schemas.ApplicationResultRecord(
1071
- time=datetime.fromisoformat(
1072
- row[mm_schemas.WriterEvent.END_INFER_TIME]
1073
- ),
1074
- result_name=row[mm_schemas.ResultData.RESULT_NAME],
1075
- kind=row[mm_schemas.ResultData.RESULT_KIND],
1076
- status=row[mm_schemas.ResultData.RESULT_STATUS],
1077
- value=row[mm_schemas.ResultData.RESULT_VALUE],
1078
- )
1079
- )
1080
-
1081
- if not df_metrics.empty:
1082
- df_metrics.rename(
1083
- columns={
1084
- f"last({mm_schemas.MetricData.METRIC_VALUE})": mm_schemas.MetricData.METRIC_VALUE,
1085
- },
1086
- inplace=True,
1087
- )
1088
- for _, row in df_metrics.iterrows():
1089
- metric_objects.append(
1090
- mm_schemas.ApplicationMetricRecord(
1091
- time=datetime.fromisoformat(
1092
- row[mm_schemas.WriterEvent.END_INFER_TIME]
1093
- ),
1094
- metric_name=row[mm_schemas.MetricData.METRIC_NAME],
1095
- value=row[mm_schemas.MetricData.METRIC_VALUE],
1096
- )
1097
- )
1098
-
1099
- return metric_objects
1100
-
1101
- return build_metric_objects()
1102
-
1103
- def get_metrics_metadata(
1104
- self,
1105
- endpoint_id: Union[str, list[str]],
1106
- start: Optional[datetime] = None,
1107
- end: Optional[datetime] = None,
1108
- ) -> pd.DataFrame:
1109
- start, end = get_start_end(start, end)
1110
- df = self._get_records(
1111
- table=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table,
1112
- start=start,
1113
- end=end,
1114
- columns=[
1115
- mm_schemas.ApplicationEvent.APPLICATION_NAME,
1116
- mm_schemas.MetricData.METRIC_NAME,
1117
- mm_schemas.EventFieldType.ENDPOINT_ID,
1118
- ],
1119
- filter_query=self._generate_filter_query(
1120
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
1121
- filter_values=endpoint_id,
1122
- ),
1123
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
1124
- group_by=[
1125
- mm_schemas.WriterEvent.APPLICATION_NAME,
1126
- mm_schemas.MetricData.METRIC_NAME,
1127
- mm_schemas.EventFieldType.ENDPOINT_ID,
1128
- ],
1129
- agg_funcs=["last"],
1130
- )
1131
- df.rename(
1132
- columns={
1133
- f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
1134
- f"last({mm_schemas.MetricData.METRIC_NAME})": mm_schemas.MetricData.METRIC_NAME,
1135
- f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
1136
- },
1137
- inplace=True,
1138
- )
1139
- if not df.empty:
1140
- df.dropna(inplace=True)
1141
- return df
1142
-
1143
- def get_results_metadata(
1144
- self,
1145
- endpoint_id: Union[str, list[str]],
1146
- start: Optional[datetime] = None,
1147
- end: Optional[datetime] = None,
1148
- ) -> pd.DataFrame:
1149
- start, end = get_start_end(start, end)
1150
- df = self._get_records(
1151
- table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
1152
- start=start,
1153
- end=end,
1154
- columns=[
1155
- mm_schemas.ApplicationEvent.APPLICATION_NAME,
1156
- mm_schemas.ResultData.RESULT_NAME,
1157
- mm_schemas.ResultData.RESULT_KIND,
1158
- mm_schemas.EventFieldType.ENDPOINT_ID,
1159
- ],
1160
- filter_query=self._generate_filter_query(
1161
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
1162
- filter_values=endpoint_id,
1163
- ),
1164
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
1165
- group_by=[
1166
- mm_schemas.WriterEvent.APPLICATION_NAME,
1167
- mm_schemas.ResultData.RESULT_NAME,
1168
- mm_schemas.EventFieldType.ENDPOINT_ID,
1169
- ],
1170
- agg_funcs=["last"],
1171
- )
1172
- df.rename(
1173
- columns={
1174
- f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
1175
- f"last({mm_schemas.ResultData.RESULT_NAME})": mm_schemas.ResultData.RESULT_NAME,
1176
- f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND,
1177
- f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
1178
- },
1179
- inplace=True,
1180
- )
1181
- if not df.empty:
1182
- df.dropna(inplace=True)
1183
- return df
1184
-
1185
- def get_error_count(
1186
- self,
1187
- endpoint_ids: Union[str, list[str]],
1188
- start: Optional[datetime] = None,
1189
- end: Optional[datetime] = None,
1190
- get_raw: bool = False,
1191
- ) -> pd.DataFrame:
1192
- filter_query = self._generate_filter_query(
1193
- filter_column=mm_schemas.EventFieldType.ENDPOINT_ID,
1194
- filter_values=endpoint_ids,
1195
- )
1196
- filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
1197
- start, end = get_start_end(start, end)
1198
- df = self._get_records(
1199
- table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
1200
- start=start,
1201
- end=end,
1202
- columns=[
1203
- mm_schemas.EventFieldType.MODEL_ERROR,
1204
- mm_schemas.EventFieldType.ENDPOINT_ID,
1205
- ],
1206
- agg_funcs=["count"],
1207
- filter_query=filter_query,
1208
- group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
1209
- preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
1210
- )
1211
- df.rename(
1212
- columns={f"count({mm_schemas.EventFieldType.MODEL_ERROR})": "error_count"},
1213
- inplace=True,
1214
- )
1215
- if not df.empty:
1216
- df.dropna(inplace=True)
1217
- return df
1218
-
1219
- def get_avg_latency(
1220
- self,
1221
- endpoint_ids: Union[str, list[str]],
1222
- start: Optional[datetime] = None,
1223
- end: Optional[datetime] = None,
1224
- get_raw: bool = False,
1225
- ) -> pd.DataFrame:
1226
- endpoint_ids = (
1227
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
1228
- )
1229
- start, end = get_start_end(start, end, delta=timedelta(hours=24))
1230
- df = self._get_records(
1231
- table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
1232
- start=start,
1233
- end=end,
1234
- columns=[
1235
- mm_schemas.EventFieldType.LATENCY,
1236
- mm_schemas.EventFieldType.ENDPOINT_ID,
1237
- ],
1238
- agg_funcs=["avg"],
1239
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
1240
- group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
1241
- preform_agg_columns=[mm_schemas.EventFieldType.LATENCY],
1242
- )
1243
- df.rename(
1244
- columns={f"avg({mm_schemas.EventFieldType.LATENCY})": "avg_latency"},
1245
- inplace=True,
1246
- )
1247
- if not df.empty:
1248
- df.dropna(inplace=True)
1249
- return df
1250
-
1251
- async def add_basic_metrics(
1252
- self,
1253
- model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
1254
- project: str,
1255
- run_in_threadpool: Callable,
1256
- metric_list: Optional[list[str]] = None,
1257
- ) -> list[mlrun.common.schemas.ModelEndpoint]:
1258
- """
1259
- Add basic metrics to the model endpoint object.
1260
-
1261
- :param model_endpoint_objects: A list of `ModelEndpoint` objects that will
1262
- be filled with the relevant basic metrics.
1263
- :param project: The name of the project.
1264
- :param run_in_threadpool: A function that runs another function in a thread pool.
1265
- :param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
1266
-
1267
- :return: A list of `ModelEndpointMonitoringMetric` objects.
1268
- """
1269
-
1270
- uids = [mep.metadata.uid for mep in model_endpoint_objects]
1271
-
1272
- metric_name_to_function = {
1273
- "error_count": self.get_error_count,
1274
- "last_request": self.get_last_request,
1275
- "avg_latency": self.get_avg_latency,
1276
- "result_status": self.get_drift_status,
1277
- }
1278
- if metric_list is not None:
1279
- for metric_name in list(metric_name_to_function):
1280
- if metric_name not in metric_list:
1281
- del metric_name_to_function[metric_name]
1282
-
1283
- metric_name_to_df = {
1284
- metric_name: function(endpoint_ids=uids)
1285
- for metric_name, function in metric_name_to_function.items()
1286
- }
1287
-
1288
- def add_metrics(
1289
- mep: mlrun.common.schemas.ModelEndpoint,
1290
- df_dictionary: dict[str, pd.DataFrame],
1291
- ):
1292
- for metric in df_dictionary.keys():
1293
- df = df_dictionary.get(metric, pd.DataFrame())
1294
- if not df.empty:
1295
- line = df[df["endpoint_id"] == mep.metadata.uid]
1296
- if not line.empty and metric in line:
1297
- value = line[metric].item()
1298
- if isinstance(value, pd.Timestamp):
1299
- value = value.to_pydatetime()
1300
- setattr(mep.status, metric, value)
1301
-
1302
- return mep
1303
-
1304
- return list(
1305
- map(
1306
- lambda mep: add_metrics(
1307
- mep=mep,
1308
- df_dictionary=metric_name_to_df,
1309
- ),
1310
- model_endpoint_objects,
1311
- )
1312
- )
1313
-
1314
- def get_drift_data(
1315
- self,
1316
- start: datetime,
1317
- end: datetime,
1318
- ) -> mm_schemas.ModelEndpointDriftValues:
1319
- filter_query = self._generate_filter_query(
1320
- filter_column=mm_schemas.ResultData.RESULT_STATUS,
1321
- filter_values=[
1322
- mm_schemas.ResultStatusApp.potential_detection.value,
1323
- mm_schemas.ResultStatusApp.detected.value,
1324
- ],
1325
- )
1326
- table = self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table
1327
- start, end, interval = self._prepare_aligned_start_end(start, end)
1328
-
1329
- # get per time-interval x endpoint_id combination the max result status
1330
- df = self._get_records(
1331
- table=table,
1332
- start=start,
1333
- end=end,
1334
- interval=interval,
1335
- columns=[mm_schemas.ResultData.RESULT_STATUS],
1336
- filter_query=filter_query,
1337
- timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
1338
- agg_funcs=["max"],
1339
- partition_by=mm_schemas.WriterEvent.ENDPOINT_ID,
1340
- )
1341
- if df.empty:
1342
- return mm_schemas.ModelEndpointDriftValues(values=[])
1343
-
1344
- df["_wstart"] = pd.to_datetime(df["_wstart"])
1345
- return self._df_to_drift_data(df)
1346
-
1347
- # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
1348
- #
1349
- # def read_prediction_metric_for_endpoint_if_exists(
1350
- # self, endpoint_id: str
1351
- # ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
1352
- # """
1353
- # Read the "invocations" metric for the provided model endpoint, and return the metric object
1354
- # if it exists.
1355
- #
1356
- # :param endpoint_id: The model endpoint identifier.
1357
- # :return: `None` if the invocations metric does not exist, otherwise return the
1358
- # corresponding metric object.
1359
- # """
1360
- # # Read just one record, because we just want to check if there is any data for this endpoint_id
1361
- # predictions = self.read_predictions(
1362
- # endpoint_id=endpoint_id,
1363
- # start=datetime.min,
1364
- # end=mlrun.utils.now_date(),
1365
- # limit=1,
1366
- # )
1367
- # if predictions:
1368
- # return get_invocations_metric(self.project)