mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show
  1. mlrun/__init__.py +26 -22
  2. mlrun/__main__.py +15 -16
  3. mlrun/alerts/alert.py +150 -15
  4. mlrun/api/schemas/__init__.py +1 -9
  5. mlrun/artifacts/__init__.py +2 -3
  6. mlrun/artifacts/base.py +62 -19
  7. mlrun/artifacts/dataset.py +17 -17
  8. mlrun/artifacts/document.py +454 -0
  9. mlrun/artifacts/manager.py +28 -18
  10. mlrun/artifacts/model.py +91 -59
  11. mlrun/artifacts/plots.py +2 -2
  12. mlrun/common/constants.py +8 -0
  13. mlrun/common/formatters/__init__.py +1 -0
  14. mlrun/common/formatters/artifact.py +1 -1
  15. mlrun/common/formatters/feature_set.py +2 -0
  16. mlrun/common/formatters/function.py +1 -0
  17. mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
  18. mlrun/common/formatters/pipeline.py +1 -2
  19. mlrun/common/formatters/project.py +9 -0
  20. mlrun/common/model_monitoring/__init__.py +0 -5
  21. mlrun/common/model_monitoring/helpers.py +12 -62
  22. mlrun/common/runtimes/constants.py +25 -4
  23. mlrun/common/schemas/__init__.py +9 -5
  24. mlrun/common/schemas/alert.py +114 -19
  25. mlrun/common/schemas/api_gateway.py +3 -3
  26. mlrun/common/schemas/artifact.py +22 -9
  27. mlrun/common/schemas/auth.py +8 -4
  28. mlrun/common/schemas/background_task.py +7 -7
  29. mlrun/common/schemas/client_spec.py +4 -4
  30. mlrun/common/schemas/clusterization_spec.py +2 -2
  31. mlrun/common/schemas/common.py +53 -3
  32. mlrun/common/schemas/constants.py +15 -0
  33. mlrun/common/schemas/datastore_profile.py +1 -1
  34. mlrun/common/schemas/feature_store.py +9 -9
  35. mlrun/common/schemas/frontend_spec.py +4 -4
  36. mlrun/common/schemas/function.py +10 -10
  37. mlrun/common/schemas/hub.py +1 -1
  38. mlrun/common/schemas/k8s.py +3 -3
  39. mlrun/common/schemas/memory_reports.py +3 -3
  40. mlrun/common/schemas/model_monitoring/__init__.py +4 -8
  41. mlrun/common/schemas/model_monitoring/constants.py +127 -46
  42. mlrun/common/schemas/model_monitoring/grafana.py +18 -12
  43. mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
  44. mlrun/common/schemas/notification.py +24 -3
  45. mlrun/common/schemas/object.py +1 -1
  46. mlrun/common/schemas/pagination.py +4 -4
  47. mlrun/common/schemas/partition.py +142 -0
  48. mlrun/common/schemas/pipeline.py +3 -3
  49. mlrun/common/schemas/project.py +26 -18
  50. mlrun/common/schemas/runs.py +3 -3
  51. mlrun/common/schemas/runtime_resource.py +5 -5
  52. mlrun/common/schemas/schedule.py +1 -1
  53. mlrun/common/schemas/secret.py +1 -1
  54. mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
  55. mlrun/common/schemas/tag.py +3 -3
  56. mlrun/common/schemas/workflow.py +6 -5
  57. mlrun/common/types.py +1 -0
  58. mlrun/config.py +157 -89
  59. mlrun/data_types/__init__.py +5 -3
  60. mlrun/data_types/infer.py +13 -3
  61. mlrun/data_types/spark.py +2 -1
  62. mlrun/datastore/__init__.py +59 -18
  63. mlrun/datastore/alibaba_oss.py +4 -1
  64. mlrun/datastore/azure_blob.py +4 -1
  65. mlrun/datastore/base.py +19 -24
  66. mlrun/datastore/datastore.py +10 -4
  67. mlrun/datastore/datastore_profile.py +178 -45
  68. mlrun/datastore/dbfs_store.py +4 -1
  69. mlrun/datastore/filestore.py +4 -1
  70. mlrun/datastore/google_cloud_storage.py +4 -1
  71. mlrun/datastore/hdfs.py +4 -1
  72. mlrun/datastore/inmem.py +4 -1
  73. mlrun/datastore/redis.py +4 -1
  74. mlrun/datastore/s3.py +14 -3
  75. mlrun/datastore/sources.py +89 -92
  76. mlrun/datastore/store_resources.py +7 -4
  77. mlrun/datastore/storeytargets.py +51 -16
  78. mlrun/datastore/targets.py +38 -31
  79. mlrun/datastore/utils.py +87 -4
  80. mlrun/datastore/v3io.py +4 -1
  81. mlrun/datastore/vectorstore.py +291 -0
  82. mlrun/datastore/wasbfs/fs.py +13 -12
  83. mlrun/db/base.py +286 -100
  84. mlrun/db/httpdb.py +1562 -490
  85. mlrun/db/nopdb.py +250 -83
  86. mlrun/errors.py +6 -2
  87. mlrun/execution.py +194 -50
  88. mlrun/feature_store/__init__.py +2 -10
  89. mlrun/feature_store/api.py +20 -458
  90. mlrun/feature_store/common.py +9 -9
  91. mlrun/feature_store/feature_set.py +20 -18
  92. mlrun/feature_store/feature_vector.py +105 -479
  93. mlrun/feature_store/feature_vector_utils.py +466 -0
  94. mlrun/feature_store/retrieval/base.py +15 -11
  95. mlrun/feature_store/retrieval/job.py +2 -1
  96. mlrun/feature_store/retrieval/storey_merger.py +1 -1
  97. mlrun/feature_store/steps.py +3 -3
  98. mlrun/features.py +30 -13
  99. mlrun/frameworks/__init__.py +1 -2
  100. mlrun/frameworks/_common/__init__.py +1 -2
  101. mlrun/frameworks/_common/artifacts_library.py +2 -2
  102. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  103. mlrun/frameworks/_common/model_handler.py +31 -31
  104. mlrun/frameworks/_common/producer.py +3 -1
  105. mlrun/frameworks/_dl_common/__init__.py +1 -2
  106. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  107. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  108. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  109. mlrun/frameworks/_ml_common/__init__.py +1 -2
  110. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  111. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  112. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  113. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  114. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  115. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  116. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  117. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  118. mlrun/frameworks/huggingface/__init__.py +1 -2
  119. mlrun/frameworks/huggingface/model_server.py +9 -9
  120. mlrun/frameworks/lgbm/__init__.py +47 -44
  121. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  122. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  123. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  124. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  125. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  126. mlrun/frameworks/lgbm/model_handler.py +15 -11
  127. mlrun/frameworks/lgbm/model_server.py +11 -7
  128. mlrun/frameworks/lgbm/utils.py +2 -2
  129. mlrun/frameworks/onnx/__init__.py +1 -2
  130. mlrun/frameworks/onnx/dataset.py +3 -3
  131. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  132. mlrun/frameworks/onnx/model_handler.py +7 -5
  133. mlrun/frameworks/onnx/model_server.py +8 -6
  134. mlrun/frameworks/parallel_coordinates.py +11 -11
  135. mlrun/frameworks/pytorch/__init__.py +22 -23
  136. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  137. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  138. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  139. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  140. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  141. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  142. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  143. mlrun/frameworks/pytorch/model_handler.py +21 -17
  144. mlrun/frameworks/pytorch/model_server.py +13 -9
  145. mlrun/frameworks/sklearn/__init__.py +19 -18
  146. mlrun/frameworks/sklearn/estimator.py +2 -2
  147. mlrun/frameworks/sklearn/metric.py +3 -3
  148. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  149. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  150. mlrun/frameworks/sklearn/model_handler.py +4 -3
  151. mlrun/frameworks/tf_keras/__init__.py +11 -12
  152. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  153. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  154. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  155. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  156. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  157. mlrun/frameworks/tf_keras/model_server.py +12 -8
  158. mlrun/frameworks/xgboost/__init__.py +19 -18
  159. mlrun/frameworks/xgboost/model_handler.py +13 -9
  160. mlrun/k8s_utils.py +2 -5
  161. mlrun/launcher/base.py +3 -4
  162. mlrun/launcher/client.py +2 -2
  163. mlrun/launcher/local.py +6 -2
  164. mlrun/launcher/remote.py +1 -1
  165. mlrun/lists.py +8 -4
  166. mlrun/model.py +132 -46
  167. mlrun/model_monitoring/__init__.py +3 -5
  168. mlrun/model_monitoring/api.py +113 -98
  169. mlrun/model_monitoring/applications/__init__.py +0 -5
  170. mlrun/model_monitoring/applications/_application_steps.py +81 -50
  171. mlrun/model_monitoring/applications/base.py +467 -14
  172. mlrun/model_monitoring/applications/context.py +212 -134
  173. mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
  174. mlrun/model_monitoring/applications/evidently/base.py +146 -0
  175. mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
  176. mlrun/model_monitoring/applications/results.py +67 -15
  177. mlrun/model_monitoring/controller.py +701 -315
  178. mlrun/model_monitoring/db/__init__.py +0 -2
  179. mlrun/model_monitoring/db/_schedules.py +242 -0
  180. mlrun/model_monitoring/db/_stats.py +189 -0
  181. mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
  182. mlrun/model_monitoring/db/tsdb/base.py +243 -49
  183. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
  184. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  185. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
  187. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  188. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
  189. mlrun/model_monitoring/helpers.py +356 -114
  190. mlrun/model_monitoring/stream_processing.py +190 -345
  191. mlrun/model_monitoring/tracking_policy.py +11 -4
  192. mlrun/model_monitoring/writer.py +49 -90
  193. mlrun/package/__init__.py +3 -6
  194. mlrun/package/context_handler.py +2 -2
  195. mlrun/package/packager.py +12 -9
  196. mlrun/package/packagers/__init__.py +0 -2
  197. mlrun/package/packagers/default_packager.py +14 -11
  198. mlrun/package/packagers/numpy_packagers.py +16 -7
  199. mlrun/package/packagers/pandas_packagers.py +18 -18
  200. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  201. mlrun/package/packagers_manager.py +35 -32
  202. mlrun/package/utils/__init__.py +0 -3
  203. mlrun/package/utils/_pickler.py +6 -6
  204. mlrun/platforms/__init__.py +47 -16
  205. mlrun/platforms/iguazio.py +4 -1
  206. mlrun/projects/operations.py +30 -30
  207. mlrun/projects/pipelines.py +116 -47
  208. mlrun/projects/project.py +1292 -329
  209. mlrun/render.py +5 -9
  210. mlrun/run.py +57 -14
  211. mlrun/runtimes/__init__.py +1 -3
  212. mlrun/runtimes/base.py +30 -22
  213. mlrun/runtimes/daskjob.py +9 -9
  214. mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
  215. mlrun/runtimes/function_reference.py +5 -2
  216. mlrun/runtimes/generators.py +3 -2
  217. mlrun/runtimes/kubejob.py +6 -7
  218. mlrun/runtimes/mounts.py +574 -0
  219. mlrun/runtimes/mpijob/__init__.py +0 -2
  220. mlrun/runtimes/mpijob/abstract.py +7 -6
  221. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  222. mlrun/runtimes/nuclio/application/application.py +11 -13
  223. mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
  224. mlrun/runtimes/nuclio/function.py +127 -70
  225. mlrun/runtimes/nuclio/serving.py +105 -37
  226. mlrun/runtimes/pod.py +159 -54
  227. mlrun/runtimes/remotesparkjob.py +3 -2
  228. mlrun/runtimes/sparkjob/__init__.py +0 -2
  229. mlrun/runtimes/sparkjob/spark3job.py +22 -12
  230. mlrun/runtimes/utils.py +7 -6
  231. mlrun/secrets.py +2 -2
  232. mlrun/serving/__init__.py +8 -0
  233. mlrun/serving/merger.py +7 -5
  234. mlrun/serving/remote.py +35 -22
  235. mlrun/serving/routers.py +186 -240
  236. mlrun/serving/server.py +41 -10
  237. mlrun/serving/states.py +432 -118
  238. mlrun/serving/utils.py +13 -2
  239. mlrun/serving/v1_serving.py +3 -2
  240. mlrun/serving/v2_serving.py +161 -203
  241. mlrun/track/__init__.py +1 -1
  242. mlrun/track/tracker.py +2 -2
  243. mlrun/track/trackers/mlflow_tracker.py +6 -5
  244. mlrun/utils/async_http.py +35 -22
  245. mlrun/utils/clones.py +7 -4
  246. mlrun/utils/helpers.py +511 -58
  247. mlrun/utils/logger.py +119 -13
  248. mlrun/utils/notifications/notification/__init__.py +22 -19
  249. mlrun/utils/notifications/notification/base.py +39 -15
  250. mlrun/utils/notifications/notification/console.py +6 -6
  251. mlrun/utils/notifications/notification/git.py +11 -11
  252. mlrun/utils/notifications/notification/ipython.py +10 -9
  253. mlrun/utils/notifications/notification/mail.py +176 -0
  254. mlrun/utils/notifications/notification/slack.py +16 -8
  255. mlrun/utils/notifications/notification/webhook.py +24 -8
  256. mlrun/utils/notifications/notification_pusher.py +191 -200
  257. mlrun/utils/regex.py +12 -2
  258. mlrun/utils/version/version.json +2 -2
  259. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
  260. mlrun-1.8.0.dist-info/RECORD +351 -0
  261. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
  262. mlrun/model_monitoring/applications/evidently_base.py +0 -137
  263. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  264. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  265. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  266. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  267. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  268. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  269. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  270. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  271. mlrun/model_monitoring/model_endpoint.py +0 -118
  272. mlrun-1.7.2rc3.dist-info/RECORD +0 -351
  273. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
  274. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
  275. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
@@ -11,10 +11,10 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
- from datetime import datetime, timezone
14
+ import math
15
+ from datetime import datetime, timedelta
16
16
  from io import StringIO
17
- from typing import Literal, Optional, Union
17
+ from typing import Callable, Literal, Optional, Union
18
18
 
19
19
  import pandas as pd
20
20
  import v3io_frames
@@ -33,6 +33,13 @@ _TSDB_BE = "tsdb"
33
33
  _TSDB_RATE = "1/s"
34
34
  _CONTAINER = "users"
35
35
 
36
+ V3IO_FRAMESD_MEPS_LIMIT = (
37
+ 200 # Maximum number of model endpoints per single request when using V3IO Frames
38
+ )
39
+ V3IO_CLIENT_MEPS_LIMIT = (
40
+ 150 # Maximum number of model endpoints per single request when using V3IO Client
41
+ )
42
+
36
43
 
37
44
  def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
38
45
  """
@@ -58,6 +65,7 @@ class V3IOTSDBConnector(TSDBConnector):
58
65
  project: str,
59
66
  container: str = _CONTAINER,
60
67
  v3io_framesd: Optional[str] = None,
68
+ v3io_access_key: str = "",
61
69
  create_table: bool = False,
62
70
  ) -> None:
63
71
  super().__init__(project=project)
@@ -65,14 +73,26 @@ class V3IOTSDBConnector(TSDBConnector):
65
73
  self.container = container
66
74
 
67
75
  self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
76
+ self._v3io_access_key = v3io_access_key
68
77
  self._frames_client: Optional[v3io_frames.client.ClientBase] = None
69
78
  self._init_tables_path()
70
79
  self._create_table = create_table
80
+ self._v3io_client = None
81
+
82
+ @property
83
+ def v3io_client(self):
84
+ if not self._v3io_client:
85
+ self._v3io_client = mlrun.utils.v3io_clients.get_v3io_client(
86
+ endpoint=mlrun.mlconf.v3io_api, access_key=self._v3io_access_key
87
+ )
88
+ return self._v3io_client
71
89
 
72
90
  @property
73
91
  def frames_client(self) -> v3io_frames.client.ClientBase:
74
92
  if not self._frames_client:
75
- self._frames_client = self._get_v3io_frames_client(self.container)
93
+ self._frames_client = self._get_v3io_frames_client(
94
+ self.container, v3io_access_key=self._v3io_access_key
95
+ )
76
96
  if self._create_table:
77
97
  self.create_tables()
78
98
  return self._frames_client
@@ -129,7 +149,7 @@ class V3IOTSDBConnector(TSDBConnector):
129
149
  monitoring_predictions_full_path = (
130
150
  mlrun.mlconf.get_model_monitoring_file_target_path(
131
151
  project=self.project,
132
- kind=mm_schemas.FileTargetKind.PREDICTIONS,
152
+ kind=mm_schemas.V3IOTSDBTables.PREDICTIONS,
133
153
  )
134
154
  )
135
155
  (
@@ -139,28 +159,51 @@ class V3IOTSDBConnector(TSDBConnector):
139
159
  ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
140
160
  monitoring_predictions_full_path
141
161
  )
142
- self.tables[mm_schemas.FileTargetKind.PREDICTIONS] = monitoring_predictions_path
162
+ self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS] = monitoring_predictions_path
163
+
164
+ # initialize kv table
165
+ last_request_full_table_path = (
166
+ mlrun.mlconf.get_model_monitoring_file_target_path(
167
+ project=self.project,
168
+ kind=mm_schemas.FileTargetKind.LAST_REQUEST,
169
+ )
170
+ )
171
+ (
172
+ _,
173
+ _,
174
+ self.last_request_table,
175
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
176
+ last_request_full_table_path
177
+ )
143
178
 
144
179
  def create_tables(self) -> None:
145
180
  """
146
- Create the tables using the TSDB connector. The tables are being created in the V3IO TSDB and include:
181
+ Create the tables using the TSDB connector. These are the tables that are stored in the V3IO TSDB:
147
182
  - app_results: a detailed result that includes status, kind, extra data, etc.
148
183
  - metrics: a basic key value that represents a single numeric metric.
149
- Note that the predictions table is automatically created by the model monitoring stream pod.
184
+ - events: A statistics table that includes pre-aggregated metrics (such as average latency over the
185
+ last 5 minutes) and data samples
186
+ - predictions: a detailed prediction that includes latency, request timestamp, etc. This table also
187
+ includes pre-aggregated operations such as count and average on 1 minute granularity.
188
+ - errors: a detailed error that includes error desc, error type, etc.
189
+
150
190
  """
151
- application_tables = [
152
- mm_schemas.V3IOTSDBTables.APP_RESULTS,
153
- mm_schemas.V3IOTSDBTables.METRICS,
154
- ]
155
- for table_name in application_tables:
191
+
192
+ default_configurations = {
193
+ "backend": _TSDB_BE,
194
+ "if_exists": v3io_frames.IGNORE,
195
+ "rate": _TSDB_RATE,
196
+ }
197
+
198
+ for table_name in self.tables:
199
+ default_configurations["table"] = self.tables[table_name]
200
+ if table_name == mm_schemas.V3IOTSDBTables.PREDICTIONS:
201
+ default_configurations["aggregates"] = "count,avg"
202
+ default_configurations["aggregation_granularity"] = "1m"
203
+ elif table_name == mm_schemas.V3IOTSDBTables.EVENTS:
204
+ default_configurations["rate"] = "10/m"
156
205
  logger.info("Creating table in V3IO TSDB", table_name=table_name)
157
- table = self.tables[table_name]
158
- self.frames_client.create(
159
- backend=_TSDB_BE,
160
- table=table,
161
- if_exists=v3io_frames.IGNORE,
162
- rate=_TSDB_RATE,
163
- )
206
+ self.frames_client.create(**default_configurations)
164
207
 
165
208
  def apply_monitoring_stream_steps(
166
209
  self,
@@ -168,6 +211,9 @@ class V3IOTSDBConnector(TSDBConnector):
168
211
  tsdb_batching_max_events: int = 1000,
169
212
  tsdb_batching_timeout_secs: int = 30,
170
213
  sample_window: int = 10,
214
+ aggregate_windows: Optional[list[str]] = None,
215
+ aggregate_period: str = "1m",
216
+ **kwarg,
171
217
  ):
172
218
  """
173
219
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -178,31 +224,73 @@ class V3IOTSDBConnector(TSDBConnector):
178
224
  - endpoint_features (Prediction and feature names and values)
179
225
  - custom_metrics (user-defined metrics)
180
226
  """
227
+ aggregate_windows = aggregate_windows or ["5m", "1h"]
228
+
229
+ # Calculate number of predictions and average latency
230
+ def apply_storey_aggregations():
231
+ # Calculate number of predictions for each window (5 min and 1 hour by default)
232
+ graph.add_step(
233
+ class_name="storey.AggregateByKey",
234
+ aggregates=[
235
+ {
236
+ "name": EventFieldType.LATENCY,
237
+ "column": EventFieldType.LATENCY,
238
+ "operations": ["count", "avg"],
239
+ "windows": aggregate_windows,
240
+ "period": aggregate_period,
241
+ }
242
+ ],
243
+ name=EventFieldType.LATENCY,
244
+ after="FilterNOP",
245
+ step_name="Aggregates",
246
+ table=".",
247
+ key_field=EventFieldType.ENDPOINT_ID,
248
+ )
249
+ # Calculate average latency time for each window (5 min and 1 hour by default)
250
+ graph.add_step(
251
+ class_name="storey.Rename",
252
+ mapping={
253
+ "latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
254
+ "latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
255
+ },
256
+ name="Rename",
257
+ after=EventFieldType.LATENCY,
258
+ )
181
259
 
260
+ apply_storey_aggregations()
182
261
  # Write latency per prediction, labeled by endpoint ID only
183
262
  graph.add_step(
184
263
  "storey.TSDBTarget",
185
264
  name="tsdb_predictions",
186
- after="MapFeatureNames",
187
- path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.PREDICTIONS]}",
188
- rate="1/s",
265
+ after="FilterNOP",
266
+ path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.PREDICTIONS]}",
189
267
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
190
268
  container=self.container,
191
269
  v3io_frames=self.v3io_framesd,
192
270
  columns=[
193
271
  mm_schemas.EventFieldType.LATENCY,
194
272
  mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
273
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
274
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
195
275
  ],
196
276
  index_cols=[
197
277
  mm_schemas.EventFieldType.ENDPOINT_ID,
198
278
  ],
199
- aggr="count,avg",
200
- aggr_granularity="1m",
201
279
  max_events=tsdb_batching_max_events,
202
280
  flush_after_seconds=tsdb_batching_timeout_secs,
203
281
  key=mm_schemas.EventFieldType.ENDPOINT_ID,
204
282
  )
205
283
 
284
+ # Write last request timestamp to KV table
285
+ graph.add_step(
286
+ "storey.NoSqlTarget",
287
+ name="KVLastRequest",
288
+ after="tsdb_predictions",
289
+ table=f"v3io:///users/{self.last_request_table}",
290
+ columns=[EventFieldType.LAST_REQUEST_TIMESTAMP],
291
+ index_cols=[EventFieldType.ENDPOINT_ID],
292
+ )
293
+
206
294
  # Emits the event in window size of events based on sample_window size (10 by default)
207
295
  graph.add_step(
208
296
  "storey.steps.SampleWindow",
@@ -236,7 +324,6 @@ class V3IOTSDBConnector(TSDBConnector):
236
324
  name=name,
237
325
  after=after,
238
326
  path=f"{self.container}/{self.tables[mm_schemas.V3IOTSDBTables.EVENTS]}",
239
- rate="10/m",
240
327
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
241
328
  container=self.container,
242
329
  v3io_frames=self.v3io_framesd,
@@ -300,7 +387,6 @@ class V3IOTSDBConnector(TSDBConnector):
300
387
  name="tsdb_error",
301
388
  after="error_extractor",
302
389
  path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
303
- rate="1/s",
304
390
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
305
391
  container=self.container,
306
392
  v3io_frames=self.v3io_framesd,
@@ -310,6 +396,7 @@ class V3IOTSDBConnector(TSDBConnector):
310
396
  ],
311
397
  index_cols=[
312
398
  mm_schemas.EventFieldType.ENDPOINT_ID,
399
+ mm_schemas.EventFieldType.ERROR_TYPE,
313
400
  ],
314
401
  max_events=tsdb_batching_max_events,
315
402
  flush_after_seconds=tsdb_batching_timeout_secs,
@@ -338,9 +425,6 @@ class V3IOTSDBConnector(TSDBConnector):
338
425
  elif kind == mm_schemas.WriterEventKind.RESULT:
339
426
  table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
340
427
  index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
341
- event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
342
- # TODO: remove this when extra data is supported (ML-7460)
343
- event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
344
428
  else:
345
429
  raise ValueError(f"Invalid {kind = }")
346
430
 
@@ -371,12 +455,20 @@ class V3IOTSDBConnector(TSDBConnector):
371
455
  # Delete all tables
372
456
  tables = mm_schemas.V3IOTSDBTables.list()
373
457
  for table_to_delete in tables:
374
- try:
375
- self.frames_client.delete(backend=_TSDB_BE, table=table_to_delete)
376
- except v3io_frames.DeleteError as e:
458
+ if table_to_delete in self.tables:
459
+ try:
460
+ self.frames_client.delete(
461
+ backend=_TSDB_BE, table=self.tables[table_to_delete]
462
+ )
463
+ except v3io_frames.DeleteError as e:
464
+ logger.warning(
465
+ f"Failed to delete TSDB table '{table_to_delete}'",
466
+ err=mlrun.errors.err_to_str(e),
467
+ )
468
+ else:
377
469
  logger.warning(
378
- f"Failed to delete TSDB table '{table}'",
379
- err=mlrun.errors.err_to_str(e),
470
+ f"Skipping deletion: table '{table_to_delete}' is not among the initialized tables.",
471
+ initialized_tables=list(self.tables.keys()),
380
472
  )
381
473
 
382
474
  # Final cleanup of tsdb path
@@ -385,6 +477,60 @@ class V3IOTSDBConnector(TSDBConnector):
385
477
  store, _, _ = mlrun.store_manager.get_or_create_store(tsdb_path)
386
478
  store.rm(tsdb_path, recursive=True)
387
479
 
480
+ def delete_tsdb_records(
481
+ self,
482
+ endpoint_ids: list[str],
483
+ ):
484
+ logger.debug(
485
+ "Deleting model endpoints resources using the V3IO TSDB connector",
486
+ project=self.project,
487
+ number_of_endpoints_to_delete=len(endpoint_ids),
488
+ )
489
+ tables = mm_schemas.V3IOTSDBTables.list()
490
+
491
+ # Split the endpoint ids into chunks to avoid exceeding the v3io-engine filter-expression limit
492
+ for i in range(0, len(endpoint_ids), V3IO_FRAMESD_MEPS_LIMIT):
493
+ endpoint_id_chunk = endpoint_ids[i : i + V3IO_FRAMESD_MEPS_LIMIT]
494
+ filter_query = f"endpoint_id IN({str(endpoint_id_chunk)[1:-1]}) "
495
+ for table in tables:
496
+ try:
497
+ self.frames_client.delete(
498
+ backend=_TSDB_BE,
499
+ table=self.tables[table],
500
+ filter=filter_query,
501
+ start="0",
502
+ )
503
+ except Exception as e:
504
+ logger.warning(
505
+ f"Failed to delete TSDB records for the provided endpoints from table '{table}'",
506
+ error=mlrun.errors.err_to_str(e),
507
+ project=self.project,
508
+ )
509
+
510
+ # Clean the last request records from the KV table
511
+ self._delete_last_request_records(endpoint_ids=endpoint_ids)
512
+
513
+ logger.debug(
514
+ "Deleted all model endpoint resources using the V3IO connector",
515
+ project=self.project,
516
+ number_of_endpoints_to_delete=len(endpoint_ids),
517
+ )
518
+
519
+ def _delete_last_request_records(self, endpoint_ids: list[str]):
520
+ for endpoint_id in endpoint_ids:
521
+ try:
522
+ self.v3io_client.kv.delete(
523
+ container=self.container,
524
+ table=self.last_request_table,
525
+ key=endpoint_id,
526
+ )
527
+ except Exception as e:
528
+ logger.warning(
529
+ f"Failed to delete last request record for endpoint '{endpoint_id}'",
530
+ error=mlrun.errors.err_to_str(e),
531
+ project=self.project,
532
+ )
533
+
388
534
  def get_model_endpoint_real_time_metrics(
389
535
  self, endpoint_id: str, metrics: list[str], start: str, end: str
390
536
  ) -> dict[str, list[tuple[str, float]]]:
@@ -449,8 +595,9 @@ class V3IOTSDBConnector(TSDBConnector):
449
595
  interval: Optional[str] = None,
450
596
  agg_funcs: Optional[list[str]] = None,
451
597
  sliding_window_step: Optional[str] = None,
598
+ get_raw: bool = False,
452
599
  **kwargs,
453
- ) -> pd.DataFrame:
600
+ ) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
454
601
  """
455
602
  Getting records from V3IO TSDB data collection.
456
603
  :param table: Path to the collection to query.
@@ -475,6 +622,10 @@ class V3IOTSDBConnector(TSDBConnector):
475
622
  `sliding_window_step` is provided, interval must be provided as well. Provided
476
623
  as a string in the format of '1m', '1h', etc.
477
624
  :param kwargs: Additional keyword arguments passed to the read method of frames client.
625
+ :param get_raw: Whether to return the request as raw frames rather than a pandas dataframe.
626
+ Defaults to False. This can greatly improve performance when a dataframe isn't
627
+ needed.
628
+
478
629
  :return: DataFrame with the provided attributes from the data collection.
479
630
  :raise: MLRunNotFoundError if the provided table wasn't found.
480
631
  """
@@ -488,7 +639,7 @@ class V3IOTSDBConnector(TSDBConnector):
488
639
  aggregators = ",".join(agg_funcs) if agg_funcs else None
489
640
  table_path = self.tables[table]
490
641
  try:
491
- df = self.frames_client.read(
642
+ res = self.frames_client.read(
492
643
  backend=_TSDB_BE,
493
644
  table=table_path,
494
645
  start=start,
@@ -498,15 +649,18 @@ class V3IOTSDBConnector(TSDBConnector):
498
649
  aggregation_window=interval,
499
650
  aggregators=aggregators,
500
651
  step=sliding_window_step,
652
+ get_raw=get_raw,
501
653
  **kwargs,
502
654
  )
655
+ if get_raw:
656
+ res = list(res)
503
657
  except v3io_frames.Error as err:
504
658
  if _is_no_schema_error(err):
505
- return pd.DataFrame()
659
+ return [] if get_raw else pd.DataFrame()
506
660
  else:
507
661
  raise err
508
662
 
509
- return df
663
+ return res
510
664
 
511
665
  def _get_v3io_source_directory(self) -> str:
512
666
  """
@@ -530,12 +684,34 @@ class V3IOTSDBConnector(TSDBConnector):
530
684
  return source_directory
531
685
 
532
686
  @staticmethod
533
- def _get_v3io_frames_client(v3io_container: str) -> v3io_frames.client.ClientBase:
687
+ def _get_v3io_frames_client(
688
+ v3io_container: str, v3io_access_key: str = ""
689
+ ) -> v3io_frames.client.ClientBase:
534
690
  return mlrun.utils.v3io_clients.get_frames_client(
535
691
  address=mlrun.mlconf.v3io_framesd,
536
692
  container=v3io_container,
693
+ token=v3io_access_key,
537
694
  )
538
695
 
696
+ @staticmethod
697
+ def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
698
+ if isinstance(endpoint_id, str):
699
+ return f"endpoint_id=='{endpoint_id}'"
700
+ elif isinstance(endpoint_id, list):
701
+ if len(endpoint_id) > V3IO_FRAMESD_MEPS_LIMIT:
702
+ logger.info(
703
+ "The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
704
+ "retrieving all the model endpoints from the db.",
705
+ limit=V3IO_FRAMESD_MEPS_LIMIT,
706
+ amount=len(endpoint_id),
707
+ )
708
+ return None
709
+ return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
710
+ else:
711
+ raise mlrun.errors.MLRunInvalidArgumentError(
712
+ f"Invalid 'endpoint_id' filter: must be a string or a list, endpoint_id: {endpoint_id}"
713
+ )
714
+
539
715
  def read_metrics_data(
540
716
  self,
541
717
  *,
@@ -544,6 +720,7 @@ class V3IOTSDBConnector(TSDBConnector):
544
720
  end: datetime,
545
721
  metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
546
722
  type: Literal["metrics", "results"] = "results",
723
+ with_result_extra_data: bool = False,
547
724
  ) -> Union[
548
725
  list[
549
726
  Union[
@@ -565,6 +742,12 @@ class V3IOTSDBConnector(TSDBConnector):
565
742
  """
566
743
 
567
744
  if type == "metrics":
745
+ if with_result_extra_data:
746
+ logger.warning(
747
+ "The 'with_result_extra_data' parameter is not supported for metrics, just for results",
748
+ project=self.project,
749
+ endpoint_id=endpoint_id,
750
+ )
568
751
  table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
569
752
  name = mm_schemas.MetricData.METRIC_NAME
570
753
  columns = [mm_schemas.MetricData.METRIC_VALUE]
@@ -577,6 +760,8 @@ class V3IOTSDBConnector(TSDBConnector):
577
760
  mm_schemas.ResultData.RESULT_STATUS,
578
761
  mm_schemas.ResultData.RESULT_KIND,
579
762
  ]
763
+ if with_result_extra_data:
764
+ columns.append(mm_schemas.ResultData.RESULT_EXTRA_DATA)
580
765
  df_handler = self.df_to_results_values
581
766
  else:
582
767
  raise ValueError(f"Invalid {type = }")
@@ -605,6 +790,9 @@ class V3IOTSDBConnector(TSDBConnector):
605
790
  endpoint_id=endpoint_id,
606
791
  is_empty=df.empty,
607
792
  )
793
+ if not with_result_extra_data and type == "results":
794
+ # Set the extra data to an empty string if it's not requested
795
+ df[mm_schemas.ResultData.RESULT_EXTRA_DATA] = ""
608
796
 
609
797
  return df_handler(df=df, metrics=metrics, project=self.project)
610
798
 
@@ -653,6 +841,9 @@ class V3IOTSDBConnector(TSDBConnector):
653
841
  end: Union[datetime, str],
654
842
  aggregation_window: Optional[str] = None,
655
843
  agg_funcs: Optional[list[str]] = None,
844
+ limit: Optional[
845
+ int
846
+ ] = None, # no effect, just for compatibility with the abstract method
656
847
  ) -> Union[
657
848
  mm_schemas.ModelEndpointMonitoringMetricNoData,
658
849
  mm_schemas.ModelEndpointMonitoringMetricValues,
@@ -664,10 +855,10 @@ class V3IOTSDBConnector(TSDBConnector):
664
855
  "both or neither of `aggregation_window` and `agg_funcs` must be provided"
665
856
  )
666
857
  df = self._get_records(
667
- table=mm_schemas.FileTargetKind.PREDICTIONS,
858
+ table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
668
859
  start=start,
669
860
  end=end,
670
- columns=[mm_schemas.EventFieldType.LATENCY],
861
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
671
862
  filter_query=f"endpoint_id=='{endpoint_id}'",
672
863
  agg_funcs=agg_funcs,
673
864
  sliding_window_step=aggregation_window,
@@ -681,10 +872,10 @@ class V3IOTSDBConnector(TSDBConnector):
681
872
  type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
682
873
  )
683
874
 
684
- latency_column = (
685
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
875
+ estimated_prediction_count = (
876
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
686
877
  if agg_funcs
687
- else mm_schemas.EventFieldType.LATENCY
878
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
688
879
  )
689
880
 
690
881
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -692,7 +883,7 @@ class V3IOTSDBConnector(TSDBConnector):
692
883
  values=list(
693
884
  zip(
694
885
  df.index,
695
- df[latency_column],
886
+ df[estimated_prediction_count],
696
887
  )
697
888
  ), # pyright: ignore[reportArgumentType]
698
889
  )
@@ -700,55 +891,78 @@ class V3IOTSDBConnector(TSDBConnector):
700
891
  def get_last_request(
701
892
  self,
702
893
  endpoint_ids: Union[str, list[str]],
703
- start: Union[datetime, str] = "0",
704
- end: Union[datetime, str] = "now",
705
- ) -> pd.DataFrame:
706
- endpoint_ids = (
707
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
708
- )
709
- df = self._get_records(
710
- table=mm_schemas.FileTargetKind.PREDICTIONS,
711
- start=start,
712
- end=end,
713
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
714
- agg_funcs=["last"],
715
- )
716
- if not df.empty:
717
- df.rename(
718
- columns={
719
- f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
720
- f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
721
- },
722
- inplace=True,
723
- )
724
- df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
725
- mm_schemas.EventFieldType.LAST_REQUEST
726
- ].map(
727
- lambda last_request: datetime.fromtimestamp(
728
- last_request, tz=timezone.utc
894
+ start: Optional[datetime] = None,
895
+ end: Optional[datetime] = None,
896
+ ) -> dict[str, float]:
897
+ # Get the last request timestamp for each endpoint from the KV table.
898
+ # The result of the query is a list of dictionaries,
899
+ # each dictionary contains the endpoint id and the last request timestamp.
900
+ last_request_timestamps = {}
901
+ if isinstance(endpoint_ids, str):
902
+ endpoint_ids = [endpoint_ids]
903
+
904
+ try:
905
+ if len(endpoint_ids) > V3IO_CLIENT_MEPS_LIMIT:
906
+ logger.warning(
907
+ "The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
908
+ "retrieving last request for all the model endpoints from the KV table.",
909
+ limit=V3IO_CLIENT_MEPS_LIMIT,
910
+ amount=len(endpoint_ids),
911
+ )
912
+
913
+ res = self.v3io_client.kv.new_cursor(
914
+ container=self.container,
915
+ table_path=self.last_request_table,
916
+ ).all()
917
+ last_request_timestamps.update(
918
+ {d["__name"]: d["last_request_timestamp"] for d in res}
729
919
  )
920
+ else:
921
+ filter_expression = " OR ".join(
922
+ [f"__name=='{endpoint_id}'" for endpoint_id in endpoint_ids]
923
+ )
924
+ res = self.v3io_client.kv.new_cursor(
925
+ container=self.container,
926
+ table_path=self.last_request_table,
927
+ filter_expression=filter_expression,
928
+ ).all()
929
+ last_request_timestamps.update(
930
+ {d["__name"]: d["last_request_timestamp"] for d in res}
931
+ )
932
+ except Exception as e:
933
+ logger.warning(
934
+ "Failed to get last request timestamp from V3IO KV table.",
935
+ err=mlrun.errors.err_to_str(e),
936
+ project=self.project,
937
+ table=self.last_request_table,
730
938
  )
731
939
 
732
- return df.reset_index(drop=True)
940
+ return last_request_timestamps
733
941
 
734
942
  def get_drift_status(
735
943
  self,
736
944
  endpoint_ids: Union[str, list[str]],
737
- start: Union[datetime, str] = "now-24h",
738
- end: Union[datetime, str] = "now",
739
- ) -> pd.DataFrame:
740
- endpoint_ids = (
741
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
742
- )
743
- df = self._get_records(
945
+ start: Optional[datetime] = None,
946
+ end: Optional[datetime] = None,
947
+ get_raw: bool = False,
948
+ ) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
949
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
950
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
951
+ start, end = self._get_start_end(start, end)
952
+ res = self._get_records(
744
953
  table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
745
954
  start=start,
746
955
  end=end,
747
956
  columns=[mm_schemas.ResultData.RESULT_STATUS],
748
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
957
+ filter_query=filter_query,
749
958
  agg_funcs=["max"],
750
959
  group_by="endpoint_id",
960
+ get_raw=get_raw,
751
961
  )
962
+ if get_raw:
963
+ return res
964
+
965
+ df = res
752
966
  if not df.empty:
753
967
  df.columns = [
754
968
  col[len("max(") : -1] if "max(" in col else col for col in df.columns
@@ -757,16 +971,18 @@ class V3IOTSDBConnector(TSDBConnector):
757
971
 
758
972
  def get_metrics_metadata(
759
973
  self,
760
- endpoint_id: str,
761
- start: Union[datetime, str] = "0",
762
- end: Union[datetime, str] = "now",
974
+ endpoint_id: Union[str, list[str]],
975
+ start: Optional[datetime] = None,
976
+ end: Optional[datetime] = None,
763
977
  ) -> pd.DataFrame:
978
+ start, end = self._get_start_end(start, end)
979
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
764
980
  df = self._get_records(
765
981
  table=mm_schemas.V3IOTSDBTables.METRICS,
766
982
  start=start,
767
983
  end=end,
768
984
  columns=[mm_schemas.MetricData.METRIC_VALUE],
769
- filter_query=f"endpoint_id=='{endpoint_id}'",
985
+ filter_query=filter_query,
770
986
  agg_funcs=["last"],
771
987
  )
772
988
  if not df.empty:
@@ -777,10 +993,12 @@ class V3IOTSDBConnector(TSDBConnector):
777
993
 
778
994
  def get_results_metadata(
779
995
  self,
780
- endpoint_id: str,
781
- start: Union[datetime, str] = "0",
782
- end: Union[datetime, str] = "now",
996
+ endpoint_id: Union[str, list[str]],
997
+ start: Optional[datetime] = None,
998
+ end: Optional[datetime] = None,
783
999
  ) -> pd.DataFrame:
1000
+ start, end = self._get_start_end(start, end)
1001
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
784
1002
  df = self._get_records(
785
1003
  table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
786
1004
  start=start,
@@ -788,7 +1006,7 @@ class V3IOTSDBConnector(TSDBConnector):
788
1006
  columns=[
789
1007
  mm_schemas.ResultData.RESULT_KIND,
790
1008
  ],
791
- filter_query=f"endpoint_id=='{endpoint_id}'",
1009
+ filter_query=filter_query,
792
1010
  agg_funcs=["last"],
793
1011
  )
794
1012
  if not df.empty:
@@ -803,20 +1021,30 @@ class V3IOTSDBConnector(TSDBConnector):
803
1021
  def get_error_count(
804
1022
  self,
805
1023
  endpoint_ids: Union[str, list[str]],
806
- start: Union[datetime, str] = "0",
807
- end: Union[datetime, str] = "now",
808
- ) -> pd.DataFrame:
809
- endpoint_ids = (
810
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
811
- )
812
- df = self._get_records(
1024
+ start: Optional[datetime] = None,
1025
+ end: Optional[datetime] = None,
1026
+ get_raw: bool = False,
1027
+ ) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
1028
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
1029
+ if filter_query:
1030
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
1031
+ else:
1032
+ filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
1033
+ start, end = self._get_start_end(start, end)
1034
+ res = self._get_records(
813
1035
  table=mm_schemas.FileTargetKind.ERRORS,
814
1036
  start=start,
815
1037
  end=end,
816
1038
  columns=[mm_schemas.EventFieldType.ERROR_COUNT],
817
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
1039
+ filter_query=filter_query,
818
1040
  agg_funcs=["count"],
1041
+ get_raw=get_raw,
819
1042
  )
1043
+
1044
+ if get_raw:
1045
+ return res
1046
+
1047
+ df = res
820
1048
  if not df.empty:
821
1049
  df.rename(
822
1050
  columns={
@@ -830,20 +1058,122 @@ class V3IOTSDBConnector(TSDBConnector):
830
1058
  def get_avg_latency(
831
1059
  self,
832
1060
  endpoint_ids: Union[str, list[str]],
833
- start: Union[datetime, str] = "0",
834
- end: Union[datetime, str] = "now",
835
- ) -> pd.DataFrame:
836
- endpoint_ids = (
837
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
838
- )
839
- df = self._get_records(
840
- table=mm_schemas.FileTargetKind.PREDICTIONS,
1061
+ start: Optional[datetime] = None,
1062
+ end: Optional[datetime] = None,
1063
+ get_raw: bool = False,
1064
+ ) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
1065
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
1066
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
1067
+ start, end = self._get_start_end(start, end)
1068
+ res = self._get_records(
1069
+ table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
841
1070
  start=start,
842
1071
  end=end,
843
1072
  columns=[mm_schemas.EventFieldType.LATENCY],
844
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
1073
+ filter_query=filter_query,
845
1074
  agg_funcs=["avg"],
1075
+ get_raw=get_raw,
846
1076
  )
1077
+
1078
+ if get_raw:
1079
+ return res
1080
+
1081
+ df = res
847
1082
  if not df.empty:
848
1083
  df.dropna(inplace=True)
1084
+ df.rename(
1085
+ columns={
1086
+ f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
1087
+ },
1088
+ inplace=True,
1089
+ )
849
1090
  return df.reset_index(drop=True)
1091
+
1092
+ async def add_basic_metrics(
1093
+ self,
1094
+ model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
1095
+ project: str,
1096
+ run_in_threadpool: Callable,
1097
+ metric_list: Optional[list[str]] = None,
1098
+ ) -> list[mlrun.common.schemas.ModelEndpoint]:
1099
+ """
1100
+ Fetch basic metrics from V3IO TSDB and add them to MEP objects.
1101
+
1102
+ :param model_endpoint_objects: A list of `ModelEndpoint` objects that will
1103
+ be filled with the relevant basic metrics.
1104
+ :param project: The name of the project.
1105
+ :param run_in_threadpool: A function that runs another function in a thread pool.
1106
+ :param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
1107
+
1108
+ :return: A list of `ModelEndpointMonitoringMetric` objects.
1109
+ """
1110
+
1111
+ uids = []
1112
+ model_endpoint_objects_by_uid = {}
1113
+ for model_endpoint_object in model_endpoint_objects:
1114
+ uid = model_endpoint_object.metadata.uid
1115
+ uids.append(uid)
1116
+ model_endpoint_objects_by_uid[uid] = model_endpoint_object
1117
+
1118
+ metric_name_to_function_and_column_name = {
1119
+ "error_count": (self.get_error_count, "count(error_count)"),
1120
+ "avg_latency": (self.get_avg_latency, "avg(latency)"),
1121
+ "result_status": (self.get_drift_status, "max(result_status)"),
1122
+ }
1123
+ if metric_list is not None:
1124
+ for metric_name in list(metric_name_to_function_and_column_name):
1125
+ if metric_name not in metric_list:
1126
+ del metric_name_to_function_and_column_name[metric_name]
1127
+
1128
+ metric_name_to_result = {}
1129
+
1130
+ for metric_name, (
1131
+ function,
1132
+ _,
1133
+ ) in metric_name_to_function_and_column_name.items():
1134
+ metric_name_to_result[metric_name] = await run_in_threadpool(
1135
+ function,
1136
+ endpoint_ids=uids,
1137
+ get_raw=True,
1138
+ )
1139
+
1140
+ def add_metric(
1141
+ metric: str,
1142
+ column_name: str,
1143
+ frames: list,
1144
+ ):
1145
+ for frame in frames:
1146
+ endpoint_ids = frame.column_data("endpoint_id")
1147
+ metric_data = frame.column_data(column_name)
1148
+ for index, endpoint_id in enumerate(endpoint_ids):
1149
+ mep = model_endpoint_objects_by_uid.get(endpoint_id)
1150
+ value = metric_data[index]
1151
+ if mep and value is not None and not math.isnan(value):
1152
+ setattr(mep.status, metric, value)
1153
+
1154
+ for metric_name, result in metric_name_to_result.items():
1155
+ add_metric(
1156
+ metric_name,
1157
+ metric_name_to_function_and_column_name[metric_name][1],
1158
+ result,
1159
+ )
1160
+ if metric_list is None or "last_request" in metric_list:
1161
+ self._enrich_mep_with_last_request(
1162
+ model_endpoint_objects_by_uid=model_endpoint_objects_by_uid
1163
+ )
1164
+
1165
+ return list(model_endpoint_objects_by_uid.values())
1166
+
1167
+ def _enrich_mep_with_last_request(
1168
+ self,
1169
+ model_endpoint_objects_by_uid: dict[str, mlrun.common.schemas.ModelEndpoint],
1170
+ ):
1171
+ last_request_dictionary = self.get_last_request(
1172
+ endpoint_ids=list(model_endpoint_objects_by_uid.keys())
1173
+ )
1174
+ for uid, mep in model_endpoint_objects_by_uid.items():
1175
+ # Set the last request timestamp to the MEP object. If not found, keep the existing value from the
1176
+ # DB (relevant for batch EP).
1177
+ mep.status.last_request = last_request_dictionary.get(
1178
+ uid, mep.status.last_request
1179
+ )