mlrun 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show
  1. mlrun/__init__.py +26 -22
  2. mlrun/__main__.py +15 -16
  3. mlrun/alerts/alert.py +150 -15
  4. mlrun/api/schemas/__init__.py +1 -9
  5. mlrun/artifacts/__init__.py +2 -3
  6. mlrun/artifacts/base.py +62 -19
  7. mlrun/artifacts/dataset.py +17 -17
  8. mlrun/artifacts/document.py +454 -0
  9. mlrun/artifacts/manager.py +28 -18
  10. mlrun/artifacts/model.py +91 -59
  11. mlrun/artifacts/plots.py +2 -2
  12. mlrun/common/constants.py +8 -0
  13. mlrun/common/formatters/__init__.py +1 -0
  14. mlrun/common/formatters/artifact.py +1 -1
  15. mlrun/common/formatters/feature_set.py +2 -0
  16. mlrun/common/formatters/function.py +1 -0
  17. mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
  18. mlrun/common/formatters/pipeline.py +1 -2
  19. mlrun/common/formatters/project.py +9 -0
  20. mlrun/common/model_monitoring/__init__.py +0 -5
  21. mlrun/common/model_monitoring/helpers.py +12 -62
  22. mlrun/common/runtimes/constants.py +25 -4
  23. mlrun/common/schemas/__init__.py +9 -5
  24. mlrun/common/schemas/alert.py +114 -19
  25. mlrun/common/schemas/api_gateway.py +3 -3
  26. mlrun/common/schemas/artifact.py +22 -9
  27. mlrun/common/schemas/auth.py +8 -4
  28. mlrun/common/schemas/background_task.py +7 -7
  29. mlrun/common/schemas/client_spec.py +4 -4
  30. mlrun/common/schemas/clusterization_spec.py +2 -2
  31. mlrun/common/schemas/common.py +53 -3
  32. mlrun/common/schemas/constants.py +15 -0
  33. mlrun/common/schemas/datastore_profile.py +1 -1
  34. mlrun/common/schemas/feature_store.py +9 -9
  35. mlrun/common/schemas/frontend_spec.py +4 -4
  36. mlrun/common/schemas/function.py +10 -10
  37. mlrun/common/schemas/hub.py +1 -1
  38. mlrun/common/schemas/k8s.py +3 -3
  39. mlrun/common/schemas/memory_reports.py +3 -3
  40. mlrun/common/schemas/model_monitoring/__init__.py +4 -8
  41. mlrun/common/schemas/model_monitoring/constants.py +127 -46
  42. mlrun/common/schemas/model_monitoring/grafana.py +18 -12
  43. mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
  44. mlrun/common/schemas/notification.py +24 -3
  45. mlrun/common/schemas/object.py +1 -1
  46. mlrun/common/schemas/pagination.py +4 -4
  47. mlrun/common/schemas/partition.py +142 -0
  48. mlrun/common/schemas/pipeline.py +3 -3
  49. mlrun/common/schemas/project.py +26 -18
  50. mlrun/common/schemas/runs.py +3 -3
  51. mlrun/common/schemas/runtime_resource.py +5 -5
  52. mlrun/common/schemas/schedule.py +1 -1
  53. mlrun/common/schemas/secret.py +1 -1
  54. mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
  55. mlrun/common/schemas/tag.py +3 -3
  56. mlrun/common/schemas/workflow.py +6 -5
  57. mlrun/common/types.py +1 -0
  58. mlrun/config.py +157 -89
  59. mlrun/data_types/__init__.py +5 -3
  60. mlrun/data_types/infer.py +13 -3
  61. mlrun/data_types/spark.py +2 -1
  62. mlrun/datastore/__init__.py +59 -18
  63. mlrun/datastore/alibaba_oss.py +4 -1
  64. mlrun/datastore/azure_blob.py +4 -1
  65. mlrun/datastore/base.py +19 -24
  66. mlrun/datastore/datastore.py +10 -4
  67. mlrun/datastore/datastore_profile.py +178 -45
  68. mlrun/datastore/dbfs_store.py +4 -1
  69. mlrun/datastore/filestore.py +4 -1
  70. mlrun/datastore/google_cloud_storage.py +4 -1
  71. mlrun/datastore/hdfs.py +4 -1
  72. mlrun/datastore/inmem.py +4 -1
  73. mlrun/datastore/redis.py +4 -1
  74. mlrun/datastore/s3.py +14 -3
  75. mlrun/datastore/sources.py +89 -92
  76. mlrun/datastore/store_resources.py +7 -4
  77. mlrun/datastore/storeytargets.py +51 -16
  78. mlrun/datastore/targets.py +38 -31
  79. mlrun/datastore/utils.py +87 -4
  80. mlrun/datastore/v3io.py +4 -1
  81. mlrun/datastore/vectorstore.py +291 -0
  82. mlrun/datastore/wasbfs/fs.py +13 -12
  83. mlrun/db/base.py +286 -100
  84. mlrun/db/httpdb.py +1562 -490
  85. mlrun/db/nopdb.py +250 -83
  86. mlrun/errors.py +6 -2
  87. mlrun/execution.py +194 -50
  88. mlrun/feature_store/__init__.py +2 -10
  89. mlrun/feature_store/api.py +20 -458
  90. mlrun/feature_store/common.py +9 -9
  91. mlrun/feature_store/feature_set.py +20 -18
  92. mlrun/feature_store/feature_vector.py +105 -479
  93. mlrun/feature_store/feature_vector_utils.py +466 -0
  94. mlrun/feature_store/retrieval/base.py +15 -11
  95. mlrun/feature_store/retrieval/job.py +2 -1
  96. mlrun/feature_store/retrieval/storey_merger.py +1 -1
  97. mlrun/feature_store/steps.py +3 -3
  98. mlrun/features.py +30 -13
  99. mlrun/frameworks/__init__.py +1 -2
  100. mlrun/frameworks/_common/__init__.py +1 -2
  101. mlrun/frameworks/_common/artifacts_library.py +2 -2
  102. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  103. mlrun/frameworks/_common/model_handler.py +31 -31
  104. mlrun/frameworks/_common/producer.py +3 -1
  105. mlrun/frameworks/_dl_common/__init__.py +1 -2
  106. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  107. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  108. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  109. mlrun/frameworks/_ml_common/__init__.py +1 -2
  110. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  111. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  112. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  113. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  114. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  115. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  116. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  117. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  118. mlrun/frameworks/huggingface/__init__.py +1 -2
  119. mlrun/frameworks/huggingface/model_server.py +9 -9
  120. mlrun/frameworks/lgbm/__init__.py +47 -44
  121. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  122. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  123. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  124. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  125. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  126. mlrun/frameworks/lgbm/model_handler.py +15 -11
  127. mlrun/frameworks/lgbm/model_server.py +11 -7
  128. mlrun/frameworks/lgbm/utils.py +2 -2
  129. mlrun/frameworks/onnx/__init__.py +1 -2
  130. mlrun/frameworks/onnx/dataset.py +3 -3
  131. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  132. mlrun/frameworks/onnx/model_handler.py +7 -5
  133. mlrun/frameworks/onnx/model_server.py +8 -6
  134. mlrun/frameworks/parallel_coordinates.py +11 -11
  135. mlrun/frameworks/pytorch/__init__.py +22 -23
  136. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  137. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  138. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  139. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  140. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  141. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  142. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  143. mlrun/frameworks/pytorch/model_handler.py +21 -17
  144. mlrun/frameworks/pytorch/model_server.py +13 -9
  145. mlrun/frameworks/sklearn/__init__.py +19 -18
  146. mlrun/frameworks/sklearn/estimator.py +2 -2
  147. mlrun/frameworks/sklearn/metric.py +3 -3
  148. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  149. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  150. mlrun/frameworks/sklearn/model_handler.py +4 -3
  151. mlrun/frameworks/tf_keras/__init__.py +11 -12
  152. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  153. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  154. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  155. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  156. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  157. mlrun/frameworks/tf_keras/model_server.py +12 -8
  158. mlrun/frameworks/xgboost/__init__.py +19 -18
  159. mlrun/frameworks/xgboost/model_handler.py +13 -9
  160. mlrun/k8s_utils.py +2 -5
  161. mlrun/launcher/base.py +3 -4
  162. mlrun/launcher/client.py +2 -2
  163. mlrun/launcher/local.py +6 -2
  164. mlrun/launcher/remote.py +1 -1
  165. mlrun/lists.py +8 -4
  166. mlrun/model.py +132 -46
  167. mlrun/model_monitoring/__init__.py +3 -5
  168. mlrun/model_monitoring/api.py +113 -98
  169. mlrun/model_monitoring/applications/__init__.py +0 -5
  170. mlrun/model_monitoring/applications/_application_steps.py +81 -50
  171. mlrun/model_monitoring/applications/base.py +467 -14
  172. mlrun/model_monitoring/applications/context.py +212 -134
  173. mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
  174. mlrun/model_monitoring/applications/evidently/base.py +146 -0
  175. mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
  176. mlrun/model_monitoring/applications/results.py +67 -15
  177. mlrun/model_monitoring/controller.py +701 -315
  178. mlrun/model_monitoring/db/__init__.py +0 -2
  179. mlrun/model_monitoring/db/_schedules.py +242 -0
  180. mlrun/model_monitoring/db/_stats.py +189 -0
  181. mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
  182. mlrun/model_monitoring/db/tsdb/base.py +243 -49
  183. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
  184. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  185. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
  187. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  188. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
  189. mlrun/model_monitoring/helpers.py +356 -114
  190. mlrun/model_monitoring/stream_processing.py +190 -345
  191. mlrun/model_monitoring/tracking_policy.py +11 -4
  192. mlrun/model_monitoring/writer.py +49 -90
  193. mlrun/package/__init__.py +3 -6
  194. mlrun/package/context_handler.py +2 -2
  195. mlrun/package/packager.py +12 -9
  196. mlrun/package/packagers/__init__.py +0 -2
  197. mlrun/package/packagers/default_packager.py +14 -11
  198. mlrun/package/packagers/numpy_packagers.py +16 -7
  199. mlrun/package/packagers/pandas_packagers.py +18 -18
  200. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  201. mlrun/package/packagers_manager.py +35 -32
  202. mlrun/package/utils/__init__.py +0 -3
  203. mlrun/package/utils/_pickler.py +6 -6
  204. mlrun/platforms/__init__.py +47 -16
  205. mlrun/platforms/iguazio.py +4 -1
  206. mlrun/projects/operations.py +30 -30
  207. mlrun/projects/pipelines.py +116 -47
  208. mlrun/projects/project.py +1292 -329
  209. mlrun/render.py +5 -9
  210. mlrun/run.py +57 -14
  211. mlrun/runtimes/__init__.py +1 -3
  212. mlrun/runtimes/base.py +30 -22
  213. mlrun/runtimes/daskjob.py +9 -9
  214. mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
  215. mlrun/runtimes/function_reference.py +5 -2
  216. mlrun/runtimes/generators.py +3 -2
  217. mlrun/runtimes/kubejob.py +6 -7
  218. mlrun/runtimes/mounts.py +574 -0
  219. mlrun/runtimes/mpijob/__init__.py +0 -2
  220. mlrun/runtimes/mpijob/abstract.py +7 -6
  221. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  222. mlrun/runtimes/nuclio/application/application.py +11 -13
  223. mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
  224. mlrun/runtimes/nuclio/function.py +127 -70
  225. mlrun/runtimes/nuclio/serving.py +105 -37
  226. mlrun/runtimes/pod.py +159 -54
  227. mlrun/runtimes/remotesparkjob.py +3 -2
  228. mlrun/runtimes/sparkjob/__init__.py +0 -2
  229. mlrun/runtimes/sparkjob/spark3job.py +22 -12
  230. mlrun/runtimes/utils.py +7 -6
  231. mlrun/secrets.py +2 -2
  232. mlrun/serving/__init__.py +8 -0
  233. mlrun/serving/merger.py +7 -5
  234. mlrun/serving/remote.py +35 -22
  235. mlrun/serving/routers.py +186 -240
  236. mlrun/serving/server.py +41 -10
  237. mlrun/serving/states.py +432 -118
  238. mlrun/serving/utils.py +13 -2
  239. mlrun/serving/v1_serving.py +3 -2
  240. mlrun/serving/v2_serving.py +161 -203
  241. mlrun/track/__init__.py +1 -1
  242. mlrun/track/tracker.py +2 -2
  243. mlrun/track/trackers/mlflow_tracker.py +6 -5
  244. mlrun/utils/async_http.py +35 -22
  245. mlrun/utils/clones.py +7 -4
  246. mlrun/utils/helpers.py +511 -58
  247. mlrun/utils/logger.py +119 -13
  248. mlrun/utils/notifications/notification/__init__.py +22 -19
  249. mlrun/utils/notifications/notification/base.py +39 -15
  250. mlrun/utils/notifications/notification/console.py +6 -6
  251. mlrun/utils/notifications/notification/git.py +11 -11
  252. mlrun/utils/notifications/notification/ipython.py +10 -9
  253. mlrun/utils/notifications/notification/mail.py +176 -0
  254. mlrun/utils/notifications/notification/slack.py +16 -8
  255. mlrun/utils/notifications/notification/webhook.py +24 -8
  256. mlrun/utils/notifications/notification_pusher.py +191 -200
  257. mlrun/utils/regex.py +12 -2
  258. mlrun/utils/version/version.json +2 -2
  259. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/METADATA +69 -54
  260. mlrun-1.8.0.dist-info/RECORD +351 -0
  261. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
  262. mlrun/model_monitoring/applications/evidently_base.py +0 -137
  263. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  264. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  265. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  266. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  267. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  268. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  269. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  270. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  271. mlrun/model_monitoring/model_endpoint.py +0 -118
  272. mlrun-1.7.2rc4.dist-info/RECORD +0 -351
  273. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
  274. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
  275. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
@@ -12,25 +12,42 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import typing
16
- from datetime import datetime
17
- from typing import Union
15
+ from datetime import datetime, timedelta
16
+ from threading import Lock
17
+ from typing import Callable, Final, Literal, Optional, Union
18
18
 
19
19
  import pandas as pd
20
20
  import taosws
21
- from taoswswrap.tdengine_connection import (
22
- Statement,
23
- TDEngineConnection,
24
- )
25
21
 
26
22
  import mlrun.common.schemas.model_monitoring as mm_schemas
23
+ import mlrun.common.types
27
24
  import mlrun.model_monitoring.db.tsdb.tdengine.schemas as tdengine_schemas
28
25
  import mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps
26
+ from mlrun.datastore.datastore_profile import DatastoreProfile
29
27
  from mlrun.model_monitoring.db import TSDBConnector
30
- from mlrun.model_monitoring.db.tsdb.tdengine.schemas import TDEngineSchema
28
+ from mlrun.model_monitoring.db.tsdb.tdengine.tdengine_connection import (
29
+ Statement,
30
+ TDEngineConnection,
31
+ )
31
32
  from mlrun.model_monitoring.helpers import get_invocations_fqn
32
33
  from mlrun.utils import logger
33
34
 
35
+ _connection = None
36
+ _connection_lock = Lock()
37
+
38
+
39
+ class TDEngineTimestampPrecision(mlrun.common.types.StrEnum):
40
+ """
41
+ The timestamp precision for the TDEngine database.
42
+ For more information, see:
43
+ https://docs.tdengine.com/tdengine-reference/sql-manual/data-types/#timestamp
44
+ https://docs.tdengine.com/tdengine-reference/sql-manual/manage-databases/#create-database
45
+ """
46
+
47
+ MILLISECOND = "ms" # TDEngine's default
48
+ MICROSECOND = "us" # MLRun's default
49
+ NANOSECOND = "ns"
50
+
34
51
 
35
52
  class TDEngineConnector(TSDBConnector):
36
53
  """
@@ -38,44 +55,46 @@ class TDEngineConnector(TSDBConnector):
38
55
  """
39
56
 
40
57
  type: str = mm_schemas.TSDBTarget.TDEngine
58
+ database = f"{tdengine_schemas._MODEL_MONITORING_DATABASE}_{mlrun.mlconf.system_id}"
41
59
 
42
60
  def __init__(
43
61
  self,
44
62
  project: str,
45
- database: str = tdengine_schemas._MODEL_MONITORING_DATABASE,
63
+ profile: DatastoreProfile,
64
+ timestamp_precision: TDEngineTimestampPrecision = TDEngineTimestampPrecision.MICROSECOND,
46
65
  **kwargs,
47
66
  ):
48
67
  super().__init__(project=project)
49
- if "connection_string" not in kwargs:
50
- raise mlrun.errors.MLRunInvalidArgumentError(
51
- "connection_string is a required parameter for TDEngineConnector."
52
- )
53
- self._tdengine_connection_string = kwargs.get("connection_string")
54
- self.database = database
55
68
 
56
- self._connection = None
57
- self._init_super_tables()
69
+ self._tdengine_connection_profile = profile
58
70
 
59
- self._timeout = mlrun.mlconf.model_endpoint_monitoring.tdengine.timeout
60
- self._retries = mlrun.mlconf.model_endpoint_monitoring.tdengine.retries
71
+ self._timestamp_precision: Final = ( # cannot be changed after initialization
72
+ timestamp_precision
73
+ )
74
+
75
+ self._init_super_tables()
61
76
 
62
77
  @property
63
78
  def connection(self) -> TDEngineConnection:
64
- if not self._connection:
65
- self._connection = self._create_connection()
66
- return self._connection
79
+ global _connection
80
+
81
+ if _connection:
82
+ return _connection
83
+
84
+ with _connection_lock:
85
+ if not _connection:
86
+ _connection = self._create_connection()
87
+
88
+ return _connection
67
89
 
68
90
  def _create_connection(self) -> TDEngineConnection:
69
91
  """Establish a connection to the TSDB server."""
70
92
  logger.debug("Creating a new connection to TDEngine", project=self.project)
71
- conn = TDEngineConnection(self._tdengine_connection_string)
72
- conn.run(
73
- statements=f"CREATE DATABASE IF NOT EXISTS {self.database}",
74
- timeout=self._timeout,
75
- retries=self._retries,
93
+ conn = TDEngineConnection(
94
+ self._tdengine_connection_profile.dsn(),
76
95
  )
77
96
  conn.prefix_statements = [f"USE {self.database}"]
78
- logger.debug("Connected to TDEngine", project=self.project)
97
+
79
98
  return conn
80
99
 
81
100
  def _init_super_tables(self):
@@ -90,16 +109,35 @@ class TDEngineConnector(TSDBConnector):
90
109
  mm_schemas.TDEngineSuperTables.PREDICTIONS: tdengine_schemas.Predictions(
91
110
  project=self.project, database=self.database
92
111
  ),
112
+ mm_schemas.TDEngineSuperTables.ERRORS: tdengine_schemas.Errors(
113
+ project=self.project, database=self.database
114
+ ),
93
115
  }
94
116
 
117
+ def _create_db_if_not_exists(self):
118
+ """Create the database if it does not exist."""
119
+ self.connection.prefix_statements = []
120
+ self.connection.run(
121
+ statements=f"CREATE DATABASE IF NOT EXISTS {self.database} PRECISION '{self._timestamp_precision}'",
122
+ )
123
+ self.connection.prefix_statements = [f"USE {self.database}"]
124
+ logger.debug(
125
+ "The TDEngine database is currently in use",
126
+ project=self.project,
127
+ database=self.database,
128
+ )
129
+
95
130
  def create_tables(self):
96
131
  """Create TDEngine supertables."""
132
+
133
+ # Create the database if it does not exist
134
+ self._create_db_if_not_exists()
135
+
97
136
  for table in self.tables:
98
137
  create_table_query = self.tables[table]._create_super_table_query()
99
- self.connection.run(
138
+ conn = self.connection
139
+ conn.run(
100
140
  statements=create_table_query,
101
- timeout=self._timeout,
102
- retries=self._retries,
103
141
  )
104
142
 
105
143
  def write_application_event(
@@ -122,7 +160,6 @@ class TDEngineConnector(TSDBConnector):
122
160
  table_name = (
123
161
  f"{table_name}_{event[mm_schemas.ResultData.RESULT_NAME]}"
124
162
  ).replace("-", "_")
125
- event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
126
163
 
127
164
  else:
128
165
  # Write a new metric
@@ -145,9 +182,14 @@ class TDEngineConnector(TSDBConnector):
145
182
 
146
183
  create_table_sql = table._create_subtable_sql(subtable=table_name, values=event)
147
184
 
185
+ # we need the string values to be sent to the connection, not the enum
186
+ columns = {str(key): str(val) for key, val in table.columns.items()}
187
+
148
188
  insert_statement = Statement(
149
- TDEngineSchema._insert_subtable_stmt,
150
- dict(columns=table.columns, subtable=table_name, values=event),
189
+ columns=columns,
190
+ subtable=table_name,
191
+ values=event,
192
+ timestamp_precision=self._timestamp_precision,
151
193
  )
152
194
 
153
195
  self.connection.run(
@@ -155,15 +197,30 @@ class TDEngineConnector(TSDBConnector):
155
197
  create_table_sql,
156
198
  insert_statement,
157
199
  ],
158
- timeout=self._timeout,
159
- retries=self._retries,
160
200
  )
161
201
 
162
202
  @staticmethod
163
- def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
203
+ def _convert_to_datetime(val: Union[str, datetime]) -> datetime:
164
204
  return datetime.fromisoformat(val) if isinstance(val, str) else val
165
205
 
166
- def apply_monitoring_stream_steps(self, graph):
206
+ @staticmethod
207
+ def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> str:
208
+ if isinstance(endpoint_id, str):
209
+ return f"endpoint_id='{endpoint_id}'"
210
+ elif isinstance(endpoint_id, list):
211
+ return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
212
+ else:
213
+ raise mlrun.errors.MLRunInvalidArgumentError(
214
+ "Invalid 'endpoint_id' filter: must be a string or a list."
215
+ )
216
+
217
+ def _drop_database_query(self) -> str:
218
+ return f"DROP DATABASE IF EXISTS {self.database};"
219
+
220
+ def _get_table_name_query(self) -> str:
221
+ return f"SELECT table_name FROM information_schema.ins_tables where db_name='{self.database}' LIMIT 1;"
222
+
223
+ def apply_monitoring_stream_steps(self, graph, **kwarg):
167
224
  """
168
225
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
169
226
  different key metric dictionaries. This data is being used by the monitoring dashboards in
@@ -176,15 +233,15 @@ class TDEngineConnector(TSDBConnector):
176
233
  graph.add_step(
177
234
  "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ProcessBeforeTDEngine",
178
235
  name="ProcessBeforeTDEngine",
179
- after="MapFeatureNames",
236
+ after="FilterNOP",
180
237
  )
181
238
 
182
239
  def apply_tdengine_target(name, after):
183
240
  graph.add_step(
184
- "storey.TDEngineTarget",
241
+ "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
185
242
  name=name,
186
243
  after=after,
187
- url=self._tdengine_connection_string,
244
+ url=f"ds://{self._tdengine_connection_profile.name}",
188
245
  supertable=self.tables[
189
246
  mm_schemas.TDEngineSuperTables.PREDICTIONS
190
247
  ].super_table,
@@ -194,9 +251,10 @@ class TDEngineConnector(TSDBConnector):
194
251
  columns=[
195
252
  mm_schemas.EventFieldType.LATENCY,
196
253
  mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
254
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
255
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
197
256
  ],
198
257
  tag_cols=[
199
- mm_schemas.EventFieldType.PROJECT,
200
258
  mm_schemas.EventFieldType.ENDPOINT_ID,
201
259
  ],
202
260
  max_events=1000,
@@ -209,8 +267,95 @@ class TDEngineConnector(TSDBConnector):
209
267
  after="ProcessBeforeTDEngine",
210
268
  )
211
269
 
212
- def handle_model_error(self, graph, **kwargs) -> None:
213
- pass
270
+ def handle_model_error(
271
+ self,
272
+ graph,
273
+ tsdb_batching_max_events: int = 1000,
274
+ tsdb_batching_timeout_secs: int = 30,
275
+ **kwargs,
276
+ ) -> None:
277
+ graph.add_step(
278
+ "mlrun.model_monitoring.db.tsdb.tdengine.stream_graph_steps.ErrorExtractor",
279
+ name="error_extractor",
280
+ after="ForwardError",
281
+ )
282
+ graph.add_step(
283
+ "mlrun.datastore.storeytargets.TDEngineStoreyTarget",
284
+ name="tsdb_error",
285
+ after="error_extractor",
286
+ url=f"ds://{self._tdengine_connection_profile.name}",
287
+ supertable=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
288
+ table_col=mm_schemas.EventFieldType.TABLE_COLUMN,
289
+ time_col=mm_schemas.EventFieldType.TIME,
290
+ database=self.database,
291
+ columns=[
292
+ mm_schemas.EventFieldType.MODEL_ERROR,
293
+ ],
294
+ tag_cols=[
295
+ mm_schemas.EventFieldType.ENDPOINT_ID,
296
+ mm_schemas.EventFieldType.ERROR_TYPE,
297
+ ],
298
+ max_events=tsdb_batching_max_events,
299
+ flush_after_seconds=tsdb_batching_timeout_secs,
300
+ )
301
+
302
+ def delete_tsdb_records(
303
+ self,
304
+ endpoint_ids: list[str],
305
+ ):
306
+ """
307
+ To delete subtables within TDEngine, we first query the subtables names with the provided endpoint_ids.
308
+ Then, we drop each subtable.
309
+ """
310
+ logger.debug(
311
+ "Deleting model endpoint resources using the TDEngine connector",
312
+ project=self.project,
313
+ number_of_endpoints_to_delete=len(endpoint_ids),
314
+ )
315
+
316
+ # Get all subtables with the provided endpoint_ids
317
+ subtables = []
318
+ try:
319
+ for table in self.tables:
320
+ get_subtable_query = self.tables[table]._get_subtables_query_by_tag(
321
+ filter_tag="endpoint_id", filter_values=endpoint_ids
322
+ )
323
+ subtables_result = self.connection.run(
324
+ query=get_subtable_query,
325
+ )
326
+ subtables.extend([subtable[0] for subtable in subtables_result.data])
327
+ except Exception as e:
328
+ logger.warning(
329
+ "Failed to get subtables for deletion. You may need to delete them manually."
330
+ "These can be found under the following supertables: app_results, "
331
+ "metrics, errors, and predictions.",
332
+ project=self.project,
333
+ error=mlrun.errors.err_to_str(e),
334
+ )
335
+
336
+ # Prepare the drop statements
337
+ drop_statements = []
338
+ for subtable in subtables:
339
+ drop_statements.append(
340
+ self.tables[table].drop_subtable_query(subtable=subtable)
341
+ )
342
+ try:
343
+ self.connection.run(
344
+ statements=drop_statements,
345
+ )
346
+ except Exception as e:
347
+ logger.warning(
348
+ "Failed to delete model endpoint resources. You may need to delete them manually. "
349
+ "These can be found under the following supertables: app_results, "
350
+ "metrics, errors, and predictions.",
351
+ project=self.project,
352
+ error=mlrun.errors.err_to_str(e),
353
+ )
354
+ logger.debug(
355
+ "Deleted all model endpoint resources using the TDEngine connector",
356
+ project=self.project,
357
+ number_of_endpoints_to_delete=len(endpoint_ids),
358
+ )
214
359
 
215
360
  def delete_tsdb_resources(self):
216
361
  """
@@ -227,14 +372,12 @@ class TDEngineConnector(TSDBConnector):
227
372
  try:
228
373
  self.connection.run(
229
374
  statements=drop_statements,
230
- timeout=self._timeout,
231
- retries=self._retries,
232
375
  )
233
376
  except Exception as e:
234
377
  logger.warning(
235
378
  "Failed to drop TDEngine tables. You may need to drop them manually. "
236
379
  "These can be found under the following supertables: app_results, "
237
- "metrics, and predictions.",
380
+ "metrics, errors, and predictions.",
238
381
  project=self.project,
239
382
  error=mlrun.errors.err_to_str(e),
240
383
  )
@@ -243,6 +386,51 @@ class TDEngineConnector(TSDBConnector):
243
386
  project=self.project,
244
387
  )
245
388
 
389
+ # Check if database is empty and if so, drop it
390
+ self._drop_database_if_empty()
391
+
392
+ def _drop_database_if_empty(self):
393
+ query_random_table_name = self._get_table_name_query()
394
+ drop_database = False
395
+ try:
396
+ table_name = self.connection.run(
397
+ query=query_random_table_name,
398
+ )
399
+ if len(table_name.data) == 0:
400
+ # no tables were found under the database
401
+ drop_database = True
402
+
403
+ except Exception as e:
404
+ logger.warning(
405
+ "Failed to query tables in the database. You may need to drop the database manually if it is empty.",
406
+ project=self.project,
407
+ error=mlrun.errors.err_to_str(e),
408
+ )
409
+
410
+ if drop_database:
411
+ logger.debug(
412
+ "Going to drop the TDEngine database",
413
+ project=self.project,
414
+ database=self.database,
415
+ )
416
+ drop_database_query = self._drop_database_query()
417
+ try:
418
+ self.connection.run(
419
+ statements=drop_database_query,
420
+ )
421
+ logger.debug(
422
+ "The TDEngine database has been successfully dropped",
423
+ project=self.project,
424
+ database=self.database,
425
+ )
426
+
427
+ except Exception as e:
428
+ logger.warning(
429
+ "Failed to drop the database. You may need to drop it manually if it is empty.",
430
+ project=self.project,
431
+ error=mlrun.errors.err_to_str(e),
432
+ )
433
+
246
434
  def get_model_endpoint_real_time_metrics(
247
435
  self,
248
436
  endpoint_id: str,
@@ -258,13 +446,17 @@ class TDEngineConnector(TSDBConnector):
258
446
  table: str,
259
447
  start: datetime,
260
448
  end: datetime,
261
- columns: typing.Optional[list[str]] = None,
262
- filter_query: typing.Optional[str] = None,
263
- interval: typing.Optional[str] = None,
264
- agg_funcs: typing.Optional[list] = None,
265
- limit: typing.Optional[int] = None,
266
- sliding_window_step: typing.Optional[str] = None,
449
+ columns: Optional[list[str]] = None,
450
+ filter_query: Optional[str] = None,
451
+ interval: Optional[str] = None,
452
+ agg_funcs: Optional[list] = None,
453
+ limit: Optional[int] = None,
454
+ sliding_window_step: Optional[str] = None,
267
455
  timestamp_column: str = mm_schemas.EventFieldType.TIME,
456
+ group_by: Optional[Union[list[str], str]] = None,
457
+ preform_agg_columns: Optional[list] = None,
458
+ order_by: Optional[str] = None,
459
+ desc: Optional[bool] = None,
268
460
  ) -> pd.DataFrame:
269
461
  """
270
462
  Getting records from TSDB data collection.
@@ -284,6 +476,14 @@ class TDEngineConnector(TSDBConnector):
284
476
  `sliding_window_step` is provided, interval must be provided as well. Provided
285
477
  as a string in the format of '1m', '1h', etc.
286
478
  :param timestamp_column: The column name that holds the timestamp index.
479
+ :param group_by: The column name to group by. Note that if `group_by` is provided, aggregation
480
+ functions must bg provided
481
+ :param preform_agg_columns: The columns to preform aggregation on.
482
+ notice that all aggregation functions provided will preform on those columns.
483
+ If not provided The default behavior is to preform on all columns in columns,
484
+ if an empty list was provided The aggregation won't be performed.
485
+ :param order_by: The column or alias to preform ordering on the query.
486
+ :param desc: Whether or not to sort the results in descending order.
287
487
 
288
488
  :return: DataFrame with the provided attributes from the data collection.
289
489
  :raise: MLRunInvalidArgumentError if query the provided table failed.
@@ -301,11 +501,15 @@ class TDEngineConnector(TSDBConnector):
301
501
  sliding_window_step=sliding_window_step,
302
502
  timestamp_column=timestamp_column,
303
503
  database=self.database,
504
+ group_by=group_by,
505
+ preform_agg_funcs_columns=preform_agg_columns,
506
+ order_by=order_by,
507
+ desc=desc,
304
508
  )
305
509
  logger.debug("Querying TDEngine", query=full_query)
306
510
  try:
307
511
  query_result = self.connection.run(
308
- query=full_query, timeout=self._timeout, retries=self._retries
512
+ query=full_query,
309
513
  )
310
514
  except taosws.QueryError as e:
311
515
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -322,16 +526,17 @@ class TDEngineConnector(TSDBConnector):
322
526
  start: datetime,
323
527
  end: datetime,
324
528
  metrics: list[mm_schemas.ModelEndpointMonitoringMetric],
325
- type: typing.Literal["metrics", "results"],
326
- ) -> typing.Union[
529
+ type: Literal["metrics", "results"],
530
+ with_result_extra_data: bool = False,
531
+ ) -> Union[
327
532
  list[
328
- typing.Union[
533
+ Union[
329
534
  mm_schemas.ModelEndpointMonitoringResultValues,
330
535
  mm_schemas.ModelEndpointMonitoringMetricNoData,
331
536
  ],
332
537
  ],
333
538
  list[
334
- typing.Union[
539
+ Union[
335
540
  mm_schemas.ModelEndpointMonitoringMetricValues,
336
541
  mm_schemas.ModelEndpointMonitoringMetricNoData,
337
542
  ],
@@ -340,6 +545,12 @@ class TDEngineConnector(TSDBConnector):
340
545
  timestamp_column = mm_schemas.WriterEvent.END_INFER_TIME
341
546
  columns = [timestamp_column, mm_schemas.WriterEvent.APPLICATION_NAME]
342
547
  if type == "metrics":
548
+ if with_result_extra_data:
549
+ logger.warning(
550
+ "The 'with_result_extra_data' parameter is not supported for metrics, just for results",
551
+ project=self.project,
552
+ endpoint_id=endpoint_id,
553
+ )
343
554
  table = self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table
344
555
  name = mm_schemas.MetricData.METRIC_NAME
345
556
  columns += [name, mm_schemas.MetricData.METRIC_VALUE]
@@ -353,6 +564,8 @@ class TDEngineConnector(TSDBConnector):
353
564
  mm_schemas.ResultData.RESULT_STATUS,
354
565
  mm_schemas.ResultData.RESULT_KIND,
355
566
  ]
567
+ if with_result_extra_data:
568
+ columns.append(mm_schemas.ResultData.RESULT_EXTRA_DATA)
356
569
  df_handler = self.df_to_results_values
357
570
  else:
358
571
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -389,6 +602,10 @@ class TDEngineConnector(TSDBConnector):
389
602
  is_empty=df.empty,
390
603
  )
391
604
 
605
+ if not with_result_extra_data and type == "results":
606
+ # Set the extra data to an empty string if it's not requested
607
+ df[mm_schemas.ResultData.RESULT_EXTRA_DATA] = ""
608
+
392
609
  return df_handler(df=df, metrics=metrics, project=self.project)
393
610
 
394
611
  def read_predictions(
@@ -397,10 +614,10 @@ class TDEngineConnector(TSDBConnector):
397
614
  endpoint_id: str,
398
615
  start: datetime,
399
616
  end: datetime,
400
- aggregation_window: typing.Optional[str] = None,
401
- agg_funcs: typing.Optional[list] = None,
402
- limit: typing.Optional[int] = None,
403
- ) -> typing.Union[
617
+ aggregation_window: Optional[str] = None,
618
+ agg_funcs: Optional[list] = None,
619
+ limit: Optional[int] = None,
620
+ ) -> Union[
404
621
  mm_schemas.ModelEndpointMonitoringMetricValues,
405
622
  mm_schemas.ModelEndpointMonitoringMetricNoData,
406
623
  ]:
@@ -414,7 +631,7 @@ class TDEngineConnector(TSDBConnector):
414
631
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
415
632
  start=start,
416
633
  end=end,
417
- columns=[mm_schemas.EventFieldType.LATENCY],
634
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
418
635
  filter_query=f"endpoint_id='{endpoint_id}'",
419
636
  agg_funcs=agg_funcs,
420
637
  interval=aggregation_window,
@@ -434,10 +651,10 @@ class TDEngineConnector(TSDBConnector):
434
651
  df["_wend"] = pd.to_datetime(df["_wend"])
435
652
  df.set_index("_wend", inplace=True)
436
653
 
437
- latency_column = (
438
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
654
+ estimated_prediction_count = (
655
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
439
656
  if agg_funcs
440
- else mm_schemas.EventFieldType.LATENCY
657
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
441
658
  )
442
659
 
443
660
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -445,7 +662,7 @@ class TDEngineConnector(TSDBConnector):
445
662
  values=list(
446
663
  zip(
447
664
  df.index,
448
- df[latency_column],
665
+ df[estimated_prediction_count],
449
666
  )
450
667
  ), # pyright: ignore[reportArgumentType]
451
668
  )
@@ -453,56 +670,285 @@ class TDEngineConnector(TSDBConnector):
453
670
  def get_last_request(
454
671
  self,
455
672
  endpoint_ids: Union[str, list[str]],
456
- start: Union[datetime, str] = "0",
457
- end: Union[datetime, str] = "now",
673
+ start: Optional[datetime] = None,
674
+ end: Optional[datetime] = None,
458
675
  ) -> pd.DataFrame:
459
- pass
676
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
677
+ start, end = self._get_start_end(start, end)
678
+ df = self._get_records(
679
+ table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
680
+ start=start,
681
+ end=end,
682
+ columns=[
683
+ mm_schemas.EventFieldType.ENDPOINT_ID,
684
+ mm_schemas.EventFieldType.TIME,
685
+ mm_schemas.EventFieldType.LATENCY,
686
+ ],
687
+ filter_query=filter_query,
688
+ timestamp_column=mm_schemas.EventFieldType.TIME,
689
+ agg_funcs=["last"],
690
+ group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
691
+ preform_agg_columns=[mm_schemas.EventFieldType.TIME],
692
+ )
693
+ if not df.empty:
694
+ df.dropna(inplace=True)
695
+ df.rename(
696
+ columns={
697
+ f"last({mm_schemas.EventFieldType.TIME})": mm_schemas.EventFieldType.LAST_REQUEST,
698
+ f"{mm_schemas.EventFieldType.LATENCY}": "last_latency",
699
+ },
700
+ inplace=True,
701
+ )
702
+ df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
703
+ df[mm_schemas.EventFieldType.LAST_REQUEST],
704
+ errors="coerce",
705
+ format="ISO8601",
706
+ utc=True,
707
+ )
708
+ return df
460
709
 
461
710
  def get_drift_status(
462
711
  self,
463
712
  endpoint_ids: Union[str, list[str]],
464
- start: Union[datetime, str] = "now-24h",
465
- end: Union[datetime, str] = "now",
713
+ start: Optional[datetime] = None,
714
+ end: Optional[datetime] = None,
715
+ get_raw: bool = False,
466
716
  ) -> pd.DataFrame:
467
- pass
717
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
718
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
719
+ start, end = self._get_start_end(start, end)
720
+ df = self._get_records(
721
+ table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
722
+ start=start,
723
+ end=end,
724
+ columns=[
725
+ mm_schemas.ResultData.RESULT_STATUS,
726
+ mm_schemas.EventFieldType.ENDPOINT_ID,
727
+ ],
728
+ filter_query=filter_query,
729
+ timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
730
+ agg_funcs=["max"],
731
+ group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
732
+ preform_agg_columns=[mm_schemas.ResultData.RESULT_STATUS],
733
+ )
734
+ df.rename(
735
+ columns={
736
+ f"max({mm_schemas.ResultData.RESULT_STATUS})": mm_schemas.ResultData.RESULT_STATUS
737
+ },
738
+ inplace=True,
739
+ )
740
+ if not df.empty:
741
+ df.dropna(inplace=True)
742
+ return df
468
743
 
469
744
  def get_metrics_metadata(
470
745
  self,
471
- endpoint_id: str,
472
- start: Union[datetime, str] = "0",
473
- end: Union[datetime, str] = "now",
746
+ endpoint_id: Union[str, list[str]],
747
+ start: Optional[datetime] = None,
748
+ end: Optional[datetime] = None,
474
749
  ) -> pd.DataFrame:
475
- pass
750
+ start, end = self._get_start_end(start, end)
751
+ df = self._get_records(
752
+ table=self.tables[mm_schemas.TDEngineSuperTables.METRICS].super_table,
753
+ start=start,
754
+ end=end,
755
+ columns=[
756
+ mm_schemas.ApplicationEvent.APPLICATION_NAME,
757
+ mm_schemas.MetricData.METRIC_NAME,
758
+ mm_schemas.EventFieldType.ENDPOINT_ID,
759
+ ],
760
+ filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
761
+ timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
762
+ group_by=[
763
+ mm_schemas.WriterEvent.APPLICATION_NAME,
764
+ mm_schemas.MetricData.METRIC_NAME,
765
+ mm_schemas.EventFieldType.ENDPOINT_ID,
766
+ ],
767
+ agg_funcs=["last"],
768
+ )
769
+ df.rename(
770
+ columns={
771
+ f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
772
+ f"last({mm_schemas.MetricData.METRIC_NAME})": mm_schemas.MetricData.METRIC_NAME,
773
+ f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
774
+ },
775
+ inplace=True,
776
+ )
777
+ if not df.empty:
778
+ df.dropna(inplace=True)
779
+ return df
476
780
 
477
781
  def get_results_metadata(
478
782
  self,
479
- endpoint_id: str,
480
- start: Union[datetime, str] = "0",
481
- end: Union[datetime, str] = "now",
783
+ endpoint_id: Union[str, list[str]],
784
+ start: Optional[datetime] = None,
785
+ end: Optional[datetime] = None,
482
786
  ) -> pd.DataFrame:
483
- pass
787
+ start, end = self._get_start_end(start, end)
788
+ df = self._get_records(
789
+ table=self.tables[mm_schemas.TDEngineSuperTables.APP_RESULTS].super_table,
790
+ start=start,
791
+ end=end,
792
+ columns=[
793
+ mm_schemas.ApplicationEvent.APPLICATION_NAME,
794
+ mm_schemas.ResultData.RESULT_NAME,
795
+ mm_schemas.ResultData.RESULT_KIND,
796
+ mm_schemas.EventFieldType.ENDPOINT_ID,
797
+ ],
798
+ filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
799
+ timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
800
+ group_by=[
801
+ mm_schemas.WriterEvent.APPLICATION_NAME,
802
+ mm_schemas.ResultData.RESULT_NAME,
803
+ mm_schemas.EventFieldType.ENDPOINT_ID,
804
+ ],
805
+ agg_funcs=["last"],
806
+ )
807
+ df.rename(
808
+ columns={
809
+ f"last({mm_schemas.ApplicationEvent.APPLICATION_NAME})": mm_schemas.ApplicationEvent.APPLICATION_NAME,
810
+ f"last({mm_schemas.ResultData.RESULT_NAME})": mm_schemas.ResultData.RESULT_NAME,
811
+ f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND,
812
+ f"last({mm_schemas.EventFieldType.ENDPOINT_ID})": mm_schemas.EventFieldType.ENDPOINT_ID,
813
+ },
814
+ inplace=True,
815
+ )
816
+ if not df.empty:
817
+ df.dropna(inplace=True)
818
+ return df
484
819
 
485
820
  def get_error_count(
486
821
  self,
487
822
  endpoint_ids: Union[str, list[str]],
488
- start: Union[datetime, str] = "0",
489
- end: Union[datetime, str] = "now",
823
+ start: Optional[datetime] = None,
824
+ end: Optional[datetime] = None,
825
+ get_raw: bool = False,
490
826
  ) -> pd.DataFrame:
491
- pass
827
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
828
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
829
+ start, end = self._get_start_end(start, end)
830
+ df = self._get_records(
831
+ table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
832
+ start=start,
833
+ end=end,
834
+ columns=[
835
+ mm_schemas.EventFieldType.MODEL_ERROR,
836
+ mm_schemas.EventFieldType.ENDPOINT_ID,
837
+ ],
838
+ agg_funcs=["count"],
839
+ filter_query=filter_query,
840
+ group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
841
+ preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
842
+ )
843
+ df.rename(
844
+ columns={f"count({mm_schemas.EventFieldType.MODEL_ERROR})": "error_count"},
845
+ inplace=True,
846
+ )
847
+ if not df.empty:
848
+ df.dropna(inplace=True)
849
+ return df
492
850
 
493
851
  def get_avg_latency(
494
852
  self,
495
853
  endpoint_ids: Union[str, list[str]],
496
- start: Union[datetime, str] = "0",
497
- end: Union[datetime, str] = "now",
854
+ start: Optional[datetime] = None,
855
+ end: Optional[datetime] = None,
856
+ get_raw: bool = False,
498
857
  ) -> pd.DataFrame:
499
- pass
858
+ endpoint_ids = (
859
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
860
+ )
861
+ start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
862
+ start, end = self._get_start_end(start, end)
863
+ df = self._get_records(
864
+ table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
865
+ start=start,
866
+ end=end,
867
+ columns=[
868
+ mm_schemas.EventFieldType.LATENCY,
869
+ mm_schemas.EventFieldType.ENDPOINT_ID,
870
+ ],
871
+ agg_funcs=["avg"],
872
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
873
+ group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
874
+ preform_agg_columns=[mm_schemas.EventFieldType.LATENCY],
875
+ )
876
+ df.rename(
877
+ columns={f"avg({mm_schemas.EventFieldType.LATENCY})": "avg_latency"},
878
+ inplace=True,
879
+ )
880
+ if not df.empty:
881
+ df.dropna(inplace=True)
882
+ return df
883
+
884
+ async def add_basic_metrics(
885
+ self,
886
+ model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
887
+ project: str,
888
+ run_in_threadpool: Callable,
889
+ metric_list: Optional[list[str]] = None,
890
+ ) -> list[mlrun.common.schemas.ModelEndpoint]:
891
+ """
892
+ Add basic metrics to the model endpoint object.
893
+
894
+ :param model_endpoint_objects: A list of `ModelEndpoint` objects that will
895
+ be filled with the relevant basic metrics.
896
+ :param project: The name of the project.
897
+ :param run_in_threadpool: A function that runs another function in a thread pool.
898
+ :param metric_list: List of metrics to include from the time series DB. Defaults to all metrics.
899
+
900
+ :return: A list of `ModelEndpointMonitoringMetric` objects.
901
+ """
902
+
903
+ uids = [mep.metadata.uid for mep in model_endpoint_objects]
904
+
905
+ metric_name_to_function = {
906
+ "error_count": self.get_error_count,
907
+ "last_request": self.get_last_request,
908
+ "avg_latency": self.get_avg_latency,
909
+ "result_status": self.get_drift_status,
910
+ }
911
+ if metric_list is not None:
912
+ for metric_name in list(metric_name_to_function):
913
+ if metric_name not in metric_list:
914
+ del metric_name_to_function[metric_name]
915
+
916
+ metric_name_to_df = {
917
+ metric_name: function(endpoint_ids=uids)
918
+ for metric_name, function in metric_name_to_function.items()
919
+ }
920
+
921
+ def add_metrics(
922
+ mep: mlrun.common.schemas.ModelEndpoint,
923
+ df_dictionary: dict[str, pd.DataFrame],
924
+ ):
925
+ for metric in df_dictionary.keys():
926
+ df = df_dictionary.get(metric, pd.DataFrame())
927
+ if not df.empty:
928
+ line = df[df["endpoint_id"] == mep.metadata.uid]
929
+ if not line.empty and metric in line:
930
+ value = line[metric].item()
931
+ if isinstance(value, pd.Timestamp):
932
+ value = value.to_pydatetime()
933
+ setattr(mep.status, metric, value)
934
+
935
+ return mep
936
+
937
+ return list(
938
+ map(
939
+ lambda mep: add_metrics(
940
+ mep=mep,
941
+ df_dictionary=metric_name_to_df,
942
+ ),
943
+ model_endpoint_objects,
944
+ )
945
+ )
500
946
 
501
947
  # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
502
948
  #
503
949
  # def read_prediction_metric_for_endpoint_if_exists(
504
950
  # self, endpoint_id: str
505
- # ) -> typing.Optional[mm_schemas.ModelEndpointMonitoringMetric]:
951
+ # ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
506
952
  # """
507
953
  # Read the "invocations" metric for the provided model endpoint, and return the metric object
508
954
  # if it exists.