mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,434 @@
1
+ # Copyright 2025 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import random
16
+ import time
17
+ from collections.abc import Callable
18
+ from typing import Any, Optional, Union
19
+
20
+ import pandas as pd
21
+ import psycopg
22
+ import semver
23
+ from psycopg_pool import ConnectionPool
24
+
25
+ import mlrun.errors
26
+ from mlrun.config import config
27
+ from mlrun.model_monitoring.db.tsdb.preaggregate import PreAggregateManager
28
+ from mlrun.utils import logger
29
+
30
+
31
+ class QueryResult:
32
+ """Container for query results with field metadata."""
33
+
34
+ def __init__(self, data: list[tuple], fields: list[str]):
35
+ self.data = data
36
+ self.fields = fields
37
+
38
+ def __eq__(self, other):
39
+ return self.data == other.data and self.fields == other.fields
40
+
41
+ def __repr__(self):
42
+ return f"QueryResult(rows={len(self.data)}, fields={self.fields})"
43
+
44
+
45
+ class Statement:
46
+ """
47
+ Represents a parameterized statement for TimescaleDB.
48
+
49
+ This class encapsulates SQL statements with parameters, providing a clean
50
+ interface
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ sql: str,
56
+ parameters: Optional[Union[tuple, list, dict]] = None,
57
+ execute_many: bool = False,
58
+ ):
59
+ """
60
+ Initialize a parameterized statement.
61
+
62
+ :param sql: SQL query with parameter placeholders. Use %(name)s for named parameters
63
+ or %s for positional parameters.
64
+ :param parameters: Parameters for the SQL statement. Can be:
65
+ - tuple/list for positional parameters
66
+ - dict for named parameters
67
+ - list of tuples/dicts for execute_many=True
68
+ :param execute_many: If True, expects parameters to be a sequence of parameter sets
69
+ for batch execution using executemany()
70
+ """
71
+ self.sql = sql
72
+ self.parameters = parameters
73
+ self.execute_many = execute_many
74
+
75
+ def execute(self, cursor) -> None:
76
+ """Execute the statement using the provided cursor."""
77
+ if self.execute_many:
78
+ if not isinstance(self.parameters, list | tuple):
79
+ raise ValueError(
80
+ "execute_many=True requires parameters to be a sequence"
81
+ )
82
+ cursor.executemany(self.sql, self.parameters)
83
+ else:
84
+ cursor.execute(self.sql, self.parameters)
85
+
86
+
87
+ class TimescaleDBConnection:
88
+ """
89
+ TimescaleDB connection with shared connection pool and parameterized query support.
90
+
91
+ """
92
+
93
+ # TimescaleDB version requirements
94
+ MIN_TIMESCALEDB_VERSION = (
95
+ "2.7.0" # Minimum version with finalized continuous aggregates
96
+ )
97
+
98
+ # Deadlock retry configuration
99
+ MAX_DEADLOCK_RETRIES = 3 # Maximum deadlock-specific retry attempts
100
+
101
+ def __init__(
102
+ self,
103
+ dsn: str,
104
+ min_connections: int = 1,
105
+ max_connections: int = 10,
106
+ max_retries: int = 3,
107
+ retry_delay: float = 1.0,
108
+ autocommit: bool = False,
109
+ ):
110
+ self._dsn = dsn
111
+ self._min_connections = min_connections
112
+ self._max_connections = max_connections
113
+ self._max_retries = max_retries
114
+ self._retry_delay = retry_delay
115
+ self.prefix_statements: list[Union[str, Statement]] = []
116
+ self._autocommit = autocommit
117
+
118
+ # Connection pools (lazy initialization)
119
+ self._pool: Optional[ConnectionPool] = None
120
+ self._timescaledb_version: Optional[str] = None
121
+ self._version_checked: bool = False
122
+
123
+ @property
124
+ def pool(self) -> ConnectionPool:
125
+ """Get or create the synchronous connection pool."""
126
+ if self._pool is None:
127
+ self._pool = ConnectionPool(
128
+ conninfo=self._dsn,
129
+ min_size=self._min_connections,
130
+ max_size=self._max_connections,
131
+ timeout=float(
132
+ config.model_endpoint_monitoring.tsdb.connection_pool_timeout
133
+ ),
134
+ )
135
+ return self._pool
136
+
137
+ def close(self) -> None:
138
+ """Close the connection pool if it exists."""
139
+ if self._pool is not None:
140
+ self._pool.close()
141
+ self._pool = None
142
+
143
+ def _parse_version(self, version_string: str) -> semver.VersionInfo:
144
+ """Parse TimescaleDB version string using semver."""
145
+ try:
146
+ # Handle versions like "2.22.0", "2.7.1-dev", etc.
147
+ # semver.VersionInfo.parse handles pre-release versions automatically
148
+ return semver.VersionInfo.parse(version_string)
149
+ except ValueError as e:
150
+ raise mlrun.errors.MLRunRuntimeError(
151
+ f"Invalid TimescaleDB version format: {version_string}"
152
+ ) from e
153
+
154
+ def _check_timescaledb_version(self) -> None:
155
+ if self._version_checked:
156
+ return
157
+
158
+ try:
159
+ with self.pool.connection() as conn:
160
+ with conn.cursor() as cursor:
161
+ # Check if TimescaleDB extension is installed
162
+ cursor.execute(
163
+ "SELECT extversion FROM pg_extension WHERE extname = %s",
164
+ ("timescaledb",),
165
+ )
166
+ result = cursor.fetchone()
167
+ except psycopg.Error as e:
168
+ raise mlrun.errors.MLRunRuntimeError(
169
+ f"Failed to check TimescaleDB version: {e}"
170
+ ) from e
171
+
172
+ if not result:
173
+ raise mlrun.errors.MLRunRuntimeError(
174
+ "TimescaleDB extension is not installed"
175
+ )
176
+
177
+ self._timescaledb_version = result[0]
178
+
179
+ # Version processing logic outside try/catch - not a database operation
180
+ # _timescaledb_version is guaranteed to be non-None at this point
181
+ current_version = self._parse_version(self._timescaledb_version) # type: ignore[arg-type]
182
+ min_version = self._parse_version(self.MIN_TIMESCALEDB_VERSION)
183
+
184
+ if current_version < min_version:
185
+ raise mlrun.errors.MLRunRuntimeError(
186
+ f"TimescaleDB version {self._timescaledb_version} is not supported. "
187
+ f"Minimum required version: {self.MIN_TIMESCALEDB_VERSION} "
188
+ f"(required for finalized continuous aggregates)"
189
+ )
190
+
191
+ self._version_checked = True
192
+
193
+ @property
194
+ def timescaledb_version(self) -> Optional[str]:
195
+ """Get the TimescaleDB version (triggers version check if not done)."""
196
+ if not self._version_checked:
197
+ self._check_timescaledb_version()
198
+ return self._timescaledb_version
199
+
200
+ def run(
201
+ self,
202
+ statements: Optional[Union[str, Statement, list[Union[str, Statement]]]] = None,
203
+ query: Optional[Union[str, Statement]] = None,
204
+ ) -> Optional[QueryResult]:
205
+ """
206
+ Execute statements and optionally return query results with deadlock-aware retry logic.
207
+
208
+ Supports both string SQL and parameterized Statement objects.
209
+ Uses deadlock-specific retry logic for optimal performance.
210
+
211
+ :param statements: SQL statements to execute. Can be:
212
+ - str: Simple SQL string
213
+ - Statement: Parameterized statement
214
+ - list: Mix of str and Statement objects
215
+ :param query: Optional query to execute after statements. Can be str or Statement.
216
+ :return: QueryResult if query provided, None otherwise
217
+ """
218
+ # Perform version check on first use
219
+ if not self._version_checked:
220
+ self._check_timescaledb_version()
221
+
222
+ if statements := self._normalize_statements(statements):
223
+ self._execute_with_retry(
224
+ cursor_operation_callable=lambda cursor: self._execute_statements(
225
+ cursor, statements
226
+ ),
227
+ operation_name="statements",
228
+ )
229
+
230
+ # Execute query with retry logic for recoverable errors
231
+ if query:
232
+ return self._execute_with_retry(
233
+ cursor_operation_callable=lambda cursor: self._execute_query(
234
+ cursor, query
235
+ ),
236
+ operation_name="query",
237
+ )
238
+
239
+ return None
240
+
241
+ def _normalize_statements(
242
+ self, statements: Optional[Union[str, Statement, list[Union[str, Statement]]]]
243
+ ) -> list[Union[str, Statement]]:
244
+ """Convert statements to a normalized list format."""
245
+ if statements is None:
246
+ return []
247
+ return [statements] if isinstance(statements, str | Statement) else statements
248
+
249
+ def _execute_operation(
250
+ self,
251
+ statements: list[Union[str, Statement]],
252
+ query: Optional[Union[str, Statement]],
253
+ ) -> Optional[QueryResult]:
254
+ """Execute a single database operation (statements + optional query)."""
255
+ with self.pool.connection() as conn:
256
+ conn.autocommit = self._autocommit
257
+
258
+ with conn.cursor() as cursor:
259
+ self._execute_statements(cursor, statements)
260
+ if not self._autocommit:
261
+ conn.commit()
262
+ return self._execute_query(cursor, query) if query else None
263
+
264
+ def _execute_statements(
265
+ self, cursor, statements: list[Union[str, Statement]]
266
+ ) -> None:
267
+ """Execute prefix statements and main statements."""
268
+ # Execute prefix statements
269
+ for stmt in self.prefix_statements:
270
+ if isinstance(stmt, Statement):
271
+ stmt.execute(cursor)
272
+ else:
273
+ cursor.execute(stmt)
274
+
275
+ # Execute main statements
276
+ for statement in statements:
277
+ if isinstance(statement, Statement):
278
+ statement.execute(cursor)
279
+ else:
280
+ cursor.execute(statement)
281
+
282
+ def _execute_query(self, cursor, query: Union[str, Statement]) -> QueryResult:
283
+ """Execute a query and return formatted results."""
284
+ if isinstance(query, Statement):
285
+ query.execute(cursor)
286
+ else:
287
+ cursor.execute(query)
288
+
289
+ if cursor.description:
290
+ field_names = [desc.name for desc in cursor.description]
291
+ results = cursor.fetchall()
292
+ data = [tuple(row) for row in results]
293
+ return QueryResult(data, field_names)
294
+ else:
295
+ return QueryResult([], [])
296
+
297
+ def execute_with_fallback(
298
+ self,
299
+ pre_aggregate_manager: PreAggregateManager,
300
+ pre_agg_query_builder: Callable[[], str],
301
+ raw_query_builder: Callable[[], str],
302
+ interval: Optional[str] = None,
303
+ agg_funcs: Optional[list[str]] = None,
304
+ column_mapping_rules: Optional[dict[str, list[str]]] = None,
305
+ debug_name: str = "query",
306
+ ) -> pd.DataFrame:
307
+ """
308
+ Execute a query with pre-aggregate optimization and automatic fallback.
309
+
310
+ This method encapsulates the common pattern of trying pre-aggregate queries first,
311
+ then falling back to raw data queries if the pre-aggregate fails.
312
+
313
+ :param pre_aggregate_manager: Manager for pre-aggregate operations
314
+ :param pre_agg_query_builder: Function that returns pre-aggregate query string
315
+ :param raw_query_builder: Function that returns raw data query string
316
+ :param interval: Time interval for aggregation
317
+ :param agg_funcs: List of aggregation functions
318
+ :param column_mapping_rules: Rules for mapping column names in pre-aggregate results
319
+ :param debug_name: Name for debugging/logging purposes
320
+ :return: DataFrame with query results
321
+ """
322
+ # Import locally to avoid circular dependency
323
+ from mlrun.model_monitoring.db.tsdb.timescaledb.utils.timescaledb_dataframe_processor import (
324
+ TimescaleDBDataFrameProcessor,
325
+ )
326
+
327
+ df_processor = TimescaleDBDataFrameProcessor()
328
+
329
+ if pre_aggregate_manager.can_use_pre_aggregates(
330
+ interval=interval, agg_funcs=agg_funcs
331
+ ):
332
+ try:
333
+ # Try pre-aggregate query first
334
+ query = pre_agg_query_builder()
335
+ result = self.run(query=query)
336
+ df = df_processor.from_query_result(result)
337
+
338
+ if not df.empty and column_mapping_rules:
339
+ # Apply flexible column mapping for pre-aggregate results
340
+ mapping = df_processor.build_flexible_column_mapping(
341
+ df, column_mapping_rules
342
+ )
343
+ df = df_processor.apply_column_mapping(df, mapping)
344
+
345
+ return df
346
+
347
+ except Exception as e:
348
+ logger.warning(
349
+ f"Pre-aggregate {debug_name} query failed, falling back to raw data",
350
+ error=mlrun.errors.err_to_str(e),
351
+ )
352
+
353
+ # Fallback to raw data query
354
+ raw_query = raw_query_builder()
355
+ result = self.run(query=raw_query)
356
+ return df_processor.from_query_result(result)
357
+
358
+ def _execute_with_retry(
359
+ self,
360
+ cursor_operation_callable: Callable[
361
+ [psycopg.Cursor[Any]], Optional[QueryResult]
362
+ ],
363
+ operation_name: str,
364
+ ) -> Optional[QueryResult]:
365
+ """
366
+ Generic retry wrapper for database operations.
367
+
368
+ PostgreSQL Error Handling Strategy Matrix (Currently Implemented):
369
+
370
+ | Category |Retry?| Timing | Reason |
371
+ |-----------------------------|------|------------------|----------------------------------|
372
+ | DeadlockDetected | Yes | 0.1s, 0.2s, 0.4s | Auto-rollback, fast resolution |
373
+ | Other OperationalError | Yes | 1s, 2s, 4s | Network/server recovery time |
374
+ | InterfaceError | Yes | 1s, 2s, 4s | Client connection issues |
375
+ | All Other psycopg.Error | No | - | Pass through without wrapping |
376
+
377
+ Note: PostgreSQL automatically rolls back failed transactions, so explicit
378
+ rollback is only needed for DeadlockDetected where we retry the operation.
379
+
380
+ Note: Unhandled errors are passed through without wrapping to preserve
381
+ original exception types and stack traces for proper debugging.
382
+
383
+ :param cursor_operation_callable: Function that takes a cursor and executes the operation
384
+ :param operation_name: Name for logging (e.g., "statements", "query")
385
+ :return: Result of cursor_operation_callable()
386
+ """
387
+ deadlock_attempts = 0
388
+ connection_attempts = 0
389
+
390
+ while True:
391
+ try:
392
+ # Execute operation within a transaction
393
+ with self.pool.connection() as conn:
394
+ conn.autocommit = self._autocommit
395
+ with conn.cursor() as cursor:
396
+ result = cursor_operation_callable(cursor)
397
+ if not self._autocommit:
398
+ conn.commit()
399
+ return result
400
+ except (psycopg.OperationalError, psycopg.InterfaceError) as e:
401
+ # Different retry limits and timing based on error type
402
+ if isinstance(e, psycopg.errors.DeadlockDetected):
403
+ if deadlock_attempts >= self.MAX_DEADLOCK_RETRIES:
404
+ raise mlrun.errors.MLRunRuntimeError(
405
+ f"Database {operation_name} failed: deadlock persisted "
406
+ f"after {self.MAX_DEADLOCK_RETRIES} retries: {e}"
407
+ ) from e
408
+ # Fast retry for deadlocks: ~0.1s, ~0.2s, ~0.4s with jitter
409
+ delay = (2**deadlock_attempts) * 0.1 + random.uniform(0, 0.05)
410
+ error_type = "deadlock"
411
+ deadlock_attempts += 1
412
+ else:
413
+ if connection_attempts >= self._max_retries:
414
+ raise mlrun.errors.MLRunRuntimeError(
415
+ f"Database {operation_name} failed after "
416
+ f"{self._max_retries} connection retries: {e}"
417
+ ) from e
418
+ # Slower retry for connection issues: 1s, 2s, 4s
419
+ delay = self._retry_delay * (2**connection_attempts)
420
+ error_type = "connection"
421
+ connection_attempts += 1
422
+
423
+ logger.warning(
424
+ f"TimescaleDB {error_type} error in {operation_name}, retrying",
425
+ attempt=deadlock_attempts
426
+ if error_type == "deadlock"
427
+ else connection_attempts,
428
+ max_retries=self.MAX_DEADLOCK_RETRIES
429
+ if error_type == "deadlock"
430
+ else self._max_retries,
431
+ delay=delay,
432
+ error=mlrun.errors.err_to_str(e),
433
+ )
434
+ time.sleep(delay)