mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +24 -3
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +14 -0
- mlrun/common/model_monitoring/helpers.py +123 -0
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +14 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +12 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +34 -0
- mlrun/common/schemas/hub.py +33 -20
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +12 -15
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +69 -19
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +12 -5
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +7 -2
- mlrun/datastore/datastore_profile.py +84 -22
- mlrun/datastore/model_provider/huggingface_provider.py +225 -41
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +206 -74
- mlrun/datastore/model_provider/openai_provider.py +226 -66
- mlrun/datastore/s3.py +39 -18
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +17 -12
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +25 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +63 -32
- mlrun/db/httpdb.py +373 -153
- mlrun/db/nopdb.py +54 -21
- mlrun/errors.py +4 -2
- mlrun/execution.py +66 -25
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +52 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +172 -0
- mlrun/hub/step.py +113 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +15 -7
- mlrun/launcher/local.py +4 -1
- mlrun/model.py +14 -4
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +65 -28
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +299 -128
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/controller.py +132 -58
- mlrun/model_monitoring/db/_schedules.py +38 -29
- mlrun/model_monitoring/db/_stats.py +6 -16
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +30 -6
- mlrun/model_monitoring/stream_processing.py +34 -28
- mlrun/model_monitoring/writer.py +224 -4
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -43
- mlrun/platforms/iguazio.py +8 -4
- mlrun/projects/operations.py +17 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +187 -123
- mlrun/run.py +95 -21
- mlrun/runtimes/__init__.py +2 -186
- mlrun/runtimes/base.py +103 -25
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +5 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +12 -7
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +339 -40
- mlrun/runtimes/nuclio/function.py +222 -72
- mlrun/runtimes/nuclio/serving.py +132 -42
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +99 -14
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +84 -11
- mlrun/serving/routers.py +26 -44
- mlrun/serving/server.py +138 -51
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +997 -283
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +149 -95
- mlrun/serving/v2_serving.py +9 -10
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +292 -94
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +3 -5
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +3 -3
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
- mlrun/api/schemas/__init__.py +0 -259
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
# Copyright 2025 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
18
|
+
|
|
19
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
20
|
+
import mlrun.errors
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
# TimescaleDB interval pattern for parsing intervals like "1h", "10m", "1d", "1w", "1M"
|
|
26
|
+
_TIMESCALEDB_INTERVAL_PATTERN = re.compile(r"(\d+)([mhdwM])")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TimescaleDBQueryBuilder:
|
|
30
|
+
"""Utility class for building common SQL query components."""
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def build_endpoint_filter(endpoint_ids: Optional[Union[str, list[str]]]) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Generate SQL filter for endpoint IDs.
|
|
36
|
+
|
|
37
|
+
:param endpoint_ids: Single endpoint ID, list of endpoint IDs, or None for no filtering
|
|
38
|
+
:return: SQL WHERE clause fragment for endpoint filtering, or empty string if None
|
|
39
|
+
"""
|
|
40
|
+
if endpoint_ids is None:
|
|
41
|
+
return ""
|
|
42
|
+
if isinstance(endpoint_ids, str):
|
|
43
|
+
return f"{mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_ids}'"
|
|
44
|
+
elif isinstance(endpoint_ids, list):
|
|
45
|
+
endpoint_list = "', '".join(endpoint_ids)
|
|
46
|
+
return f"{mm_schemas.WriterEvent.ENDPOINT_ID} IN ('{endpoint_list}')"
|
|
47
|
+
else:
|
|
48
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
49
|
+
"Invalid 'endpoint_ids' filter: must be a string or a list of strings"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def build_time_range_filter(
|
|
54
|
+
start: datetime, end: datetime, time_column: str
|
|
55
|
+
) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Generate SQL filter for time range.
|
|
58
|
+
|
|
59
|
+
:param start: Start datetime
|
|
60
|
+
:param end: End datetime
|
|
61
|
+
:param time_column: Name of the time column to filter on
|
|
62
|
+
:return: SQL WHERE clause fragment for time filtering
|
|
63
|
+
"""
|
|
64
|
+
return f"{time_column} >= '{start}' AND {time_column} <= '{end}'"
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def build_application_filter(app_names: Union[str, list[str]]) -> str:
|
|
68
|
+
"""
|
|
69
|
+
Generate SQL filter for application names.
|
|
70
|
+
|
|
71
|
+
:param app_names: Single application name or list of application names
|
|
72
|
+
:return: SQL WHERE clause fragment for application filtering
|
|
73
|
+
"""
|
|
74
|
+
if isinstance(app_names, str):
|
|
75
|
+
return f"{mm_schemas.WriterEvent.APPLICATION_NAME} = '{app_names}'"
|
|
76
|
+
elif isinstance(app_names, list):
|
|
77
|
+
app_list = "', '".join(app_names)
|
|
78
|
+
return f"{mm_schemas.WriterEvent.APPLICATION_NAME} IN ('{app_list}')"
|
|
79
|
+
else:
|
|
80
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
81
|
+
"Invalid 'app_names' filter: must be either a string or a list of strings"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def build_metrics_filter(
|
|
86
|
+
metrics: Optional[list[mm_schemas.ModelEndpointMonitoringMetric]],
|
|
87
|
+
) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Generate SQL filter for metrics using both application_name and metric_name columns.
|
|
90
|
+
|
|
91
|
+
:param metrics: List of ModelEndpointMonitoringMetric objects, or None for no filtering
|
|
92
|
+
:return: SQL WHERE clause fragment for metrics filtering, or empty string if None
|
|
93
|
+
"""
|
|
94
|
+
if metrics is None:
|
|
95
|
+
return ""
|
|
96
|
+
if not metrics:
|
|
97
|
+
raise mlrun.errors.MLRunInvalidArgumentError("Metrics list cannot be empty")
|
|
98
|
+
|
|
99
|
+
# Build filter that includes both application_name and metric_name
|
|
100
|
+
# Format: (application_name = 'app1' AND metric_name = 'name1') OR
|
|
101
|
+
# (application_name = 'app2' AND metric_name = 'name2')
|
|
102
|
+
conditions = []
|
|
103
|
+
for metric in metrics:
|
|
104
|
+
condition = (
|
|
105
|
+
f"({mm_schemas.WriterEvent.APPLICATION_NAME} = '{metric.app}' "
|
|
106
|
+
f"AND {mm_schemas.MetricData.METRIC_NAME} = '{metric.name}')"
|
|
107
|
+
)
|
|
108
|
+
conditions.append(condition)
|
|
109
|
+
|
|
110
|
+
if len(conditions) == 1:
|
|
111
|
+
return conditions[0]
|
|
112
|
+
return " OR ".join(conditions)
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def build_results_filter(
|
|
116
|
+
metrics: Optional[list[mm_schemas.ModelEndpointMonitoringMetric]],
|
|
117
|
+
) -> str:
|
|
118
|
+
"""
|
|
119
|
+
Generate SQL filter for results using both application_name and result_name columns.
|
|
120
|
+
:param metrics: List of ModelEndpointMonitoringMetric objects, or None for no filtering
|
|
121
|
+
:return: SQL WHERE clause fragment for results filtering, or empty string if None
|
|
122
|
+
"""
|
|
123
|
+
if metrics is None:
|
|
124
|
+
return ""
|
|
125
|
+
if not metrics:
|
|
126
|
+
raise mlrun.errors.MLRunInvalidArgumentError("Metrics list cannot be empty")
|
|
127
|
+
|
|
128
|
+
# Build filter that includes both application_name and result_name
|
|
129
|
+
# Format: (application_name = 'app1' AND result_name = 'name1') OR
|
|
130
|
+
# (application_name = 'app2' AND result_name = 'name2')
|
|
131
|
+
conditions = []
|
|
132
|
+
for metric in metrics:
|
|
133
|
+
condition = (
|
|
134
|
+
f"({mm_schemas.WriterEvent.APPLICATION_NAME} = '{metric.app}' "
|
|
135
|
+
f"AND {mm_schemas.ResultData.RESULT_NAME} = '{metric.name}')"
|
|
136
|
+
)
|
|
137
|
+
conditions.append(condition)
|
|
138
|
+
|
|
139
|
+
if len(conditions) == 1:
|
|
140
|
+
return conditions[0]
|
|
141
|
+
return " OR ".join(conditions)
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def build_metrics_filter_from_names(metric_names: list[str]) -> str:
|
|
145
|
+
"""
|
|
146
|
+
Generate SQL filter for metrics by name.
|
|
147
|
+
|
|
148
|
+
:param metric_names: List of metric names
|
|
149
|
+
:return: SQL WHERE clause fragment for metrics filtering
|
|
150
|
+
"""
|
|
151
|
+
if not metric_names:
|
|
152
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
153
|
+
"Metric names list cannot be empty"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if len(metric_names) == 1:
|
|
157
|
+
return f"{mm_schemas.MetricData.METRIC_NAME} = '{metric_names[0]}'"
|
|
158
|
+
metric_list = "', '".join(metric_names)
|
|
159
|
+
return f"{mm_schemas.MetricData.METRIC_NAME} IN ('{metric_list}')"
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def combine_filters(filters: list[str]) -> Optional[str]:
|
|
163
|
+
"""
|
|
164
|
+
Combine multiple filter conditions with AND operator.
|
|
165
|
+
|
|
166
|
+
:param filters: List of filter condition strings
|
|
167
|
+
:return: Combined filter string or None if no filters
|
|
168
|
+
"""
|
|
169
|
+
if valid_filters := [f.strip() for f in filters if f.strip()]:
|
|
170
|
+
return (
|
|
171
|
+
valid_filters[0]
|
|
172
|
+
if len(valid_filters) == 1
|
|
173
|
+
else " AND ".join(valid_filters)
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def interval_to_minutes(interval: str) -> Optional[int]:
|
|
180
|
+
"""
|
|
181
|
+
Convert TimescaleDB interval string to minutes.
|
|
182
|
+
|
|
183
|
+
Uses PostgreSQL/TimescaleDB fixed duration assumptions:
|
|
184
|
+
- 1 month = 30 days = 43,200 minutes
|
|
185
|
+
- 1 year = 365.25 days = 525,960 minutes
|
|
186
|
+
|
|
187
|
+
This matches TimescaleDB's INTERVAL arithmetic behavior and is appropriate
|
|
188
|
+
for duration calculations and optimal interval selection.
|
|
189
|
+
|
|
190
|
+
:param interval: Interval string like "1h", "10m", "1d", "1w", "1M"
|
|
191
|
+
:return: Duration in minutes, or None if invalid format
|
|
192
|
+
"""
|
|
193
|
+
match = _TIMESCALEDB_INTERVAL_PATTERN.match(interval)
|
|
194
|
+
if not match:
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
amount, unit = int(match.group(1)), match.group(2)
|
|
198
|
+
|
|
199
|
+
if unit == "m": # minutes
|
|
200
|
+
return amount
|
|
201
|
+
elif unit == "h": # hours
|
|
202
|
+
return amount * 60
|
|
203
|
+
elif unit == "d": # days
|
|
204
|
+
return amount * 1440
|
|
205
|
+
elif unit == "w": # weeks
|
|
206
|
+
return amount * 10080
|
|
207
|
+
elif unit == "M": # months (PostgreSQL: 30 days)
|
|
208
|
+
return amount * 43200
|
|
209
|
+
else:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def determine_optimal_interval(start: datetime, end: datetime) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Determine optimal interval for time-based aggregation based on time range.
|
|
216
|
+
|
|
217
|
+
This method selects appropriate interval from a comprehensive list of
|
|
218
|
+
standard TimescaleDB intervals rather than simple time-based thresholds.
|
|
219
|
+
This provides better balance between query performance
|
|
220
|
+
and data granularity by targeting optimal data point counts.
|
|
221
|
+
|
|
222
|
+
:param start: Start time
|
|
223
|
+
:param end: End time
|
|
224
|
+
:return: Optimal interval string (in Python format like "1h", "1d")
|
|
225
|
+
"""
|
|
226
|
+
# Comprehensive list of standard TimescaleDB intervals
|
|
227
|
+
standard_intervals = [
|
|
228
|
+
"1m",
|
|
229
|
+
"5m",
|
|
230
|
+
"10m",
|
|
231
|
+
"15m",
|
|
232
|
+
"30m",
|
|
233
|
+
"1h",
|
|
234
|
+
"2h",
|
|
235
|
+
"6h",
|
|
236
|
+
"12h",
|
|
237
|
+
"1d",
|
|
238
|
+
"3d",
|
|
239
|
+
"1w",
|
|
240
|
+
"1M",
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
optimal = TimescaleDBQueryBuilder.determine_optimal_from_available(
|
|
244
|
+
start, end, standard_intervals
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Fallback for edge cases where algorithm doesn't find a suitable match
|
|
248
|
+
# Simple binary choice: smallest interval for short ranges, largest for long ranges
|
|
249
|
+
if optimal is None:
|
|
250
|
+
time_diff = end - start
|
|
251
|
+
return "1m" if time_diff <= timedelta(days=30) else "1M"
|
|
252
|
+
return optimal
|
|
253
|
+
|
|
254
|
+
@staticmethod
|
|
255
|
+
def determine_optimal_from_available(
|
|
256
|
+
start: datetime, end: datetime, available_intervals: list[str]
|
|
257
|
+
) -> Optional[str]:
|
|
258
|
+
"""
|
|
259
|
+
Determine optimal interval from available pre-aggregate intervals.
|
|
260
|
+
|
|
261
|
+
Uses a formula-based approach to select intervals that provide reasonable data points
|
|
262
|
+
(~50-200 range) for optimal visualization and query performance.
|
|
263
|
+
|
|
264
|
+
:param start: Start time
|
|
265
|
+
:param end: End time
|
|
266
|
+
:param available_intervals: List of available interval strings (e.g., ["10m", "1h", "6h", "1d"])
|
|
267
|
+
:return: Optimal interval string or None if no suitable intervals available
|
|
268
|
+
"""
|
|
269
|
+
if not available_intervals:
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
# Convert available intervals to (name, minutes) tuples using our centralized parsing
|
|
273
|
+
available_with_minutes = []
|
|
274
|
+
for interval in available_intervals:
|
|
275
|
+
minutes = TimescaleDBQueryBuilder.interval_to_minutes(interval)
|
|
276
|
+
if minutes is not None:
|
|
277
|
+
available_with_minutes.append((interval, minutes))
|
|
278
|
+
|
|
279
|
+
if not available_with_minutes:
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
# Sort by duration (ascending)
|
|
283
|
+
available_with_minutes.sort(key=lambda x: x[1])
|
|
284
|
+
|
|
285
|
+
# Calculate time range in minutes
|
|
286
|
+
time_diff_minutes = (end - start).total_seconds() / 60
|
|
287
|
+
|
|
288
|
+
# Target ~100 data points for optimal visualization balance
|
|
289
|
+
# Accept intervals that give 20-500 data points (wider reasonable range)
|
|
290
|
+
target_points = 100
|
|
291
|
+
min_acceptable_points = 20
|
|
292
|
+
max_acceptable_points = 500
|
|
293
|
+
|
|
294
|
+
optimal_interval_minutes = time_diff_minutes / target_points
|
|
295
|
+
min_interval_minutes = time_diff_minutes / max_acceptable_points
|
|
296
|
+
max_interval_minutes = time_diff_minutes / min_acceptable_points
|
|
297
|
+
|
|
298
|
+
# Find the best matching interval within acceptable range
|
|
299
|
+
best_interval = None
|
|
300
|
+
best_score = float("inf")
|
|
301
|
+
|
|
302
|
+
for interval_name, interval_minutes in available_with_minutes:
|
|
303
|
+
# Check if this interval is within acceptable range
|
|
304
|
+
if min_interval_minutes <= interval_minutes <= max_interval_minutes:
|
|
305
|
+
# Score by distance from optimal (closer to optimal = better)
|
|
306
|
+
score = abs(interval_minutes - optimal_interval_minutes)
|
|
307
|
+
if score < best_score:
|
|
308
|
+
best_score = score
|
|
309
|
+
best_interval = interval_name
|
|
310
|
+
|
|
311
|
+
return best_interval
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def build_read_data_with_fallback(
|
|
315
|
+
connection,
|
|
316
|
+
pre_aggregate_manager,
|
|
317
|
+
table_schema,
|
|
318
|
+
start: "datetime", # Use string to avoid import cycle
|
|
319
|
+
end: "datetime",
|
|
320
|
+
columns: list[str],
|
|
321
|
+
filter_query: Optional[str],
|
|
322
|
+
name_column: str,
|
|
323
|
+
value_column: str,
|
|
324
|
+
debug_name: str = "read_data",
|
|
325
|
+
timestamp_column: Optional[str] = None,
|
|
326
|
+
) -> "pd.DataFrame": # Use string to avoid import cycle
|
|
327
|
+
"""
|
|
328
|
+
Build and execute read data query with pre-aggregate fallback pattern.
|
|
329
|
+
|
|
330
|
+
This method deduplicates the common pattern used in both metrics and results
|
|
331
|
+
queries for reading data with pre-aggregate optimization and fallback.
|
|
332
|
+
|
|
333
|
+
:param connection: Database connection instance
|
|
334
|
+
:param pre_aggregate_manager: Pre-aggregate handler for optimization
|
|
335
|
+
:param table_schema: Table schema for query building
|
|
336
|
+
:param start: Start datetime for query
|
|
337
|
+
:param end: End datetime for query
|
|
338
|
+
:param columns: List of columns to select
|
|
339
|
+
:param filter_query: WHERE clause conditions
|
|
340
|
+
:param name_column: Name of the metric/result name column
|
|
341
|
+
:param value_column: Name of the metric/result value column
|
|
342
|
+
:param debug_name: Name for debugging purposes
|
|
343
|
+
:param timestamp_column: Optional timestamp column to use for time filtering
|
|
344
|
+
:return: DataFrame with query results
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
def build_pre_agg_query():
|
|
348
|
+
return table_schema._get_records_query(
|
|
349
|
+
start=start,
|
|
350
|
+
end=end,
|
|
351
|
+
columns_to_filter=columns,
|
|
352
|
+
filter_query=filter_query,
|
|
353
|
+
use_pre_aggregates=True,
|
|
354
|
+
timestamp_column=timestamp_column,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def build_raw_query():
|
|
358
|
+
return table_schema._get_records_query(
|
|
359
|
+
start=start,
|
|
360
|
+
end=end,
|
|
361
|
+
columns_to_filter=columns,
|
|
362
|
+
filter_query=filter_query,
|
|
363
|
+
timestamp_column=timestamp_column,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Column mapping rules for pre-aggregate results
|
|
367
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
368
|
+
|
|
369
|
+
column_mapping_rules = {
|
|
370
|
+
name_column: [name_column],
|
|
371
|
+
value_column: [value_column],
|
|
372
|
+
table_schema.time_column: [table_schema.time_column],
|
|
373
|
+
mm_schemas.WriterEvent.APPLICATION_NAME: [
|
|
374
|
+
mm_schemas.WriterEvent.APPLICATION_NAME
|
|
375
|
+
],
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
return connection.execute_with_fallback(
|
|
379
|
+
pre_aggregate_manager,
|
|
380
|
+
build_pre_agg_query,
|
|
381
|
+
build_raw_query,
|
|
382
|
+
interval=None, # No specific interval for this query
|
|
383
|
+
agg_funcs=None,
|
|
384
|
+
column_mapping_rules=column_mapping_rules,
|
|
385
|
+
debug_name=debug_name,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
@staticmethod
|
|
389
|
+
def prepare_time_range_and_interval(
|
|
390
|
+
pre_aggregate_manager,
|
|
391
|
+
start: Optional[datetime] = None,
|
|
392
|
+
end: Optional[datetime] = None,
|
|
393
|
+
interval: Optional[str] = None,
|
|
394
|
+
auto_determine_interval: bool = True,
|
|
395
|
+
) -> tuple[datetime, datetime, str]:
|
|
396
|
+
"""
|
|
397
|
+
Standardized time range and interval preparation for TimescaleDB queries.
|
|
398
|
+
|
|
399
|
+
This helper eliminates the common pattern of:
|
|
400
|
+
1. get_start_end()
|
|
401
|
+
2. determine_optimal_interval() (optional)
|
|
402
|
+
3. align_time_range()
|
|
403
|
+
|
|
404
|
+
:param pre_aggregate_manager: PreAggregateManager instance
|
|
405
|
+
:param start: Start datetime (optional)
|
|
406
|
+
:param end: End datetime (optional)
|
|
407
|
+
:param interval: Time interval (optional, auto-determined if None and auto_determine_interval=True)
|
|
408
|
+
:param auto_determine_interval: Whether to auto-determine interval if not provided
|
|
409
|
+
:return: Tuple of (aligned_start, aligned_end, interval) - interval is guaranteed to be valid
|
|
410
|
+
"""
|
|
411
|
+
# Step 1: Get start/end times with defaults
|
|
412
|
+
start, end = pre_aggregate_manager.get_start_end(start, end)
|
|
413
|
+
|
|
414
|
+
# Step 2: Auto-determine optimal interval if requested and not provided
|
|
415
|
+
if interval is None and auto_determine_interval:
|
|
416
|
+
# First, try to use available pre-aggregate intervals if they exist
|
|
417
|
+
available_intervals = (
|
|
418
|
+
pre_aggregate_manager.config.aggregate_intervals
|
|
419
|
+
if pre_aggregate_manager.config
|
|
420
|
+
else None
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if available_intervals:
|
|
424
|
+
if optimal_from_preaggregate := (
|
|
425
|
+
TimescaleDBQueryBuilder.determine_optimal_from_available(
|
|
426
|
+
start, end, available_intervals
|
|
427
|
+
)
|
|
428
|
+
):
|
|
429
|
+
interval = optimal_from_preaggregate
|
|
430
|
+
|
|
431
|
+
# If no suitable pre-aggregate interval found, use formula-based approach
|
|
432
|
+
if interval is None:
|
|
433
|
+
interval = TimescaleDBQueryBuilder.determine_optimal_interval(
|
|
434
|
+
start, end
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# Step 3: Align times to interval boundaries
|
|
438
|
+
start, end = pre_aggregate_manager.align_time_range(start, end, interval)
|
|
439
|
+
|
|
440
|
+
return start, end, interval
|
|
441
|
+
|
|
442
|
+
@staticmethod
|
|
443
|
+
def prepare_time_range_with_validation(
|
|
444
|
+
pre_aggregate_manager,
|
|
445
|
+
start_iso: str,
|
|
446
|
+
end_iso: str,
|
|
447
|
+
interval: Optional[str] = None,
|
|
448
|
+
agg_function: Optional[str] = None,
|
|
449
|
+
) -> tuple[datetime, datetime, Optional[str]]:
|
|
450
|
+
"""
|
|
451
|
+
Specialized helper for time preparation with validation and ISO string conversion.
|
|
452
|
+
|
|
453
|
+
This helper eliminates the pattern of:
|
|
454
|
+
1. validate_interval_and_function()
|
|
455
|
+
2. datetime.fromisoformat() conversion
|
|
456
|
+
3. align_time_range()
|
|
457
|
+
|
|
458
|
+
:param pre_aggregate_manager: PreAggregateManager instance
|
|
459
|
+
:param start_iso: Start time as ISO string
|
|
460
|
+
:param end_iso: End time as ISO string
|
|
461
|
+
:param interval: Time interval (optional)
|
|
462
|
+
:param agg_function: Aggregation function (optional)
|
|
463
|
+
:return: Tuple of (aligned_start_dt, aligned_end_dt, interval)
|
|
464
|
+
"""
|
|
465
|
+
# Step 1: Validate parameters using the pre-aggregate handler
|
|
466
|
+
pre_aggregate_manager.validate_interval_and_function(interval, agg_function)
|
|
467
|
+
|
|
468
|
+
# Step 2: Convert ISO strings to datetime objects
|
|
469
|
+
start_dt, end_dt = (
|
|
470
|
+
datetime.fromisoformat(start_iso),
|
|
471
|
+
datetime.fromisoformat(end_iso),
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Step 3: Align times if interval is provided
|
|
475
|
+
start_dt, end_dt = pre_aggregate_manager.align_time_range(
|
|
476
|
+
start_dt, end_dt, interval
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
return start_dt, end_dt, interval
|
|
480
|
+
|
|
481
|
+
@staticmethod
|
|
482
|
+
def build_endpoint_aggregation_query(
|
|
483
|
+
subquery: str,
|
|
484
|
+
aggregation_columns: dict[str, str],
|
|
485
|
+
group_by_column: str = mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
486
|
+
order_by_column: str = mm_schemas.WriterEvent.ENDPOINT_ID,
|
|
487
|
+
) -> str:
|
|
488
|
+
"""
|
|
489
|
+
Build standardized outer query for endpoint-level aggregation over time buckets.
|
|
490
|
+
|
|
491
|
+
This helper eliminates the repeated pattern of:
|
|
492
|
+
SELECT endpoint_id, AGG(column) FROM (subquery) GROUP BY endpoint_id ORDER BY endpoint_id
|
|
493
|
+
|
|
494
|
+
:param subquery: Inner query that provides time-bucketed data
|
|
495
|
+
:param aggregation_columns: Dict of {result_column: "AGG(source_column)"} mappings
|
|
496
|
+
:param group_by_column: Column to group by (default: endpoint_id)
|
|
497
|
+
:param order_by_column: Column to order by (default: endpoint_id)
|
|
498
|
+
:return: Complete SQL query string
|
|
499
|
+
"""
|
|
500
|
+
# Build the SELECT columns list
|
|
501
|
+
select_columns = [group_by_column] + [
|
|
502
|
+
f"{agg_expr} AS {result_col}"
|
|
503
|
+
for result_col, agg_expr in aggregation_columns.items()
|
|
504
|
+
]
|
|
505
|
+
|
|
506
|
+
return f"""
|
|
507
|
+
SELECT
|
|
508
|
+
{', '.join(select_columns)}
|
|
509
|
+
FROM ({subquery}) AS time_buckets
|
|
510
|
+
GROUP BY {group_by_column}
|
|
511
|
+
ORDER BY {order_by_column}
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class TimescaleDBNaming:
|
|
516
|
+
"""Utility class for TimescaleDB table and view naming conventions."""
|
|
517
|
+
|
|
518
|
+
@staticmethod
|
|
519
|
+
def get_agg_table_name(base_name: str, interval: str) -> str:
|
|
520
|
+
"""
|
|
521
|
+
Generate aggregate table name with interval.
|
|
522
|
+
|
|
523
|
+
:param base_name: Base table name
|
|
524
|
+
:param interval: Time interval (e.g., '1h', '1d')
|
|
525
|
+
:return: Aggregate table name (e.g., 'metrics_agg_1h')
|
|
526
|
+
"""
|
|
527
|
+
return f"{base_name}_agg_{interval}"
|
|
528
|
+
|
|
529
|
+
@staticmethod
|
|
530
|
+
def get_cagg_view_name(base_name: str, interval: str) -> str:
|
|
531
|
+
"""
|
|
532
|
+
Generate continuous aggregate view name with interval.
|
|
533
|
+
|
|
534
|
+
:param base_name: Base table name
|
|
535
|
+
:param interval: Time interval (e.g., '1h', '1d')
|
|
536
|
+
:return: Continuous aggregate view name (e.g., 'metrics_cagg_1h')
|
|
537
|
+
"""
|
|
538
|
+
return f"{base_name}_cagg_{interval}"
|
|
539
|
+
|
|
540
|
+
@staticmethod
|
|
541
|
+
def get_agg_pattern(base_pattern: str) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Generate SQL LIKE pattern for aggregate tables.
|
|
544
|
+
|
|
545
|
+
:param base_pattern: Base pattern (e.g., 'metrics')
|
|
546
|
+
:return: SQL LIKE pattern (e.g., 'metrics_agg_%')
|
|
547
|
+
"""
|
|
548
|
+
return f"{base_pattern}_agg_%"
|
|
549
|
+
|
|
550
|
+
@staticmethod
|
|
551
|
+
def get_cagg_pattern(base_pattern: str) -> str:
|
|
552
|
+
"""
|
|
553
|
+
Generate SQL LIKE pattern for continuous aggregate views.
|
|
554
|
+
|
|
555
|
+
:param base_pattern: Base pattern (e.g., 'metrics')
|
|
556
|
+
:return: SQL LIKE pattern (e.g., 'metrics_cagg_%')
|
|
557
|
+
"""
|
|
558
|
+
return f"{base_pattern}_cagg_%"
|
|
559
|
+
|
|
560
|
+
@staticmethod
|
|
561
|
+
def get_all_aggregate_patterns(base_pattern: str) -> list[str]:
|
|
562
|
+
"""
|
|
563
|
+
Generate both aggregate table and continuous aggregate view patterns.
|
|
564
|
+
|
|
565
|
+
:param base_pattern: Base pattern (e.g., 'metrics')
|
|
566
|
+
:return: List of patterns ['metrics_agg_%', 'metrics_cagg_%']
|
|
567
|
+
"""
|
|
568
|
+
return [
|
|
569
|
+
TimescaleDBNaming.get_agg_pattern(base_pattern),
|
|
570
|
+
TimescaleDBNaming.get_cagg_pattern(base_pattern),
|
|
571
|
+
]
|
|
572
|
+
|
|
573
|
+
@staticmethod
|
|
574
|
+
def get_deletion_patterns(base_pattern: str) -> list[str]:
|
|
575
|
+
"""
|
|
576
|
+
Generate all patterns needed for table deletion operations.
|
|
577
|
+
|
|
578
|
+
:param base_pattern: Base pattern (e.g., 'metrics')
|
|
579
|
+
:return: List of patterns [base_pattern, 'metrics_agg_%', 'metrics_cagg_%']
|
|
580
|
+
"""
|
|
581
|
+
return [
|
|
582
|
+
base_pattern,
|
|
583
|
+
TimescaleDBNaming.get_agg_pattern(base_pattern),
|
|
584
|
+
TimescaleDBNaming.get_cagg_pattern(base_pattern),
|
|
585
|
+
]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright 2025 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
19
|
+
import mlrun.feature_store.steps
|
|
20
|
+
from mlrun.utils import logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ProcessBeforeTimescaleDBWriter(mlrun.feature_store.steps.MapClass):
|
|
24
|
+
"""
|
|
25
|
+
Process the data before writing to TimescaleDB via the new async writer.
|
|
26
|
+
|
|
27
|
+
This step combines functionality from the existing stream processor
|
|
28
|
+
to create appropriate table names and format data for TimescaleDB writer targets.
|
|
29
|
+
|
|
30
|
+
:returns: Event as a dictionary which will be written into the TimescaleDB Metrics/App Results tables.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def do(self, event):
|
|
34
|
+
logger.info("Process event before writing to TimescaleDB writer", event=event)
|
|
35
|
+
|
|
36
|
+
# Extract project from function URI (existing TimescaleDB pattern)
|
|
37
|
+
if mm_schemas.EventFieldType.FUNCTION_URI in event:
|
|
38
|
+
event[mm_schemas.EventFieldType.PROJECT] = event[
|
|
39
|
+
mm_schemas.EventFieldType.FUNCTION_URI
|
|
40
|
+
].split("/")[0]
|
|
41
|
+
|
|
42
|
+
# Handle custom metrics serialization (existing TimescaleDB pattern)
|
|
43
|
+
event[mm_schemas.EventKeyMetrics.CUSTOM_METRICS] = json.dumps(
|
|
44
|
+
event.get(mm_schemas.EventFieldType.METRICS, {})
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Handle time mapping (existing TimescaleDB pattern)
|
|
48
|
+
# Map WHEN field to END_INFER_TIME for predictions data from model serving
|
|
49
|
+
if mm_schemas.StreamProcessingEvent.WHEN in event:
|
|
50
|
+
event[mm_schemas.WriterEvent.END_INFER_TIME] = event[
|
|
51
|
+
mm_schemas.StreamProcessingEvent.WHEN
|
|
52
|
+
]
|
|
53
|
+
# For non-prediction events, use timestamp as END_INFER_TIME to maintain consistency
|
|
54
|
+
elif mm_schemas.EventFieldType.TIMESTAMP in event:
|
|
55
|
+
event[mm_schemas.WriterEvent.END_INFER_TIME] = event[
|
|
56
|
+
mm_schemas.EventFieldType.TIMESTAMP
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Handle START_INFER_TIME conversion
|
|
60
|
+
if mm_schemas.WriterEvent.START_INFER_TIME in event and isinstance(
|
|
61
|
+
event[mm_schemas.WriterEvent.START_INFER_TIME], str
|
|
62
|
+
):
|
|
63
|
+
event[mm_schemas.WriterEvent.START_INFER_TIME] = datetime.fromisoformat(
|
|
64
|
+
event[mm_schemas.WriterEvent.START_INFER_TIME]
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Create table column identifier
|
|
68
|
+
# TimescaleDB uses endpoint-based table organization
|
|
69
|
+
event[mm_schemas.EventFieldType.TABLE_COLUMN] = (
|
|
70
|
+
f"_{event.get(mm_schemas.EventFieldType.ENDPOINT_ID)}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return event
|