mlrun 1.10.0rc4__py3-none-any.whl → 1.10.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/model.py +3 -3
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +14 -2
- mlrun/common/schemas/model_monitoring/functions.py +66 -0
- mlrun/common/schemas/project.py +3 -0
- mlrun/config.py +7 -4
- mlrun/db/base.py +13 -0
- mlrun/db/httpdb.py +47 -0
- mlrun/db/nopdb.py +12 -0
- mlrun/launcher/client.py +23 -0
- mlrun/model_monitoring/applications/base.py +9 -5
- mlrun/model_monitoring/db/tsdb/base.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +118 -50
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +117 -24
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +106 -15
- mlrun/projects/project.py +40 -1
- mlrun/runtimes/base.py +0 -27
- mlrun/runtimes/daskjob.py +12 -5
- mlrun/runtimes/databricks_job/databricks_runtime.py +0 -2
- mlrun/runtimes/mpijob/abstract.py +0 -2
- mlrun/runtimes/mpijob/v1.py +0 -2
- mlrun/runtimes/nuclio/application/application.py +0 -5
- mlrun/runtimes/nuclio/function.py +0 -11
- mlrun/runtimes/nuclio/serving.py +0 -6
- mlrun/runtimes/pod.py +1 -3
- mlrun/runtimes/remotesparkjob.py +0 -2
- mlrun/runtimes/sparkjob/spark3job.py +0 -2
- mlrun/serving/states.py +16 -18
- mlrun/utils/helpers.py +17 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/METADATA +2 -1
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/RECORD +36 -35
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc4.dist-info → mlrun-1.10.0rc6.dist-info}/top_level.txt +0 -0
mlrun/artifacts/model.py
CHANGED
|
@@ -187,7 +187,7 @@ class ModelArtifact(Artifact):
|
|
|
187
187
|
:param model_url: Remote model url.
|
|
188
188
|
:param default_config: Default configuration for client building
|
|
189
189
|
Saved as a sub-dictionary under the parameter.
|
|
190
|
-
:param kwargs:
|
|
190
|
+
:param kwargs: Arguments to pass to the artifact class.
|
|
191
191
|
"""
|
|
192
192
|
if key or body or format or target_path:
|
|
193
193
|
warnings.warn(
|
|
@@ -366,7 +366,7 @@ class ModelArtifact(Artifact):
|
|
|
366
366
|
def before_log(self):
|
|
367
367
|
if not self.spec.model_file and not self.spec.model_url:
|
|
368
368
|
raise ValueError(
|
|
369
|
-
"ModelArtifact must have either model_file or model_url attributes"
|
|
369
|
+
"ModelArtifact must have either 'model_file' or 'model_url' attributes"
|
|
370
370
|
)
|
|
371
371
|
|
|
372
372
|
super().before_log()
|
|
@@ -479,7 +479,7 @@ def get_model(
|
|
|
479
479
|
] = None,
|
|
480
480
|
suffix="",
|
|
481
481
|
) -> (str, ModelArtifact, dict):
|
|
482
|
-
"""
|
|
482
|
+
"""Return model file, model spec object, and dictionary of extra data items
|
|
483
483
|
|
|
484
484
|
this function will get the model file, metadata, and extra data
|
|
485
485
|
the returned model file is always local, when using remote urls
|
|
@@ -416,14 +416,22 @@ class ResultStatusApp(IntEnum):
|
|
|
416
416
|
detected = 2
|
|
417
417
|
|
|
418
418
|
|
|
419
|
-
class
|
|
419
|
+
class ModelMonitoringLabel:
|
|
420
420
|
KEY = mlrun.common.constants.MLRunInternalLabels.mlrun_type
|
|
421
|
-
VAL = "
|
|
421
|
+
VAL = ""
|
|
422
422
|
|
|
423
423
|
def __str__(self) -> str:
|
|
424
424
|
return f"{self.KEY}={self.VAL}"
|
|
425
425
|
|
|
426
426
|
|
|
427
|
+
class ModelMonitoringAppLabel(ModelMonitoringLabel):
|
|
428
|
+
VAL = "mlrun__model-monitoring-application"
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class ModelMonitoringInfraLabel(ModelMonitoringLabel):
|
|
432
|
+
VAL = "mlrun__model-monitoring-infra"
|
|
433
|
+
|
|
434
|
+
|
|
427
435
|
class HistogramDataDriftApplicationConstants:
|
|
428
436
|
NAME = "histogram-data-drift"
|
|
429
437
|
GENERAL_RESULT_NAME = "general_drift"
|
|
@@ -438,6 +446,10 @@ class SpecialApps:
|
|
|
438
446
|
MLRUN_INFRA = "mlrun-infra"
|
|
439
447
|
|
|
440
448
|
|
|
449
|
+
class ModelMonitoringLabels:
|
|
450
|
+
MLRUN_MODEL_MONITORING_INFRA = "mlrun-model-monitoring-infra"
|
|
451
|
+
|
|
452
|
+
|
|
441
453
|
_RESERVED_FUNCTION_NAMES = MonitoringFunctionNames.list() + [SpecialApps.MLRUN_INFRA]
|
|
442
454
|
|
|
443
455
|
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Copyright 2025 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import enum
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from pydantic.v1 import BaseModel
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FunctionsType(enum.Enum):
|
|
23
|
+
APPLICATION = "application"
|
|
24
|
+
INFRA = "infra"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FunctionSummary(BaseModel):
|
|
28
|
+
"""
|
|
29
|
+
Function summary model. Includes metadata about the function, such as its name, as well as statistical
|
|
30
|
+
metrics such as the number of detections and possible detections. A function summary can be from either a
|
|
31
|
+
model monitoring application (type "application") or an infrastructure function (type "infra").
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
type: FunctionsType
|
|
35
|
+
name: str
|
|
36
|
+
application_class: str
|
|
37
|
+
updated_time: datetime
|
|
38
|
+
status: Optional[str] = None
|
|
39
|
+
base_period: Optional[int] = None
|
|
40
|
+
stats: Optional[dict] = None
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_function_dict(
|
|
44
|
+
cls,
|
|
45
|
+
func_dict: dict,
|
|
46
|
+
func_type=FunctionsType.APPLICATION,
|
|
47
|
+
base_period: Optional[int] = None,
|
|
48
|
+
stats: Optional[dict] = None,
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Create a FunctionSummary instance from a dictionary.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
return cls(
|
|
55
|
+
type=func_type,
|
|
56
|
+
name=func_dict["metadata"]["name"],
|
|
57
|
+
application_class=""
|
|
58
|
+
if func_type != FunctionsType.APPLICATION
|
|
59
|
+
else func_dict["spec"]["graph"]["steps"]["PushToMonitoringWriter"]["after"][
|
|
60
|
+
0
|
|
61
|
+
],
|
|
62
|
+
updated_time=func_dict["metadata"].get("updated"),
|
|
63
|
+
status=func_dict["status"].get("state"),
|
|
64
|
+
base_period=base_period,
|
|
65
|
+
stats=stats,
|
|
66
|
+
)
|
mlrun/common/schemas/project.py
CHANGED
|
@@ -145,6 +145,9 @@ class ProjectSummary(pydantic.v1.BaseModel):
|
|
|
145
145
|
endpoint_alerts_count: int = 0
|
|
146
146
|
job_alerts_count: int = 0
|
|
147
147
|
other_alerts_count: int = 0
|
|
148
|
+
datasets_count: int = 0
|
|
149
|
+
documents_count: int = 0
|
|
150
|
+
llm_prompts_count: int = 0
|
|
148
151
|
|
|
149
152
|
|
|
150
153
|
class IguazioProject(pydantic.v1.BaseModel):
|
mlrun/config.py
CHANGED
|
@@ -78,12 +78,12 @@ default_config = {
|
|
|
78
78
|
"vendor_images_registry": "",
|
|
79
79
|
# comma separated list of images that are in the specified images_registry, and therefore will be enriched with this
|
|
80
80
|
# registry when used. default to mlrun/* which means any image which is of the mlrun repository (mlrun/mlrun,
|
|
81
|
-
# mlrun/
|
|
81
|
+
# mlrun/mlrun-kfp, etc...)
|
|
82
82
|
"images_to_enrich_registry": "^mlrun/*,^python:3.(9|11)$",
|
|
83
83
|
"kfp_url": "",
|
|
84
84
|
"kfp_ttl": "14400", # KFP ttl in sec, after that completed PODs will be deleted
|
|
85
85
|
"kfp_image": "mlrun/mlrun-kfp", # image to use for KFP runner
|
|
86
|
-
"dask_kfp_image": "mlrun/
|
|
86
|
+
"dask_kfp_image": "mlrun/mlrun", # image to use for dask KFP runner
|
|
87
87
|
"igz_version": "", # the version of the iguazio system the API is running on
|
|
88
88
|
"iguazio_api_url": "", # the url to iguazio api
|
|
89
89
|
"spark_app_image": "", # image to use for spark operator app runtime
|
|
@@ -234,7 +234,10 @@ default_config = {
|
|
|
234
234
|
"model_endpoint_creation": "600",
|
|
235
235
|
"model_endpoint_tsdb_leftovers": "900",
|
|
236
236
|
},
|
|
237
|
-
"runtimes": {
|
|
237
|
+
"runtimes": {
|
|
238
|
+
"dask": "600",
|
|
239
|
+
"dask_cluster_start": "300",
|
|
240
|
+
},
|
|
238
241
|
"push_notifications": "60",
|
|
239
242
|
},
|
|
240
243
|
},
|
|
@@ -284,7 +287,7 @@ default_config = {
|
|
|
284
287
|
"serving": "mlrun/mlrun",
|
|
285
288
|
"nuclio": "mlrun/mlrun",
|
|
286
289
|
"remote": "mlrun/mlrun",
|
|
287
|
-
"dask": "mlrun/
|
|
290
|
+
"dask": "mlrun/mlrun",
|
|
288
291
|
"mpijob": "mlrun/mlrun",
|
|
289
292
|
"application": "python",
|
|
290
293
|
},
|
mlrun/db/base.py
CHANGED
|
@@ -1119,6 +1119,19 @@ class RunDBInterface(ABC):
|
|
|
1119
1119
|
) -> None:
|
|
1120
1120
|
pass
|
|
1121
1121
|
|
|
1122
|
+
@abstractmethod
|
|
1123
|
+
def get_monitoring_function_summaries(
|
|
1124
|
+
self,
|
|
1125
|
+
project: str,
|
|
1126
|
+
start: Optional[datetime.datetime] = None,
|
|
1127
|
+
end: Optional[datetime.datetime] = None,
|
|
1128
|
+
names: Optional[Union[list[str], str]] = None,
|
|
1129
|
+
labels: Optional[Union[str, dict[str, Optional[str]], list[str]]] = None,
|
|
1130
|
+
include_stats: bool = False,
|
|
1131
|
+
include_infra: bool = True,
|
|
1132
|
+
) -> list[mlrun.common.schemas.model_monitoring.FunctionSummary]:
|
|
1133
|
+
pass
|
|
1134
|
+
|
|
1122
1135
|
@abstractmethod
|
|
1123
1136
|
def get_project_summary(self, project: str) -> mlrun.common.schemas.ProjectSummary:
|
|
1124
1137
|
pass
|
mlrun/db/httpdb.py
CHANGED
|
@@ -50,6 +50,7 @@ from mlrun_pipelines.utils import compile_pipeline
|
|
|
50
50
|
|
|
51
51
|
from ..artifacts import Artifact
|
|
52
52
|
from ..common.schemas import AlertActivations
|
|
53
|
+
from ..common.schemas.model_monitoring import FunctionSummary
|
|
53
54
|
from ..config import config
|
|
54
55
|
from ..datastore.datastore_profile import DatastoreProfile2Json
|
|
55
56
|
from ..feature_store import FeatureSet, FeatureVector
|
|
@@ -4118,6 +4119,52 @@ class HTTPRunDB(RunDBInterface):
|
|
|
4118
4119
|
params={**credentials, "replace_creds": replace_creds},
|
|
4119
4120
|
)
|
|
4120
4121
|
|
|
4122
|
+
def get_monitoring_function_summaries(
|
|
4123
|
+
self,
|
|
4124
|
+
project: str,
|
|
4125
|
+
start: Optional[datetime] = None,
|
|
4126
|
+
end: Optional[datetime] = None,
|
|
4127
|
+
names: Optional[Union[list[str], str]] = None,
|
|
4128
|
+
labels: Optional[Union[str, dict[str, Optional[str]], list[str]]] = None,
|
|
4129
|
+
include_stats: bool = False,
|
|
4130
|
+
include_infra: bool = True,
|
|
4131
|
+
) -> list[FunctionSummary]:
|
|
4132
|
+
"""
|
|
4133
|
+
Get monitoring function summaries for the specified project.
|
|
4134
|
+
|
|
4135
|
+
:param project: The name of the project.
|
|
4136
|
+
:param start: Start time for filtering the results (optional).
|
|
4137
|
+
:param end: End time for filtering the results (optional).
|
|
4138
|
+
:param names: List of function names to filter by (optional).
|
|
4139
|
+
:param labels: Labels to filter by (optional).
|
|
4140
|
+
:param include_stats: Whether to include statistics in the response (default is False).
|
|
4141
|
+
:param include_infra: whether to include model monitoring infrastructure functions (default is True).
|
|
4142
|
+
|
|
4143
|
+
:return: A list of FunctionSummary objects containing information about the monitoring functions.
|
|
4144
|
+
"""
|
|
4145
|
+
|
|
4146
|
+
path = f"projects/{project}/model-monitoring/function-summaries"
|
|
4147
|
+
labels = self._parse_labels(labels)
|
|
4148
|
+
if names and isinstance(names, str):
|
|
4149
|
+
names = [names]
|
|
4150
|
+
response = self.api_call(
|
|
4151
|
+
method=mlrun.common.types.HTTPMethod.GET,
|
|
4152
|
+
path=path,
|
|
4153
|
+
params={
|
|
4154
|
+
"start": datetime_to_iso(start),
|
|
4155
|
+
"end": datetime_to_iso(end),
|
|
4156
|
+
"name": names,
|
|
4157
|
+
"label": labels,
|
|
4158
|
+
"include-stats": include_stats,
|
|
4159
|
+
"include-infra": include_infra,
|
|
4160
|
+
},
|
|
4161
|
+
)
|
|
4162
|
+
|
|
4163
|
+
results = []
|
|
4164
|
+
for item in response.json():
|
|
4165
|
+
results.append(FunctionSummary(**item))
|
|
4166
|
+
return results
|
|
4167
|
+
|
|
4121
4168
|
def create_hub_source(
|
|
4122
4169
|
self, source: Union[dict, mlrun.common.schemas.IndexedHubSource]
|
|
4123
4170
|
):
|
mlrun/db/nopdb.py
CHANGED
|
@@ -893,6 +893,18 @@ class NopDB(RunDBInterface):
|
|
|
893
893
|
) -> None:
|
|
894
894
|
pass
|
|
895
895
|
|
|
896
|
+
def get_monitoring_function_summaries(
|
|
897
|
+
self,
|
|
898
|
+
project: str,
|
|
899
|
+
start: Optional[datetime.datetime] = None,
|
|
900
|
+
end: Optional[datetime.datetime] = None,
|
|
901
|
+
names: Optional[Union[list[str], str]] = None,
|
|
902
|
+
labels: Optional[Union[str, dict[str, Optional[str]], list[str]]] = None,
|
|
903
|
+
include_stats: bool = False,
|
|
904
|
+
include_infra: bool = True,
|
|
905
|
+
) -> [mlrun.common.schemas.model_monitoring.FunctionSummary]:
|
|
906
|
+
pass
|
|
907
|
+
|
|
896
908
|
def generate_event(
|
|
897
909
|
self, name: str, event_data: Union[dict, mlrun.common.schemas.Event], project=""
|
|
898
910
|
):
|
mlrun/launcher/client.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import abc
|
|
15
|
+
import warnings
|
|
15
16
|
from typing import Optional
|
|
16
17
|
|
|
17
18
|
import IPython.display
|
|
@@ -23,6 +24,7 @@ import mlrun.lists
|
|
|
23
24
|
import mlrun.model
|
|
24
25
|
import mlrun.runtimes
|
|
25
26
|
import mlrun.utils
|
|
27
|
+
import mlrun.utils.version
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
|
|
@@ -60,6 +62,27 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
|
|
|
60
62
|
):
|
|
61
63
|
image = mlrun.mlconf.function_defaults.image_by_kind.to_dict()[runtime.kind]
|
|
62
64
|
|
|
65
|
+
# Warn if user explicitly set the deprecated mlrun/ml-base image
|
|
66
|
+
if image and "mlrun/ml-base" in image:
|
|
67
|
+
client_version = mlrun.utils.version.Version().get()["version"]
|
|
68
|
+
auto_replaced = mlrun.utils.validate_component_version_compatibility(
|
|
69
|
+
"mlrun-client", "1.10.0", mlrun_client_version=client_version
|
|
70
|
+
)
|
|
71
|
+
message = (
|
|
72
|
+
"'mlrun/ml-base' image is deprecated in 1.10.0 and will be removed in 1.12.0, "
|
|
73
|
+
"use 'mlrun/mlrun' instead."
|
|
74
|
+
)
|
|
75
|
+
if auto_replaced:
|
|
76
|
+
message += (
|
|
77
|
+
" Since your client version is >= 1.10.0, the image will be automatically "
|
|
78
|
+
"replaced with mlrun/mlrun."
|
|
79
|
+
)
|
|
80
|
+
warnings.warn(
|
|
81
|
+
message,
|
|
82
|
+
# TODO: Remove this in 1.12.0
|
|
83
|
+
FutureWarning,
|
|
84
|
+
)
|
|
85
|
+
|
|
63
86
|
# TODO: need a better way to decide whether a function requires a build
|
|
64
87
|
if require_build and image and not runtime.spec.build.base_image:
|
|
65
88
|
# when the function require build use the image as the base_image for the build
|
|
@@ -409,8 +409,8 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
409
409
|
tag: Optional[str] = None,
|
|
410
410
|
run_local: bool = True,
|
|
411
411
|
auto_build: bool = True,
|
|
412
|
-
sample_data: Optional[pd.DataFrame] = None,
|
|
413
|
-
reference_data: Optional[pd.DataFrame] = None,
|
|
412
|
+
sample_data: Optional[Union[pd.DataFrame, str]] = None,
|
|
413
|
+
reference_data: Optional[Union[pd.DataFrame, str]] = None,
|
|
414
414
|
image: Optional[str] = None,
|
|
415
415
|
with_repo: Optional[bool] = False,
|
|
416
416
|
class_handler: Optional[str] = None,
|
|
@@ -434,9 +434,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
434
434
|
:param tag: Tag for the function.
|
|
435
435
|
:param run_local: Whether to run the function locally or remotely.
|
|
436
436
|
:param auto_build: Whether to auto build the function.
|
|
437
|
-
:param sample_data: Pandas data-frame
|
|
437
|
+
:param sample_data: Pandas data-frame or :py:class:`~mlrun.artifacts.dataset.DatasetArtifact` URI as
|
|
438
|
+
the current dataset.
|
|
438
439
|
When set, it replaces the data read from the model endpoint's offline source.
|
|
439
|
-
:param reference_data: Pandas data-frame
|
|
440
|
+
:param reference_data: Pandas data-frame or :py:class:`~mlrun.artifacts.dataset.DatasetArtifact` URI as
|
|
441
|
+
the reference dataset.
|
|
440
442
|
When set, its statistics override the model endpoint's feature statistics.
|
|
441
443
|
:param image: Docker image to run the job on (when running remotely).
|
|
442
444
|
:param with_repo: Whether to clone the current repo to the build source.
|
|
@@ -515,7 +517,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
515
517
|
(sample_data, "sample_data"),
|
|
516
518
|
(reference_data, "reference_data"),
|
|
517
519
|
]:
|
|
518
|
-
if data
|
|
520
|
+
if isinstance(data, str):
|
|
521
|
+
inputs[identifier] = data
|
|
522
|
+
elif data is not None:
|
|
519
523
|
key = f"{job.metadata.name}_{identifier}"
|
|
520
524
|
inputs[identifier] = project.log_dataset(
|
|
521
525
|
key,
|
|
@@ -328,6 +328,36 @@ class TSDBConnector(ABC):
|
|
|
328
328
|
If an endpoint has not been invoked within the specified time range, it will not appear in the result.
|
|
329
329
|
"""
|
|
330
330
|
|
|
331
|
+
@abstractmethod
|
|
332
|
+
def count_results_by_status(
|
|
333
|
+
self,
|
|
334
|
+
start: Optional[Union[datetime, str]] = None,
|
|
335
|
+
end: Optional[Union[datetime, str]] = None,
|
|
336
|
+
endpoint_ids: Optional[Union[str, list[str]]] = None,
|
|
337
|
+
application_names: Optional[Union[str, list[str]]] = None,
|
|
338
|
+
result_status_list: Optional[list[int]] = None,
|
|
339
|
+
) -> dict[tuple[str, int], int]:
|
|
340
|
+
"""
|
|
341
|
+
Read results status from the TSDB and return a dictionary of results statuses by application name.
|
|
342
|
+
|
|
343
|
+
:param start: The start time in which to read the results. By default, the last 24 hours are read.
|
|
344
|
+
:param end: The end time in which to read the results. Default is the current time (now).
|
|
345
|
+
:param endpoint_ids: Optional list of endpoint ids to filter the results by. By default, all
|
|
346
|
+
endpoint ids are included.
|
|
347
|
+
:param application_names: Optional list of application names to filter the results by. By default, all
|
|
348
|
+
application are included.
|
|
349
|
+
:param result_status_list: Optional list of result statuses to filter the results by. By default, all
|
|
350
|
+
result statuses are included.
|
|
351
|
+
|
|
352
|
+
:return: A dictionary where the key is a tuple of (application_name, result_status) and the value is the total
|
|
353
|
+
number of results with that status for that application.
|
|
354
|
+
For example:
|
|
355
|
+
{
|
|
356
|
+
('app1', 1): 10,
|
|
357
|
+
('app1', 2): 5
|
|
358
|
+
}
|
|
359
|
+
"""
|
|
360
|
+
|
|
331
361
|
async def add_basic_metrics(
|
|
332
362
|
self,
|
|
333
363
|
model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
|
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import traceback
|
|
14
|
+
import time
|
|
16
15
|
from collections.abc import Callable
|
|
17
16
|
from enum import Enum
|
|
18
17
|
from typing import Any, Final, Optional, Union
|
|
@@ -20,6 +19,9 @@ from typing import Any, Final, Optional, Union
|
|
|
20
19
|
import taosws
|
|
21
20
|
from taosws import TaosStmt
|
|
22
21
|
|
|
22
|
+
import mlrun
|
|
23
|
+
from mlrun.utils import logger
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
class _StrEnum(str, Enum):
|
|
25
27
|
pass
|
|
@@ -137,40 +139,99 @@ class Statement:
|
|
|
137
139
|
return statement
|
|
138
140
|
|
|
139
141
|
|
|
140
|
-
def _run(connection_string, prefix_statements, q, statements, query):
|
|
141
|
-
try:
|
|
142
|
-
conn = taosws.connect(connection_string)
|
|
143
|
-
|
|
144
|
-
for statement in prefix_statements + statements:
|
|
145
|
-
if isinstance(statement, Statement):
|
|
146
|
-
prepared_statement = statement.prepare(conn.statement())
|
|
147
|
-
prepared_statement.execute()
|
|
148
|
-
else:
|
|
149
|
-
conn.execute(statement)
|
|
150
|
-
|
|
151
|
-
if not query:
|
|
152
|
-
q.put(None)
|
|
153
|
-
return
|
|
154
|
-
|
|
155
|
-
res = conn.query(query)
|
|
156
|
-
|
|
157
|
-
# taosws.TaosField is not serializable
|
|
158
|
-
fields = [
|
|
159
|
-
Field(field.name(), field.type(), field.bytes()) for field in res.fields
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
q.put(QueryResult(list(res), fields))
|
|
163
|
-
except Exception as e:
|
|
164
|
-
tb = traceback.format_exc()
|
|
165
|
-
q.put(ErrorResult(tb, e))
|
|
166
|
-
|
|
167
|
-
|
|
168
142
|
class TDEngineConnection:
|
|
169
|
-
def __init__(self, connection_string):
|
|
143
|
+
def __init__(self, connection_string, max_retries=3, retry_delay=0.5):
|
|
170
144
|
self._connection_string = connection_string
|
|
171
145
|
self.prefix_statements = []
|
|
146
|
+
self._max_retries = max_retries
|
|
147
|
+
self._retry_delay = retry_delay
|
|
172
148
|
|
|
173
|
-
self._conn =
|
|
149
|
+
self._conn = self._create_connection()
|
|
150
|
+
|
|
151
|
+
def _create_connection(self):
|
|
152
|
+
"""Create a new TDEngine connection."""
|
|
153
|
+
return taosws.connect(self._connection_string)
|
|
154
|
+
|
|
155
|
+
def _reconnect(self):
|
|
156
|
+
"""Close current connection and create a new one."""
|
|
157
|
+
try:
|
|
158
|
+
if hasattr(self, "_conn") and self._conn:
|
|
159
|
+
self._conn.close()
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.warning(f"Error closing connection during reconnect: {e}")
|
|
162
|
+
|
|
163
|
+
self._conn = self._create_connection()
|
|
164
|
+
logger.info("Successfully reconnected to TDEngine")
|
|
165
|
+
|
|
166
|
+
def _execute_with_retry(self, operation, operation_name, *args, **kwargs):
|
|
167
|
+
"""
|
|
168
|
+
Execute an operation with retry logic for connection failures.
|
|
169
|
+
|
|
170
|
+
:param operation: The function to execute
|
|
171
|
+
:param operation_name: Name of the operation for logging
|
|
172
|
+
:param args: Arguments to pass to the operation
|
|
173
|
+
:param kwargs: Keyword arguments to pass to the operation
|
|
174
|
+
:return: Result of the operation
|
|
175
|
+
"""
|
|
176
|
+
last_exception = None
|
|
177
|
+
|
|
178
|
+
for attempt in range(self._max_retries + 1): # +1 for initial attempt
|
|
179
|
+
try:
|
|
180
|
+
return operation(*args, **kwargs)
|
|
181
|
+
|
|
182
|
+
except taosws.Error as e:
|
|
183
|
+
last_exception = e
|
|
184
|
+
|
|
185
|
+
if attempt < self._max_retries:
|
|
186
|
+
logger.warning(
|
|
187
|
+
f"Connection error during {operation_name} "
|
|
188
|
+
f"(attempt {attempt + 1}/{self._max_retries + 1}): {e}. "
|
|
189
|
+
f"Retrying in {self._retry_delay} seconds..."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Wait before retrying
|
|
193
|
+
time.sleep(self._retry_delay)
|
|
194
|
+
|
|
195
|
+
# Reconnect
|
|
196
|
+
try:
|
|
197
|
+
self._reconnect()
|
|
198
|
+
except Exception as reconnect_error:
|
|
199
|
+
logger.error(f"Failed to reconnect: {reconnect_error}")
|
|
200
|
+
if attempt == self._max_retries - 1:
|
|
201
|
+
# Last attempt, raise the reconnection error
|
|
202
|
+
raise TDEngineError(
|
|
203
|
+
f"Failed to reconnect after {operation_name} failure: {reconnect_error}"
|
|
204
|
+
) from reconnect_error
|
|
205
|
+
continue
|
|
206
|
+
else:
|
|
207
|
+
# Max retries exceeded
|
|
208
|
+
logger.error(
|
|
209
|
+
f"Max retries ({self._max_retries}) exceeded for {operation_name}"
|
|
210
|
+
)
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
# Non-TDEngine error, don't retry
|
|
215
|
+
raise TDEngineError(
|
|
216
|
+
f"Unexpected error during {operation_name}: {e}"
|
|
217
|
+
) from e
|
|
218
|
+
|
|
219
|
+
# If we get here, all retries failed
|
|
220
|
+
raise TDEngineError(
|
|
221
|
+
f"Failed to {operation_name} after {self._max_retries} retries: {last_exception}"
|
|
222
|
+
) from last_exception
|
|
223
|
+
|
|
224
|
+
def _execute_statement(self, statement):
|
|
225
|
+
"""Execute a single statement (string or Statement object)."""
|
|
226
|
+
if isinstance(statement, Statement):
|
|
227
|
+
prepared_statement = statement.prepare(self._conn.statement())
|
|
228
|
+
prepared_statement.execute()
|
|
229
|
+
else:
|
|
230
|
+
self._conn.execute(statement)
|
|
231
|
+
|
|
232
|
+
def _execute_query(self, query):
|
|
233
|
+
"""Execute a query and return the result."""
|
|
234
|
+
return self._conn.query(query)
|
|
174
235
|
|
|
175
236
|
def run(
|
|
176
237
|
self,
|
|
@@ -181,33 +242,40 @@ class TDEngineConnection:
|
|
|
181
242
|
if not isinstance(statements, list):
|
|
182
243
|
statements = [statements]
|
|
183
244
|
|
|
184
|
-
|
|
245
|
+
# Execute all statements with retry logic
|
|
246
|
+
all_statements = self.prefix_statements + statements
|
|
247
|
+
for i, statement in enumerate(all_statements):
|
|
248
|
+
operation_name = f"execute statement {i + 1}/{len(all_statements)}"
|
|
185
249
|
if isinstance(statement, Statement):
|
|
186
|
-
|
|
187
|
-
prepared_statement = statement.prepare(self._conn.statement())
|
|
188
|
-
prepared_statement.execute()
|
|
189
|
-
except taosws.Error as e:
|
|
190
|
-
raise TDEngineError(
|
|
191
|
-
f"Failed to run prepared statement `{self._conn.statement()}`: {e}"
|
|
192
|
-
) from e
|
|
250
|
+
operation_name += " (prepared)"
|
|
193
251
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
raise TDEngineError(
|
|
198
|
-
f"Failed to run statement `{statement}`: {e}"
|
|
199
|
-
) from e
|
|
252
|
+
operation_name += f" `{statement}`"
|
|
253
|
+
|
|
254
|
+
self._execute_with_retry(self._execute_statement, operation_name, statement)
|
|
200
255
|
|
|
201
256
|
if not query:
|
|
202
257
|
return None
|
|
203
258
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
259
|
+
# Execute query with retry logic
|
|
260
|
+
res = self._execute_with_retry(
|
|
261
|
+
self._execute_query, f"execute query `{query}`", query
|
|
262
|
+
)
|
|
208
263
|
|
|
264
|
+
# Process results
|
|
209
265
|
fields = [
|
|
210
266
|
Field(field.name(), field.type(), field.bytes()) for field in res.fields
|
|
211
267
|
]
|
|
212
268
|
|
|
213
269
|
return QueryResult(list(res), fields)
|
|
270
|
+
|
|
271
|
+
def close(self):
|
|
272
|
+
"""Close the connection."""
|
|
273
|
+
try:
|
|
274
|
+
if self._conn:
|
|
275
|
+
self._conn.close()
|
|
276
|
+
logger.debug("TDEngine connection closed")
|
|
277
|
+
self._conn = None
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.warning(
|
|
280
|
+
f"Error closing TDEngine connection: {mlrun.errors.err_to_str(e)}"
|
|
281
|
+
)
|