mlrun 1.8.0rc61__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +13 -3
- mlrun/common/constants.py +7 -0
- mlrun/config.py +4 -4
- mlrun/datastore/datastore_profile.py +3 -3
- mlrun/db/httpdb.py +4 -2
- mlrun/frameworks/tf_keras/__init__.py +4 -4
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +23 -20
- mlrun/frameworks/tf_keras/model_handler.py +69 -9
- mlrun/frameworks/tf_keras/utils.py +12 -1
- mlrun/launcher/client.py +1 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +118 -50
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +12 -13
- mlrun/projects/project.py +4 -1
- mlrun/runtimes/base.py +1 -1
- mlrun/runtimes/utils.py +24 -7
- mlrun/serving/v2_serving.py +9 -8
- mlrun/utils/helpers.py +72 -22
- mlrun/utils/notifications/notification/slack.py +5 -1
- mlrun/utils/notifications/notification_pusher.py +2 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/METADATA +9 -8
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/RECORD +26 -26
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/WHEEL +1 -1
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.8.0rc61.dist-info → mlrun-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import traceback
|
|
14
|
+
import time
|
|
16
15
|
from collections.abc import Callable
|
|
17
16
|
from enum import Enum
|
|
18
17
|
from typing import Any, Final, Optional, Union
|
|
@@ -20,6 +19,9 @@ from typing import Any, Final, Optional, Union
|
|
|
20
19
|
import taosws
|
|
21
20
|
from taosws import TaosStmt
|
|
22
21
|
|
|
22
|
+
import mlrun
|
|
23
|
+
from mlrun.utils import logger
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
class _StrEnum(str, Enum):
|
|
25
27
|
pass
|
|
@@ -137,40 +139,99 @@ class Statement:
|
|
|
137
139
|
return statement
|
|
138
140
|
|
|
139
141
|
|
|
140
|
-
def _run(connection_string, prefix_statements, q, statements, query):
|
|
141
|
-
try:
|
|
142
|
-
conn = taosws.connect(connection_string)
|
|
143
|
-
|
|
144
|
-
for statement in prefix_statements + statements:
|
|
145
|
-
if isinstance(statement, Statement):
|
|
146
|
-
prepared_statement = statement.prepare(conn.statement())
|
|
147
|
-
prepared_statement.execute()
|
|
148
|
-
else:
|
|
149
|
-
conn.execute(statement)
|
|
150
|
-
|
|
151
|
-
if not query:
|
|
152
|
-
q.put(None)
|
|
153
|
-
return
|
|
154
|
-
|
|
155
|
-
res = conn.query(query)
|
|
156
|
-
|
|
157
|
-
# taosws.TaosField is not serializable
|
|
158
|
-
fields = [
|
|
159
|
-
Field(field.name(), field.type(), field.bytes()) for field in res.fields
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
q.put(QueryResult(list(res), fields))
|
|
163
|
-
except Exception as e:
|
|
164
|
-
tb = traceback.format_exc()
|
|
165
|
-
q.put(ErrorResult(tb, e))
|
|
166
|
-
|
|
167
|
-
|
|
168
142
|
class TDEngineConnection:
|
|
169
|
-
def __init__(self, connection_string):
|
|
143
|
+
def __init__(self, connection_string, max_retries=3, retry_delay=0.5):
|
|
170
144
|
self._connection_string = connection_string
|
|
171
145
|
self.prefix_statements = []
|
|
146
|
+
self._max_retries = max_retries
|
|
147
|
+
self._retry_delay = retry_delay
|
|
172
148
|
|
|
173
|
-
self._conn =
|
|
149
|
+
self._conn = self._create_connection()
|
|
150
|
+
|
|
151
|
+
def _create_connection(self):
|
|
152
|
+
"""Create a new TDEngine connection."""
|
|
153
|
+
return taosws.connect(self._connection_string)
|
|
154
|
+
|
|
155
|
+
def _reconnect(self):
|
|
156
|
+
"""Close current connection and create a new one."""
|
|
157
|
+
try:
|
|
158
|
+
if hasattr(self, "_conn") and self._conn:
|
|
159
|
+
self._conn.close()
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.warning(f"Error closing connection during reconnect: {e}")
|
|
162
|
+
|
|
163
|
+
self._conn = self._create_connection()
|
|
164
|
+
logger.info("Successfully reconnected to TDEngine")
|
|
165
|
+
|
|
166
|
+
def _execute_with_retry(self, operation, operation_name, *args, **kwargs):
|
|
167
|
+
"""
|
|
168
|
+
Execute an operation with retry logic for connection failures.
|
|
169
|
+
|
|
170
|
+
:param operation: The function to execute
|
|
171
|
+
:param operation_name: Name of the operation for logging
|
|
172
|
+
:param args: Arguments to pass to the operation
|
|
173
|
+
:param kwargs: Keyword arguments to pass to the operation
|
|
174
|
+
:return: Result of the operation
|
|
175
|
+
"""
|
|
176
|
+
last_exception = None
|
|
177
|
+
|
|
178
|
+
for attempt in range(self._max_retries + 1): # +1 for initial attempt
|
|
179
|
+
try:
|
|
180
|
+
return operation(*args, **kwargs)
|
|
181
|
+
|
|
182
|
+
except taosws.Error as e:
|
|
183
|
+
last_exception = e
|
|
184
|
+
|
|
185
|
+
if attempt < self._max_retries:
|
|
186
|
+
logger.warning(
|
|
187
|
+
f"Connection error during {operation_name} "
|
|
188
|
+
f"(attempt {attempt + 1}/{self._max_retries + 1}): {e}. "
|
|
189
|
+
f"Retrying in {self._retry_delay} seconds..."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Wait before retrying
|
|
193
|
+
time.sleep(self._retry_delay)
|
|
194
|
+
|
|
195
|
+
# Reconnect
|
|
196
|
+
try:
|
|
197
|
+
self._reconnect()
|
|
198
|
+
except Exception as reconnect_error:
|
|
199
|
+
logger.error(f"Failed to reconnect: {reconnect_error}")
|
|
200
|
+
if attempt == self._max_retries - 1:
|
|
201
|
+
# Last attempt, raise the reconnection error
|
|
202
|
+
raise TDEngineError(
|
|
203
|
+
f"Failed to reconnect after {operation_name} failure: {reconnect_error}"
|
|
204
|
+
) from reconnect_error
|
|
205
|
+
continue
|
|
206
|
+
else:
|
|
207
|
+
# Max retries exceeded
|
|
208
|
+
logger.error(
|
|
209
|
+
f"Max retries ({self._max_retries}) exceeded for {operation_name}"
|
|
210
|
+
)
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
# Non-TDEngine error, don't retry
|
|
215
|
+
raise TDEngineError(
|
|
216
|
+
f"Unexpected error during {operation_name}: {e}"
|
|
217
|
+
) from e
|
|
218
|
+
|
|
219
|
+
# If we get here, all retries failed
|
|
220
|
+
raise TDEngineError(
|
|
221
|
+
f"Failed to {operation_name} after {self._max_retries} retries: {last_exception}"
|
|
222
|
+
) from last_exception
|
|
223
|
+
|
|
224
|
+
def _execute_statement(self, statement):
|
|
225
|
+
"""Execute a single statement (string or Statement object)."""
|
|
226
|
+
if isinstance(statement, Statement):
|
|
227
|
+
prepared_statement = statement.prepare(self._conn.statement())
|
|
228
|
+
prepared_statement.execute()
|
|
229
|
+
else:
|
|
230
|
+
self._conn.execute(statement)
|
|
231
|
+
|
|
232
|
+
def _execute_query(self, query):
|
|
233
|
+
"""Execute a query and return the result."""
|
|
234
|
+
return self._conn.query(query)
|
|
174
235
|
|
|
175
236
|
def run(
|
|
176
237
|
self,
|
|
@@ -181,33 +242,40 @@ class TDEngineConnection:
|
|
|
181
242
|
if not isinstance(statements, list):
|
|
182
243
|
statements = [statements]
|
|
183
244
|
|
|
184
|
-
|
|
245
|
+
# Execute all statements with retry logic
|
|
246
|
+
all_statements = self.prefix_statements + statements
|
|
247
|
+
for i, statement in enumerate(all_statements):
|
|
248
|
+
operation_name = f"execute statement {i + 1}/{len(all_statements)}"
|
|
185
249
|
if isinstance(statement, Statement):
|
|
186
|
-
|
|
187
|
-
prepared_statement = statement.prepare(self._conn.statement())
|
|
188
|
-
prepared_statement.execute()
|
|
189
|
-
except taosws.Error as e:
|
|
190
|
-
raise TDEngineError(
|
|
191
|
-
f"Failed to run prepared statement `{self._conn.statement()}`: {e}"
|
|
192
|
-
) from e
|
|
250
|
+
operation_name += " (prepared)"
|
|
193
251
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
raise TDEngineError(
|
|
198
|
-
f"Failed to run statement `{statement}`: {e}"
|
|
199
|
-
) from e
|
|
252
|
+
operation_name += f" `{statement}`"
|
|
253
|
+
|
|
254
|
+
self._execute_with_retry(self._execute_statement, operation_name, statement)
|
|
200
255
|
|
|
201
256
|
if not query:
|
|
202
257
|
return None
|
|
203
258
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
259
|
+
# Execute query with retry logic
|
|
260
|
+
res = self._execute_with_retry(
|
|
261
|
+
self._execute_query, f"execute query `{query}`", query
|
|
262
|
+
)
|
|
208
263
|
|
|
264
|
+
# Process results
|
|
209
265
|
fields = [
|
|
210
266
|
Field(field.name(), field.type(), field.bytes()) for field in res.fields
|
|
211
267
|
]
|
|
212
268
|
|
|
213
269
|
return QueryResult(list(res), fields)
|
|
270
|
+
|
|
271
|
+
def close(self):
|
|
272
|
+
"""Close the connection."""
|
|
273
|
+
try:
|
|
274
|
+
if self._conn:
|
|
275
|
+
self._conn.close()
|
|
276
|
+
logger.debug("TDEngine connection closed")
|
|
277
|
+
self._conn = None
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.warning(
|
|
280
|
+
f"Error closing TDEngine connection: {mlrun.errors.err_to_str(e)}"
|
|
281
|
+
)
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import threading
|
|
15
16
|
from datetime import datetime, timedelta
|
|
16
|
-
from threading import Lock
|
|
17
17
|
from typing import Callable, Final, Literal, Optional, Union
|
|
18
18
|
|
|
19
19
|
import pandas as pd
|
|
@@ -32,8 +32,8 @@ from mlrun.model_monitoring.db.tsdb.tdengine.tdengine_connection import (
|
|
|
32
32
|
from mlrun.model_monitoring.helpers import get_invocations_fqn
|
|
33
33
|
from mlrun.utils import logger
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
# Thread-local storage for connections
|
|
36
|
+
_thread_local = threading.local()
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class TDEngineTimestampPrecision(mlrun.common.types.StrEnum):
|
|
@@ -76,16 +76,15 @@ class TDEngineConnector(TSDBConnector):
|
|
|
76
76
|
|
|
77
77
|
@property
|
|
78
78
|
def connection(self) -> TDEngineConnection:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
return _connection
|
|
79
|
+
if not hasattr(_thread_local, "connection"):
|
|
80
|
+
_thread_local.connection = self._create_connection()
|
|
81
|
+
logger.debug(
|
|
82
|
+
"Created new TDEngine connection for thread",
|
|
83
|
+
project=self.project,
|
|
84
|
+
thread_name=threading.current_thread().name,
|
|
85
|
+
thread_id=threading.get_ident(),
|
|
86
|
+
)
|
|
87
|
+
return _thread_local.connection
|
|
89
88
|
|
|
90
89
|
def _create_connection(self) -> TDEngineConnection:
|
|
91
90
|
"""Establish a connection to the TSDB server."""
|
mlrun/projects/project.py
CHANGED
|
@@ -1408,7 +1408,10 @@ class MlrunProject(ModelObj):
|
|
|
1408
1408
|
https://apscheduler.readthedocs.io/en/3.x/modules/triggers/cron.html#module-apscheduler.triggers.cron
|
|
1409
1409
|
Note that "local" engine does not support this argument
|
|
1410
1410
|
:param ttl: Pipeline ttl in secs (after that the pods will be removed)
|
|
1411
|
-
:param image: Image for workflow runner job, only for scheduled and remote workflows
|
|
1411
|
+
:param image: Image for workflow runner job, only for scheduled and remote workflows.
|
|
1412
|
+
The image must have mlrun[kfp] installed which requires python 3.9.
|
|
1413
|
+
Therefore, the project default image will not be used for the workflow,
|
|
1414
|
+
and the image must be specified explicitly.
|
|
1412
1415
|
:param args: Argument values (key=value, ..)
|
|
1413
1416
|
"""
|
|
1414
1417
|
|
mlrun/runtimes/base.py
CHANGED
|
@@ -489,7 +489,7 @@ class BaseRuntime(ModelObj):
|
|
|
489
489
|
def _store_function(self, runspec, meta, db):
|
|
490
490
|
meta.labels["kind"] = self.kind
|
|
491
491
|
mlrun.runtimes.utils.enrich_run_labels(
|
|
492
|
-
meta.labels, [
|
|
492
|
+
meta.labels, [mlrun_constants.MLRunInternalLabels.owner]
|
|
493
493
|
)
|
|
494
494
|
if runspec.spec.output_path:
|
|
495
495
|
runspec.spec.output_path = runspec.spec.output_path.replace(
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import enum
|
|
14
15
|
import getpass
|
|
15
16
|
import hashlib
|
|
16
17
|
import json
|
|
@@ -28,7 +29,6 @@ import mlrun.common.constants as mlrun_constants
|
|
|
28
29
|
import mlrun.common.schemas
|
|
29
30
|
import mlrun.utils.regex
|
|
30
31
|
from mlrun.artifacts import TableArtifact
|
|
31
|
-
from mlrun.common.runtimes.constants import RunLabels
|
|
32
32
|
from mlrun.config import config
|
|
33
33
|
from mlrun.errors import err_to_str
|
|
34
34
|
from mlrun.frameworks.parallel_coordinates import gen_pcp_plot
|
|
@@ -433,18 +433,35 @@ def enrich_function_from_dict(function, function_dict):
|
|
|
433
433
|
|
|
434
434
|
def enrich_run_labels(
|
|
435
435
|
labels: dict,
|
|
436
|
-
labels_to_enrich: Optional[list[
|
|
436
|
+
labels_to_enrich: Optional[list[mlrun_constants.MLRunInternalLabels]] = None,
|
|
437
437
|
):
|
|
438
|
+
"""
|
|
439
|
+
Enrich the run labels with the internal labels and the labels enrichment extension
|
|
440
|
+
:param labels: The run labels dict
|
|
441
|
+
:param labels_to_enrich: The label keys to enrich from MLRunInternalLabels.default_run_labels_to_enrich
|
|
442
|
+
:return: The enriched labels dict
|
|
443
|
+
"""
|
|
444
|
+
# Merge the labels with the labels enrichment extension
|
|
438
445
|
labels_enrichment = {
|
|
439
|
-
|
|
446
|
+
mlrun_constants.MLRunInternalLabels.owner: os.environ.get("V3IO_USERNAME")
|
|
447
|
+
or getpass.getuser(),
|
|
440
448
|
# TODO: remove this in 1.10.0
|
|
441
|
-
|
|
449
|
+
mlrun_constants.MLRunInternalLabels.v3io_user: os.environ.get("V3IO_USERNAME"),
|
|
442
450
|
}
|
|
443
|
-
|
|
451
|
+
|
|
452
|
+
# Resolve which label keys to enrich
|
|
453
|
+
if labels_to_enrich is None:
|
|
454
|
+
labels_to_enrich = (
|
|
455
|
+
mlrun_constants.MLRunInternalLabels.default_run_labels_to_enrich()
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Enrich labels
|
|
444
459
|
for label in labels_to_enrich:
|
|
460
|
+
if isinstance(label, enum.Enum):
|
|
461
|
+
label = label.value
|
|
445
462
|
enrichment = labels_enrichment.get(label)
|
|
446
|
-
if label
|
|
447
|
-
labels[label
|
|
463
|
+
if label not in labels and enrichment:
|
|
464
|
+
labels[label] = enrichment
|
|
448
465
|
return labels
|
|
449
466
|
|
|
450
467
|
|
mlrun/serving/v2_serving.py
CHANGED
|
@@ -384,15 +384,15 @@ class V2ModelServer(StepToDict):
|
|
|
384
384
|
return event
|
|
385
385
|
|
|
386
386
|
def logged_results(self, request: dict, response: dict, op: str):
|
|
387
|
-
"""
|
|
387
|
+
"""Hook for controlling which results are tracked by the model monitoring
|
|
388
388
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
for example in image classification calculate and track the RGB values vs the image bitmap
|
|
389
|
+
This hook allows controlling which input/output data is logged by the model monitoring.
|
|
390
|
+
It allows filtering out columns or adding custom values, and can also be used to monitor derived metrics,
|
|
391
|
+
for example in image classification to calculate and track the RGB values vs the image bitmap.
|
|
392
392
|
|
|
393
|
-
|
|
394
|
-
corresponding output values/arrays (the schema of the input/output fields is stored in the model object)
|
|
395
|
-
|
|
393
|
+
The request ["inputs"] holds a list of input values/arrays, the response ["outputs"] holds a list of
|
|
394
|
+
corresponding output values/arrays (the schema of the input/output fields is stored in the model object).
|
|
395
|
+
This method should return lists of alternative inputs and outputs which will be monitored.
|
|
396
396
|
|
|
397
397
|
:param request: predict/explain request, see model serving docs for details
|
|
398
398
|
:param response: result from the model predict/explain (after postprocess())
|
|
@@ -422,6 +422,7 @@ class V2ModelServer(StepToDict):
|
|
|
422
422
|
|
|
423
423
|
def predict(self, request: dict) -> list:
|
|
424
424
|
"""model prediction operation
|
|
425
|
+
|
|
425
426
|
:return: list with the model prediction results (can be multi-port) or list of lists for multiple predictions
|
|
426
427
|
"""
|
|
427
428
|
raise NotImplementedError()
|
|
@@ -436,7 +437,7 @@ class V2ModelServer(StepToDict):
|
|
|
436
437
|
where the internal list order is according to the ArtifactModel inputs.
|
|
437
438
|
|
|
438
439
|
:param request: event
|
|
439
|
-
:return:
|
|
440
|
+
:return: event body converting the inputs to be list of lists
|
|
440
441
|
"""
|
|
441
442
|
if self.model_spec and self.model_spec.inputs:
|
|
442
443
|
input_order = [feature.name for feature in self.model_spec.inputs]
|
mlrun/utils/helpers.py
CHANGED
|
@@ -876,16 +876,25 @@ def enrich_image_url(
|
|
|
876
876
|
client_version: Optional[str] = None,
|
|
877
877
|
client_python_version: Optional[str] = None,
|
|
878
878
|
) -> str:
|
|
879
|
+
image_url = image_url.strip()
|
|
880
|
+
|
|
881
|
+
# Add python version tag if needed
|
|
882
|
+
if image_url == "python" and client_python_version:
|
|
883
|
+
image_tag = ".".join(client_python_version.split(".")[:2])
|
|
884
|
+
image_url = f"python:{image_tag}"
|
|
885
|
+
|
|
879
886
|
client_version = _convert_python_package_version_to_image_tag(client_version)
|
|
880
887
|
server_version = _convert_python_package_version_to_image_tag(
|
|
881
888
|
mlrun.utils.version.Version().get()["version"]
|
|
882
889
|
)
|
|
883
|
-
image_url = image_url.strip()
|
|
884
890
|
mlrun_version = config.images_tag or client_version or server_version
|
|
885
|
-
tag = mlrun_version
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
891
|
+
tag = mlrun_version or ""
|
|
892
|
+
|
|
893
|
+
# TODO: Remove condition when mlrun/mlrun-kfp image is also supported
|
|
894
|
+
if "mlrun-kfp" not in image_url:
|
|
895
|
+
tag += resolve_image_tag_suffix(
|
|
896
|
+
mlrun_version=mlrun_version, python_version=client_python_version
|
|
897
|
+
)
|
|
889
898
|
|
|
890
899
|
# it's an mlrun image if the repository is mlrun
|
|
891
900
|
is_mlrun_image = image_url.startswith("mlrun/") or "/mlrun/" in image_url
|
|
@@ -917,7 +926,7 @@ def resolve_image_tag_suffix(
|
|
|
917
926
|
mlrun_version: Optional[str] = None, python_version: Optional[str] = None
|
|
918
927
|
) -> str:
|
|
919
928
|
"""
|
|
920
|
-
|
|
929
|
+
Resolves what suffix to be appended to the image tag
|
|
921
930
|
:param mlrun_version: the mlrun version
|
|
922
931
|
:param python_version: the requested python version
|
|
923
932
|
:return: the suffix to append to the image tag
|
|
@@ -929,19 +938,19 @@ def resolve_image_tag_suffix(
|
|
|
929
938
|
# mlrun version is higher than 1.3.0, but we can check the python version and if python version was passed it
|
|
930
939
|
# means it 1.3.0-rc or higher, so we can add the suffix of the python version.
|
|
931
940
|
if mlrun_version.startswith("0.0.0-") or "unstable" in mlrun_version:
|
|
932
|
-
if python_version.startswith("3.
|
|
933
|
-
return "-
|
|
941
|
+
if python_version.startswith("3.9"):
|
|
942
|
+
return "-py39"
|
|
934
943
|
return ""
|
|
935
944
|
|
|
936
|
-
# For mlrun 1.
|
|
937
|
-
# While the python 3.
|
|
938
|
-
# Python 3.
|
|
939
|
-
# and mlrun 1.
|
|
945
|
+
# For mlrun 1.9.x and 1.10.x, we support mlrun runtimes images with both python 3.9 and 3.11 images.
|
|
946
|
+
# While the python 3.11 images will continue to have no suffix, the python 3.9 images will have a '-py39' suffix.
|
|
947
|
+
# Python 3.10 images are not supported in mlrun 1.9.0, meaning that if the user has client with python 3.10
|
|
948
|
+
# and mlrun 1.9.x then the image will be pulled without a suffix (which is the python 3.11 image).
|
|
940
949
|
# using semver (x.y.z-X) to include rc versions as well
|
|
941
|
-
if semver.VersionInfo.parse("1.
|
|
950
|
+
if semver.VersionInfo.parse("1.11.0-X") > semver.VersionInfo.parse(
|
|
942
951
|
mlrun_version
|
|
943
|
-
) >= semver.VersionInfo.parse("1.
|
|
944
|
-
return "-
|
|
952
|
+
) >= semver.VersionInfo.parse("1.9.0-X") and python_version.startswith("3.9"):
|
|
953
|
+
return "-py39"
|
|
945
954
|
return ""
|
|
946
955
|
|
|
947
956
|
|
|
@@ -2088,22 +2097,60 @@ def join_urls(base_url: Optional[str], path: Optional[str]) -> str:
|
|
|
2088
2097
|
|
|
2089
2098
|
class Workflow:
|
|
2090
2099
|
@staticmethod
|
|
2091
|
-
def get_workflow_steps(
|
|
2100
|
+
def get_workflow_steps(
|
|
2101
|
+
db: "mlrun.db.RunDBInterface", workflow_id: str, project: str
|
|
2102
|
+
) -> list:
|
|
2092
2103
|
steps = []
|
|
2093
|
-
db = mlrun.get_run_db()
|
|
2094
2104
|
|
|
2095
2105
|
def _add_run_step(_step: mlrun_pipelines.models.PipelineStep):
|
|
2106
|
+
# on kfp 1.8 argo sets the pod hostname differently than what we have with kfp 2.5
|
|
2107
|
+
# therefore, the heuristic needs to change. what we do here is first trying against 1.8 conventions
|
|
2108
|
+
# and if we can't find it then falling back to 2.5
|
|
2096
2109
|
try:
|
|
2097
|
-
|
|
2110
|
+
# runner_pod = x-y-N
|
|
2111
|
+
_runs = db.list_runs(
|
|
2098
2112
|
project=project,
|
|
2099
2113
|
labels=f"{mlrun_constants.MLRunInternalLabels.runner_pod}={_step.node_name}",
|
|
2100
|
-
)
|
|
2114
|
+
)
|
|
2115
|
+
if not _runs:
|
|
2116
|
+
try:
|
|
2117
|
+
# x-y-N -> x-y, N
|
|
2118
|
+
node_name_initials, node_name_generated_id = (
|
|
2119
|
+
_step.node_name.rsplit("-", 1)
|
|
2120
|
+
)
|
|
2121
|
+
|
|
2122
|
+
except ValueError:
|
|
2123
|
+
# defensive programming, if the node name is not in the expected format
|
|
2124
|
+
node_name_initials = _step.node_name
|
|
2125
|
+
node_name_generated_id = ""
|
|
2126
|
+
|
|
2127
|
+
# compile the expected runner pod hostname as per kfp >= 2.4
|
|
2128
|
+
# x-y, Z, N -> runner_pod = x-y-Z-N
|
|
2129
|
+
runner_pod_value = "-".join(
|
|
2130
|
+
[
|
|
2131
|
+
node_name_initials,
|
|
2132
|
+
_step.display_name,
|
|
2133
|
+
node_name_generated_id,
|
|
2134
|
+
]
|
|
2135
|
+
).rstrip("-")
|
|
2136
|
+
logger.debug(
|
|
2137
|
+
"No run found for step, trying with different node name",
|
|
2138
|
+
step_node_name=runner_pod_value,
|
|
2139
|
+
)
|
|
2140
|
+
_runs = db.list_runs(
|
|
2141
|
+
project=project,
|
|
2142
|
+
labels=f"{mlrun_constants.MLRunInternalLabels.runner_pod}={runner_pod_value}",
|
|
2143
|
+
)
|
|
2144
|
+
|
|
2145
|
+
_run = _runs[0]
|
|
2101
2146
|
except IndexError:
|
|
2147
|
+
logger.warning("No run found for step", step=_step.to_dict())
|
|
2102
2148
|
_run = {
|
|
2103
2149
|
"metadata": {
|
|
2104
2150
|
"name": _step.display_name,
|
|
2105
2151
|
"project": project,
|
|
2106
2152
|
},
|
|
2153
|
+
"status": {},
|
|
2107
2154
|
}
|
|
2108
2155
|
_run["step_kind"] = _step.step_type
|
|
2109
2156
|
if _step.skipped:
|
|
@@ -2216,11 +2263,14 @@ class Workflow:
|
|
|
2216
2263
|
def _get_workflow_manifest(
|
|
2217
2264
|
workflow_id: str,
|
|
2218
2265
|
) -> typing.Optional[mlrun_pipelines.models.PipelineManifest]:
|
|
2219
|
-
kfp_client = mlrun_pipelines.utils.get_client(
|
|
2266
|
+
kfp_client = mlrun_pipelines.utils.get_client(
|
|
2267
|
+
url=mlrun.mlconf.kfp_url,
|
|
2268
|
+
namespace=mlrun.mlconf.namespace,
|
|
2269
|
+
)
|
|
2220
2270
|
|
|
2221
|
-
# arbitrary timeout of
|
|
2271
|
+
# arbitrary timeout of 30 seconds, the workflow should be done by now, however sometimes kfp takes a few
|
|
2222
2272
|
# seconds to update the workflow status
|
|
2223
|
-
kfp_run = kfp_client.wait_for_run_completion(workflow_id,
|
|
2273
|
+
kfp_run = kfp_client.wait_for_run_completion(workflow_id, 30)
|
|
2224
2274
|
if not kfp_run:
|
|
2225
2275
|
return None
|
|
2226
2276
|
|
|
@@ -16,6 +16,7 @@ import typing
|
|
|
16
16
|
|
|
17
17
|
import aiohttp
|
|
18
18
|
|
|
19
|
+
import mlrun.common.runtimes.constants as runtimes_constants
|
|
19
20
|
import mlrun.common.schemas
|
|
20
21
|
import mlrun.lists
|
|
21
22
|
import mlrun.utils.helpers
|
|
@@ -177,7 +178,10 @@ class SlackNotification(NotificationBase):
|
|
|
177
178
|
# Only show the URL if the run is not a function (serving or mlrun function)
|
|
178
179
|
kind = run.get("step_kind")
|
|
179
180
|
state = run["status"].get("state", "")
|
|
180
|
-
|
|
181
|
+
|
|
182
|
+
if state != runtimes_constants.RunStates.skipped and (
|
|
183
|
+
url and not kind or kind == "run"
|
|
184
|
+
):
|
|
181
185
|
line = f'<{url}|*{meta.get("name")}*>'
|
|
182
186
|
else:
|
|
183
187
|
line = meta.get("name")
|
|
@@ -287,7 +287,8 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
287
287
|
)
|
|
288
288
|
project = run.metadata.project
|
|
289
289
|
workflow_id = run.status.results.get("workflow_id", None)
|
|
290
|
-
|
|
290
|
+
db = mlrun.get_run_db()
|
|
291
|
+
runs.extend(Workflow.get_workflow_steps(db, workflow_id, project))
|
|
291
292
|
|
|
292
293
|
message = (
|
|
293
294
|
self.messages.get(run.state(), "").format(resource=resource)
|
mlrun/utils/version/version.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlrun
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Tracking and config of machine learning runs
|
|
5
5
|
Home-page: https://github.com/mlrun/mlrun
|
|
6
6
|
Author: Yaron Haviv
|
|
@@ -15,6 +15,7 @@ Classifier: Operating System :: Microsoft :: Windows
|
|
|
15
15
|
Classifier: Operating System :: MacOS
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
19
|
Classifier: Programming Language :: Python
|
|
19
20
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
21
|
Classifier: Topic :: Software Development :: Libraries
|
|
@@ -35,7 +36,7 @@ Requires-Dist: pyarrow<17,>=10.0
|
|
|
35
36
|
Requires-Dist: pyyaml<7,>=6.0.2
|
|
36
37
|
Requires-Dist: requests~=2.32
|
|
37
38
|
Requires-Dist: tabulate~=0.8.6
|
|
38
|
-
Requires-Dist: v3io~=0.
|
|
39
|
+
Requires-Dist: v3io~=0.7.0
|
|
39
40
|
Requires-Dist: pydantic>=1.10.15
|
|
40
41
|
Requires-Dist: mergedeep~=1.3
|
|
41
42
|
Requires-Dist: v3io-frames~=0.10.14; python_version < "3.11"
|
|
@@ -44,15 +45,15 @@ Requires-Dist: semver~=3.0
|
|
|
44
45
|
Requires-Dist: dependency-injector~=4.41
|
|
45
46
|
Requires-Dist: fsspec<2024.7,>=2023.9.2
|
|
46
47
|
Requires-Dist: v3iofs~=0.1.17
|
|
47
|
-
Requires-Dist: storey~=1.
|
|
48
|
+
Requires-Dist: storey~=1.9.0
|
|
48
49
|
Requires-Dist: inflection~=0.5.0
|
|
49
50
|
Requires-Dist: python-dotenv~=1.0
|
|
50
51
|
Requires-Dist: setuptools>=75.2
|
|
51
52
|
Requires-Dist: deprecated~=1.2
|
|
52
53
|
Requires-Dist: jinja2>=3.1.3,~=3.1
|
|
53
54
|
Requires-Dist: orjson<4,>=3.9.15
|
|
54
|
-
Requires-Dist: mlrun-pipelines-kfp-common~=0.
|
|
55
|
-
Requires-Dist: mlrun-pipelines-kfp-v1-8~=0.
|
|
55
|
+
Requires-Dist: mlrun-pipelines-kfp-common~=0.4.4
|
|
56
|
+
Requires-Dist: mlrun-pipelines-kfp-v1-8~=0.4.3
|
|
56
57
|
Requires-Dist: docstring_parser~=0.16
|
|
57
58
|
Requires-Dist: aiosmtplib~=3.0
|
|
58
59
|
Provides-Extra: s3
|
|
@@ -102,7 +103,7 @@ Requires-Dist: taos-ws-py==0.3.2; extra == "tdengine"
|
|
|
102
103
|
Provides-Extra: snowflake
|
|
103
104
|
Requires-Dist: snowflake-connector-python~=3.7; extra == "snowflake"
|
|
104
105
|
Provides-Extra: kfp18
|
|
105
|
-
Requires-Dist: mlrun_pipelines_kfp_v1_8[kfp]>=0.
|
|
106
|
+
Requires-Dist: mlrun_pipelines_kfp_v1_8[kfp]>=0.4.0; python_version < "3.11" and extra == "kfp18"
|
|
106
107
|
Provides-Extra: api
|
|
107
108
|
Requires-Dist: uvicorn~=0.32.1; extra == "api"
|
|
108
109
|
Requires-Dist: dask-kubernetes~=0.11.0; extra == "api"
|
|
@@ -118,7 +119,7 @@ Requires-Dist: timelength~=1.1; extra == "api"
|
|
|
118
119
|
Requires-Dist: memray~=1.12; sys_platform != "win32" and extra == "api"
|
|
119
120
|
Requires-Dist: aiosmtplib~=3.0; extra == "api"
|
|
120
121
|
Requires-Dist: pydantic<2,>=1; extra == "api"
|
|
121
|
-
Requires-Dist: mlrun-pipelines-kfp-v1-8
|
|
122
|
+
Requires-Dist: mlrun-pipelines-kfp-v1-8~=0.4.3; extra == "api"
|
|
122
123
|
Requires-Dist: grpcio~=1.70.0; extra == "api"
|
|
123
124
|
Provides-Extra: all
|
|
124
125
|
Requires-Dist: adlfs==2023.9.0; extra == "all"
|
|
@@ -212,7 +213,7 @@ Requires-Dist: igz-mgmt~=0.4.1; extra == "complete-api"
|
|
|
212
213
|
Requires-Dist: kafka-python~=2.1.0; extra == "complete-api"
|
|
213
214
|
Requires-Dist: memray~=1.12; sys_platform != "win32" and extra == "complete-api"
|
|
214
215
|
Requires-Dist: mlflow~=2.16; extra == "complete-api"
|
|
215
|
-
Requires-Dist: mlrun-pipelines-kfp-v1-8
|
|
216
|
+
Requires-Dist: mlrun-pipelines-kfp-v1-8~=0.4.3; extra == "complete-api"
|
|
216
217
|
Requires-Dist: msrest~=0.6.21; extra == "complete-api"
|
|
217
218
|
Requires-Dist: objgraph~=3.6; extra == "complete-api"
|
|
218
219
|
Requires-Dist: oss2==2.18.1; extra == "complete-api"
|