mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +9 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +34 -21
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +265 -7
- mlrun/datastore/datastore.py +10 -5
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +367 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +211 -74
- mlrun/datastore/model_provider/openai_provider.py +243 -71
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +27 -19
- mlrun/db/httpdb.py +57 -48
- mlrun/db/nopdb.py +25 -10
- mlrun/execution.py +55 -13
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +2 -0
- mlrun/model.py +9 -3
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +388 -138
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +36 -13
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
- mlrun/model_monitoring/helpers.py +28 -5
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +16 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +157 -69
- mlrun/run.py +97 -20
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +147 -17
- mlrun/runtimes/nuclio/function.py +72 -27
- mlrun/runtimes/nuclio/serving.py +102 -20
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +54 -13
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +230 -40
- mlrun/serving/states.py +605 -232
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +136 -81
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +215 -83
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/mail.py +38 -15
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
mlrun/common/secrets.py
CHANGED
|
@@ -11,10 +11,31 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import re
|
|
15
15
|
from abc import ABC, abstractmethod
|
|
16
16
|
|
|
17
17
|
import mlrun.common.schemas
|
|
18
|
+
from mlrun.config import config as mlconf
|
|
19
|
+
|
|
20
|
+
_AUTH_SECRET_NAME_TEMPLATE = re.escape(
|
|
21
|
+
mlconf.secret_stores.kubernetes.auth_secret_name.format(
|
|
22
|
+
hashed_access_key="",
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
AUTH_SECRET_PATTERN = re.compile(f"^{_AUTH_SECRET_NAME_TEMPLATE}.*")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate_not_forbidden_secret(secret_name: str) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Forbid client-supplied references to internal MLRun auth/project secrets.
|
|
31
|
+
No-op when running inside the API server (API enrichments are allowed).
|
|
32
|
+
"""
|
|
33
|
+
if not secret_name or mlrun.config.is_running_as_api():
|
|
34
|
+
return
|
|
35
|
+
if AUTH_SECRET_PATTERN.match(secret_name):
|
|
36
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
37
|
+
f"Forbidden secret '{secret_name}' matches MLRun auth-secret pattern."
|
|
38
|
+
)
|
|
18
39
|
|
|
19
40
|
|
|
20
41
|
class SecretProviderInterface(ABC):
|
mlrun/config.py
CHANGED
|
@@ -66,7 +66,6 @@ default_config = {
|
|
|
66
66
|
"nuclio_version": "",
|
|
67
67
|
"default_nuclio_runtime": "python:3.11",
|
|
68
68
|
"nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
|
|
69
|
-
"ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
|
|
70
69
|
"remote_host": "",
|
|
71
70
|
"api_base_version": "v1",
|
|
72
71
|
"version": "", # will be set to current version
|
|
@@ -107,7 +106,11 @@ default_config = {
|
|
|
107
106
|
"submit_timeout": "280", # timeout when submitting a new k8s resource
|
|
108
107
|
# runtimes cleanup interval in seconds
|
|
109
108
|
"runtimes_cleanup_interval": "300",
|
|
110
|
-
|
|
109
|
+
# disabled by default due to an internal bug in serving functions
|
|
110
|
+
# relying on a background task to hold the status for its model endpoints
|
|
111
|
+
# TODO: need to refine what/when we can delete the background tasks
|
|
112
|
+
# e.g: use labels or naming convention.
|
|
113
|
+
"background_task_cleanup_interval": "0",
|
|
111
114
|
"background_task_max_age": "21600", # 6 hours in seconds
|
|
112
115
|
"monitoring": {
|
|
113
116
|
"runs": {
|
|
@@ -194,6 +197,7 @@ default_config = {
|
|
|
194
197
|
"v3io_framesd": "http://framesd:8080",
|
|
195
198
|
"model_providers": {
|
|
196
199
|
"openai_default_model": "gpt-4o",
|
|
200
|
+
"huggingface_default_model": "microsoft/Phi-3-mini-4k-instruct",
|
|
197
201
|
},
|
|
198
202
|
# default node selector to be applied to all functions - json string base64 encoded format
|
|
199
203
|
"default_function_node_selector": "e30=",
|
|
@@ -250,7 +254,8 @@ default_config = {
|
|
|
250
254
|
},
|
|
251
255
|
"runtimes": {
|
|
252
256
|
"dask": "600",
|
|
253
|
-
|
|
257
|
+
# cluster start might take some time in case k8s needs to spin up new nodes
|
|
258
|
+
"dask_cluster_start": "600",
|
|
254
259
|
},
|
|
255
260
|
"push_notifications": "60",
|
|
256
261
|
},
|
|
@@ -298,6 +303,7 @@ default_config = {
|
|
|
298
303
|
"application": {
|
|
299
304
|
"default_sidecar_internal_port": 8050,
|
|
300
305
|
"default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
|
|
306
|
+
"default_worker_number": 100,
|
|
301
307
|
},
|
|
302
308
|
},
|
|
303
309
|
# TODO: function defaults should be moved to the function spec config above
|
|
@@ -406,11 +412,7 @@ default_config = {
|
|
|
406
412
|
#
|
|
407
413
|
# if set to "nil" or "none", nothing would be set
|
|
408
414
|
"modes": (
|
|
409
|
-
"STRICT_TRANS_TABLES"
|
|
410
|
-
",NO_ZERO_IN_DATE"
|
|
411
|
-
",NO_ZERO_DATE"
|
|
412
|
-
",ERROR_FOR_DIVISION_BY_ZERO"
|
|
413
|
-
",NO_ENGINE_SUBSTITUTION",
|
|
415
|
+
"STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION"
|
|
414
416
|
)
|
|
415
417
|
},
|
|
416
418
|
},
|
|
@@ -647,6 +649,13 @@ default_config = {
|
|
|
647
649
|
"max_replicas": 1,
|
|
648
650
|
},
|
|
649
651
|
},
|
|
652
|
+
"writer_graph": {
|
|
653
|
+
"max_events": 1000,
|
|
654
|
+
"flush_after_seconds": 30,
|
|
655
|
+
"writer_version": "v1", # v1 is the sync version while v2 is async
|
|
656
|
+
"parquet_batching_max_events": 10,
|
|
657
|
+
"parquet_batching_timeout_secs": 30,
|
|
658
|
+
},
|
|
650
659
|
# Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
|
|
651
660
|
# stream, and endpoints.
|
|
652
661
|
"store_prefixes": {
|
|
@@ -715,9 +724,8 @@ default_config = {
|
|
|
715
724
|
# Set false to avoid creating a global source (for example in a dark site)
|
|
716
725
|
"create": True,
|
|
717
726
|
"name": "default",
|
|
718
|
-
"description": "MLRun
|
|
727
|
+
"description": "MLRun hub",
|
|
719
728
|
"url": "https://mlrun.github.io/marketplace",
|
|
720
|
-
"object_type": "functions",
|
|
721
729
|
"channel": "master",
|
|
722
730
|
},
|
|
723
731
|
},
|
|
@@ -999,9 +1007,9 @@ class Config:
|
|
|
999
1007
|
)
|
|
1000
1008
|
|
|
1001
1009
|
@staticmethod
|
|
1002
|
-
def
|
|
1010
|
+
def get_default_hub_source_url_prefix(object_type) -> str:
|
|
1003
1011
|
default_source = config.hub.default_source
|
|
1004
|
-
return f"{default_source.url}/{
|
|
1012
|
+
return f"{default_source.url}/{object_type}/{default_source.channel}/"
|
|
1005
1013
|
|
|
1006
1014
|
@staticmethod
|
|
1007
1015
|
def decode_base64_config_and_load_to_object(
|
|
@@ -1242,6 +1250,19 @@ class Config:
|
|
|
1242
1250
|
"""
|
|
1243
1251
|
return self.is_running_on_iguazio()
|
|
1244
1252
|
|
|
1253
|
+
@staticmethod
|
|
1254
|
+
def get_run_retry_staleness_threshold_timedelta() -> timedelta:
|
|
1255
|
+
"""
|
|
1256
|
+
Get the staleness threshold in timedelta for run retries.
|
|
1257
|
+
This is used to determine if a run is stale and should be retried.
|
|
1258
|
+
|
|
1259
|
+
:return: The staleness threshold in timedelta.
|
|
1260
|
+
"""
|
|
1261
|
+
staleness_threshold = int(
|
|
1262
|
+
mlrun.mlconf.monitoring.runs.retry.staleness_threshold
|
|
1263
|
+
)
|
|
1264
|
+
return timedelta(minutes=staleness_threshold)
|
|
1265
|
+
|
|
1245
1266
|
def to_dict(self):
|
|
1246
1267
|
return copy.deepcopy(self._cfg)
|
|
1247
1268
|
|
|
@@ -1258,10 +1279,7 @@ class Config:
|
|
|
1258
1279
|
|
|
1259
1280
|
@staticmethod
|
|
1260
1281
|
def resolve_ui_url():
|
|
1261
|
-
|
|
1262
|
-
# since the config class is used in a "recursive" way, we can't use property like we used in other places
|
|
1263
|
-
# since the property will need to be url, which exists in other structs as well
|
|
1264
|
-
return config.ui.url or config.ui_url
|
|
1282
|
+
return config.ui.url
|
|
1265
1283
|
|
|
1266
1284
|
def is_api_running_on_k8s(self):
|
|
1267
1285
|
# determine if the API service is attached to K8s cluster
|
|
@@ -1548,7 +1566,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1548
1566
|
"https://mlrun-api.", "https://framesd."
|
|
1549
1567
|
)
|
|
1550
1568
|
|
|
1551
|
-
uisvc = env.get("MLRUN_UI_SERVICE_HOST")
|
|
1552
1569
|
igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
|
|
1553
1570
|
|
|
1554
1571
|
# workaround to try and detect IGZ domain
|
|
@@ -1574,10 +1591,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1574
1591
|
if config.get("nuclio_dashboard_url") == "disabled":
|
|
1575
1592
|
config["nuclio_dashboard_url"] = ""
|
|
1576
1593
|
|
|
1577
|
-
if uisvc and not config.get("ui_url"):
|
|
1578
|
-
if igz_domain:
|
|
1579
|
-
config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
|
|
1580
|
-
|
|
1581
1594
|
if log_level := config.get("log_level"):
|
|
1582
1595
|
import mlrun.utils.logger
|
|
1583
1596
|
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -39,10 +39,11 @@ __all__ = [
|
|
|
39
39
|
from urllib.parse import urlparse
|
|
40
40
|
|
|
41
41
|
import fsspec
|
|
42
|
+
import storey
|
|
42
43
|
|
|
43
44
|
import mlrun.datastore.wasbfs
|
|
44
45
|
from mlrun.datastore.datastore_profile import (
|
|
45
|
-
|
|
46
|
+
DatastoreProfileKafkaStream,
|
|
46
47
|
DatastoreProfileKafkaTarget,
|
|
47
48
|
DatastoreProfileV3io,
|
|
48
49
|
)
|
|
@@ -122,7 +123,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
122
123
|
)
|
|
123
124
|
if isinstance(
|
|
124
125
|
datastore_profile,
|
|
125
|
-
(
|
|
126
|
+
(DatastoreProfileKafkaStream, DatastoreProfileKafkaTarget),
|
|
126
127
|
):
|
|
127
128
|
attributes = datastore_profile.attributes()
|
|
128
129
|
brokers = attributes.pop("brokers", None)
|
|
@@ -168,11 +169,12 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
168
169
|
raise ValueError(f"unsupported stream path {stream_path}")
|
|
169
170
|
|
|
170
171
|
|
|
171
|
-
class _DummyStream:
|
|
172
|
+
class _DummyStream(storey.MapClass):
|
|
172
173
|
"""stream emulator for tests and debug"""
|
|
173
174
|
|
|
174
175
|
def __init__(self, event_list=None, **kwargs):
|
|
175
176
|
self.event_list = event_list or []
|
|
177
|
+
super().__init__(**kwargs)
|
|
176
178
|
|
|
177
179
|
def push(self, data, **kwargs):
|
|
178
180
|
if not isinstance(data, list):
|
|
@@ -180,3 +182,9 @@ class _DummyStream:
|
|
|
180
182
|
for item in data:
|
|
181
183
|
logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
|
|
182
184
|
self.event_list.append(item)
|
|
185
|
+
|
|
186
|
+
def do(self, event):
|
|
187
|
+
if not isinstance(event, list):
|
|
188
|
+
event = [event]
|
|
189
|
+
for item in event:
|
|
190
|
+
self.event_list.append(item)
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2025 Iguazio
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import contextlib
|
|
15
16
|
import time
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import Optional
|
|
@@ -30,6 +31,40 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class AzureBlobStore(DataStore):
|
|
34
|
+
"""
|
|
35
|
+
Azure Blob Storage datastore implementation.
|
|
36
|
+
|
|
37
|
+
Supports multiple URL schemas: az://, wasbs://, wasb://
|
|
38
|
+
|
|
39
|
+
Supported Connection String Formats:
|
|
40
|
+
====================================
|
|
41
|
+
|
|
42
|
+
1. Account Key (Standard):
|
|
43
|
+
"DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.windows.net"
|
|
44
|
+
|
|
45
|
+
2. SAS Token:
|
|
46
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;SharedAccessSignature=<sas_token>"
|
|
47
|
+
|
|
48
|
+
3. Minimal BlobEndpoint:
|
|
49
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;AccountName=<account>;AccountKey=<key>"
|
|
50
|
+
|
|
51
|
+
4. Custom Domain:
|
|
52
|
+
"BlobEndpoint=https://<account>.mydomain.com/;AccountName=<account>;AccountKey=<key>"
|
|
53
|
+
|
|
54
|
+
5. China/Government Cloud:
|
|
55
|
+
"DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn"
|
|
56
|
+
|
|
57
|
+
6. Full Service Endpoints with SAS:
|
|
58
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;QueueEndpoint=...;SharedAccessSignature=<sas>"
|
|
59
|
+
|
|
60
|
+
Authentication Methods:
|
|
61
|
+
======================
|
|
62
|
+
- Account Key (connection_string or storage_options)
|
|
63
|
+
- SAS Token (connection_string or storage_options)
|
|
64
|
+
- OAuth/Azure AD (storage_options: client_id, client_secret, tenant_id)
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
|
|
33
68
|
using_bucket = True
|
|
34
69
|
max_concurrency = 100
|
|
35
70
|
max_blocksize = 1024 * 1024 * 4
|
|
@@ -40,6 +75,12 @@ class AzureBlobStore(DataStore):
|
|
|
40
75
|
def __init__(
|
|
41
76
|
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
42
77
|
):
|
|
78
|
+
# Extract container from WASBS endpoint before calling super()
|
|
79
|
+
self._container_from_endpoint = None
|
|
80
|
+
if schema in ["wasbs", "wasb"] and endpoint and "@" in endpoint:
|
|
81
|
+
# Handle container@host format
|
|
82
|
+
self._container_from_endpoint, endpoint = endpoint.split("@", 1)
|
|
83
|
+
|
|
43
84
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
44
85
|
self._service_client = None
|
|
45
86
|
self._storage_options = None
|
|
@@ -67,6 +108,34 @@ class AzureBlobStore(DataStore):
|
|
|
67
108
|
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
68
109
|
credential=self._get_secret_or_env("credential"),
|
|
69
110
|
)
|
|
111
|
+
# Use container extracted from WASBS endpoint during initialization
|
|
112
|
+
if self._container_from_endpoint:
|
|
113
|
+
res["container"] = self._container_from_endpoint
|
|
114
|
+
|
|
115
|
+
# For az:// URLs, endpoint contains the container name
|
|
116
|
+
if not res.get("container") and self.kind in ["az"]:
|
|
117
|
+
if container := getattr(self, "endpoint", None):
|
|
118
|
+
res["container"] = container
|
|
119
|
+
|
|
120
|
+
# Last resort: For wasbs:// without container, check if connection string has BlobEndpoint with container
|
|
121
|
+
if not res.get("container") and self.kind in ["wasbs", "wasb"]:
|
|
122
|
+
connection_string = res.get("connection_string")
|
|
123
|
+
if connection_string and "BlobEndpoint=" in connection_string:
|
|
124
|
+
# Try to extract container from BlobEndpoint URL
|
|
125
|
+
for part in connection_string.split(";"):
|
|
126
|
+
if part.startswith("BlobEndpoint="):
|
|
127
|
+
blob_endpoint = part.split("=", 1)[1]
|
|
128
|
+
# Parse URL to get path component
|
|
129
|
+
from urllib.parse import urlparse
|
|
130
|
+
|
|
131
|
+
parsed = urlparse(blob_endpoint)
|
|
132
|
+
if parsed.path and parsed.path.strip("/"):
|
|
133
|
+
# Extract first path segment as container
|
|
134
|
+
path_parts = parsed.path.strip("/").split("/")
|
|
135
|
+
if path_parts[0]:
|
|
136
|
+
res["container"] = path_parts[0]
|
|
137
|
+
break
|
|
138
|
+
|
|
70
139
|
self._storage_options = self._sanitize_options(res)
|
|
71
140
|
return self._storage_options
|
|
72
141
|
|
|
@@ -165,7 +234,18 @@ class AzureBlobStore(DataStore):
|
|
|
165
234
|
# if called without passing dataitem - like in fset.purge_targets,
|
|
166
235
|
# key will include schema.
|
|
167
236
|
if not schema:
|
|
168
|
-
|
|
237
|
+
# For wasbs/wasb, the filesystem is scoped to the container, so we need to use
|
|
238
|
+
# the container name as the base path, not the hostname endpoint.
|
|
239
|
+
# For az://, endpoint already contains the container name.
|
|
240
|
+
if self.kind in ["wasbs", "wasb"]:
|
|
241
|
+
container = self.storage_options.get("container")
|
|
242
|
+
if container:
|
|
243
|
+
key = Path(container, key).as_posix()
|
|
244
|
+
else:
|
|
245
|
+
# If no container found, use endpoint (might be hostname, but better than nothing)
|
|
246
|
+
key = Path(self.endpoint, key).as_posix()
|
|
247
|
+
else:
|
|
248
|
+
key = Path(self.endpoint, key).as_posix()
|
|
169
249
|
return key
|
|
170
250
|
|
|
171
251
|
def upload(self, key, src_path):
|
|
@@ -229,18 +309,27 @@ class AzureBlobStore(DataStore):
|
|
|
229
309
|
st = self.storage_options
|
|
230
310
|
service = "blob"
|
|
231
311
|
primary_url = None
|
|
232
|
-
|
|
312
|
+
|
|
313
|
+
# Parse connection string (fills account_name/account_key or SAS)
|
|
314
|
+
connection_string = st.get("connection_string")
|
|
315
|
+
if connection_string:
|
|
233
316
|
primary_url, _, parsed_credential = parse_connection_str(
|
|
234
|
-
|
|
317
|
+
connection_string, credential=None, service=service
|
|
235
318
|
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
319
|
+
|
|
320
|
+
if isinstance(parsed_credential, str):
|
|
321
|
+
# SharedAccessSignature as raw string
|
|
322
|
+
parsed_credential = {"sas_token": parsed_credential}
|
|
323
|
+
|
|
324
|
+
for key in ["account_name", "account_key", "sas_token"]:
|
|
325
|
+
if parsed_value := parsed_credential.get(key):
|
|
326
|
+
# Only check for conflicts if storage options has a non-empty value for this key
|
|
327
|
+
existing_value = st.get(key)
|
|
328
|
+
if existing_value and existing_value != parsed_value:
|
|
240
329
|
if key == "account_name":
|
|
241
330
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
242
|
-
f"Storage option for '{key}' is '{
|
|
243
|
-
|
|
331
|
+
f"Storage option for '{key}' is '{existing_value}', "
|
|
332
|
+
f"which does not match corresponding connection string '{parsed_value}'"
|
|
244
333
|
)
|
|
245
334
|
else:
|
|
246
335
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -249,57 +338,83 @@ class AzureBlobStore(DataStore):
|
|
|
249
338
|
st[key] = parsed_value
|
|
250
339
|
|
|
251
340
|
account_name = st.get("account_name")
|
|
341
|
+
# Derive host (prefer connection string primary URL)
|
|
252
342
|
if primary_url:
|
|
253
343
|
if primary_url.startswith("http://"):
|
|
254
344
|
primary_url = primary_url[len("http://") :]
|
|
255
345
|
if primary_url.startswith("https://"):
|
|
256
346
|
primary_url = primary_url[len("https://") :]
|
|
257
|
-
|
|
347
|
+
# Remove any path components from the host
|
|
348
|
+
host = primary_url.split("/")[0]
|
|
258
349
|
elif account_name:
|
|
259
350
|
host = f"{account_name}.{service}.core.windows.net"
|
|
260
351
|
else:
|
|
352
|
+
# nothing to configure yet
|
|
261
353
|
return res
|
|
262
354
|
|
|
263
|
-
|
|
355
|
+
host = host.rstrip("/")
|
|
356
|
+
|
|
357
|
+
# Account key (optional; WASB supports it)
|
|
358
|
+
if "account_key" in st and st["account_key"]:
|
|
264
359
|
res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
|
|
265
360
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
|
|
270
|
-
)
|
|
271
|
-
if "client_id" in st:
|
|
272
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
|
|
273
|
-
"client_id"
|
|
274
|
-
]
|
|
275
|
-
if "client_secret" in st:
|
|
276
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.secret.{host}"] = st[
|
|
277
|
-
"client_secret"
|
|
278
|
-
]
|
|
279
|
-
if "tenant_id" in st:
|
|
280
|
-
tenant_id = st["tenant_id"]
|
|
281
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
|
|
282
|
-
f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
|
|
283
|
-
)
|
|
361
|
+
# --- WASB + SAS (container-scoped key; no provider classes needed) ---
|
|
362
|
+
if "sas_token" in st and st["sas_token"]:
|
|
363
|
+
sas = st["sas_token"].lstrip("?")
|
|
284
364
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
365
|
+
container = st.get("container")
|
|
366
|
+
|
|
367
|
+
if container:
|
|
368
|
+
# fs.azure.sas.<container>.<account>.blob.core.windows.net = <sas>
|
|
369
|
+
res[f"spark.hadoop.fs.azure.sas.{container}.{host}"] = sas
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
373
|
+
"Container name is required for WASB SAS. "
|
|
374
|
+
"Set self.endpoint or storage_options['container']."
|
|
375
|
+
)
|
|
291
376
|
return res
|
|
292
377
|
|
|
293
378
|
@property
|
|
294
379
|
def spark_url(self):
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
380
|
+
# Build: wasbs://<container>@<host>
|
|
381
|
+
st = self.storage_options
|
|
382
|
+
service = "blob"
|
|
383
|
+
|
|
384
|
+
container = st.get("container")
|
|
385
|
+
|
|
386
|
+
if not container:
|
|
387
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
388
|
+
"Container name is required to build the WASB URL. "
|
|
389
|
+
"Set storage_options['container'] or use datastore profile with container specified."
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Prefer host from connection string; else synthesize from account_name
|
|
393
|
+
host = None
|
|
394
|
+
account_name = st.get("account_name")
|
|
395
|
+
connection_string = st.get("connection_string")
|
|
396
|
+
|
|
397
|
+
if connection_string:
|
|
398
|
+
with contextlib.suppress(Exception):
|
|
399
|
+
primary_url, _, _ = parse_connection_str(
|
|
400
|
+
connection_string, credential=None, service=service
|
|
401
|
+
)
|
|
402
|
+
if primary_url.startswith("http://"):
|
|
403
|
+
primary_url = primary_url[len("http://") :]
|
|
404
|
+
if primary_url.startswith("https://"):
|
|
405
|
+
primary_url = primary_url[len("https://") :]
|
|
406
|
+
# Remove any path components from the host
|
|
407
|
+
host = primary_url.split("/")[0].rstrip("/")
|
|
408
|
+
if not host and account_name:
|
|
409
|
+
host = f"{account_name}.{service}.core.windows.net"
|
|
410
|
+
|
|
411
|
+
# For wasbs:// URLs where endpoint is already the host
|
|
412
|
+
if not host and self.kind in ["wasbs", "wasb"] and hasattr(self, "endpoint"):
|
|
413
|
+
host = getattr(self, "endpoint", None)
|
|
414
|
+
|
|
415
|
+
if not host:
|
|
416
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
417
|
+
"account_name is required (or provide a connection_string) to build the WASB URL."
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return f"wasbs://{container}@{host}"
|