mlrun 1.10.0rc16__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (101) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/document.py +6 -1
  3. mlrun/artifacts/llm_prompt.py +21 -15
  4. mlrun/artifacts/model.py +3 -3
  5. mlrun/common/constants.py +9 -0
  6. mlrun/common/formatters/artifact.py +1 -0
  7. mlrun/common/model_monitoring/helpers.py +86 -0
  8. mlrun/common/schemas/__init__.py +2 -0
  9. mlrun/common/schemas/auth.py +2 -0
  10. mlrun/common/schemas/function.py +10 -0
  11. mlrun/common/schemas/hub.py +30 -18
  12. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  13. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  14. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  15. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  16. mlrun/common/schemas/pipeline.py +1 -1
  17. mlrun/common/schemas/serving.py +3 -0
  18. mlrun/common/schemas/workflow.py +1 -0
  19. mlrun/common/secrets.py +22 -1
  20. mlrun/config.py +34 -21
  21. mlrun/datastore/__init__.py +11 -3
  22. mlrun/datastore/azure_blob.py +162 -47
  23. mlrun/datastore/base.py +265 -7
  24. mlrun/datastore/datastore.py +10 -5
  25. mlrun/datastore/datastore_profile.py +61 -5
  26. mlrun/datastore/model_provider/huggingface_provider.py +367 -0
  27. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  28. mlrun/datastore/model_provider/model_provider.py +211 -74
  29. mlrun/datastore/model_provider/openai_provider.py +243 -71
  30. mlrun/datastore/s3.py +24 -2
  31. mlrun/datastore/store_resources.py +4 -4
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +27 -19
  35. mlrun/db/httpdb.py +57 -48
  36. mlrun/db/nopdb.py +25 -10
  37. mlrun/execution.py +55 -13
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +2 -0
  43. mlrun/model.py +9 -3
  44. mlrun/model_monitoring/api.py +66 -27
  45. mlrun/model_monitoring/applications/__init__.py +1 -1
  46. mlrun/model_monitoring/applications/base.py +388 -138
  47. mlrun/model_monitoring/applications/context.py +2 -4
  48. mlrun/model_monitoring/applications/results.py +4 -7
  49. mlrun/model_monitoring/controller.py +239 -101
  50. mlrun/model_monitoring/db/_schedules.py +36 -13
  51. mlrun/model_monitoring/db/_stats.py +4 -3
  52. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  53. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
  54. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
  55. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  56. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  57. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
  58. mlrun/model_monitoring/helpers.py +28 -5
  59. mlrun/model_monitoring/stream_processing.py +45 -14
  60. mlrun/model_monitoring/writer.py +220 -1
  61. mlrun/platforms/__init__.py +3 -2
  62. mlrun/platforms/iguazio.py +7 -3
  63. mlrun/projects/operations.py +16 -11
  64. mlrun/projects/pipelines.py +2 -2
  65. mlrun/projects/project.py +157 -69
  66. mlrun/run.py +97 -20
  67. mlrun/runtimes/__init__.py +18 -0
  68. mlrun/runtimes/base.py +14 -6
  69. mlrun/runtimes/daskjob.py +1 -0
  70. mlrun/runtimes/local.py +5 -2
  71. mlrun/runtimes/mounts.py +20 -2
  72. mlrun/runtimes/nuclio/__init__.py +1 -0
  73. mlrun/runtimes/nuclio/application/application.py +147 -17
  74. mlrun/runtimes/nuclio/function.py +72 -27
  75. mlrun/runtimes/nuclio/serving.py +102 -20
  76. mlrun/runtimes/pod.py +213 -21
  77. mlrun/runtimes/utils.py +49 -9
  78. mlrun/secrets.py +54 -13
  79. mlrun/serving/remote.py +79 -6
  80. mlrun/serving/routers.py +23 -41
  81. mlrun/serving/server.py +230 -40
  82. mlrun/serving/states.py +605 -232
  83. mlrun/serving/steps.py +62 -0
  84. mlrun/serving/system_steps.py +136 -81
  85. mlrun/serving/v2_serving.py +9 -10
  86. mlrun/utils/helpers.py +215 -83
  87. mlrun/utils/logger.py +3 -1
  88. mlrun/utils/notifications/notification/base.py +18 -0
  89. mlrun/utils/notifications/notification/git.py +2 -4
  90. mlrun/utils/notifications/notification/mail.py +38 -15
  91. mlrun/utils/notifications/notification/slack.py +2 -4
  92. mlrun/utils/notifications/notification/webhook.py +2 -5
  93. mlrun/utils/notifications/notification_pusher.py +1 -1
  94. mlrun/utils/version/version.json +2 -2
  95. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +51 -50
  96. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +100 -95
  97. mlrun/api/schemas/__init__.py +0 -259
  98. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
  99. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
  100. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
  101. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
mlrun/common/secrets.py CHANGED
@@ -11,10 +11,31 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import re
15
15
  from abc import ABC, abstractmethod
16
16
 
17
17
  import mlrun.common.schemas
18
+ from mlrun.config import config as mlconf
19
+
20
+ _AUTH_SECRET_NAME_TEMPLATE = re.escape(
21
+ mlconf.secret_stores.kubernetes.auth_secret_name.format(
22
+ hashed_access_key="",
23
+ )
24
+ )
25
+ AUTH_SECRET_PATTERN = re.compile(f"^{_AUTH_SECRET_NAME_TEMPLATE}.*")
26
+
27
+
28
+ def validate_not_forbidden_secret(secret_name: str) -> None:
29
+ """
30
+ Forbid client-supplied references to internal MLRun auth/project secrets.
31
+ No-op when running inside the API server (API enrichments are allowed).
32
+ """
33
+ if not secret_name or mlrun.config.is_running_as_api():
34
+ return
35
+ if AUTH_SECRET_PATTERN.match(secret_name):
36
+ raise mlrun.errors.MLRunInvalidArgumentError(
37
+ f"Forbidden secret '{secret_name}' matches MLRun auth-secret pattern."
38
+ )
18
39
 
19
40
 
20
41
  class SecretProviderInterface(ABC):
mlrun/config.py CHANGED
@@ -66,7 +66,6 @@ default_config = {
66
66
  "nuclio_version": "",
67
67
  "default_nuclio_runtime": "python:3.11",
68
68
  "nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
69
- "ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
70
69
  "remote_host": "",
71
70
  "api_base_version": "v1",
72
71
  "version": "", # will be set to current version
@@ -107,7 +106,11 @@ default_config = {
107
106
  "submit_timeout": "280", # timeout when submitting a new k8s resource
108
107
  # runtimes cleanup interval in seconds
109
108
  "runtimes_cleanup_interval": "300",
110
- "background_task_cleanup_interval": "86400", # 24 hours in seconds
109
+ # disabled by default due to an internal bug in serving functions
110
+ # relying on a background task to hold the status for its model endpoints
111
+ # TODO: need to refine what/when we can delete the background tasks
112
+ # e.g: use labels or naming convention.
113
+ "background_task_cleanup_interval": "0",
111
114
  "background_task_max_age": "21600", # 6 hours in seconds
112
115
  "monitoring": {
113
116
  "runs": {
@@ -194,6 +197,7 @@ default_config = {
194
197
  "v3io_framesd": "http://framesd:8080",
195
198
  "model_providers": {
196
199
  "openai_default_model": "gpt-4o",
200
+ "huggingface_default_model": "microsoft/Phi-3-mini-4k-instruct",
197
201
  },
198
202
  # default node selector to be applied to all functions - json string base64 encoded format
199
203
  "default_function_node_selector": "e30=",
@@ -250,7 +254,8 @@ default_config = {
250
254
  },
251
255
  "runtimes": {
252
256
  "dask": "600",
253
- "dask_cluster_start": "300",
257
+ # cluster start might take some time in case k8s needs to spin up new nodes
258
+ "dask_cluster_start": "600",
254
259
  },
255
260
  "push_notifications": "60",
256
261
  },
@@ -298,6 +303,7 @@ default_config = {
298
303
  "application": {
299
304
  "default_sidecar_internal_port": 8050,
300
305
  "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
306
+ "default_worker_number": 100,
301
307
  },
302
308
  },
303
309
  # TODO: function defaults should be moved to the function spec config above
@@ -406,11 +412,7 @@ default_config = {
406
412
  #
407
413
  # if set to "nil" or "none", nothing would be set
408
414
  "modes": (
409
- "STRICT_TRANS_TABLES"
410
- ",NO_ZERO_IN_DATE"
411
- ",NO_ZERO_DATE"
412
- ",ERROR_FOR_DIVISION_BY_ZERO"
413
- ",NO_ENGINE_SUBSTITUTION",
415
+ "STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION"
414
416
  )
415
417
  },
416
418
  },
@@ -647,6 +649,13 @@ default_config = {
647
649
  "max_replicas": 1,
648
650
  },
649
651
  },
652
+ "writer_graph": {
653
+ "max_events": 1000,
654
+ "flush_after_seconds": 30,
655
+ "writer_version": "v1", # v1 is the sync version while v2 is async
656
+ "parquet_batching_max_events": 10,
657
+ "parquet_batching_timeout_secs": 30,
658
+ },
650
659
  # Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
651
660
  # stream, and endpoints.
652
661
  "store_prefixes": {
@@ -715,9 +724,8 @@ default_config = {
715
724
  # Set false to avoid creating a global source (for example in a dark site)
716
725
  "create": True,
717
726
  "name": "default",
718
- "description": "MLRun global function hub",
727
+ "description": "MLRun hub",
719
728
  "url": "https://mlrun.github.io/marketplace",
720
- "object_type": "functions",
721
729
  "channel": "master",
722
730
  },
723
731
  },
@@ -999,9 +1007,9 @@ class Config:
999
1007
  )
1000
1008
 
1001
1009
  @staticmethod
1002
- def get_default_hub_source() -> str:
1010
+ def get_default_hub_source_url_prefix(object_type) -> str:
1003
1011
  default_source = config.hub.default_source
1004
- return f"{default_source.url}/{default_source.object_type}/{default_source.channel}/"
1012
+ return f"{default_source.url}/{object_type}/{default_source.channel}/"
1005
1013
 
1006
1014
  @staticmethod
1007
1015
  def decode_base64_config_and_load_to_object(
@@ -1242,6 +1250,19 @@ class Config:
1242
1250
  """
1243
1251
  return self.is_running_on_iguazio()
1244
1252
 
1253
+ @staticmethod
1254
+ def get_run_retry_staleness_threshold_timedelta() -> timedelta:
1255
+ """
1256
+ Get the staleness threshold in timedelta for run retries.
1257
+ This is used to determine if a run is stale and should be retried.
1258
+
1259
+ :return: The staleness threshold in timedelta.
1260
+ """
1261
+ staleness_threshold = int(
1262
+ mlrun.mlconf.monitoring.runs.retry.staleness_threshold
1263
+ )
1264
+ return timedelta(minutes=staleness_threshold)
1265
+
1245
1266
  def to_dict(self):
1246
1267
  return copy.deepcopy(self._cfg)
1247
1268
 
@@ -1258,10 +1279,7 @@ class Config:
1258
1279
 
1259
1280
  @staticmethod
1260
1281
  def resolve_ui_url():
1261
- # ui_url is deprecated in favor of the ui.url (we created the ui block)
1262
- # since the config class is used in a "recursive" way, we can't use property like we used in other places
1263
- # since the property will need to be url, which exists in other structs as well
1264
- return config.ui.url or config.ui_url
1282
+ return config.ui.url
1265
1283
 
1266
1284
  def is_api_running_on_k8s(self):
1267
1285
  # determine if the API service is attached to K8s cluster
@@ -1548,7 +1566,6 @@ def read_env(env=None, prefix=env_prefix):
1548
1566
  "https://mlrun-api.", "https://framesd."
1549
1567
  )
1550
1568
 
1551
- uisvc = env.get("MLRUN_UI_SERVICE_HOST")
1552
1569
  igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
1553
1570
 
1554
1571
  # workaround to try and detect IGZ domain
@@ -1574,10 +1591,6 @@ def read_env(env=None, prefix=env_prefix):
1574
1591
  if config.get("nuclio_dashboard_url") == "disabled":
1575
1592
  config["nuclio_dashboard_url"] = ""
1576
1593
 
1577
- if uisvc and not config.get("ui_url"):
1578
- if igz_domain:
1579
- config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
1580
-
1581
1594
  if log_level := config.get("log_level"):
1582
1595
  import mlrun.utils.logger
1583
1596
 
@@ -39,10 +39,11 @@ __all__ = [
39
39
  from urllib.parse import urlparse
40
40
 
41
41
  import fsspec
42
+ import storey
42
43
 
43
44
  import mlrun.datastore.wasbfs
44
45
  from mlrun.datastore.datastore_profile import (
45
- DatastoreProfileKafkaSource,
46
+ DatastoreProfileKafkaStream,
46
47
  DatastoreProfileKafkaTarget,
47
48
  DatastoreProfileV3io,
48
49
  )
@@ -122,7 +123,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
122
123
  )
123
124
  if isinstance(
124
125
  datastore_profile,
125
- (DatastoreProfileKafkaSource, DatastoreProfileKafkaTarget),
126
+ (DatastoreProfileKafkaStream, DatastoreProfileKafkaTarget),
126
127
  ):
127
128
  attributes = datastore_profile.attributes()
128
129
  brokers = attributes.pop("brokers", None)
@@ -168,11 +169,12 @@ def get_stream_pusher(stream_path: str, **kwargs):
168
169
  raise ValueError(f"unsupported stream path {stream_path}")
169
170
 
170
171
 
171
- class _DummyStream:
172
+ class _DummyStream(storey.MapClass):
172
173
  """stream emulator for tests and debug"""
173
174
 
174
175
  def __init__(self, event_list=None, **kwargs):
175
176
  self.event_list = event_list or []
177
+ super().__init__(**kwargs)
176
178
 
177
179
  def push(self, data, **kwargs):
178
180
  if not isinstance(data, list):
@@ -180,3 +182,9 @@ class _DummyStream:
180
182
  for item in data:
181
183
  logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
182
184
  self.event_list.append(item)
185
+
186
+ def do(self, event):
187
+ if not isinstance(event, list):
188
+ event = [event]
189
+ for item in event:
190
+ self.event_list.append(item)
@@ -1,4 +1,4 @@
1
- # Copyright 2023 Iguazio
1
+ # Copyright 2025 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import contextlib
15
16
  import time
16
17
  from pathlib import Path
17
18
  from typing import Optional
@@ -30,6 +31,40 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
30
31
 
31
32
 
32
33
  class AzureBlobStore(DataStore):
34
+ """
35
+ Azure Blob Storage datastore implementation.
36
+
37
+ Supports multiple URL schemas: az://, wasbs://, wasb://
38
+
39
+ Supported Connection String Formats:
40
+ ====================================
41
+
42
+ 1. Account Key (Standard):
43
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.windows.net"
44
+
45
+ 2. SAS Token:
46
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;SharedAccessSignature=<sas_token>"
47
+
48
+ 3. Minimal BlobEndpoint:
49
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;AccountName=<account>;AccountKey=<key>"
50
+
51
+ 4. Custom Domain:
52
+ "BlobEndpoint=https://<account>.mydomain.com/;AccountName=<account>;AccountKey=<key>"
53
+
54
+ 5. China/Government Cloud:
55
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn"
56
+
57
+ 6. Full Service Endpoints with SAS:
58
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;QueueEndpoint=...;SharedAccessSignature=<sas>"
59
+
60
+ Authentication Methods:
61
+ ======================
62
+ - Account Key (connection_string or storage_options)
63
+ - SAS Token (connection_string or storage_options)
64
+ - OAuth/Azure AD (storage_options: client_id, client_secret, tenant_id)
65
+
66
+ """
67
+
33
68
  using_bucket = True
34
69
  max_concurrency = 100
35
70
  max_blocksize = 1024 * 1024 * 4
@@ -40,6 +75,12 @@ class AzureBlobStore(DataStore):
40
75
  def __init__(
41
76
  self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
42
77
  ):
78
+ # Extract container from WASBS endpoint before calling super()
79
+ self._container_from_endpoint = None
80
+ if schema in ["wasbs", "wasb"] and endpoint and "@" in endpoint:
81
+ # Handle container@host format
82
+ self._container_from_endpoint, endpoint = endpoint.split("@", 1)
83
+
43
84
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
44
85
  self._service_client = None
45
86
  self._storage_options = None
@@ -67,6 +108,34 @@ class AzureBlobStore(DataStore):
67
108
  or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
68
109
  credential=self._get_secret_or_env("credential"),
69
110
  )
111
+ # Use container extracted from WASBS endpoint during initialization
112
+ if self._container_from_endpoint:
113
+ res["container"] = self._container_from_endpoint
114
+
115
+ # For az:// URLs, endpoint contains the container name
116
+ if not res.get("container") and self.kind in ["az"]:
117
+ if container := getattr(self, "endpoint", None):
118
+ res["container"] = container
119
+
120
+ # Last resort: For wasbs:// without container, check if connection string has BlobEndpoint with container
121
+ if not res.get("container") and self.kind in ["wasbs", "wasb"]:
122
+ connection_string = res.get("connection_string")
123
+ if connection_string and "BlobEndpoint=" in connection_string:
124
+ # Try to extract container from BlobEndpoint URL
125
+ for part in connection_string.split(";"):
126
+ if part.startswith("BlobEndpoint="):
127
+ blob_endpoint = part.split("=", 1)[1]
128
+ # Parse URL to get path component
129
+ from urllib.parse import urlparse
130
+
131
+ parsed = urlparse(blob_endpoint)
132
+ if parsed.path and parsed.path.strip("/"):
133
+ # Extract first path segment as container
134
+ path_parts = parsed.path.strip("/").split("/")
135
+ if path_parts[0]:
136
+ res["container"] = path_parts[0]
137
+ break
138
+
70
139
  self._storage_options = self._sanitize_options(res)
71
140
  return self._storage_options
72
141
 
@@ -165,7 +234,18 @@ class AzureBlobStore(DataStore):
165
234
  # if called without passing dataitem - like in fset.purge_targets,
166
235
  # key will include schema.
167
236
  if not schema:
168
- key = Path(self.endpoint, key).as_posix()
237
+ # For wasbs/wasb, the filesystem is scoped to the container, so we need to use
238
+ # the container name as the base path, not the hostname endpoint.
239
+ # For az://, endpoint already contains the container name.
240
+ if self.kind in ["wasbs", "wasb"]:
241
+ container = self.storage_options.get("container")
242
+ if container:
243
+ key = Path(container, key).as_posix()
244
+ else:
245
+ # If no container found, use endpoint (might be hostname, but better than nothing)
246
+ key = Path(self.endpoint, key).as_posix()
247
+ else:
248
+ key = Path(self.endpoint, key).as_posix()
169
249
  return key
170
250
 
171
251
  def upload(self, key, src_path):
@@ -229,18 +309,27 @@ class AzureBlobStore(DataStore):
229
309
  st = self.storage_options
230
310
  service = "blob"
231
311
  primary_url = None
232
- if st.get("connection_string"):
312
+
313
+ # Parse connection string (fills account_name/account_key or SAS)
314
+ connection_string = st.get("connection_string")
315
+ if connection_string:
233
316
  primary_url, _, parsed_credential = parse_connection_str(
234
- st.get("connection_string"), credential=None, service=service
317
+ connection_string, credential=None, service=service
235
318
  )
236
- for key in ["account_name", "account_key"]:
237
- parsed_value = parsed_credential.get(key)
238
- if parsed_value:
239
- if key in st and st[key] != parsed_value:
319
+
320
+ if isinstance(parsed_credential, str):
321
+ # SharedAccessSignature as raw string
322
+ parsed_credential = {"sas_token": parsed_credential}
323
+
324
+ for key in ["account_name", "account_key", "sas_token"]:
325
+ if parsed_value := parsed_credential.get(key):
326
+ # Only check for conflicts if storage options has a non-empty value for this key
327
+ existing_value = st.get(key)
328
+ if existing_value and existing_value != parsed_value:
240
329
  if key == "account_name":
241
330
  raise mlrun.errors.MLRunInvalidArgumentError(
242
- f"Storage option for '{key}' is '{st[key]}',\
243
- which does not match corresponding connection string '{parsed_value}'"
331
+ f"Storage option for '{key}' is '{existing_value}', "
332
+ f"which does not match corresponding connection string '{parsed_value}'"
244
333
  )
245
334
  else:
246
335
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -249,57 +338,83 @@ class AzureBlobStore(DataStore):
249
338
  st[key] = parsed_value
250
339
 
251
340
  account_name = st.get("account_name")
341
+ # Derive host (prefer connection string primary URL)
252
342
  if primary_url:
253
343
  if primary_url.startswith("http://"):
254
344
  primary_url = primary_url[len("http://") :]
255
345
  if primary_url.startswith("https://"):
256
346
  primary_url = primary_url[len("https://") :]
257
- host = primary_url
347
+ # Remove any path components from the host
348
+ host = primary_url.split("/")[0]
258
349
  elif account_name:
259
350
  host = f"{account_name}.{service}.core.windows.net"
260
351
  else:
352
+ # nothing to configure yet
261
353
  return res
262
354
 
263
- if "account_key" in st:
355
+ host = host.rstrip("/")
356
+
357
+ # Account key (optional; WASB supports it)
358
+ if "account_key" in st and st["account_key"]:
264
359
  res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
265
360
 
266
- if "client_secret" in st or "client_id" in st or "tenant_id" in st:
267
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
268
- res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
269
- "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
270
- )
271
- if "client_id" in st:
272
- res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
273
- "client_id"
274
- ]
275
- if "client_secret" in st:
276
- res[f"spark.hadoop.fs.azure.account.oauth2.client.secret.{host}"] = st[
277
- "client_secret"
278
- ]
279
- if "tenant_id" in st:
280
- tenant_id = st["tenant_id"]
281
- res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
282
- f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
283
- )
361
+ # --- WASB + SAS (container-scoped key; no provider classes needed) ---
362
+ if "sas_token" in st and st["sas_token"]:
363
+ sas = st["sas_token"].lstrip("?")
284
364
 
285
- if "sas_token" in st:
286
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
287
- res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
288
- "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
289
- )
290
- res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
365
+ container = st.get("container")
366
+
367
+ if container:
368
+ # fs.azure.sas.<container>.<account>.blob.core.windows.net = <sas>
369
+ res[f"spark.hadoop.fs.azure.sas.{container}.{host}"] = sas
370
+
371
+ else:
372
+ raise mlrun.errors.MLRunInvalidArgumentError(
373
+ "Container name is required for WASB SAS. "
374
+ "Set self.endpoint or storage_options['container']."
375
+ )
291
376
  return res
292
377
 
293
378
  @property
294
379
  def spark_url(self):
295
- spark_options = self.get_spark_options()
296
- url = f"wasbs://{self.endpoint}"
297
- prefix = "spark.hadoop.fs.azure.account.key."
298
- if spark_options:
299
- for key in spark_options:
300
- if key.startswith(prefix):
301
- account_key = key[len(prefix) :]
302
- if not url.endswith(account_key):
303
- url += f"@{account_key}"
304
- break
305
- return url
380
+ # Build: wasbs://<container>@<host>
381
+ st = self.storage_options
382
+ service = "blob"
383
+
384
+ container = st.get("container")
385
+
386
+ if not container:
387
+ raise mlrun.errors.MLRunInvalidArgumentError(
388
+ "Container name is required to build the WASB URL. "
389
+ "Set storage_options['container'] or use datastore profile with container specified."
390
+ )
391
+
392
+ # Prefer host from connection string; else synthesize from account_name
393
+ host = None
394
+ account_name = st.get("account_name")
395
+ connection_string = st.get("connection_string")
396
+
397
+ if connection_string:
398
+ with contextlib.suppress(Exception):
399
+ primary_url, _, _ = parse_connection_str(
400
+ connection_string, credential=None, service=service
401
+ )
402
+ if primary_url.startswith("http://"):
403
+ primary_url = primary_url[len("http://") :]
404
+ if primary_url.startswith("https://"):
405
+ primary_url = primary_url[len("https://") :]
406
+ # Remove any path components from the host
407
+ host = primary_url.split("/")[0].rstrip("/")
408
+ if not host and account_name:
409
+ host = f"{account_name}.{service}.core.windows.net"
410
+
411
+ # For wasbs:// URLs where endpoint is already the host
412
+ if not host and self.kind in ["wasbs", "wasb"] and hasattr(self, "endpoint"):
413
+ host = getattr(self, "endpoint", None)
414
+
415
+ if not host:
416
+ raise mlrun.errors.MLRunInvalidArgumentError(
417
+ "account_name is required (or provide a connection_string) to build the WASB URL."
418
+ )
419
+
420
+ return f"wasbs://{container}@{host}"