mlrun 1.10.0rc16__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (98) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/document.py +6 -1
  3. mlrun/artifacts/llm_prompt.py +21 -15
  4. mlrun/artifacts/model.py +3 -3
  5. mlrun/common/constants.py +9 -0
  6. mlrun/common/formatters/artifact.py +1 -0
  7. mlrun/common/model_monitoring/helpers.py +86 -0
  8. mlrun/common/schemas/__init__.py +2 -0
  9. mlrun/common/schemas/auth.py +2 -0
  10. mlrun/common/schemas/function.py +10 -0
  11. mlrun/common/schemas/hub.py +30 -18
  12. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  13. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  14. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  15. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  16. mlrun/common/schemas/pipeline.py +1 -1
  17. mlrun/common/schemas/serving.py +3 -0
  18. mlrun/common/schemas/workflow.py +1 -0
  19. mlrun/common/secrets.py +22 -1
  20. mlrun/config.py +32 -10
  21. mlrun/datastore/__init__.py +11 -3
  22. mlrun/datastore/azure_blob.py +162 -47
  23. mlrun/datastore/datastore.py +9 -4
  24. mlrun/datastore/datastore_profile.py +61 -5
  25. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  26. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  27. mlrun/datastore/model_provider/model_provider.py +211 -74
  28. mlrun/datastore/model_provider/openai_provider.py +243 -71
  29. mlrun/datastore/s3.py +24 -2
  30. mlrun/datastore/storeytargets.py +2 -3
  31. mlrun/datastore/utils.py +15 -3
  32. mlrun/db/base.py +27 -19
  33. mlrun/db/httpdb.py +57 -48
  34. mlrun/db/nopdb.py +25 -10
  35. mlrun/execution.py +55 -13
  36. mlrun/hub/__init__.py +15 -0
  37. mlrun/hub/module.py +181 -0
  38. mlrun/k8s_utils.py +105 -16
  39. mlrun/launcher/base.py +13 -6
  40. mlrun/launcher/local.py +2 -0
  41. mlrun/model.py +9 -3
  42. mlrun/model_monitoring/api.py +66 -27
  43. mlrun/model_monitoring/applications/__init__.py +1 -1
  44. mlrun/model_monitoring/applications/base.py +372 -136
  45. mlrun/model_monitoring/applications/context.py +2 -4
  46. mlrun/model_monitoring/applications/results.py +4 -7
  47. mlrun/model_monitoring/controller.py +239 -101
  48. mlrun/model_monitoring/db/_schedules.py +36 -13
  49. mlrun/model_monitoring/db/_stats.py +4 -3
  50. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  51. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
  52. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
  53. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  54. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  55. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
  56. mlrun/model_monitoring/helpers.py +28 -5
  57. mlrun/model_monitoring/stream_processing.py +45 -14
  58. mlrun/model_monitoring/writer.py +220 -1
  59. mlrun/platforms/__init__.py +3 -2
  60. mlrun/platforms/iguazio.py +7 -3
  61. mlrun/projects/operations.py +6 -1
  62. mlrun/projects/pipelines.py +2 -2
  63. mlrun/projects/project.py +128 -45
  64. mlrun/run.py +94 -17
  65. mlrun/runtimes/__init__.py +18 -0
  66. mlrun/runtimes/base.py +14 -6
  67. mlrun/runtimes/daskjob.py +1 -0
  68. mlrun/runtimes/local.py +5 -2
  69. mlrun/runtimes/mounts.py +20 -2
  70. mlrun/runtimes/nuclio/__init__.py +1 -0
  71. mlrun/runtimes/nuclio/application/application.py +147 -17
  72. mlrun/runtimes/nuclio/function.py +70 -27
  73. mlrun/runtimes/nuclio/serving.py +85 -4
  74. mlrun/runtimes/pod.py +213 -21
  75. mlrun/runtimes/utils.py +49 -9
  76. mlrun/secrets.py +54 -13
  77. mlrun/serving/remote.py +79 -6
  78. mlrun/serving/routers.py +23 -41
  79. mlrun/serving/server.py +211 -40
  80. mlrun/serving/states.py +536 -156
  81. mlrun/serving/steps.py +62 -0
  82. mlrun/serving/system_steps.py +136 -81
  83. mlrun/serving/v2_serving.py +9 -10
  84. mlrun/utils/helpers.py +212 -82
  85. mlrun/utils/logger.py +3 -1
  86. mlrun/utils/notifications/notification/base.py +18 -0
  87. mlrun/utils/notifications/notification/git.py +2 -4
  88. mlrun/utils/notifications/notification/slack.py +2 -4
  89. mlrun/utils/notifications/notification/webhook.py +2 -5
  90. mlrun/utils/notifications/notification_pusher.py +1 -1
  91. mlrun/utils/version/version.json +2 -2
  92. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +44 -45
  93. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +97 -92
  94. mlrun/api/schemas/__init__.py +0 -259
  95. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  96. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  97. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  98. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
mlrun/common/secrets.py CHANGED
@@ -11,10 +11,31 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import re
15
15
  from abc import ABC, abstractmethod
16
16
 
17
17
  import mlrun.common.schemas
18
+ from mlrun.config import config as mlconf
19
+
20
+ _AUTH_SECRET_NAME_TEMPLATE = re.escape(
21
+ mlconf.secret_stores.kubernetes.auth_secret_name.format(
22
+ hashed_access_key="",
23
+ )
24
+ )
25
+ AUTH_SECRET_PATTERN = re.compile(f"^{_AUTH_SECRET_NAME_TEMPLATE}.*")
26
+
27
+
28
+ def validate_not_forbidden_secret(secret_name: str) -> None:
29
+ """
30
+ Forbid client-supplied references to internal MLRun auth/project secrets.
31
+ No-op when running inside the API server (API enrichments are allowed).
32
+ """
33
+ if not secret_name or mlrun.config.is_running_as_api():
34
+ return
35
+ if AUTH_SECRET_PATTERN.match(secret_name):
36
+ raise mlrun.errors.MLRunInvalidArgumentError(
37
+ f"Forbidden secret '{secret_name}' matches MLRun auth-secret pattern."
38
+ )
18
39
 
19
40
 
20
41
  class SecretProviderInterface(ABC):
mlrun/config.py CHANGED
@@ -107,7 +107,11 @@ default_config = {
107
107
  "submit_timeout": "280", # timeout when submitting a new k8s resource
108
108
  # runtimes cleanup interval in seconds
109
109
  "runtimes_cleanup_interval": "300",
110
- "background_task_cleanup_interval": "86400", # 24 hours in seconds
110
+ # disabled by default due to an internal bug in serving functions
111
+ # relying on a background task to hold the status for its model endpoints
112
+ # TODO: need to refine what/when we can delete the background tasks
113
+ # e.g: use labels or naming convention.
114
+ "background_task_cleanup_interval": "0",
111
115
  "background_task_max_age": "21600", # 6 hours in seconds
112
116
  "monitoring": {
113
117
  "runs": {
@@ -194,6 +198,7 @@ default_config = {
194
198
  "v3io_framesd": "http://framesd:8080",
195
199
  "model_providers": {
196
200
  "openai_default_model": "gpt-4o",
201
+ "huggingface_default_model": "microsoft/Phi-3-mini-4k-instruct",
197
202
  },
198
203
  # default node selector to be applied to all functions - json string base64 encoded format
199
204
  "default_function_node_selector": "e30=",
@@ -250,7 +255,8 @@ default_config = {
250
255
  },
251
256
  "runtimes": {
252
257
  "dask": "600",
253
- "dask_cluster_start": "300",
258
+ # cluster start might take some time in case k8s needs to spin up new nodes
259
+ "dask_cluster_start": "600",
254
260
  },
255
261
  "push_notifications": "60",
256
262
  },
@@ -298,6 +304,7 @@ default_config = {
298
304
  "application": {
299
305
  "default_sidecar_internal_port": 8050,
300
306
  "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
307
+ "default_worker_number": 10000,
301
308
  },
302
309
  },
303
310
  # TODO: function defaults should be moved to the function spec config above
@@ -406,11 +413,7 @@ default_config = {
406
413
  #
407
414
  # if set to "nil" or "none", nothing would be set
408
415
  "modes": (
409
- "STRICT_TRANS_TABLES"
410
- ",NO_ZERO_IN_DATE"
411
- ",NO_ZERO_DATE"
412
- ",ERROR_FOR_DIVISION_BY_ZERO"
413
- ",NO_ENGINE_SUBSTITUTION",
416
+ "STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION"
414
417
  )
415
418
  },
416
419
  },
@@ -647,6 +650,13 @@ default_config = {
647
650
  "max_replicas": 1,
648
651
  },
649
652
  },
653
+ "writer_graph": {
654
+ "max_events": 1000,
655
+ "flush_after_seconds": 30,
656
+ "writer_version": "v1", # v1 is the sync version while v2 is async
657
+ "parquet_batching_max_events": 10,
658
+ "parquet_batching_timeout_secs": 30,
659
+ },
650
660
  # Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
651
661
  # stream, and endpoints.
652
662
  "store_prefixes": {
@@ -717,7 +727,6 @@ default_config = {
717
727
  "name": "default",
718
728
  "description": "MLRun global function hub",
719
729
  "url": "https://mlrun.github.io/marketplace",
720
- "object_type": "functions",
721
730
  "channel": "master",
722
731
  },
723
732
  },
@@ -999,9 +1008,9 @@ class Config:
999
1008
  )
1000
1009
 
1001
1010
  @staticmethod
1002
- def get_default_hub_source() -> str:
1011
+ def get_default_hub_source_url_prefix(object_type) -> str:
1003
1012
  default_source = config.hub.default_source
1004
- return f"{default_source.url}/{default_source.object_type}/{default_source.channel}/"
1013
+ return f"{default_source.url}/{object_type}/{default_source.channel}/"
1005
1014
 
1006
1015
  @staticmethod
1007
1016
  def decode_base64_config_and_load_to_object(
@@ -1242,6 +1251,19 @@ class Config:
1242
1251
  """
1243
1252
  return self.is_running_on_iguazio()
1244
1253
 
1254
+ @staticmethod
1255
+ def get_run_retry_staleness_threshold_timedelta() -> timedelta:
1256
+ """
1257
+ Get the staleness threshold in timedelta for run retries.
1258
+ This is used to determine if a run is stale and should be retried.
1259
+
1260
+ :return: The staleness threshold in timedelta.
1261
+ """
1262
+ staleness_threshold = int(
1263
+ mlrun.mlconf.monitoring.runs.retry.staleness_threshold
1264
+ )
1265
+ return timedelta(minutes=staleness_threshold)
1266
+
1245
1267
  def to_dict(self):
1246
1268
  return copy.deepcopy(self._cfg)
1247
1269
 
@@ -39,10 +39,11 @@ __all__ = [
39
39
  from urllib.parse import urlparse
40
40
 
41
41
  import fsspec
42
+ import storey
42
43
 
43
44
  import mlrun.datastore.wasbfs
44
45
  from mlrun.datastore.datastore_profile import (
45
- DatastoreProfileKafkaSource,
46
+ DatastoreProfileKafkaStream,
46
47
  DatastoreProfileKafkaTarget,
47
48
  DatastoreProfileV3io,
48
49
  )
@@ -122,7 +123,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
122
123
  )
123
124
  if isinstance(
124
125
  datastore_profile,
125
- (DatastoreProfileKafkaSource, DatastoreProfileKafkaTarget),
126
+ (DatastoreProfileKafkaStream, DatastoreProfileKafkaTarget),
126
127
  ):
127
128
  attributes = datastore_profile.attributes()
128
129
  brokers = attributes.pop("brokers", None)
@@ -168,11 +169,12 @@ def get_stream_pusher(stream_path: str, **kwargs):
168
169
  raise ValueError(f"unsupported stream path {stream_path}")
169
170
 
170
171
 
171
- class _DummyStream:
172
+ class _DummyStream(storey.MapClass):
172
173
  """stream emulator for tests and debug"""
173
174
 
174
175
  def __init__(self, event_list=None, **kwargs):
175
176
  self.event_list = event_list or []
177
+ super().__init__(**kwargs)
176
178
 
177
179
  def push(self, data, **kwargs):
178
180
  if not isinstance(data, list):
@@ -180,3 +182,9 @@ class _DummyStream:
180
182
  for item in data:
181
183
  logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
182
184
  self.event_list.append(item)
185
+
186
+ def do(self, event):
187
+ if not isinstance(event, list):
188
+ event = [event]
189
+ for item in event:
190
+ self.event_list.append(item)
@@ -1,4 +1,4 @@
1
- # Copyright 2023 Iguazio
1
+ # Copyright 2025 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import contextlib
15
16
  import time
16
17
  from pathlib import Path
17
18
  from typing import Optional
@@ -30,6 +31,40 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
30
31
 
31
32
 
32
33
  class AzureBlobStore(DataStore):
34
+ """
35
+ Azure Blob Storage datastore implementation.
36
+
37
+ Supports multiple URL schemas: az://, wasbs://, wasb://
38
+
39
+ Supported Connection String Formats:
40
+ ====================================
41
+
42
+ 1. Account Key (Standard):
43
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.windows.net"
44
+
45
+ 2. SAS Token:
46
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;SharedAccessSignature=<sas_token>"
47
+
48
+ 3. Minimal BlobEndpoint:
49
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;AccountName=<account>;AccountKey=<key>"
50
+
51
+ 4. Custom Domain:
52
+ "BlobEndpoint=https://<account>.mydomain.com/;AccountName=<account>;AccountKey=<key>"
53
+
54
+ 5. China/Government Cloud:
55
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn"
56
+
57
+ 6. Full Service Endpoints with SAS:
58
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;QueueEndpoint=...;SharedAccessSignature=<sas>"
59
+
60
+ Authentication Methods:
61
+ ======================
62
+ - Account Key (connection_string or storage_options)
63
+ - SAS Token (connection_string or storage_options)
64
+ - OAuth/Azure AD (storage_options: client_id, client_secret, tenant_id)
65
+
66
+ """
67
+
33
68
  using_bucket = True
34
69
  max_concurrency = 100
35
70
  max_blocksize = 1024 * 1024 * 4
@@ -40,6 +75,12 @@ class AzureBlobStore(DataStore):
40
75
  def __init__(
41
76
  self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
42
77
  ):
78
+ # Extract container from WASBS endpoint before calling super()
79
+ self._container_from_endpoint = None
80
+ if schema in ["wasbs", "wasb"] and endpoint and "@" in endpoint:
81
+ # Handle container@host format
82
+ self._container_from_endpoint, endpoint = endpoint.split("@", 1)
83
+
43
84
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
44
85
  self._service_client = None
45
86
  self._storage_options = None
@@ -67,6 +108,34 @@ class AzureBlobStore(DataStore):
67
108
  or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
68
109
  credential=self._get_secret_or_env("credential"),
69
110
  )
111
+ # Use container extracted from WASBS endpoint during initialization
112
+ if self._container_from_endpoint:
113
+ res["container"] = self._container_from_endpoint
114
+
115
+ # For az:// URLs, endpoint contains the container name
116
+ if not res.get("container") and self.kind in ["az"]:
117
+ if container := getattr(self, "endpoint", None):
118
+ res["container"] = container
119
+
120
+ # Last resort: For wasbs:// without container, check if connection string has BlobEndpoint with container
121
+ if not res.get("container") and self.kind in ["wasbs", "wasb"]:
122
+ connection_string = res.get("connection_string")
123
+ if connection_string and "BlobEndpoint=" in connection_string:
124
+ # Try to extract container from BlobEndpoint URL
125
+ for part in connection_string.split(";"):
126
+ if part.startswith("BlobEndpoint="):
127
+ blob_endpoint = part.split("=", 1)[1]
128
+ # Parse URL to get path component
129
+ from urllib.parse import urlparse
130
+
131
+ parsed = urlparse(blob_endpoint)
132
+ if parsed.path and parsed.path.strip("/"):
133
+ # Extract first path segment as container
134
+ path_parts = parsed.path.strip("/").split("/")
135
+ if path_parts[0]:
136
+ res["container"] = path_parts[0]
137
+ break
138
+
70
139
  self._storage_options = self._sanitize_options(res)
71
140
  return self._storage_options
72
141
 
@@ -165,7 +234,18 @@ class AzureBlobStore(DataStore):
165
234
  # if called without passing dataitem - like in fset.purge_targets,
166
235
  # key will include schema.
167
236
  if not schema:
168
- key = Path(self.endpoint, key).as_posix()
237
+ # For wasbs/wasb, the filesystem is scoped to the container, so we need to use
238
+ # the container name as the base path, not the hostname endpoint.
239
+ # For az://, endpoint already contains the container name.
240
+ if self.kind in ["wasbs", "wasb"]:
241
+ container = self.storage_options.get("container")
242
+ if container:
243
+ key = Path(container, key).as_posix()
244
+ else:
245
+ # If no container found, use endpoint (might be hostname, but better than nothing)
246
+ key = Path(self.endpoint, key).as_posix()
247
+ else:
248
+ key = Path(self.endpoint, key).as_posix()
169
249
  return key
170
250
 
171
251
  def upload(self, key, src_path):
@@ -229,18 +309,27 @@ class AzureBlobStore(DataStore):
229
309
  st = self.storage_options
230
310
  service = "blob"
231
311
  primary_url = None
232
- if st.get("connection_string"):
312
+
313
+ # Parse connection string (fills account_name/account_key or SAS)
314
+ connection_string = st.get("connection_string")
315
+ if connection_string:
233
316
  primary_url, _, parsed_credential = parse_connection_str(
234
- st.get("connection_string"), credential=None, service=service
317
+ connection_string, credential=None, service=service
235
318
  )
236
- for key in ["account_name", "account_key"]:
237
- parsed_value = parsed_credential.get(key)
238
- if parsed_value:
239
- if key in st and st[key] != parsed_value:
319
+
320
+ if isinstance(parsed_credential, str):
321
+ # SharedAccessSignature as raw string
322
+ parsed_credential = {"sas_token": parsed_credential}
323
+
324
+ for key in ["account_name", "account_key", "sas_token"]:
325
+ if parsed_value := parsed_credential.get(key):
326
+ # Only check for conflicts if storage options has a non-empty value for this key
327
+ existing_value = st.get(key)
328
+ if existing_value and existing_value != parsed_value:
240
329
  if key == "account_name":
241
330
  raise mlrun.errors.MLRunInvalidArgumentError(
242
- f"Storage option for '{key}' is '{st[key]}',\
243
- which does not match corresponding connection string '{parsed_value}'"
331
+ f"Storage option for '{key}' is '{existing_value}', "
332
+ f"which does not match corresponding connection string '{parsed_value}'"
244
333
  )
245
334
  else:
246
335
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -249,57 +338,83 @@ class AzureBlobStore(DataStore):
249
338
  st[key] = parsed_value
250
339
 
251
340
  account_name = st.get("account_name")
341
+ # Derive host (prefer connection string primary URL)
252
342
  if primary_url:
253
343
  if primary_url.startswith("http://"):
254
344
  primary_url = primary_url[len("http://") :]
255
345
  if primary_url.startswith("https://"):
256
346
  primary_url = primary_url[len("https://") :]
257
- host = primary_url
347
+ # Remove any path components from the host
348
+ host = primary_url.split("/")[0]
258
349
  elif account_name:
259
350
  host = f"{account_name}.{service}.core.windows.net"
260
351
  else:
352
+ # nothing to configure yet
261
353
  return res
262
354
 
263
- if "account_key" in st:
355
+ host = host.rstrip("/")
356
+
357
+ # Account key (optional; WASB supports it)
358
+ if "account_key" in st and st["account_key"]:
264
359
  res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
265
360
 
266
- if "client_secret" in st or "client_id" in st or "tenant_id" in st:
267
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
268
- res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
269
- "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
270
- )
271
- if "client_id" in st:
272
- res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
273
- "client_id"
274
- ]
275
- if "client_secret" in st:
276
- res[f"spark.hadoop.fs.azure.account.oauth2.client.secret.{host}"] = st[
277
- "client_secret"
278
- ]
279
- if "tenant_id" in st:
280
- tenant_id = st["tenant_id"]
281
- res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
282
- f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
283
- )
361
+ # --- WASB + SAS (container-scoped key; no provider classes needed) ---
362
+ if "sas_token" in st and st["sas_token"]:
363
+ sas = st["sas_token"].lstrip("?")
284
364
 
285
- if "sas_token" in st:
286
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
287
- res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
288
- "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
289
- )
290
- res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
365
+ container = st.get("container")
366
+
367
+ if container:
368
+ # fs.azure.sas.<container>.<account>.blob.core.windows.net = <sas>
369
+ res[f"spark.hadoop.fs.azure.sas.{container}.{host}"] = sas
370
+
371
+ else:
372
+ raise mlrun.errors.MLRunInvalidArgumentError(
373
+ "Container name is required for WASB SAS. "
374
+ "Set self.endpoint or storage_options['container']."
375
+ )
291
376
  return res
292
377
 
293
378
  @property
294
379
  def spark_url(self):
295
- spark_options = self.get_spark_options()
296
- url = f"wasbs://{self.endpoint}"
297
- prefix = "spark.hadoop.fs.azure.account.key."
298
- if spark_options:
299
- for key in spark_options:
300
- if key.startswith(prefix):
301
- account_key = key[len(prefix) :]
302
- if not url.endswith(account_key):
303
- url += f"@{account_key}"
304
- break
305
- return url
380
+ # Build: wasbs://<container>@<host>
381
+ st = self.storage_options
382
+ service = "blob"
383
+
384
+ container = st.get("container")
385
+
386
+ if not container:
387
+ raise mlrun.errors.MLRunInvalidArgumentError(
388
+ "Container name is required to build the WASB URL. "
389
+ "Set storage_options['container'] or use datastore profile with container specified."
390
+ )
391
+
392
+ # Prefer host from connection string; else synthesize from account_name
393
+ host = None
394
+ account_name = st.get("account_name")
395
+ connection_string = st.get("connection_string")
396
+
397
+ if connection_string:
398
+ with contextlib.suppress(Exception):
399
+ primary_url, _, _ = parse_connection_str(
400
+ connection_string, credential=None, service=service
401
+ )
402
+ if primary_url.startswith("http://"):
403
+ primary_url = primary_url[len("http://") :]
404
+ if primary_url.startswith("https://"):
405
+ primary_url = primary_url[len("https://") :]
406
+ # Remove any path components from the host
407
+ host = primary_url.split("/")[0].rstrip("/")
408
+ if not host and account_name:
409
+ host = f"{account_name}.{service}.core.windows.net"
410
+
411
+ # For wasbs:// URLs where endpoint is already the host
412
+ if not host and self.kind in ["wasbs", "wasb"] and hasattr(self, "endpoint"):
413
+ host = getattr(self, "endpoint", None)
414
+
415
+ if not host:
416
+ raise mlrun.errors.MLRunInvalidArgumentError(
417
+ "account_name is required (or provide a connection_string) to build the WASB URL."
418
+ )
419
+
420
+ return f"wasbs://{container}@{host}"
@@ -38,6 +38,8 @@ from ..utils import DB_SCHEMA, RunKeys
38
38
  from .base import DataItem, DataStore, HttpStore
39
39
  from .filestore import FileStore
40
40
  from .inmem import InMemoryStore
41
+ from .model_provider.huggingface_provider import HuggingFaceProvider
42
+ from .model_provider.mock_model_provider import MockModelProvider
41
43
  from .model_provider.openai_provider import OpenAIProvider
42
44
  from .store_resources import get_store_resource, is_store_uri
43
45
  from .v3io import V3ioStore
@@ -102,8 +104,11 @@ def schema_to_store(schema) -> DataStore.__subclasses__():
102
104
  def schema_to_model_provider(
103
105
  schema: str, raise_missing_schema_exception=True
104
106
  ) -> type[ModelProvider]:
105
- # TODO add hugging face and http
106
- schema_dict = {"openai": OpenAIProvider}
107
+ schema_dict = {
108
+ "openai": OpenAIProvider,
109
+ "huggingface": HuggingFaceProvider,
110
+ "mock": MockModelProvider,
111
+ }
107
112
  provider_class = schema_dict.get(schema, None)
108
113
  if not provider_class:
109
114
  if raise_missing_schema_exception:
@@ -247,7 +252,7 @@ class StoreManager:
247
252
 
248
253
  if schema == "ds":
249
254
  datastore_profile = datastore_profile_read(url, project_name, secrets)
250
- secrets = merge(secrets or {}, datastore_profile.secrets() or {})
255
+ secrets = merge({}, secrets or {}, datastore_profile.secrets() or {})
251
256
  url = datastore_profile.url(subpath)
252
257
  schema, endpoint, parsed_url = parse_url(url)
253
258
  subpath = parsed_url.path
@@ -281,7 +286,7 @@ class StoreManager:
281
286
  endpoint, subpath
282
287
  )
283
288
  remote_client = remote_client_class(
284
- self, schema, cache_key, parsed_url.netloc, secrets=secrets, **kwargs
289
+ self, schema, cache_key, endpoint, secrets=secrets, **kwargs
285
290
  )
286
291
  if not secrets and not mlrun.config.is_running_as_api():
287
292
  cache[cache_key] = remote_client
@@ -19,6 +19,7 @@ import typing
19
19
  from urllib.parse import ParseResult, urlparse
20
20
 
21
21
  import pydantic.v1
22
+ from deprecated import deprecated
22
23
  from mergedeep import merge
23
24
 
24
25
  import mlrun
@@ -138,6 +139,15 @@ class ConfigProfile(DatastoreProfile):
138
139
  return res
139
140
 
140
141
 
142
+ # TODO: Remove in 1.12.0
143
+ @deprecated(
144
+ version="1.10.0",
145
+ reason=(
146
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
147
+ "Use `DatastoreProfileKafkaStream` instead."
148
+ ),
149
+ category=FutureWarning,
150
+ )
141
151
  class DatastoreProfileKafkaTarget(DatastoreProfile):
142
152
  type: str = pydantic.v1.Field("kafka_target")
143
153
  _private_attributes = "kwargs_private"
@@ -158,8 +168,8 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
158
168
  return attributes
159
169
 
160
170
 
161
- class DatastoreProfileKafkaSource(DatastoreProfile):
162
- type: str = pydantic.v1.Field("kafka_source")
171
+ class DatastoreProfileKafkaStream(DatastoreProfile):
172
+ type: str = pydantic.v1.Field("kafka_stream")
163
173
  _private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
164
174
  brokers: typing.Union[str, list[str]]
165
175
  topics: typing.Union[str, list[str]]
@@ -198,6 +208,19 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
198
208
  return attributes
199
209
 
200
210
 
211
+ # TODO: Remove in 1.12.0
212
+ @deprecated(
213
+ version="1.10.0",
214
+ reason=(
215
+ "This class is deprecated from mlrun 1.10.0, and will be removed in 1.12.0. "
216
+ "Use `DatastoreProfileKafkaStream` instead."
217
+ ),
218
+ category=FutureWarning,
219
+ )
220
+ class DatastoreProfileKafkaSource(DatastoreProfileKafkaStream):
221
+ type: str = pydantic.v1.Field("kafka_source")
222
+
223
+
201
224
  class DatastoreProfileV3io(DatastoreProfile):
202
225
  type: str = pydantic.v1.Field("v3io")
203
226
  v3io_access_key: typing.Optional[str] = None
@@ -232,7 +255,7 @@ class DatastoreProfileS3(DatastoreProfile):
232
255
  if self.secret_key:
233
256
  res["AWS_SECRET_ACCESS_KEY"] = self.secret_key
234
257
  if self.endpoint_url:
235
- res["S3_ENDPOINT_URL"] = self.endpoint_url
258
+ res["AWS_ENDPOINT_URL_S3"] = self.endpoint_url
236
259
  if self.force_non_anonymous:
237
260
  res["S3_NON_ANONYMOUS"] = self.force_non_anonymous
238
261
  if self.profile_name:
@@ -333,7 +356,9 @@ class DatastoreProfileGCS(DatastoreProfile):
333
356
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
334
357
  subpath = subpath[1:]
335
358
  if self.bucket:
336
- return f"gcs://{self.bucket}/{subpath}"
359
+ return (
360
+ f"gcs://{self.bucket}/{subpath}" if subpath else f"gcs://{self.bucket}"
361
+ )
337
362
  else:
338
363
  return f"gcs://{subpath}"
339
364
 
@@ -370,7 +395,11 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
370
395
  # in azure the path after schema is starts with container, wherefore it should not start with "/".
371
396
  subpath = subpath[1:]
372
397
  if self.container:
373
- return f"az://{self.container}/{subpath}"
398
+ return (
399
+ f"az://{self.container}/{subpath}"
400
+ if subpath
401
+ else f"az://{self.container}"
402
+ )
374
403
  else:
375
404
  return f"az://{subpath}"
376
405
 
@@ -486,6 +515,31 @@ class OpenAIProfile(DatastoreProfile):
486
515
  return f"{self.type}://{subpath.lstrip('/')}"
487
516
 
488
517
 
518
+ class HuggingFaceProfile(DatastoreProfile):
519
+ type: str = pydantic.v1.Field("huggingface")
520
+ _private_attributes = ("token", "model_kwargs")
521
+ task: typing.Optional[str] = None
522
+ token: typing.Optional[str] = None
523
+ device: typing.Optional[typing.Union[int, str]] = None
524
+ device_map: typing.Union[str, dict[str, typing.Union[int, str]], None] = None
525
+ trust_remote_code: bool = None
526
+ model_kwargs: typing.Optional[dict[str, typing.Any]] = None
527
+
528
+ def secrets(self) -> dict:
529
+ keys = {
530
+ "HF_TASK": self.task,
531
+ "HF_TOKEN": self.token,
532
+ "HF_DEVICE": self.device,
533
+ "HF_DEVICE_MAP": self.device_map,
534
+ "HF_TRUST_REMOTE_CODE": self.trust_remote_code,
535
+ "HF_MODEL_KWARGS": self.model_kwargs,
536
+ }
537
+ return {k: v for k, v in keys.items() if v}
538
+
539
+ def url(self, subpath):
540
+ return f"{self.type}://{subpath.lstrip('/')}"
541
+
542
+
489
543
  _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
490
544
  "v3io": DatastoreProfileV3io,
491
545
  "s3": DatastoreProfileS3,
@@ -493,6 +547,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
493
547
  "basic": DatastoreProfileBasic,
494
548
  "kafka_target": DatastoreProfileKafkaTarget,
495
549
  "kafka_source": DatastoreProfileKafkaSource,
550
+ "kafka_stream": DatastoreProfileKafkaStream,
496
551
  "dbfs": DatastoreProfileDBFS,
497
552
  "gcs": DatastoreProfileGCS,
498
553
  "az": DatastoreProfileAzureBlob,
@@ -500,6 +555,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
500
555
  "taosws": DatastoreProfileTDEngine,
501
556
  "config": ConfigProfile,
502
557
  "openai": OpenAIProfile,
558
+ "huggingface": HuggingFaceProfile,
503
559
  }
504
560
 
505
561