mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/config.py CHANGED
@@ -40,6 +40,7 @@ import yaml
40
40
 
41
41
  import mlrun.common.constants
42
42
  import mlrun.common.schemas
43
+ import mlrun.common.types
43
44
  import mlrun.errors
44
45
 
45
46
  env_prefix = "MLRUN_"
@@ -66,7 +67,6 @@ default_config = {
66
67
  "nuclio_version": "",
67
68
  "default_nuclio_runtime": "python:3.11",
68
69
  "nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
69
- "ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
70
70
  "remote_host": "",
71
71
  "api_base_version": "v1",
72
72
  "version": "", # will be set to current version
@@ -85,7 +85,9 @@ default_config = {
85
85
  "kfp_image": "mlrun/mlrun-kfp", # image to use for KFP runner
86
86
  "dask_kfp_image": "mlrun/mlrun", # image to use for dask KFP runner
87
87
  "igz_version": "", # the version of the iguazio system the API is running on
88
- "iguazio_api_url": "", # the url to iguazio api
88
+ "iguazio_api_url": "", # the url to iguazio api (internal / external access with priority to internal)
89
+ "iguazio_api_url_ingress": "", # the url to iguazio api ingress (for external access)
90
+ "iguazio_api_ssl_verify": True, # verify ssl certificate of iguazio api
89
91
  "spark_app_image": "", # image to use for spark operator app runtime
90
92
  "spark_app_image_tag": "", # image tag to use for spark operator app runtime
91
93
  "spark_history_server_path": "", # spark logs directory for spark history server
@@ -107,7 +109,11 @@ default_config = {
107
109
  "submit_timeout": "280", # timeout when submitting a new k8s resource
108
110
  # runtimes cleanup interval in seconds
109
111
  "runtimes_cleanup_interval": "300",
110
- "background_task_cleanup_interval": "86400", # 24 hours in seconds
112
+ # disabled by default due to an internal bug in serving functions
113
+ # relying on a background task to hold the status for its model endpoints
114
+ # TODO: need to refine what/when we can delete the background tasks
115
+ # e.g: use labels or naming convention.
116
+ "background_task_cleanup_interval": "0",
111
117
  "background_task_max_age": "21600", # 6 hours in seconds
112
118
  "monitoring": {
113
119
  "runs": {
@@ -251,7 +257,8 @@ default_config = {
251
257
  },
252
258
  "runtimes": {
253
259
  "dask": "600",
254
- "dask_cluster_start": "300",
260
+ # cluster start might take some time in case k8s needs to spin up new nodes
261
+ "dask_cluster_start": "600",
255
262
  },
256
263
  "push_notifications": "60",
257
264
  },
@@ -299,6 +306,7 @@ default_config = {
299
306
  "application": {
300
307
  "default_sidecar_internal_port": 8050,
301
308
  "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
309
+ "default_worker_number": 10000,
302
310
  },
303
311
  },
304
312
  # TODO: function defaults should be moved to the function spec config above
@@ -416,11 +424,17 @@ default_config = {
416
424
  "allow_local_run": False,
417
425
  },
418
426
  "authentication": {
419
- "mode": "none", # one of none, basic, bearer, iguazio
427
+ "mode": "none", # one of none, basic, bearer, iguazio, iguazio-v4
420
428
  "basic": {"username": "", "password": ""},
421
429
  "bearer": {"token": ""},
422
430
  "iguazio": {
423
431
  "session_verification_endpoint": "data_sessions/verifications/app_service",
432
+ "authentication_endpoint": "api/v1/authentication/refresh-access-token",
433
+ },
434
+ "service_account": {
435
+ # the following are the default values for k8s service accounts, but may be changed per deployment
436
+ "token_expiration_seconds": 600,
437
+ "token_path": "/var/run/secrets/kubernetes.io/serviceaccount/token",
424
438
  },
425
439
  },
426
440
  "nuclio": {
@@ -475,6 +489,10 @@ default_config = {
475
489
  },
476
490
  "authorization": {
477
491
  "mode": "none", # one of none, opa
492
+ "namespaces": {
493
+ "resources": "",
494
+ "mgmt": "mgmt",
495
+ },
478
496
  "opa": {
479
497
  "address": "",
480
498
  "request_timeout": 10,
@@ -644,6 +662,13 @@ default_config = {
644
662
  "max_replicas": 1,
645
663
  },
646
664
  },
665
+ "writer_graph": {
666
+ "max_events": 1000,
667
+ "flush_after_seconds": 30,
668
+ "writer_version": "v2", # v1 is the sync version while v2 is async
669
+ "parquet_batching_max_events": 10,
670
+ "parquet_batching_timeout_secs": 30,
671
+ },
647
672
  # Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
648
673
  # stream, and endpoints.
649
674
  "store_prefixes": {
@@ -657,6 +682,15 @@ default_config = {
657
682
  "parquet_batching_max_events": 10_000,
658
683
  "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
659
684
  "model_endpoint_creation_check_period": 15,
685
+ # TSDB (TimescaleDB) configuration
686
+ "tsdb": {
687
+ # When True, automatically create/generate database name using system_id if not explicitly
688
+ # specified in the connection string. When False, use the database from connection string as-is.
689
+ "auto_create_database": True,
690
+ # Connection pool timeout in seconds. This is the maximum time to wait for a connection
691
+ # from the pool before raising an error.
692
+ "connection_pool_timeout": 120,
693
+ },
660
694
  },
661
695
  "secret_stores": {
662
696
  # Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory
@@ -712,9 +746,8 @@ default_config = {
712
746
  # Set false to avoid creating a global source (for example in a dark site)
713
747
  "create": True,
714
748
  "name": "default",
715
- "description": "MLRun global function hub",
749
+ "description": "MLRun hub",
716
750
  "url": "https://mlrun.github.io/marketplace",
717
- "object_type": "functions",
718
751
  "channel": "master",
719
752
  },
720
753
  },
@@ -856,6 +889,19 @@ default_config = {
856
889
  "enabled": False,
857
890
  "request_timeout": 5,
858
891
  },
892
+ "auth_with_oauth_token": {
893
+ "enabled": False,
894
+ "request_timeout": 5,
895
+ "refresh_threshold": 0.75,
896
+ # Default is empty. automatically set based on configuration (end client vs jupyter vs runtime, etc)
897
+ # can be set manually set using envvars
898
+ "token_file": "",
899
+ # Default is empty because if set, searches for the specific token name in the file, if empty, it will look
900
+ # for a token named "default", if "default" does not exist, it will use the first token in the file
901
+ "token_name": "",
902
+ },
903
+ # a runtime computed value. Do not set it manually.
904
+ "auth_token_endpoint": "",
859
905
  "services": {
860
906
  # The running service name. One of: "api", "alerts"
861
907
  "service_name": "api",
@@ -953,7 +999,7 @@ class Config:
953
999
  try:
954
1000
  config_value.update(value)
955
1001
  except AttributeError as exc:
956
- if not isinstance(config_value, (dict, Config)):
1002
+ if not isinstance(config_value, dict | Config):
957
1003
  raise ValueError(
958
1004
  f"Can not update `{key}` config. "
959
1005
  f"Expected a configuration but received {type(value)}"
@@ -996,9 +1042,9 @@ class Config:
996
1042
  )
997
1043
 
998
1044
  @staticmethod
999
- def get_default_hub_source() -> str:
1045
+ def get_default_hub_source_url_prefix(object_type) -> str:
1000
1046
  default_source = config.hub.default_source
1001
- return f"{default_source.url}/{default_source.object_type}/{default_source.channel}/"
1047
+ return f"{default_source.url}/{object_type}/{default_source.channel}/"
1002
1048
 
1003
1049
  @staticmethod
1004
1050
  def decode_base64_config_and_load_to_object(
@@ -1268,10 +1314,7 @@ class Config:
1268
1314
 
1269
1315
  @staticmethod
1270
1316
  def resolve_ui_url():
1271
- # ui_url is deprecated in favor of the ui.url (we created the ui block)
1272
- # since the config class is used in a "recursive" way, we can't use property like we used in other places
1273
- # since the property will need to be url, which exists in other structs as well
1274
- return config.ui.url or config.ui_url
1317
+ return config.ui.url
1275
1318
 
1276
1319
  def is_api_running_on_k8s(self):
1277
1320
  # determine if the API service is attached to K8s cluster
@@ -1391,6 +1434,18 @@ class Config:
1391
1434
  ver in mlrun.mlconf.ce.mode for ver in ["lite", "full"]
1392
1435
  )
1393
1436
 
1437
+ def is_iguazio_mode(self):
1438
+ return (
1439
+ mlrun.mlconf.httpdb.authentication.mode
1440
+ == mlrun.common.types.AuthenticationMode.IGUAZIO
1441
+ )
1442
+
1443
+ def is_iguazio_v4_mode(self):
1444
+ return (
1445
+ config.httpdb.authentication.mode
1446
+ == mlrun.common.types.AuthenticationMode.IGUAZIO_V4
1447
+ )
1448
+
1394
1449
  def is_explicit_ack_enabled(self) -> bool:
1395
1450
  return self.httpdb.nuclio.explicit_ack == "enabled" and (
1396
1451
  not self.nuclio_version
@@ -1558,7 +1613,6 @@ def read_env(env=None, prefix=env_prefix):
1558
1613
  "https://mlrun-api.", "https://framesd."
1559
1614
  )
1560
1615
 
1561
- uisvc = env.get("MLRUN_UI_SERVICE_HOST")
1562
1616
  igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
1563
1617
 
1564
1618
  # workaround to try and detect IGZ domain
@@ -1584,10 +1638,6 @@ def read_env(env=None, prefix=env_prefix):
1584
1638
  if config.get("nuclio_dashboard_url") == "disabled":
1585
1639
  config["nuclio_dashboard_url"] = ""
1586
1640
 
1587
- if uisvc and not config.get("ui_url"):
1588
- if igz_domain:
1589
- config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
1590
-
1591
1641
  if log_level := config.get("log_level"):
1592
1642
  import mlrun.utils.logger
1593
1643
 
mlrun/data_types/infer.py CHANGED
@@ -134,9 +134,9 @@ def get_df_stats(df, options, num_bins=None, sample_size=None):
134
134
  for col, values in df.describe(include="all", **kwargs).items():
135
135
  stats_dict = {}
136
136
  for stat, val in values.dropna().items():
137
- if isinstance(val, (float, np.floating, np.float64)):
137
+ if isinstance(val, float | np.floating | np.float64):
138
138
  stats_dict[stat] = float(val)
139
- elif isinstance(val, (int, np.integer, np.int64)):
139
+ elif isinstance(val, int | np.integer | np.int64):
140
140
  # boolean values are considered subclass of int
141
141
  if isinstance(val, bool):
142
142
  stats_dict[stat] = bool(val)
@@ -39,10 +39,11 @@ __all__ = [
39
39
  from urllib.parse import urlparse
40
40
 
41
41
  import fsspec
42
+ import storey
42
43
 
43
44
  import mlrun.datastore.wasbfs
44
45
  from mlrun.datastore.datastore_profile import (
45
- DatastoreProfileKafkaSource,
46
+ DatastoreProfileKafkaStream,
46
47
  DatastoreProfileKafkaTarget,
47
48
  DatastoreProfileV3io,
48
49
  )
@@ -58,7 +59,6 @@ from ..utils import logger
58
59
  from .base import DataItem
59
60
  from .datastore import StoreManager, in_memory_store, uri_to_ipython
60
61
  from .dbfs_store import DatabricksFileBugFixed, DatabricksFileSystemDisableCache
61
- from .s3 import parse_s3_bucket_and_key
62
62
  from .sources import (
63
63
  BigQuerySource,
64
64
  CSVSource,
@@ -74,7 +74,7 @@ from .store_resources import (
74
74
  parse_store_uri,
75
75
  )
76
76
  from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
77
- from .utils import get_kafka_brokers_from_dict, parse_kafka_url
77
+ from .utils import get_kafka_brokers_from_dict, parse_kafka_url, parse_s3_bucket_and_key
78
78
 
79
79
  store_manager = StoreManager()
80
80
 
@@ -122,7 +122,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
122
122
  )
123
123
  if isinstance(
124
124
  datastore_profile,
125
- (DatastoreProfileKafkaSource, DatastoreProfileKafkaTarget),
125
+ DatastoreProfileKafkaStream | DatastoreProfileKafkaTarget,
126
126
  ):
127
127
  attributes = datastore_profile.attributes()
128
128
  brokers = attributes.pop("brokers", None)
@@ -168,11 +168,12 @@ def get_stream_pusher(stream_path: str, **kwargs):
168
168
  raise ValueError(f"unsupported stream path {stream_path}")
169
169
 
170
170
 
171
- class _DummyStream:
171
+ class _DummyStream(storey.MapClass):
172
172
  """stream emulator for tests and debug"""
173
173
 
174
174
  def __init__(self, event_list=None, **kwargs):
175
175
  self.event_list = event_list or []
176
+ super().__init__(**kwargs)
176
177
 
177
178
  def push(self, data, **kwargs):
178
179
  if not isinstance(data, list):
@@ -180,3 +181,9 @@ class _DummyStream:
180
181
  for item in data:
181
182
  logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
182
183
  self.event_list.append(item)
184
+
185
+ def do(self, event):
186
+ if not isinstance(event, list):
187
+ event = [event]
188
+ for item in event:
189
+ self.event_list.append(item)
@@ -1,4 +1,4 @@
1
- # Copyright 2023 Iguazio
1
+ # Copyright 2025 Iguazio
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import contextlib
15
16
  import time
16
17
  from pathlib import Path
17
18
  from typing import Optional
@@ -30,6 +31,40 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
30
31
 
31
32
 
32
33
  class AzureBlobStore(DataStore):
34
+ """
35
+ Azure Blob Storage datastore implementation.
36
+
37
+ Supports multiple URL schemas: az://, wasbs://, wasb://
38
+
39
+ Supported Connection String Formats:
40
+ ====================================
41
+
42
+ 1. Account Key (Standard):
43
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.windows.net"
44
+
45
+ 2. SAS Token:
46
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;SharedAccessSignature=<sas_token>"
47
+
48
+ 3. Minimal BlobEndpoint:
49
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;AccountName=<account>;AccountKey=<key>"
50
+
51
+ 4. Custom Domain:
52
+ "BlobEndpoint=https://<account>.mydomain.com/;AccountName=<account>;AccountKey=<key>"
53
+
54
+ 5. China/Government Cloud:
55
+ "DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn"
56
+
57
+ 6. Full Service Endpoints with SAS:
58
+ "BlobEndpoint=https://<account>.blob.core.windows.net/;QueueEndpoint=...;SharedAccessSignature=<sas>"
59
+
60
+ Authentication Methods:
61
+ ======================
62
+ - Account Key (connection_string or storage_options)
63
+ - SAS Token (connection_string or storage_options)
64
+ - OAuth/Azure AD (storage_options: client_id, client_secret, tenant_id)
65
+
66
+ """
67
+
33
68
  using_bucket = True
34
69
  max_concurrency = 100
35
70
  max_blocksize = 1024 * 1024 * 4
@@ -40,6 +75,12 @@ class AzureBlobStore(DataStore):
40
75
  def __init__(
41
76
  self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
42
77
  ):
78
+ # Extract container from WASBS endpoint before calling super()
79
+ self._container_from_endpoint = None
80
+ if schema in ["wasbs", "wasb"] and endpoint and "@" in endpoint:
81
+ # Handle container@host format
82
+ self._container_from_endpoint, endpoint = endpoint.split("@", 1)
83
+
43
84
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
44
85
  self._service_client = None
45
86
  self._storage_options = None
@@ -67,6 +108,34 @@ class AzureBlobStore(DataStore):
67
108
  or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
68
109
  credential=self._get_secret_or_env("credential"),
69
110
  )
111
+ # Use container extracted from WASBS endpoint during initialization
112
+ if self._container_from_endpoint:
113
+ res["container"] = self._container_from_endpoint
114
+
115
+ # For az:// URLs, endpoint contains the container name
116
+ if not res.get("container") and self.kind in ["az"]:
117
+ if container := getattr(self, "endpoint", None):
118
+ res["container"] = container
119
+
120
+ # Last resort: For wasbs:// without container, check if connection string has BlobEndpoint with container
121
+ if not res.get("container") and self.kind in ["wasbs", "wasb"]:
122
+ connection_string = res.get("connection_string")
123
+ if connection_string and "BlobEndpoint=" in connection_string:
124
+ # Try to extract container from BlobEndpoint URL
125
+ for part in connection_string.split(";"):
126
+ if part.startswith("BlobEndpoint="):
127
+ blob_endpoint = part.split("=", 1)[1]
128
+ # Parse URL to get path component
129
+ from urllib.parse import urlparse
130
+
131
+ parsed = urlparse(blob_endpoint)
132
+ if parsed.path and parsed.path.strip("/"):
133
+ # Extract first path segment as container
134
+ path_parts = parsed.path.strip("/").split("/")
135
+ if path_parts[0]:
136
+ res["container"] = path_parts[0]
137
+ break
138
+
70
139
  self._storage_options = self._sanitize_options(res)
71
140
  return self._storage_options
72
141
 
@@ -165,7 +234,18 @@ class AzureBlobStore(DataStore):
165
234
  # if called without passing dataitem - like in fset.purge_targets,
166
235
  # key will include schema.
167
236
  if not schema:
168
- key = Path(self.endpoint, key).as_posix()
237
+ # For wasbs/wasb, the filesystem is scoped to the container, so we need to use
238
+ # the container name as the base path, not the hostname endpoint.
239
+ # For az://, endpoint already contains the container name.
240
+ if self.kind in ["wasbs", "wasb"]:
241
+ container = self.storage_options.get("container")
242
+ if container:
243
+ key = Path(container, key).as_posix()
244
+ else:
245
+ # If no container found, use endpoint (might be hostname, but better than nothing)
246
+ key = Path(self.endpoint, key).as_posix()
247
+ else:
248
+ key = Path(self.endpoint, key).as_posix()
169
249
  return key
170
250
 
171
251
  def upload(self, key, src_path):
@@ -229,18 +309,27 @@ class AzureBlobStore(DataStore):
229
309
  st = self.storage_options
230
310
  service = "blob"
231
311
  primary_url = None
232
- if st.get("connection_string"):
312
+
313
+ # Parse connection string (fills account_name/account_key or SAS)
314
+ connection_string = st.get("connection_string")
315
+ if connection_string:
233
316
  primary_url, _, parsed_credential = parse_connection_str(
234
- st.get("connection_string"), credential=None, service=service
317
+ connection_string, credential=None, service=service
235
318
  )
236
- for key in ["account_name", "account_key"]:
237
- parsed_value = parsed_credential.get(key)
238
- if parsed_value:
239
- if key in st and st[key] != parsed_value:
319
+
320
+ if isinstance(parsed_credential, str):
321
+ # SharedAccessSignature as raw string
322
+ parsed_credential = {"sas_token": parsed_credential}
323
+
324
+ for key in ["account_name", "account_key", "sas_token"]:
325
+ if parsed_value := parsed_credential.get(key):
326
+ # Only check for conflicts if storage options has a non-empty value for this key
327
+ existing_value = st.get(key)
328
+ if existing_value and existing_value != parsed_value:
240
329
  if key == "account_name":
241
330
  raise mlrun.errors.MLRunInvalidArgumentError(
242
- f"Storage option for '{key}' is '{st[key]}',\
243
- which does not match corresponding connection string '{parsed_value}'"
331
+ f"Storage option for '{key}' is '{existing_value}', "
332
+ f"which does not match corresponding connection string '{parsed_value}'"
244
333
  )
245
334
  else:
246
335
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -249,57 +338,83 @@ class AzureBlobStore(DataStore):
249
338
  st[key] = parsed_value
250
339
 
251
340
  account_name = st.get("account_name")
341
+ # Derive host (prefer connection string primary URL)
252
342
  if primary_url:
253
343
  if primary_url.startswith("http://"):
254
344
  primary_url = primary_url[len("http://") :]
255
345
  if primary_url.startswith("https://"):
256
346
  primary_url = primary_url[len("https://") :]
257
- host = primary_url
347
+ # Remove any path components from the host
348
+ host = primary_url.split("/")[0]
258
349
  elif account_name:
259
350
  host = f"{account_name}.{service}.core.windows.net"
260
351
  else:
352
+ # nothing to configure yet
261
353
  return res
262
354
 
263
- if "account_key" in st:
355
+ host = host.rstrip("/")
356
+
357
+ # Account key (optional; WASB supports it)
358
+ if "account_key" in st and st["account_key"]:
264
359
  res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
265
360
 
266
- if "client_secret" in st or "client_id" in st or "tenant_id" in st:
267
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
268
- res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
269
- "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
270
- )
271
- if "client_id" in st:
272
- res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
273
- "client_id"
274
- ]
275
- if "client_secret" in st:
276
- res[f"spark.hadoop.fs.azure.account.oauth2.client.secret.{host}"] = st[
277
- "client_secret"
278
- ]
279
- if "tenant_id" in st:
280
- tenant_id = st["tenant_id"]
281
- res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
282
- f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
283
- )
361
+ # --- WASB + SAS (container-scoped key; no provider classes needed) ---
362
+ if "sas_token" in st and st["sas_token"]:
363
+ sas = st["sas_token"].lstrip("?")
284
364
 
285
- if "sas_token" in st:
286
- res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
287
- res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
288
- "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
289
- )
290
- res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
365
+ container = st.get("container")
366
+
367
+ if container:
368
+ # fs.azure.sas.<container>.<account>.blob.core.windows.net = <sas>
369
+ res[f"spark.hadoop.fs.azure.sas.{container}.{host}"] = sas
370
+
371
+ else:
372
+ raise mlrun.errors.MLRunInvalidArgumentError(
373
+ "Container name is required for WASB SAS. "
374
+ "Set self.endpoint or storage_options['container']."
375
+ )
291
376
  return res
292
377
 
293
378
  @property
294
379
  def spark_url(self):
295
- spark_options = self.get_spark_options()
296
- url = f"wasbs://{self.endpoint}"
297
- prefix = "spark.hadoop.fs.azure.account.key."
298
- if spark_options:
299
- for key in spark_options:
300
- if key.startswith(prefix):
301
- account_key = key[len(prefix) :]
302
- if not url.endswith(account_key):
303
- url += f"@{account_key}"
304
- break
305
- return url
380
+ # Build: wasbs://<container>@<host>
381
+ st = self.storage_options
382
+ service = "blob"
383
+
384
+ container = st.get("container")
385
+
386
+ if not container:
387
+ raise mlrun.errors.MLRunInvalidArgumentError(
388
+ "Container name is required to build the WASB URL. "
389
+ "Set storage_options['container'] or use datastore profile with container specified."
390
+ )
391
+
392
+ # Prefer host from connection string; else synthesize from account_name
393
+ host = None
394
+ account_name = st.get("account_name")
395
+ connection_string = st.get("connection_string")
396
+
397
+ if connection_string:
398
+ with contextlib.suppress(Exception):
399
+ primary_url, _, _ = parse_connection_str(
400
+ connection_string, credential=None, service=service
401
+ )
402
+ if primary_url.startswith("http://"):
403
+ primary_url = primary_url[len("http://") :]
404
+ if primary_url.startswith("https://"):
405
+ primary_url = primary_url[len("https://") :]
406
+ # Remove any path components from the host
407
+ host = primary_url.split("/")[0].rstrip("/")
408
+ if not host and account_name:
409
+ host = f"{account_name}.{service}.core.windows.net"
410
+
411
+ # For wasbs:// URLs where endpoint is already the host
412
+ if not host and self.kind in ["wasbs", "wasb"] and hasattr(self, "endpoint"):
413
+ host = getattr(self, "endpoint", None)
414
+
415
+ if not host:
416
+ raise mlrun.errors.MLRunInvalidArgumentError(
417
+ "account_name is required (or provide a connection_string) to build the WASB URL."
418
+ )
419
+
420
+ return f"wasbs://{container}@{host}"