mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +24 -3
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +14 -0
- mlrun/common/model_monitoring/helpers.py +123 -0
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +14 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +12 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +34 -0
- mlrun/common/schemas/hub.py +33 -20
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +12 -15
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +69 -19
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +12 -5
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +7 -2
- mlrun/datastore/datastore_profile.py +84 -22
- mlrun/datastore/model_provider/huggingface_provider.py +225 -41
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +206 -74
- mlrun/datastore/model_provider/openai_provider.py +226 -66
- mlrun/datastore/s3.py +39 -18
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +17 -12
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +25 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +63 -32
- mlrun/db/httpdb.py +373 -153
- mlrun/db/nopdb.py +54 -21
- mlrun/errors.py +4 -2
- mlrun/execution.py +66 -25
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +52 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +172 -0
- mlrun/hub/step.py +113 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +15 -7
- mlrun/launcher/local.py +4 -1
- mlrun/model.py +14 -4
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +65 -28
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +299 -128
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/controller.py +132 -58
- mlrun/model_monitoring/db/_schedules.py +38 -29
- mlrun/model_monitoring/db/_stats.py +6 -16
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +30 -6
- mlrun/model_monitoring/stream_processing.py +34 -28
- mlrun/model_monitoring/writer.py +224 -4
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -43
- mlrun/platforms/iguazio.py +8 -4
- mlrun/projects/operations.py +17 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +187 -123
- mlrun/run.py +95 -21
- mlrun/runtimes/__init__.py +2 -186
- mlrun/runtimes/base.py +103 -25
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +5 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +12 -7
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +339 -40
- mlrun/runtimes/nuclio/function.py +222 -72
- mlrun/runtimes/nuclio/serving.py +132 -42
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +99 -14
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +84 -11
- mlrun/serving/routers.py +26 -44
- mlrun/serving/server.py +138 -51
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +997 -283
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +149 -95
- mlrun/serving/v2_serving.py +9 -10
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +292 -94
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +3 -5
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +3 -3
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
- mlrun/api/schemas/__init__.py +0 -259
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -40,6 +40,7 @@ import yaml
|
|
|
40
40
|
|
|
41
41
|
import mlrun.common.constants
|
|
42
42
|
import mlrun.common.schemas
|
|
43
|
+
import mlrun.common.types
|
|
43
44
|
import mlrun.errors
|
|
44
45
|
|
|
45
46
|
env_prefix = "MLRUN_"
|
|
@@ -66,7 +67,6 @@ default_config = {
|
|
|
66
67
|
"nuclio_version": "",
|
|
67
68
|
"default_nuclio_runtime": "python:3.11",
|
|
68
69
|
"nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
|
|
69
|
-
"ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
|
|
70
70
|
"remote_host": "",
|
|
71
71
|
"api_base_version": "v1",
|
|
72
72
|
"version": "", # will be set to current version
|
|
@@ -85,7 +85,9 @@ default_config = {
|
|
|
85
85
|
"kfp_image": "mlrun/mlrun-kfp", # image to use for KFP runner
|
|
86
86
|
"dask_kfp_image": "mlrun/mlrun", # image to use for dask KFP runner
|
|
87
87
|
"igz_version": "", # the version of the iguazio system the API is running on
|
|
88
|
-
"iguazio_api_url": "", # the url to iguazio api
|
|
88
|
+
"iguazio_api_url": "", # the url to iguazio api (internal / external access with priority to internal)
|
|
89
|
+
"iguazio_api_url_ingress": "", # the url to iguazio api ingress (for external access)
|
|
90
|
+
"iguazio_api_ssl_verify": True, # verify ssl certificate of iguazio api
|
|
89
91
|
"spark_app_image": "", # image to use for spark operator app runtime
|
|
90
92
|
"spark_app_image_tag": "", # image tag to use for spark operator app runtime
|
|
91
93
|
"spark_history_server_path": "", # spark logs directory for spark history server
|
|
@@ -107,7 +109,11 @@ default_config = {
|
|
|
107
109
|
"submit_timeout": "280", # timeout when submitting a new k8s resource
|
|
108
110
|
# runtimes cleanup interval in seconds
|
|
109
111
|
"runtimes_cleanup_interval": "300",
|
|
110
|
-
|
|
112
|
+
# disabled by default due to an internal bug in serving functions
|
|
113
|
+
# relying on a background task to hold the status for its model endpoints
|
|
114
|
+
# TODO: need to refine what/when we can delete the background tasks
|
|
115
|
+
# e.g: use labels or naming convention.
|
|
116
|
+
"background_task_cleanup_interval": "0",
|
|
111
117
|
"background_task_max_age": "21600", # 6 hours in seconds
|
|
112
118
|
"monitoring": {
|
|
113
119
|
"runs": {
|
|
@@ -251,7 +257,8 @@ default_config = {
|
|
|
251
257
|
},
|
|
252
258
|
"runtimes": {
|
|
253
259
|
"dask": "600",
|
|
254
|
-
|
|
260
|
+
# cluster start might take some time in case k8s needs to spin up new nodes
|
|
261
|
+
"dask_cluster_start": "600",
|
|
255
262
|
},
|
|
256
263
|
"push_notifications": "60",
|
|
257
264
|
},
|
|
@@ -299,6 +306,7 @@ default_config = {
|
|
|
299
306
|
"application": {
|
|
300
307
|
"default_sidecar_internal_port": 8050,
|
|
301
308
|
"default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
|
|
309
|
+
"default_worker_number": 10000,
|
|
302
310
|
},
|
|
303
311
|
},
|
|
304
312
|
# TODO: function defaults should be moved to the function spec config above
|
|
@@ -416,11 +424,17 @@ default_config = {
|
|
|
416
424
|
"allow_local_run": False,
|
|
417
425
|
},
|
|
418
426
|
"authentication": {
|
|
419
|
-
"mode": "none", # one of none, basic, bearer, iguazio
|
|
427
|
+
"mode": "none", # one of none, basic, bearer, iguazio, iguazio-v4
|
|
420
428
|
"basic": {"username": "", "password": ""},
|
|
421
429
|
"bearer": {"token": ""},
|
|
422
430
|
"iguazio": {
|
|
423
431
|
"session_verification_endpoint": "data_sessions/verifications/app_service",
|
|
432
|
+
"authentication_endpoint": "api/v1/authentication/refresh-access-token",
|
|
433
|
+
},
|
|
434
|
+
"service_account": {
|
|
435
|
+
# the following are the default values for k8s service accounts, but may be changed per deployment
|
|
436
|
+
"token_expiration_seconds": 600,
|
|
437
|
+
"token_path": "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
|
424
438
|
},
|
|
425
439
|
},
|
|
426
440
|
"nuclio": {
|
|
@@ -475,6 +489,10 @@ default_config = {
|
|
|
475
489
|
},
|
|
476
490
|
"authorization": {
|
|
477
491
|
"mode": "none", # one of none, opa
|
|
492
|
+
"namespaces": {
|
|
493
|
+
"resources": "",
|
|
494
|
+
"mgmt": "mgmt",
|
|
495
|
+
},
|
|
478
496
|
"opa": {
|
|
479
497
|
"address": "",
|
|
480
498
|
"request_timeout": 10,
|
|
@@ -644,6 +662,13 @@ default_config = {
|
|
|
644
662
|
"max_replicas": 1,
|
|
645
663
|
},
|
|
646
664
|
},
|
|
665
|
+
"writer_graph": {
|
|
666
|
+
"max_events": 1000,
|
|
667
|
+
"flush_after_seconds": 30,
|
|
668
|
+
"writer_version": "v2", # v1 is the sync version while v2 is async
|
|
669
|
+
"parquet_batching_max_events": 10,
|
|
670
|
+
"parquet_batching_timeout_secs": 30,
|
|
671
|
+
},
|
|
647
672
|
# Store prefixes are used to handle model monitoring storing policies based on project and kind, such as events,
|
|
648
673
|
# stream, and endpoints.
|
|
649
674
|
"store_prefixes": {
|
|
@@ -657,6 +682,15 @@ default_config = {
|
|
|
657
682
|
"parquet_batching_max_events": 10_000,
|
|
658
683
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
659
684
|
"model_endpoint_creation_check_period": 15,
|
|
685
|
+
# TSDB (TimescaleDB) configuration
|
|
686
|
+
"tsdb": {
|
|
687
|
+
# When True, automatically create/generate database name using system_id if not explicitly
|
|
688
|
+
# specified in the connection string. When False, use the database from connection string as-is.
|
|
689
|
+
"auto_create_database": True,
|
|
690
|
+
# Connection pool timeout in seconds. This is the maximum time to wait for a connection
|
|
691
|
+
# from the pool before raising an error.
|
|
692
|
+
"connection_pool_timeout": 120,
|
|
693
|
+
},
|
|
660
694
|
},
|
|
661
695
|
"secret_stores": {
|
|
662
696
|
# Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory
|
|
@@ -712,9 +746,8 @@ default_config = {
|
|
|
712
746
|
# Set false to avoid creating a global source (for example in a dark site)
|
|
713
747
|
"create": True,
|
|
714
748
|
"name": "default",
|
|
715
|
-
"description": "MLRun
|
|
749
|
+
"description": "MLRun hub",
|
|
716
750
|
"url": "https://mlrun.github.io/marketplace",
|
|
717
|
-
"object_type": "functions",
|
|
718
751
|
"channel": "master",
|
|
719
752
|
},
|
|
720
753
|
},
|
|
@@ -856,6 +889,19 @@ default_config = {
|
|
|
856
889
|
"enabled": False,
|
|
857
890
|
"request_timeout": 5,
|
|
858
891
|
},
|
|
892
|
+
"auth_with_oauth_token": {
|
|
893
|
+
"enabled": False,
|
|
894
|
+
"request_timeout": 5,
|
|
895
|
+
"refresh_threshold": 0.75,
|
|
896
|
+
# Default is empty. automatically set based on configuration (end client vs jupyter vs runtime, etc)
|
|
897
|
+
# can be set manually set using envvars
|
|
898
|
+
"token_file": "",
|
|
899
|
+
# Default is empty because if set, searches for the specific token name in the file, if empty, it will look
|
|
900
|
+
# for a token named "default", if "default" does not exist, it will use the first token in the file
|
|
901
|
+
"token_name": "",
|
|
902
|
+
},
|
|
903
|
+
# a runtime computed value. Do not set it manually.
|
|
904
|
+
"auth_token_endpoint": "",
|
|
859
905
|
"services": {
|
|
860
906
|
# The running service name. One of: "api", "alerts"
|
|
861
907
|
"service_name": "api",
|
|
@@ -953,7 +999,7 @@ class Config:
|
|
|
953
999
|
try:
|
|
954
1000
|
config_value.update(value)
|
|
955
1001
|
except AttributeError as exc:
|
|
956
|
-
if not isinstance(config_value,
|
|
1002
|
+
if not isinstance(config_value, dict | Config):
|
|
957
1003
|
raise ValueError(
|
|
958
1004
|
f"Can not update `{key}` config. "
|
|
959
1005
|
f"Expected a configuration but received {type(value)}"
|
|
@@ -996,9 +1042,9 @@ class Config:
|
|
|
996
1042
|
)
|
|
997
1043
|
|
|
998
1044
|
@staticmethod
|
|
999
|
-
def
|
|
1045
|
+
def get_default_hub_source_url_prefix(object_type) -> str:
|
|
1000
1046
|
default_source = config.hub.default_source
|
|
1001
|
-
return f"{default_source.url}/{
|
|
1047
|
+
return f"{default_source.url}/{object_type}/{default_source.channel}/"
|
|
1002
1048
|
|
|
1003
1049
|
@staticmethod
|
|
1004
1050
|
def decode_base64_config_and_load_to_object(
|
|
@@ -1268,10 +1314,7 @@ class Config:
|
|
|
1268
1314
|
|
|
1269
1315
|
@staticmethod
|
|
1270
1316
|
def resolve_ui_url():
|
|
1271
|
-
|
|
1272
|
-
# since the config class is used in a "recursive" way, we can't use property like we used in other places
|
|
1273
|
-
# since the property will need to be url, which exists in other structs as well
|
|
1274
|
-
return config.ui.url or config.ui_url
|
|
1317
|
+
return config.ui.url
|
|
1275
1318
|
|
|
1276
1319
|
def is_api_running_on_k8s(self):
|
|
1277
1320
|
# determine if the API service is attached to K8s cluster
|
|
@@ -1391,6 +1434,18 @@ class Config:
|
|
|
1391
1434
|
ver in mlrun.mlconf.ce.mode for ver in ["lite", "full"]
|
|
1392
1435
|
)
|
|
1393
1436
|
|
|
1437
|
+
def is_iguazio_mode(self):
|
|
1438
|
+
return (
|
|
1439
|
+
mlrun.mlconf.httpdb.authentication.mode
|
|
1440
|
+
== mlrun.common.types.AuthenticationMode.IGUAZIO
|
|
1441
|
+
)
|
|
1442
|
+
|
|
1443
|
+
def is_iguazio_v4_mode(self):
|
|
1444
|
+
return (
|
|
1445
|
+
config.httpdb.authentication.mode
|
|
1446
|
+
== mlrun.common.types.AuthenticationMode.IGUAZIO_V4
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1394
1449
|
def is_explicit_ack_enabled(self) -> bool:
|
|
1395
1450
|
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1396
1451
|
not self.nuclio_version
|
|
@@ -1558,7 +1613,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1558
1613
|
"https://mlrun-api.", "https://framesd."
|
|
1559
1614
|
)
|
|
1560
1615
|
|
|
1561
|
-
uisvc = env.get("MLRUN_UI_SERVICE_HOST")
|
|
1562
1616
|
igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
|
|
1563
1617
|
|
|
1564
1618
|
# workaround to try and detect IGZ domain
|
|
@@ -1584,10 +1638,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1584
1638
|
if config.get("nuclio_dashboard_url") == "disabled":
|
|
1585
1639
|
config["nuclio_dashboard_url"] = ""
|
|
1586
1640
|
|
|
1587
|
-
if uisvc and not config.get("ui_url"):
|
|
1588
|
-
if igz_domain:
|
|
1589
|
-
config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
|
|
1590
|
-
|
|
1591
1641
|
if log_level := config.get("log_level"):
|
|
1592
1642
|
import mlrun.utils.logger
|
|
1593
1643
|
|
mlrun/data_types/infer.py
CHANGED
|
@@ -134,9 +134,9 @@ def get_df_stats(df, options, num_bins=None, sample_size=None):
|
|
|
134
134
|
for col, values in df.describe(include="all", **kwargs).items():
|
|
135
135
|
stats_dict = {}
|
|
136
136
|
for stat, val in values.dropna().items():
|
|
137
|
-
if isinstance(val,
|
|
137
|
+
if isinstance(val, float | np.floating | np.float64):
|
|
138
138
|
stats_dict[stat] = float(val)
|
|
139
|
-
elif isinstance(val,
|
|
139
|
+
elif isinstance(val, int | np.integer | np.int64):
|
|
140
140
|
# boolean values are considered subclass of int
|
|
141
141
|
if isinstance(val, bool):
|
|
142
142
|
stats_dict[stat] = bool(val)
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -39,10 +39,11 @@ __all__ = [
|
|
|
39
39
|
from urllib.parse import urlparse
|
|
40
40
|
|
|
41
41
|
import fsspec
|
|
42
|
+
import storey
|
|
42
43
|
|
|
43
44
|
import mlrun.datastore.wasbfs
|
|
44
45
|
from mlrun.datastore.datastore_profile import (
|
|
45
|
-
|
|
46
|
+
DatastoreProfileKafkaStream,
|
|
46
47
|
DatastoreProfileKafkaTarget,
|
|
47
48
|
DatastoreProfileV3io,
|
|
48
49
|
)
|
|
@@ -58,7 +59,6 @@ from ..utils import logger
|
|
|
58
59
|
from .base import DataItem
|
|
59
60
|
from .datastore import StoreManager, in_memory_store, uri_to_ipython
|
|
60
61
|
from .dbfs_store import DatabricksFileBugFixed, DatabricksFileSystemDisableCache
|
|
61
|
-
from .s3 import parse_s3_bucket_and_key
|
|
62
62
|
from .sources import (
|
|
63
63
|
BigQuerySource,
|
|
64
64
|
CSVSource,
|
|
@@ -74,7 +74,7 @@ from .store_resources import (
|
|
|
74
74
|
parse_store_uri,
|
|
75
75
|
)
|
|
76
76
|
from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
|
|
77
|
-
from .utils import get_kafka_brokers_from_dict, parse_kafka_url
|
|
77
|
+
from .utils import get_kafka_brokers_from_dict, parse_kafka_url, parse_s3_bucket_and_key
|
|
78
78
|
|
|
79
79
|
store_manager = StoreManager()
|
|
80
80
|
|
|
@@ -122,7 +122,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
122
122
|
)
|
|
123
123
|
if isinstance(
|
|
124
124
|
datastore_profile,
|
|
125
|
-
|
|
125
|
+
DatastoreProfileKafkaStream | DatastoreProfileKafkaTarget,
|
|
126
126
|
):
|
|
127
127
|
attributes = datastore_profile.attributes()
|
|
128
128
|
brokers = attributes.pop("brokers", None)
|
|
@@ -168,11 +168,12 @@ def get_stream_pusher(stream_path: str, **kwargs):
|
|
|
168
168
|
raise ValueError(f"unsupported stream path {stream_path}")
|
|
169
169
|
|
|
170
170
|
|
|
171
|
-
class _DummyStream:
|
|
171
|
+
class _DummyStream(storey.MapClass):
|
|
172
172
|
"""stream emulator for tests and debug"""
|
|
173
173
|
|
|
174
174
|
def __init__(self, event_list=None, **kwargs):
|
|
175
175
|
self.event_list = event_list or []
|
|
176
|
+
super().__init__(**kwargs)
|
|
176
177
|
|
|
177
178
|
def push(self, data, **kwargs):
|
|
178
179
|
if not isinstance(data, list):
|
|
@@ -180,3 +181,9 @@ class _DummyStream:
|
|
|
180
181
|
for item in data:
|
|
181
182
|
logger.info(f"dummy stream got event: {item}, kwargs={kwargs}")
|
|
182
183
|
self.event_list.append(item)
|
|
184
|
+
|
|
185
|
+
def do(self, event):
|
|
186
|
+
if not isinstance(event, list):
|
|
187
|
+
event = [event]
|
|
188
|
+
for item in event:
|
|
189
|
+
self.event_list.append(item)
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2025 Iguazio
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import contextlib
|
|
15
16
|
import time
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import Optional
|
|
@@ -30,6 +31,40 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class AzureBlobStore(DataStore):
|
|
34
|
+
"""
|
|
35
|
+
Azure Blob Storage datastore implementation.
|
|
36
|
+
|
|
37
|
+
Supports multiple URL schemas: az://, wasbs://, wasb://
|
|
38
|
+
|
|
39
|
+
Supported Connection String Formats:
|
|
40
|
+
====================================
|
|
41
|
+
|
|
42
|
+
1. Account Key (Standard):
|
|
43
|
+
"DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.windows.net"
|
|
44
|
+
|
|
45
|
+
2. SAS Token:
|
|
46
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;SharedAccessSignature=<sas_token>"
|
|
47
|
+
|
|
48
|
+
3. Minimal BlobEndpoint:
|
|
49
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;AccountName=<account>;AccountKey=<key>"
|
|
50
|
+
|
|
51
|
+
4. Custom Domain:
|
|
52
|
+
"BlobEndpoint=https://<account>.mydomain.com/;AccountName=<account>;AccountKey=<key>"
|
|
53
|
+
|
|
54
|
+
5. China/Government Cloud:
|
|
55
|
+
"DefaultEndpointsProtocol=https;AccountName=<account>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn"
|
|
56
|
+
|
|
57
|
+
6. Full Service Endpoints with SAS:
|
|
58
|
+
"BlobEndpoint=https://<account>.blob.core.windows.net/;QueueEndpoint=...;SharedAccessSignature=<sas>"
|
|
59
|
+
|
|
60
|
+
Authentication Methods:
|
|
61
|
+
======================
|
|
62
|
+
- Account Key (connection_string or storage_options)
|
|
63
|
+
- SAS Token (connection_string or storage_options)
|
|
64
|
+
- OAuth/Azure AD (storage_options: client_id, client_secret, tenant_id)
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
|
|
33
68
|
using_bucket = True
|
|
34
69
|
max_concurrency = 100
|
|
35
70
|
max_blocksize = 1024 * 1024 * 4
|
|
@@ -40,6 +75,12 @@ class AzureBlobStore(DataStore):
|
|
|
40
75
|
def __init__(
|
|
41
76
|
self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
|
|
42
77
|
):
|
|
78
|
+
# Extract container from WASBS endpoint before calling super()
|
|
79
|
+
self._container_from_endpoint = None
|
|
80
|
+
if schema in ["wasbs", "wasb"] and endpoint and "@" in endpoint:
|
|
81
|
+
# Handle container@host format
|
|
82
|
+
self._container_from_endpoint, endpoint = endpoint.split("@", 1)
|
|
83
|
+
|
|
43
84
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
44
85
|
self._service_client = None
|
|
45
86
|
self._storage_options = None
|
|
@@ -67,6 +108,34 @@ class AzureBlobStore(DataStore):
|
|
|
67
108
|
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
68
109
|
credential=self._get_secret_or_env("credential"),
|
|
69
110
|
)
|
|
111
|
+
# Use container extracted from WASBS endpoint during initialization
|
|
112
|
+
if self._container_from_endpoint:
|
|
113
|
+
res["container"] = self._container_from_endpoint
|
|
114
|
+
|
|
115
|
+
# For az:// URLs, endpoint contains the container name
|
|
116
|
+
if not res.get("container") and self.kind in ["az"]:
|
|
117
|
+
if container := getattr(self, "endpoint", None):
|
|
118
|
+
res["container"] = container
|
|
119
|
+
|
|
120
|
+
# Last resort: For wasbs:// without container, check if connection string has BlobEndpoint with container
|
|
121
|
+
if not res.get("container") and self.kind in ["wasbs", "wasb"]:
|
|
122
|
+
connection_string = res.get("connection_string")
|
|
123
|
+
if connection_string and "BlobEndpoint=" in connection_string:
|
|
124
|
+
# Try to extract container from BlobEndpoint URL
|
|
125
|
+
for part in connection_string.split(";"):
|
|
126
|
+
if part.startswith("BlobEndpoint="):
|
|
127
|
+
blob_endpoint = part.split("=", 1)[1]
|
|
128
|
+
# Parse URL to get path component
|
|
129
|
+
from urllib.parse import urlparse
|
|
130
|
+
|
|
131
|
+
parsed = urlparse(blob_endpoint)
|
|
132
|
+
if parsed.path and parsed.path.strip("/"):
|
|
133
|
+
# Extract first path segment as container
|
|
134
|
+
path_parts = parsed.path.strip("/").split("/")
|
|
135
|
+
if path_parts[0]:
|
|
136
|
+
res["container"] = path_parts[0]
|
|
137
|
+
break
|
|
138
|
+
|
|
70
139
|
self._storage_options = self._sanitize_options(res)
|
|
71
140
|
return self._storage_options
|
|
72
141
|
|
|
@@ -165,7 +234,18 @@ class AzureBlobStore(DataStore):
|
|
|
165
234
|
# if called without passing dataitem - like in fset.purge_targets,
|
|
166
235
|
# key will include schema.
|
|
167
236
|
if not schema:
|
|
168
|
-
|
|
237
|
+
# For wasbs/wasb, the filesystem is scoped to the container, so we need to use
|
|
238
|
+
# the container name as the base path, not the hostname endpoint.
|
|
239
|
+
# For az://, endpoint already contains the container name.
|
|
240
|
+
if self.kind in ["wasbs", "wasb"]:
|
|
241
|
+
container = self.storage_options.get("container")
|
|
242
|
+
if container:
|
|
243
|
+
key = Path(container, key).as_posix()
|
|
244
|
+
else:
|
|
245
|
+
# If no container found, use endpoint (might be hostname, but better than nothing)
|
|
246
|
+
key = Path(self.endpoint, key).as_posix()
|
|
247
|
+
else:
|
|
248
|
+
key = Path(self.endpoint, key).as_posix()
|
|
169
249
|
return key
|
|
170
250
|
|
|
171
251
|
def upload(self, key, src_path):
|
|
@@ -229,18 +309,27 @@ class AzureBlobStore(DataStore):
|
|
|
229
309
|
st = self.storage_options
|
|
230
310
|
service = "blob"
|
|
231
311
|
primary_url = None
|
|
232
|
-
|
|
312
|
+
|
|
313
|
+
# Parse connection string (fills account_name/account_key or SAS)
|
|
314
|
+
connection_string = st.get("connection_string")
|
|
315
|
+
if connection_string:
|
|
233
316
|
primary_url, _, parsed_credential = parse_connection_str(
|
|
234
|
-
|
|
317
|
+
connection_string, credential=None, service=service
|
|
235
318
|
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
319
|
+
|
|
320
|
+
if isinstance(parsed_credential, str):
|
|
321
|
+
# SharedAccessSignature as raw string
|
|
322
|
+
parsed_credential = {"sas_token": parsed_credential}
|
|
323
|
+
|
|
324
|
+
for key in ["account_name", "account_key", "sas_token"]:
|
|
325
|
+
if parsed_value := parsed_credential.get(key):
|
|
326
|
+
# Only check for conflicts if storage options has a non-empty value for this key
|
|
327
|
+
existing_value = st.get(key)
|
|
328
|
+
if existing_value and existing_value != parsed_value:
|
|
240
329
|
if key == "account_name":
|
|
241
330
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
242
|
-
f"Storage option for '{key}' is '{
|
|
243
|
-
|
|
331
|
+
f"Storage option for '{key}' is '{existing_value}', "
|
|
332
|
+
f"which does not match corresponding connection string '{parsed_value}'"
|
|
244
333
|
)
|
|
245
334
|
else:
|
|
246
335
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -249,57 +338,83 @@ class AzureBlobStore(DataStore):
|
|
|
249
338
|
st[key] = parsed_value
|
|
250
339
|
|
|
251
340
|
account_name = st.get("account_name")
|
|
341
|
+
# Derive host (prefer connection string primary URL)
|
|
252
342
|
if primary_url:
|
|
253
343
|
if primary_url.startswith("http://"):
|
|
254
344
|
primary_url = primary_url[len("http://") :]
|
|
255
345
|
if primary_url.startswith("https://"):
|
|
256
346
|
primary_url = primary_url[len("https://") :]
|
|
257
|
-
|
|
347
|
+
# Remove any path components from the host
|
|
348
|
+
host = primary_url.split("/")[0]
|
|
258
349
|
elif account_name:
|
|
259
350
|
host = f"{account_name}.{service}.core.windows.net"
|
|
260
351
|
else:
|
|
352
|
+
# nothing to configure yet
|
|
261
353
|
return res
|
|
262
354
|
|
|
263
|
-
|
|
355
|
+
host = host.rstrip("/")
|
|
356
|
+
|
|
357
|
+
# Account key (optional; WASB supports it)
|
|
358
|
+
if "account_key" in st and st["account_key"]:
|
|
264
359
|
res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
|
|
265
360
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
|
|
270
|
-
)
|
|
271
|
-
if "client_id" in st:
|
|
272
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
|
|
273
|
-
"client_id"
|
|
274
|
-
]
|
|
275
|
-
if "client_secret" in st:
|
|
276
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.secret.{host}"] = st[
|
|
277
|
-
"client_secret"
|
|
278
|
-
]
|
|
279
|
-
if "tenant_id" in st:
|
|
280
|
-
tenant_id = st["tenant_id"]
|
|
281
|
-
res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
|
|
282
|
-
f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
|
|
283
|
-
)
|
|
361
|
+
# --- WASB + SAS (container-scoped key; no provider classes needed) ---
|
|
362
|
+
if "sas_token" in st and st["sas_token"]:
|
|
363
|
+
sas = st["sas_token"].lstrip("?")
|
|
284
364
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
365
|
+
container = st.get("container")
|
|
366
|
+
|
|
367
|
+
if container:
|
|
368
|
+
# fs.azure.sas.<container>.<account>.blob.core.windows.net = <sas>
|
|
369
|
+
res[f"spark.hadoop.fs.azure.sas.{container}.{host}"] = sas
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
373
|
+
"Container name is required for WASB SAS. "
|
|
374
|
+
"Set self.endpoint or storage_options['container']."
|
|
375
|
+
)
|
|
291
376
|
return res
|
|
292
377
|
|
|
293
378
|
@property
|
|
294
379
|
def spark_url(self):
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
380
|
+
# Build: wasbs://<container>@<host>
|
|
381
|
+
st = self.storage_options
|
|
382
|
+
service = "blob"
|
|
383
|
+
|
|
384
|
+
container = st.get("container")
|
|
385
|
+
|
|
386
|
+
if not container:
|
|
387
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
388
|
+
"Container name is required to build the WASB URL. "
|
|
389
|
+
"Set storage_options['container'] or use datastore profile with container specified."
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Prefer host from connection string; else synthesize from account_name
|
|
393
|
+
host = None
|
|
394
|
+
account_name = st.get("account_name")
|
|
395
|
+
connection_string = st.get("connection_string")
|
|
396
|
+
|
|
397
|
+
if connection_string:
|
|
398
|
+
with contextlib.suppress(Exception):
|
|
399
|
+
primary_url, _, _ = parse_connection_str(
|
|
400
|
+
connection_string, credential=None, service=service
|
|
401
|
+
)
|
|
402
|
+
if primary_url.startswith("http://"):
|
|
403
|
+
primary_url = primary_url[len("http://") :]
|
|
404
|
+
if primary_url.startswith("https://"):
|
|
405
|
+
primary_url = primary_url[len("https://") :]
|
|
406
|
+
# Remove any path components from the host
|
|
407
|
+
host = primary_url.split("/")[0].rstrip("/")
|
|
408
|
+
if not host and account_name:
|
|
409
|
+
host = f"{account_name}.{service}.core.windows.net"
|
|
410
|
+
|
|
411
|
+
# For wasbs:// URLs where endpoint is already the host
|
|
412
|
+
if not host and self.kind in ["wasbs", "wasb"] and hasattr(self, "endpoint"):
|
|
413
|
+
host = getattr(self, "endpoint", None)
|
|
414
|
+
|
|
415
|
+
if not host:
|
|
416
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
417
|
+
"account_name is required (or provide a connection_string) to build the WASB URL."
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return f"wasbs://{container}@{host}"
|