mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +134 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +133 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc8.dist-info/METADATA +0 -272
- mlrun-1.6.4rc8.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ import ast
|
|
|
16
16
|
import base64
|
|
17
17
|
import json
|
|
18
18
|
import typing
|
|
19
|
+
import warnings
|
|
19
20
|
from urllib.parse import ParseResult, urlparse, urlunparse
|
|
20
21
|
|
|
21
22
|
import pydantic
|
|
@@ -30,12 +31,13 @@ from ..secrets import get_secret_or_env
|
|
|
30
31
|
class DatastoreProfile(pydantic.BaseModel):
|
|
31
32
|
type: str
|
|
32
33
|
name: str
|
|
33
|
-
_private_attributes:
|
|
34
|
+
_private_attributes: list = ()
|
|
34
35
|
|
|
35
36
|
class Config:
|
|
36
37
|
extra = pydantic.Extra.forbid
|
|
37
38
|
|
|
38
39
|
@pydantic.validator("name")
|
|
40
|
+
@classmethod
|
|
39
41
|
def lower_case(cls, v):
|
|
40
42
|
return v.lower()
|
|
41
43
|
|
|
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
|
|
|
68
70
|
def get(self, key):
|
|
69
71
|
return self._data.get(key, None)
|
|
70
72
|
|
|
73
|
+
def remove(self, key):
|
|
74
|
+
self._data.pop(key, None)
|
|
75
|
+
|
|
71
76
|
|
|
72
77
|
class DatastoreProfileBasic(DatastoreProfile):
|
|
73
78
|
type: str = pydantic.Field("basic")
|
|
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
|
|
|
79
84
|
class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
80
85
|
type: str = pydantic.Field("kafka_target")
|
|
81
86
|
_private_attributes = "kwargs_private"
|
|
82
|
-
bootstrap_servers: str
|
|
87
|
+
bootstrap_servers: typing.Optional[str] = None
|
|
88
|
+
brokers: typing.Optional[str] = None
|
|
83
89
|
topic: str
|
|
84
|
-
kwargs_public: typing.Optional[
|
|
85
|
-
kwargs_private: typing.Optional[
|
|
90
|
+
kwargs_public: typing.Optional[dict]
|
|
91
|
+
kwargs_private: typing.Optional[dict]
|
|
92
|
+
|
|
93
|
+
def __init__(self, **kwargs):
|
|
94
|
+
super().__init__(**kwargs)
|
|
95
|
+
|
|
96
|
+
if not self.brokers and not self.bootstrap_servers:
|
|
97
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
98
|
+
"DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if self.bootstrap_servers:
|
|
102
|
+
if self.brokers:
|
|
103
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
104
|
+
"DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
self.brokers = self.bootstrap_servers
|
|
108
|
+
self.bootstrap_servers = None
|
|
109
|
+
warnings.warn(
|
|
110
|
+
"'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
|
|
111
|
+
"use 'brokers' instead.",
|
|
112
|
+
# TODO: Remove this in 1.9.0
|
|
113
|
+
FutureWarning,
|
|
114
|
+
)
|
|
86
115
|
|
|
87
116
|
def attributes(self):
|
|
88
|
-
attributes = {"
|
|
117
|
+
attributes = {"brokers": self.brokers or self.bootstrap_servers}
|
|
89
118
|
if self.kwargs_public:
|
|
90
119
|
attributes = merge(attributes, self.kwargs_public)
|
|
91
120
|
if self.kwargs_private:
|
|
@@ -96,15 +125,15 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
|
|
|
96
125
|
class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
97
126
|
type: str = pydantic.Field("kafka_source")
|
|
98
127
|
_private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
|
|
99
|
-
brokers: typing.Union[str,
|
|
100
|
-
topics: typing.Union[str,
|
|
128
|
+
brokers: typing.Union[str, list[str]]
|
|
129
|
+
topics: typing.Union[str, list[str]]
|
|
101
130
|
group: typing.Optional[str] = "serving"
|
|
102
131
|
initial_offset: typing.Optional[str] = "earliest"
|
|
103
|
-
partitions: typing.Optional[typing.Union[str,
|
|
132
|
+
partitions: typing.Optional[typing.Union[str, list[str]]]
|
|
104
133
|
sasl_user: typing.Optional[str]
|
|
105
134
|
sasl_pass: typing.Optional[str]
|
|
106
|
-
kwargs_public: typing.Optional[
|
|
107
|
-
kwargs_private: typing.Optional[
|
|
135
|
+
kwargs_public: typing.Optional[dict]
|
|
136
|
+
kwargs_private: typing.Optional[dict]
|
|
108
137
|
|
|
109
138
|
def attributes(self):
|
|
110
139
|
attributes = {}
|
|
@@ -132,6 +161,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
132
161
|
return attributes
|
|
133
162
|
|
|
134
163
|
|
|
164
|
+
class DatastoreProfileV3io(DatastoreProfile):
|
|
165
|
+
type: str = pydantic.Field("v3io")
|
|
166
|
+
v3io_access_key: typing.Optional[str] = None
|
|
167
|
+
_private_attributes = "v3io_access_key"
|
|
168
|
+
|
|
169
|
+
def url(self, subpath):
|
|
170
|
+
subpath = subpath.lstrip("/")
|
|
171
|
+
return f"v3io:///{subpath}"
|
|
172
|
+
|
|
173
|
+
def secrets(self) -> dict:
|
|
174
|
+
res = {}
|
|
175
|
+
if self.v3io_access_key:
|
|
176
|
+
res["V3IO_ACCESS_KEY"] = self.v3io_access_key
|
|
177
|
+
return res
|
|
178
|
+
|
|
179
|
+
|
|
135
180
|
class DatastoreProfileS3(DatastoreProfile):
|
|
136
181
|
type: str = pydantic.Field("s3")
|
|
137
182
|
_private_attributes = ("access_key_id", "secret_key")
|
|
@@ -141,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
141
186
|
assume_role_arn: typing.Optional[str] = None
|
|
142
187
|
access_key_id: typing.Optional[str] = None
|
|
143
188
|
secret_key: typing.Optional[str] = None
|
|
189
|
+
bucket: typing.Optional[str] = None
|
|
190
|
+
|
|
191
|
+
@pydantic.validator("bucket")
|
|
192
|
+
@classmethod
|
|
193
|
+
def check_bucket(cls, v):
|
|
194
|
+
if not v:
|
|
195
|
+
warnings.warn(
|
|
196
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
197
|
+
FutureWarning,
|
|
198
|
+
stacklevel=2,
|
|
199
|
+
)
|
|
200
|
+
return v
|
|
144
201
|
|
|
145
202
|
def secrets(self) -> dict:
|
|
146
203
|
res = {}
|
|
@@ -156,10 +213,16 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
156
213
|
res["AWS_PROFILE"] = self.profile_name
|
|
157
214
|
if self.assume_role_arn:
|
|
158
215
|
res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
|
|
159
|
-
return res
|
|
216
|
+
return res
|
|
160
217
|
|
|
161
218
|
def url(self, subpath):
|
|
162
|
-
|
|
219
|
+
# TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
|
|
220
|
+
# we assume that the subpath can begin without a '/' character,
|
|
221
|
+
# while here we assume it always starts with one.
|
|
222
|
+
if self.bucket:
|
|
223
|
+
return f"s3://{self.bucket}{subpath}"
|
|
224
|
+
else:
|
|
225
|
+
return f"s3:/{subpath}"
|
|
163
226
|
|
|
164
227
|
|
|
165
228
|
class DatastoreProfileRedis(DatastoreProfile):
|
|
@@ -199,7 +262,7 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
199
262
|
res["REDIS_USER"] = self.username
|
|
200
263
|
if self.password:
|
|
201
264
|
res["REDIS_PASSWORD"] = self.password
|
|
202
|
-
return res
|
|
265
|
+
return res
|
|
203
266
|
|
|
204
267
|
def url(self, subpath):
|
|
205
268
|
return self.endpoint_url + subpath
|
|
@@ -220,26 +283,44 @@ class DatastoreProfileDBFS(DatastoreProfile):
|
|
|
220
283
|
res["DATABRICKS_TOKEN"] = self.token
|
|
221
284
|
if self.endpoint_url:
|
|
222
285
|
res["DATABRICKS_HOST"] = self.endpoint_url
|
|
223
|
-
return res
|
|
286
|
+
return res
|
|
224
287
|
|
|
225
288
|
|
|
226
289
|
class DatastoreProfileGCS(DatastoreProfile):
|
|
227
290
|
type: str = pydantic.Field("gcs")
|
|
228
291
|
_private_attributes = ("gcp_credentials",)
|
|
229
292
|
credentials_path: typing.Optional[str] = None # path to file.
|
|
230
|
-
gcp_credentials: typing.Optional[typing.Union[str,
|
|
293
|
+
gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
|
|
294
|
+
bucket: typing.Optional[str] = None
|
|
295
|
+
|
|
296
|
+
@pydantic.validator("bucket")
|
|
297
|
+
@classmethod
|
|
298
|
+
def check_bucket(cls, v):
|
|
299
|
+
if not v:
|
|
300
|
+
warnings.warn(
|
|
301
|
+
"The 'bucket' attribute will be mandatory starting from version 1.9",
|
|
302
|
+
FutureWarning,
|
|
303
|
+
stacklevel=2,
|
|
304
|
+
)
|
|
305
|
+
return v
|
|
231
306
|
|
|
232
307
|
@pydantic.validator("gcp_credentials", pre=True, always=True)
|
|
308
|
+
@classmethod
|
|
233
309
|
def convert_dict_to_json(cls, v):
|
|
234
310
|
if isinstance(v, dict):
|
|
235
311
|
return json.dumps(v)
|
|
236
312
|
return v
|
|
237
313
|
|
|
238
314
|
def url(self, subpath) -> str:
|
|
315
|
+
# TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
|
|
316
|
+
# but the opposite assumption is made in S3.
|
|
239
317
|
if subpath.startswith("/"):
|
|
240
318
|
# in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
|
|
241
319
|
subpath = subpath[1:]
|
|
242
|
-
|
|
320
|
+
if self.bucket:
|
|
321
|
+
return f"gcs://{self.bucket}/{subpath}"
|
|
322
|
+
else:
|
|
323
|
+
return f"gcs://{subpath}"
|
|
243
324
|
|
|
244
325
|
def secrets(self) -> dict:
|
|
245
326
|
res = {}
|
|
@@ -247,7 +328,7 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
247
328
|
res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
|
|
248
329
|
if self.gcp_credentials:
|
|
249
330
|
res["GCP_CREDENTIALS"] = self.gcp_credentials
|
|
250
|
-
return res
|
|
331
|
+
return res
|
|
251
332
|
|
|
252
333
|
|
|
253
334
|
class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
@@ -267,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
267
348
|
client_secret: typing.Optional[str] = None
|
|
268
349
|
sas_token: typing.Optional[str] = None
|
|
269
350
|
credential: typing.Optional[str] = None
|
|
351
|
+
container: typing.Optional[str] = None
|
|
352
|
+
|
|
353
|
+
@pydantic.validator("container")
|
|
354
|
+
@classmethod
|
|
355
|
+
def check_container(cls, v):
|
|
356
|
+
if not v:
|
|
357
|
+
warnings.warn(
|
|
358
|
+
"The 'container' attribute will be mandatory starting from version 1.9",
|
|
359
|
+
FutureWarning,
|
|
360
|
+
stacklevel=2,
|
|
361
|
+
)
|
|
362
|
+
return v
|
|
270
363
|
|
|
271
364
|
def url(self, subpath) -> str:
|
|
272
365
|
if subpath.startswith("/"):
|
|
273
|
-
# in azure the path after schema is starts with
|
|
366
|
+
# in azure the path after schema is starts with container, wherefore it should not start with "/".
|
|
274
367
|
subpath = subpath[1:]
|
|
275
|
-
|
|
368
|
+
if self.container:
|
|
369
|
+
return f"az://{self.container}/{subpath}"
|
|
370
|
+
else:
|
|
371
|
+
return f"az://{subpath}"
|
|
276
372
|
|
|
277
373
|
def secrets(self) -> dict:
|
|
278
374
|
res = {}
|
|
@@ -292,7 +388,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
292
388
|
res["sas_token"] = self.sas_token
|
|
293
389
|
if self.credential:
|
|
294
390
|
res["credential"] = self.credential
|
|
295
|
-
return res
|
|
391
|
+
return res
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class DatastoreProfileHdfs(DatastoreProfile):
|
|
395
|
+
type: str = pydantic.Field("hdfs")
|
|
396
|
+
_private_attributes = "token"
|
|
397
|
+
host: typing.Optional[str] = None
|
|
398
|
+
port: typing.Optional[int] = None
|
|
399
|
+
http_port: typing.Optional[int] = None
|
|
400
|
+
user: typing.Optional[str] = None
|
|
401
|
+
|
|
402
|
+
def secrets(self) -> dict:
|
|
403
|
+
res = {}
|
|
404
|
+
if self.host:
|
|
405
|
+
res["HDFS_HOST"] = self.host
|
|
406
|
+
if self.port:
|
|
407
|
+
res["HDFS_PORT"] = self.port
|
|
408
|
+
if self.port:
|
|
409
|
+
res["HDFS_HTTP_PORT"] = self.http_port
|
|
410
|
+
if self.user:
|
|
411
|
+
res["HDFS_USER"] = self.user
|
|
412
|
+
return res or None
|
|
413
|
+
|
|
414
|
+
def url(self, subpath):
|
|
415
|
+
return f"webhdfs://{self.host}:{self.http_port}{subpath}"
|
|
296
416
|
|
|
297
417
|
|
|
298
418
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
@@ -346,6 +466,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
346
466
|
decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
|
|
347
467
|
datastore_type = decoded_dict.get("type")
|
|
348
468
|
ds_profile_factory = {
|
|
469
|
+
"v3io": DatastoreProfileV3io,
|
|
349
470
|
"s3": DatastoreProfileS3,
|
|
350
471
|
"redis": DatastoreProfileRedis,
|
|
351
472
|
"basic": DatastoreProfileBasic,
|
|
@@ -354,6 +475,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
354
475
|
"dbfs": DatastoreProfileDBFS,
|
|
355
476
|
"gcs": DatastoreProfileGCS,
|
|
356
477
|
"az": DatastoreProfileAzureBlob,
|
|
478
|
+
"hdfs": DatastoreProfileHdfs,
|
|
357
479
|
}
|
|
358
480
|
if datastore_type in ds_profile_factory:
|
|
359
481
|
return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
|
|
@@ -418,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
|
|
|
418
540
|
It's beneficial for testing purposes.
|
|
419
541
|
"""
|
|
420
542
|
TemporaryClientDatastoreProfiles().add(profile)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def remove_temporary_client_datastore_profile(profile_name: str):
|
|
546
|
+
TemporaryClientDatastoreProfiles().remove(profile_name)
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
|
|
|
19
19
|
|
|
20
20
|
import mlrun.errors
|
|
21
21
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
22
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class DatabricksFileBugFixed(DatabricksFile):
|
|
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
|
|
|
89
89
|
"""return fsspec file system object, if supported"""
|
|
90
90
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
91
91
|
if not self._filesystem:
|
|
92
|
-
self._filesystem =
|
|
92
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
93
93
|
cls=filesystem_class,
|
|
94
94
|
using_bucket=False,
|
|
95
95
|
**self.get_storage_options(),
|
|
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
|
|
|
130
130
|
"Append mode not supported for Databricks file system"
|
|
131
131
|
)
|
|
132
132
|
# can not use append mode because it overrides data.
|
|
133
|
-
mode =
|
|
134
|
-
if isinstance(data, bytes):
|
|
135
|
-
mode += "b"
|
|
136
|
-
elif not isinstance(data, str):
|
|
137
|
-
raise TypeError(f"Unknown data type {type(data)}")
|
|
133
|
+
data, mode = self._prepare_put_data(data, append)
|
|
138
134
|
with self.filesystem.open(key, mode) as f:
|
|
139
135
|
f.write(data)
|
|
140
136
|
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -66,9 +66,7 @@ class FileStore(DataStore):
|
|
|
66
66
|
dir_to_create = path.dirname(self._join(key))
|
|
67
67
|
if dir_to_create:
|
|
68
68
|
self._ensure_directory(dir_to_create)
|
|
69
|
-
mode =
|
|
70
|
-
if isinstance(data, bytes):
|
|
71
|
-
mode = mode + "b"
|
|
69
|
+
data, mode = self._prepare_put_data(data, append)
|
|
72
70
|
with open(self._join(key), mode) as fp:
|
|
73
71
|
fp.write(data)
|
|
74
72
|
fp.close()
|
|
@@ -105,4 +103,3 @@ class FileStore(DataStore):
|
|
|
105
103
|
return
|
|
106
104
|
except FileExistsError:
|
|
107
105
|
time.sleep(0.1)
|
|
108
|
-
pass
|
|
@@ -12,51 +12,93 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import os
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
|
|
17
18
|
from fsspec.registry import get_filesystem_class
|
|
19
|
+
from google.auth.credentials import Credentials
|
|
20
|
+
from google.cloud.storage import Client, transfer_manager
|
|
21
|
+
from google.oauth2 import service_account
|
|
18
22
|
|
|
19
23
|
import mlrun.errors
|
|
20
24
|
from mlrun.utils import logger
|
|
21
25
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
26
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
27
|
|
|
24
28
|
# Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class GoogleCloudStorageStore(DataStore):
|
|
28
32
|
using_bucket = True
|
|
33
|
+
workers = 8
|
|
34
|
+
chunk_size = 32 * 1024 * 1024
|
|
29
35
|
|
|
30
36
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
31
37
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
38
|
+
self._storage_client = None
|
|
39
|
+
self._storage_options = None
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def storage_client(self):
|
|
43
|
+
if self._storage_client:
|
|
44
|
+
return self._storage_client
|
|
45
|
+
|
|
46
|
+
token = self._get_credentials().get("token")
|
|
47
|
+
access = "https://www.googleapis.com/auth/devstorage.full_control"
|
|
48
|
+
if isinstance(token, str):
|
|
49
|
+
if os.path.exists(token):
|
|
50
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
51
|
+
token, scopes=[access]
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
55
|
+
"gcsfs authentication file not found!"
|
|
56
|
+
)
|
|
57
|
+
elif isinstance(token, dict):
|
|
58
|
+
credentials = service_account.Credentials.from_service_account_info(
|
|
59
|
+
token, scopes=[access]
|
|
60
|
+
)
|
|
61
|
+
elif isinstance(token, Credentials):
|
|
62
|
+
credentials = token
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(f"Unsupported token type: {type(token)}")
|
|
65
|
+
self._storage_client = Client(credentials=credentials)
|
|
66
|
+
return self._storage_client
|
|
32
67
|
|
|
33
68
|
@property
|
|
34
69
|
def filesystem(self):
|
|
35
70
|
"""return fsspec file system object, if supported"""
|
|
36
|
-
if self._filesystem:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
) from exc
|
|
44
|
-
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
45
|
-
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
46
|
-
filesystem_class,
|
|
47
|
-
using_bucket=self.using_bucket,
|
|
48
|
-
**self.get_storage_options(),
|
|
49
|
-
)
|
|
71
|
+
if not self._filesystem:
|
|
72
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
73
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
74
|
+
filesystem_class,
|
|
75
|
+
using_bucket=self.using_bucket,
|
|
76
|
+
**self.storage_options,
|
|
77
|
+
)
|
|
50
78
|
return self._filesystem
|
|
51
79
|
|
|
52
|
-
|
|
80
|
+
@property
|
|
81
|
+
def storage_options(self):
|
|
82
|
+
if self._storage_options:
|
|
83
|
+
return self._storage_options
|
|
84
|
+
credentials = self._get_credentials()
|
|
85
|
+
# due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
|
|
86
|
+
credentials["use_listings_cache"] = False
|
|
87
|
+
self._storage_options = credentials
|
|
88
|
+
return self._storage_options
|
|
89
|
+
|
|
90
|
+
def _get_credentials(self):
|
|
53
91
|
credentials = self._get_secret_or_env(
|
|
54
92
|
"GCP_CREDENTIALS"
|
|
55
93
|
) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
|
|
56
94
|
if credentials:
|
|
57
95
|
try:
|
|
58
|
-
# Try to handle credentials as a json connection string
|
|
59
|
-
token =
|
|
96
|
+
# Try to handle credentials as a json connection string or do nothing if already a dict
|
|
97
|
+
token = (
|
|
98
|
+
credentials
|
|
99
|
+
if isinstance(credentials, dict)
|
|
100
|
+
else json.loads(credentials)
|
|
101
|
+
)
|
|
60
102
|
except json.JSONDecodeError:
|
|
61
103
|
# If it's not json, handle it as a filename
|
|
62
104
|
token = credentials
|
|
@@ -67,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
67
109
|
)
|
|
68
110
|
return self._sanitize_storage_options(None)
|
|
69
111
|
|
|
112
|
+
def get_storage_options(self):
|
|
113
|
+
return self.storage_options
|
|
114
|
+
|
|
70
115
|
def _make_path(self, key):
|
|
71
116
|
key = key.strip("/")
|
|
72
117
|
path = Path(self.endpoint, key).as_posix()
|
|
@@ -86,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
86
131
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
87
132
|
"Append mode not supported for Google cloud storage datastore"
|
|
88
133
|
)
|
|
89
|
-
|
|
90
|
-
if isinstance(data, bytes):
|
|
91
|
-
mode = "wb"
|
|
92
|
-
elif isinstance(data, str):
|
|
93
|
-
mode = "w"
|
|
94
|
-
else:
|
|
95
|
-
raise TypeError(
|
|
96
|
-
"Data type unknown. Unable to put in Google cloud storage!"
|
|
97
|
-
)
|
|
134
|
+
data, mode = self._prepare_put_data(data, append)
|
|
98
135
|
with self.filesystem.open(path, mode) as f:
|
|
99
136
|
f.write(data)
|
|
100
137
|
|
|
101
138
|
def upload(self, key, src_path):
|
|
102
|
-
|
|
103
|
-
self.
|
|
139
|
+
file_size = os.path.getsize(src_path)
|
|
140
|
+
united_path = self._make_path(key)
|
|
141
|
+
|
|
142
|
+
# Multiple upload limitation recommendations as described in
|
|
143
|
+
# https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
|
|
144
|
+
|
|
145
|
+
if file_size <= self.chunk_size:
|
|
146
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
bucket = self.storage_client.bucket(self.endpoint)
|
|
150
|
+
blob = bucket.blob(key.strip("/"))
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
transfer_manager.upload_chunks_concurrently(
|
|
154
|
+
src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
|
|
155
|
+
)
|
|
156
|
+
except Exception as upload_chunks_concurrently_exception:
|
|
157
|
+
logger.warning(
|
|
158
|
+
f"gcs: failed to concurrently upload {src_path},"
|
|
159
|
+
f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
|
|
160
|
+
)
|
|
161
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
104
162
|
|
|
105
163
|
def stat(self, key):
|
|
106
164
|
path = self._make_path(key)
|
|
@@ -129,16 +187,18 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
129
187
|
|
|
130
188
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
131
189
|
path = self._make_path(path)
|
|
132
|
-
|
|
190
|
+
# in order to raise an error in case of a connection error (ML-7056)
|
|
191
|
+
self.filesystem.exists(path)
|
|
192
|
+
super().rm(path, recursive=recursive, maxdepth=maxdepth)
|
|
133
193
|
|
|
134
194
|
def get_spark_options(self):
|
|
135
|
-
res =
|
|
136
|
-
st = self.
|
|
195
|
+
res = {}
|
|
196
|
+
st = self._get_credentials()
|
|
137
197
|
if "token" in st:
|
|
138
198
|
res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
|
|
139
199
|
if isinstance(st["token"], str):
|
|
140
200
|
# Token is a filename, read json from it
|
|
141
|
-
with open(st["token"]
|
|
201
|
+
with open(st["token"]) as file:
|
|
142
202
|
credentials = json.load(file)
|
|
143
203
|
else:
|
|
144
204
|
# Token is a dictionary, use it directly
|
|
@@ -161,3 +221,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
161
221
|
if "client_id" in credentials:
|
|
162
222
|
res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
|
|
163
223
|
return res
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
def spark_url(self):
|
|
227
|
+
return f"gs://{self.endpoint}"
|
mlrun/datastore/hdfs.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
import fsspec
|
|
18
|
+
|
|
19
|
+
from mlrun.datastore.base import DataStore
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HdfsStore(DataStore):
|
|
23
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
24
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
25
|
+
|
|
26
|
+
self.host = self._get_secret_or_env("HDFS_HOST")
|
|
27
|
+
self.port = self._get_secret_or_env("HDFS_PORT")
|
|
28
|
+
self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
|
|
29
|
+
self.user = self._get_secret_or_env("HDFS_USER")
|
|
30
|
+
if not self.user:
|
|
31
|
+
self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
|
|
32
|
+
|
|
33
|
+
self._filesystem = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def filesystem(self):
|
|
37
|
+
if not self._filesystem:
|
|
38
|
+
self._filesystem = fsspec.filesystem(
|
|
39
|
+
"webhdfs",
|
|
40
|
+
host=self.host,
|
|
41
|
+
port=self.http_port,
|
|
42
|
+
user=self.user,
|
|
43
|
+
)
|
|
44
|
+
return self._filesystem
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def url(self):
|
|
48
|
+
return f"webhdfs://{self.host}:{self.http_port}"
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def spark_url(self):
|
|
52
|
+
return f"hdfs://{self.host}:{self.port}"
|
|
53
|
+
|
|
54
|
+
def rm(self, url, recursive=False, maxdepth=None):
|
|
55
|
+
path = urlparse(url).path
|
|
56
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
|
|
|
72
72
|
if columns:
|
|
73
73
|
kwargs["usecols"] = columns
|
|
74
74
|
reader = df_module.read_csv
|
|
75
|
-
elif
|
|
75
|
+
elif mlrun.utils.helpers.is_parquet_file(url, format):
|
|
76
76
|
if columns:
|
|
77
77
|
kwargs["columns"] = columns
|
|
78
78
|
reader = df_module.read_parquet
|
|
@@ -80,8 +80,11 @@ class InMemoryStore(DataStore):
|
|
|
80
80
|
reader = df_module.read_json
|
|
81
81
|
else:
|
|
82
82
|
raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
|
|
83
|
-
# InMemoryStore store
|
|
84
|
-
for field in ["time_column", "start_time", "end_time"]:
|
|
83
|
+
# InMemoryStore store – don't pass filters
|
|
84
|
+
for field in ["time_column", "start_time", "end_time", "additional_filters"]:
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|
|
88
|
+
|
|
89
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
90
|
+
self._items.pop(path, None)
|
mlrun/datastore/redis.py
CHANGED
|
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
|
-
|
|
34
|
+
redis_default_port = "6379"
|
|
35
35
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
36
|
self.headers = None
|
|
37
37
|
|
|
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
|
|
|
49
49
|
user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
|
|
50
50
|
password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
|
|
51
51
|
host = parsed_endpoint.hostname
|
|
52
|
-
port = parsed_endpoint.port if parsed_endpoint.port else
|
|
52
|
+
port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
|
|
53
53
|
schema = parsed_endpoint.scheme
|
|
54
54
|
if user or password:
|
|
55
55
|
endpoint = f"{schema}://{user}:{password}@{host}:{port}"
|
|
@@ -126,6 +126,7 @@ class RedisStore(DataStore):
|
|
|
126
126
|
|
|
127
127
|
def put(self, key, data, append=False):
|
|
128
128
|
key = RedisStore.build_redis_key(key)
|
|
129
|
+
data, _ = self._prepare_put_data(data, append)
|
|
129
130
|
if append:
|
|
130
131
|
self.redis.append(key, data)
|
|
131
132
|
else:
|
|
@@ -163,3 +164,7 @@ class RedisStore(DataStore):
|
|
|
163
164
|
self.redis.delete(k)
|
|
164
165
|
else:
|
|
165
166
|
self.redis.delete(key)
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def spark_url(self):
|
|
170
|
+
return ""
|