mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -2
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +21 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +113 -2
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +11 -0
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +224 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +374 -102
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +231 -22
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +864 -228
- mlrun/db/nopdb.py +268 -16
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1125 -414
- mlrun/render.py +28 -22
- mlrun/run.py +207 -180
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +40 -14
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/api_gateway.py +646 -177
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc5.dist-info/METADATA +0 -269
- mlrun-1.7.0rc5.dist-info/RECORD +0 -323
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
|
@@ -12,51 +12,93 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import os
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
|
|
17
18
|
from fsspec.registry import get_filesystem_class
|
|
19
|
+
from google.auth.credentials import Credentials
|
|
20
|
+
from google.cloud.storage import Client, transfer_manager
|
|
21
|
+
from google.oauth2 import service_account
|
|
18
22
|
|
|
19
23
|
import mlrun.errors
|
|
20
24
|
from mlrun.utils import logger
|
|
21
25
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
26
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
27
|
|
|
24
28
|
# Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class GoogleCloudStorageStore(DataStore):
|
|
28
32
|
using_bucket = True
|
|
33
|
+
workers = 8
|
|
34
|
+
chunk_size = 32 * 1024 * 1024
|
|
29
35
|
|
|
30
36
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
31
37
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
38
|
+
self._storage_client = None
|
|
39
|
+
self._storage_options = None
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def storage_client(self):
|
|
43
|
+
if self._storage_client:
|
|
44
|
+
return self._storage_client
|
|
45
|
+
|
|
46
|
+
token = self._get_credentials().get("token")
|
|
47
|
+
access = "https://www.googleapis.com/auth/devstorage.full_control"
|
|
48
|
+
if isinstance(token, str):
|
|
49
|
+
if os.path.exists(token):
|
|
50
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
51
|
+
token, scopes=[access]
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
55
|
+
"gcsfs authentication file not found!"
|
|
56
|
+
)
|
|
57
|
+
elif isinstance(token, dict):
|
|
58
|
+
credentials = service_account.Credentials.from_service_account_info(
|
|
59
|
+
token, scopes=[access]
|
|
60
|
+
)
|
|
61
|
+
elif isinstance(token, Credentials):
|
|
62
|
+
credentials = token
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(f"Unsupported token type: {type(token)}")
|
|
65
|
+
self._storage_client = Client(credentials=credentials)
|
|
66
|
+
return self._storage_client
|
|
32
67
|
|
|
33
68
|
@property
|
|
34
69
|
def filesystem(self):
|
|
35
70
|
"""return fsspec file system object, if supported"""
|
|
36
|
-
if self._filesystem:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
) from exc
|
|
44
|
-
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
45
|
-
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
46
|
-
filesystem_class,
|
|
47
|
-
using_bucket=self.using_bucket,
|
|
48
|
-
**self.get_storage_options(),
|
|
49
|
-
)
|
|
71
|
+
if not self._filesystem:
|
|
72
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
73
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
74
|
+
filesystem_class,
|
|
75
|
+
using_bucket=self.using_bucket,
|
|
76
|
+
**self.storage_options,
|
|
77
|
+
)
|
|
50
78
|
return self._filesystem
|
|
51
79
|
|
|
52
|
-
|
|
80
|
+
@property
|
|
81
|
+
def storage_options(self):
|
|
82
|
+
if self._storage_options:
|
|
83
|
+
return self._storage_options
|
|
84
|
+
credentials = self._get_credentials()
|
|
85
|
+
# due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
|
|
86
|
+
credentials["use_listings_cache"] = False
|
|
87
|
+
self._storage_options = credentials
|
|
88
|
+
return self._storage_options
|
|
89
|
+
|
|
90
|
+
def _get_credentials(self):
|
|
53
91
|
credentials = self._get_secret_or_env(
|
|
54
92
|
"GCP_CREDENTIALS"
|
|
55
93
|
) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
|
|
56
94
|
if credentials:
|
|
57
95
|
try:
|
|
58
|
-
# Try to handle credentials as a json connection string
|
|
59
|
-
token =
|
|
96
|
+
# Try to handle credentials as a json connection string or do nothing if already a dict
|
|
97
|
+
token = (
|
|
98
|
+
credentials
|
|
99
|
+
if isinstance(credentials, dict)
|
|
100
|
+
else json.loads(credentials)
|
|
101
|
+
)
|
|
60
102
|
except json.JSONDecodeError:
|
|
61
103
|
# If it's not json, handle it as a filename
|
|
62
104
|
token = credentials
|
|
@@ -67,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
67
109
|
)
|
|
68
110
|
return self._sanitize_storage_options(None)
|
|
69
111
|
|
|
112
|
+
def get_storage_options(self):
|
|
113
|
+
return self.storage_options
|
|
114
|
+
|
|
70
115
|
def _make_path(self, key):
|
|
71
116
|
key = key.strip("/")
|
|
72
117
|
path = Path(self.endpoint, key).as_posix()
|
|
@@ -86,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
86
131
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
87
132
|
"Append mode not supported for Google cloud storage datastore"
|
|
88
133
|
)
|
|
89
|
-
|
|
90
|
-
if isinstance(data, bytes):
|
|
91
|
-
mode = "wb"
|
|
92
|
-
elif isinstance(data, str):
|
|
93
|
-
mode = "w"
|
|
94
|
-
else:
|
|
95
|
-
raise TypeError(
|
|
96
|
-
"Data type unknown. Unable to put in Google cloud storage!"
|
|
97
|
-
)
|
|
134
|
+
data, mode = self._prepare_put_data(data, append)
|
|
98
135
|
with self.filesystem.open(path, mode) as f:
|
|
99
136
|
f.write(data)
|
|
100
137
|
|
|
101
138
|
def upload(self, key, src_path):
|
|
102
|
-
|
|
103
|
-
self.
|
|
139
|
+
file_size = os.path.getsize(src_path)
|
|
140
|
+
united_path = self._make_path(key)
|
|
141
|
+
|
|
142
|
+
# Multiple upload limitation recommendations as described in
|
|
143
|
+
# https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
|
|
144
|
+
|
|
145
|
+
if file_size <= self.chunk_size:
|
|
146
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
bucket = self.storage_client.bucket(self.endpoint)
|
|
150
|
+
blob = bucket.blob(key.strip("/"))
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
transfer_manager.upload_chunks_concurrently(
|
|
154
|
+
src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
|
|
155
|
+
)
|
|
156
|
+
except Exception as upload_chunks_concurrently_exception:
|
|
157
|
+
logger.warning(
|
|
158
|
+
f"gcs: failed to concurrently upload {src_path},"
|
|
159
|
+
f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
|
|
160
|
+
)
|
|
161
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
104
162
|
|
|
105
163
|
def stat(self, key):
|
|
106
164
|
path = self._make_path(key)
|
|
@@ -129,11 +187,13 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
129
187
|
|
|
130
188
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
131
189
|
path = self._make_path(path)
|
|
132
|
-
|
|
190
|
+
# in order to raise an error in case of a connection error (ML-7056)
|
|
191
|
+
self.filesystem.exists(path)
|
|
192
|
+
super().rm(path, recursive=recursive, maxdepth=maxdepth)
|
|
133
193
|
|
|
134
194
|
def get_spark_options(self):
|
|
135
|
-
res =
|
|
136
|
-
st = self.
|
|
195
|
+
res = {}
|
|
196
|
+
st = self._get_credentials()
|
|
137
197
|
if "token" in st:
|
|
138
198
|
res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
|
|
139
199
|
if isinstance(st["token"], str):
|
mlrun/datastore/hdfs.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import os
|
|
15
|
+
from urllib.parse import urlparse
|
|
15
16
|
|
|
16
17
|
import fsspec
|
|
17
18
|
|
|
@@ -49,3 +50,7 @@ class HdfsStore(DataStore):
|
|
|
49
50
|
@property
|
|
50
51
|
def spark_url(self):
|
|
51
52
|
return f"hdfs://{self.host}:{self.port}"
|
|
53
|
+
|
|
54
|
+
def rm(self, url, recursive=False, maxdepth=None):
|
|
55
|
+
path = urlparse(url).path
|
|
56
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
|
|
|
72
72
|
if columns:
|
|
73
73
|
kwargs["usecols"] = columns
|
|
74
74
|
reader = df_module.read_csv
|
|
75
|
-
elif
|
|
75
|
+
elif mlrun.utils.helpers.is_parquet_file(url, format):
|
|
76
76
|
if columns:
|
|
77
77
|
kwargs["columns"] = columns
|
|
78
78
|
reader = df_module.read_parquet
|
|
@@ -80,8 +80,11 @@ class InMemoryStore(DataStore):
|
|
|
80
80
|
reader = df_module.read_json
|
|
81
81
|
else:
|
|
82
82
|
raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
|
|
83
|
-
# InMemoryStore store
|
|
84
|
-
for field in ["time_column", "start_time", "end_time"]:
|
|
83
|
+
# InMemoryStore store – don't pass filters
|
|
84
|
+
for field in ["time_column", "start_time", "end_time", "additional_filters"]:
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|
|
88
|
+
|
|
89
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
90
|
+
self._items.pop(path, None)
|
mlrun/datastore/redis.py
CHANGED
|
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
|
-
|
|
34
|
+
redis_default_port = "6379"
|
|
35
35
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
36
|
self.headers = None
|
|
37
37
|
|
|
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
|
|
|
49
49
|
user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
|
|
50
50
|
password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
|
|
51
51
|
host = parsed_endpoint.hostname
|
|
52
|
-
port = parsed_endpoint.port if parsed_endpoint.port else
|
|
52
|
+
port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
|
|
53
53
|
schema = parsed_endpoint.scheme
|
|
54
54
|
if user or password:
|
|
55
55
|
endpoint = f"{schema}://{user}:{password}@{host}:{port}"
|
|
@@ -126,6 +126,7 @@ class RedisStore(DataStore):
|
|
|
126
126
|
|
|
127
127
|
def put(self, key, data, append=False):
|
|
128
128
|
key = RedisStore.build_redis_key(key)
|
|
129
|
+
data, _ = self._prepare_put_data(data, append)
|
|
129
130
|
if append:
|
|
130
131
|
self.redis.append(key, data)
|
|
131
132
|
else:
|
mlrun/datastore/s3.py
CHANGED
|
@@ -15,11 +15,12 @@
|
|
|
15
15
|
import time
|
|
16
16
|
|
|
17
17
|
import boto3
|
|
18
|
+
from boto3.s3.transfer import TransferConfig
|
|
18
19
|
from fsspec.registry import get_filesystem_class
|
|
19
20
|
|
|
20
21
|
import mlrun.errors
|
|
21
22
|
|
|
22
|
-
from .base import DataStore, FileStats, get_range,
|
|
23
|
+
from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitizer
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class S3Store(DataStore):
|
|
@@ -35,11 +36,18 @@ class S3Store(DataStore):
|
|
|
35
36
|
|
|
36
37
|
access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
37
38
|
secret_key = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
39
|
+
token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
|
|
38
40
|
endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
|
|
39
41
|
force_non_anonymous = self._get_secret_or_env("S3_NON_ANONYMOUS")
|
|
40
42
|
profile_name = self._get_secret_or_env("AWS_PROFILE")
|
|
41
43
|
assume_role_arn = self._get_secret_or_env("MLRUN_AWS_ROLE_ARN")
|
|
42
44
|
|
|
45
|
+
self.config = TransferConfig(
|
|
46
|
+
multipart_threshold=1024 * 1024 * 25,
|
|
47
|
+
max_concurrency=10,
|
|
48
|
+
multipart_chunksize=1024 * 1024 * 25,
|
|
49
|
+
)
|
|
50
|
+
|
|
43
51
|
# If user asks to assume a role, this needs to go through the STS client and retrieve temporary creds
|
|
44
52
|
if assume_role_arn:
|
|
45
53
|
client = boto3.client(
|
|
@@ -87,14 +95,15 @@ class S3Store(DataStore):
|
|
|
87
95
|
self.s3 = boto3.resource(
|
|
88
96
|
"s3", region_name=region, endpoint_url=endpoint_url
|
|
89
97
|
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
+
if not token_file:
|
|
99
|
+
# If not using credentials, boto will still attempt to sign the requests, and will fail any operations
|
|
100
|
+
# due to no credentials found. These commands disable signing and allow anonymous mode (same as
|
|
101
|
+
# anon in the storage_options when working with fsspec).
|
|
102
|
+
from botocore.handlers import disable_signing
|
|
103
|
+
|
|
104
|
+
self.s3.meta.client.meta.events.register(
|
|
105
|
+
"choose-signer.s3.*", disable_signing
|
|
106
|
+
)
|
|
98
107
|
|
|
99
108
|
def get_spark_options(self):
|
|
100
109
|
res = {}
|
|
@@ -119,7 +128,7 @@ class S3Store(DataStore):
|
|
|
119
128
|
except ImportError as exc:
|
|
120
129
|
raise ImportError("AWS s3fs not installed") from exc
|
|
121
130
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
122
|
-
self._filesystem =
|
|
131
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
123
132
|
filesystem_class,
|
|
124
133
|
using_bucket=self.using_bucket,
|
|
125
134
|
**self.get_storage_options(),
|
|
@@ -132,6 +141,7 @@ class S3Store(DataStore):
|
|
|
132
141
|
endpoint_url = self._get_secret_or_env("S3_ENDPOINT_URL")
|
|
133
142
|
access_key_id = self._get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
134
143
|
secret = self._get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
144
|
+
token_file = self._get_secret_or_env("AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE")
|
|
135
145
|
|
|
136
146
|
if self._temp_credentials:
|
|
137
147
|
access_key_id = self._temp_credentials["AccessKeyId"]
|
|
@@ -141,7 +151,7 @@ class S3Store(DataStore):
|
|
|
141
151
|
token = None
|
|
142
152
|
|
|
143
153
|
storage_options = dict(
|
|
144
|
-
anon=not (force_non_anonymous or (access_key_id and secret)),
|
|
154
|
+
anon=not (force_non_anonymous or (access_key_id and secret) or token_file),
|
|
145
155
|
key=access_key_id,
|
|
146
156
|
secret=secret,
|
|
147
157
|
token=token,
|
|
@@ -166,7 +176,7 @@ class S3Store(DataStore):
|
|
|
166
176
|
|
|
167
177
|
def upload(self, key, src_path):
|
|
168
178
|
bucket, key = self.get_bucket_and_key(key)
|
|
169
|
-
self.s3.
|
|
179
|
+
self.s3.Bucket(bucket).upload_file(src_path, key, Config=self.config)
|
|
170
180
|
|
|
171
181
|
def get(self, key, size=None, offset=0):
|
|
172
182
|
bucket, key = self.get_bucket_and_key(key)
|
|
@@ -176,6 +186,7 @@ class S3Store(DataStore):
|
|
|
176
186
|
return obj.get()["Body"].read()
|
|
177
187
|
|
|
178
188
|
def put(self, key, data, append=False):
|
|
189
|
+
data, _ = self._prepare_put_data(data, append)
|
|
179
190
|
bucket, key = self.get_bucket_and_key(key)
|
|
180
191
|
self.s3.Object(bucket, key).put(Body=data)
|
|
181
192
|
|
|
@@ -198,6 +209,13 @@ class S3Store(DataStore):
|
|
|
198
209
|
bucket = self.s3.Bucket(bucket)
|
|
199
210
|
return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
|
|
200
211
|
|
|
212
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
213
|
+
bucket, key = self.get_bucket_and_key(path)
|
|
214
|
+
path = f"{bucket}/{key}"
|
|
215
|
+
# In order to raise an error if there is connection error, ML-7056.
|
|
216
|
+
self.filesystem.exists(path=path)
|
|
217
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
218
|
+
|
|
201
219
|
|
|
202
220
|
def parse_s3_bucket_and_key(s3_path):
|
|
203
221
|
try:
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
import mlrun
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_snowflake_password():
|
|
20
|
+
key = "SNOWFLAKE_PASSWORD"
|
|
21
|
+
snowflake_password = mlrun.get_secret_or_env(key)
|
|
22
|
+
|
|
23
|
+
if not snowflake_password:
|
|
24
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
25
|
+
f"No password provided. Set password using the {key} "
|
|
26
|
+
"project secret or environment variable."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return snowflake_password
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_snowflake_spark_options(attributes):
|
|
33
|
+
if not attributes:
|
|
34
|
+
return {}
|
|
35
|
+
return {
|
|
36
|
+
"format": "net.snowflake.spark.snowflake",
|
|
37
|
+
"sfURL": attributes.get("url"),
|
|
38
|
+
"sfUser": attributes.get("user"),
|
|
39
|
+
"sfPassword": get_snowflake_password(),
|
|
40
|
+
"sfDatabase": attributes.get("database"),
|
|
41
|
+
"sfSchema": attributes.get("db_schema"),
|
|
42
|
+
"sfWarehouse": attributes.get("warehouse"),
|
|
43
|
+
"application": "iguazio_platform",
|
|
44
|
+
"TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
|
|
45
|
+
}
|