mlrun 1.6.0rc20__py3-none-any.whl → 1.6.0rc22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +15 -8
- mlrun/artifacts/manager.py +6 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/config.py +1 -1
- mlrun/data_types/to_pandas.py +1 -1
- mlrun/datastore/azure_blob.py +12 -16
- mlrun/datastore/base.py +32 -10
- mlrun/datastore/datastore_profile.py +4 -4
- mlrun/datastore/dbfs_store.py +12 -11
- mlrun/datastore/filestore.py +2 -1
- mlrun/datastore/google_cloud_storage.py +11 -10
- mlrun/datastore/redis.py +2 -1
- mlrun/datastore/s3.py +12 -15
- mlrun/datastore/sources.py +16 -11
- mlrun/datastore/targets.py +2 -13
- mlrun/datastore/v3io.py +18 -20
- mlrun/db/httpdb.py +76 -7
- mlrun/errors.py +4 -0
- mlrun/execution.py +13 -4
- mlrun/feature_store/api.py +3 -4
- mlrun/launcher/base.py +4 -4
- mlrun/lists.py +0 -6
- mlrun/model.py +8 -1
- mlrun/model_monitoring/api.py +9 -31
- mlrun/model_monitoring/batch.py +14 -13
- mlrun/model_monitoring/controller.py +100 -70
- mlrun/model_monitoring/controller_handler.py +1 -3
- mlrun/model_monitoring/helpers.py +65 -20
- mlrun/model_monitoring/stream_processing.py +0 -3
- mlrun/projects/operations.py +1 -1
- mlrun/projects/project.py +10 -4
- mlrun/runtimes/base.py +6 -1
- mlrun/runtimes/constants.py +11 -0
- mlrun/runtimes/databricks_job/databricks_runtime.py +7 -9
- mlrun/runtimes/kubejob.py +1 -1
- mlrun/runtimes/local.py +64 -53
- mlrun/runtimes/serving.py +8 -1
- mlrun/serving/routers.py +7 -20
- mlrun/serving/server.py +4 -14
- mlrun/serving/utils.py +0 -3
- mlrun/utils/helpers.py +10 -2
- mlrun/utils/logger.py +5 -5
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/METADATA +5 -3
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/RECORD +51 -51
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/WHEEL +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc20.dist-info → mlrun-1.6.0rc22.dist-info}/top_level.txt +0 -0
mlrun/artifacts/base.py
CHANGED
|
@@ -714,10 +714,10 @@ class LinkArtifact(Artifact):
|
|
|
714
714
|
self._spec = self._verify_dict(spec, "spec", LinkArtifactSpec)
|
|
715
715
|
|
|
716
716
|
|
|
717
|
-
# TODO: remove in 1.
|
|
717
|
+
# TODO: remove in 1.7.0
|
|
718
718
|
@deprecated(
|
|
719
719
|
version="1.3.0",
|
|
720
|
-
reason="'LegacyArtifact' will be removed in 1.
|
|
720
|
+
reason="'LegacyArtifact' will be removed in 1.7.0, use 'Artifact' instead",
|
|
721
721
|
category=FutureWarning,
|
|
722
722
|
)
|
|
723
723
|
class LegacyArtifact(ModelObj):
|
|
@@ -880,10 +880,10 @@ class LegacyArtifact(ModelObj):
|
|
|
880
880
|
return generate_target_path(self, artifact_path, producer)
|
|
881
881
|
|
|
882
882
|
|
|
883
|
-
# TODO: remove in 1.
|
|
883
|
+
# TODO: remove in 1.7.0
|
|
884
884
|
@deprecated(
|
|
885
885
|
version="1.3.0",
|
|
886
|
-
reason="'LegacyDirArtifact' will be removed in 1.
|
|
886
|
+
reason="'LegacyDirArtifact' will be removed in 1.7.0, use 'DirArtifact' instead",
|
|
887
887
|
category=FutureWarning,
|
|
888
888
|
)
|
|
889
889
|
class LegacyDirArtifact(LegacyArtifact):
|
|
@@ -916,10 +916,10 @@ class LegacyDirArtifact(LegacyArtifact):
|
|
|
916
916
|
mlrun.datastore.store_manager.object(url=target).upload(file_path)
|
|
917
917
|
|
|
918
918
|
|
|
919
|
-
# TODO: remove in 1.
|
|
919
|
+
# TODO: remove in 1.7.0
|
|
920
920
|
@deprecated(
|
|
921
921
|
version="1.3.0",
|
|
922
|
-
reason="'LegacyLinkArtifact' will be removed in 1.
|
|
922
|
+
reason="'LegacyLinkArtifact' will be removed in 1.7.0, use 'LinkArtifact' instead",
|
|
923
923
|
category=FutureWarning,
|
|
924
924
|
)
|
|
925
925
|
class LegacyLinkArtifact(LegacyArtifact):
|
mlrun/artifacts/dataset.py
CHANGED
|
@@ -283,14 +283,16 @@ class DatasetArtifact(Artifact):
|
|
|
283
283
|
if artifact.spec.length > preview_rows_length and not ignore_preview_limits:
|
|
284
284
|
preview_df = df.head(preview_rows_length)
|
|
285
285
|
|
|
286
|
-
|
|
287
|
-
# that way it wont create another index if one already there
|
|
288
|
-
preview_df = preview_df.reset_index(drop=True)
|
|
286
|
+
preview_df = preview_df.reset_index()
|
|
289
287
|
artifact.status.header_original_length = len(preview_df.columns)
|
|
290
288
|
if len(preview_df.columns) > max_preview_columns and not ignore_preview_limits:
|
|
291
289
|
preview_df = preview_df.iloc[:, :max_preview_columns]
|
|
292
290
|
artifact.spec.header = preview_df.columns.values.tolist()
|
|
293
291
|
artifact.status.preview = preview_df.values.tolist()
|
|
292
|
+
# Table schema parsing doesn't require a column named "index"
|
|
293
|
+
# to align its output with previously generated header and preview data
|
|
294
|
+
if "index" in preview_df.columns:
|
|
295
|
+
preview_df.drop("index", axis=1, inplace=True)
|
|
294
296
|
artifact.spec.schema = build_table_schema(preview_df)
|
|
295
297
|
|
|
296
298
|
# set artifact stats if stats is explicitly set to true, or if stats is None and the dataframe is small
|
|
@@ -344,10 +346,10 @@ class DatasetArtifact(Artifact):
|
|
|
344
346
|
self.status.stats = stats
|
|
345
347
|
|
|
346
348
|
|
|
347
|
-
# TODO: remove in 1.
|
|
349
|
+
# TODO: remove in 1.7.0
|
|
348
350
|
@deprecated(
|
|
349
351
|
version="1.3.0",
|
|
350
|
-
reason="'LegacyTableArtifact' will be removed in 1.
|
|
352
|
+
reason="'LegacyTableArtifact' will be removed in 1.7.0, use 'TableArtifact' instead",
|
|
351
353
|
category=FutureWarning,
|
|
352
354
|
)
|
|
353
355
|
class LegacyTableArtifact(LegacyArtifact):
|
|
@@ -400,10 +402,10 @@ class LegacyTableArtifact(LegacyArtifact):
|
|
|
400
402
|
return csv_buffer.getvalue()
|
|
401
403
|
|
|
402
404
|
|
|
403
|
-
# TODO: remove in 1.
|
|
405
|
+
# TODO: remove in 1.7.0
|
|
404
406
|
@deprecated(
|
|
405
407
|
version="1.3.0",
|
|
406
|
-
reason="'LegacyDatasetArtifact' will be removed in 1.
|
|
408
|
+
reason="'LegacyDatasetArtifact' will be removed in 1.7.0, use 'DatasetArtifact' instead",
|
|
407
409
|
category=FutureWarning,
|
|
408
410
|
)
|
|
409
411
|
class LegacyDatasetArtifact(LegacyArtifact):
|
|
@@ -513,11 +515,16 @@ class LegacyDatasetArtifact(LegacyArtifact):
|
|
|
513
515
|
|
|
514
516
|
if artifact.length > preview_rows_length and not ignore_preview_limits:
|
|
515
517
|
preview_df = df.head(preview_rows_length)
|
|
516
|
-
|
|
518
|
+
|
|
519
|
+
preview_df = preview_df.reset_index()
|
|
517
520
|
if len(preview_df.columns) > max_preview_columns and not ignore_preview_limits:
|
|
518
521
|
preview_df = preview_df.iloc[:, :max_preview_columns]
|
|
519
522
|
artifact.header = preview_df.columns.values.tolist()
|
|
520
523
|
artifact.preview = preview_df.values.tolist()
|
|
524
|
+
# Table schema parsing doesn't require a column named "index"
|
|
525
|
+
# to align its output with previously generated header and preview data
|
|
526
|
+
if "index" in preview_df.columns:
|
|
527
|
+
preview_df.drop("index", axis=1, inplace=True)
|
|
521
528
|
artifact.schema = build_table_schema(preview_df)
|
|
522
529
|
if (
|
|
523
530
|
stats
|
mlrun/artifacts/manager.py
CHANGED
|
@@ -66,7 +66,7 @@ artifact_types = {
|
|
|
66
66
|
"bokeh": BokehArtifact,
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
-
# TODO - Remove this when legacy types are deleted in 1.
|
|
69
|
+
# TODO - Remove this when legacy types are deleted in 1.7.0
|
|
70
70
|
legacy_artifact_types = {
|
|
71
71
|
"": LegacyArtifact,
|
|
72
72
|
"dir": LegacyDirArtifact,
|
|
@@ -200,8 +200,11 @@ class ArtifactManager:
|
|
|
200
200
|
# and receive back all the runs that are associated with his search result.
|
|
201
201
|
db_key = producer.name + "_" + key
|
|
202
202
|
else:
|
|
203
|
-
db_key
|
|
204
|
-
|
|
203
|
+
# if the db_key is not explicitly set on the item, we want to use the key as the db_key
|
|
204
|
+
# otherwise, we do not want to override it.
|
|
205
|
+
# this is mainly relevant for imported artifacts that have an explicit db_key value already set
|
|
206
|
+
db_key = item.db_key or key
|
|
207
|
+
item.db_key = db_key or ""
|
|
205
208
|
item.viewer = viewer or item.viewer
|
|
206
209
|
item.tree = producer.tag
|
|
207
210
|
item.tag = tag or item.tag
|
mlrun/artifacts/model.py
CHANGED
|
@@ -390,10 +390,10 @@ class ModelArtifact(Artifact):
|
|
|
390
390
|
return mlrun.get_dataitem(target_model_path).get()
|
|
391
391
|
|
|
392
392
|
|
|
393
|
-
# TODO: remove in 1.
|
|
393
|
+
# TODO: remove in 1.7.0
|
|
394
394
|
@deprecated(
|
|
395
395
|
version="1.3.0",
|
|
396
|
-
reason="'LegacyModelArtifact' will be removed in 1.
|
|
396
|
+
reason="'LegacyModelArtifact' will be removed in 1.7.0, use 'ModelArtifact' instead",
|
|
397
397
|
category=FutureWarning,
|
|
398
398
|
)
|
|
399
399
|
class LegacyModelArtifact(LegacyArtifact):
|
mlrun/artifacts/plots.py
CHANGED
|
@@ -256,10 +256,10 @@ class PlotlyArtifact(Artifact):
|
|
|
256
256
|
return self._figure.to_html()
|
|
257
257
|
|
|
258
258
|
|
|
259
|
-
# TODO: remove in 1.
|
|
259
|
+
# TODO: remove in 1.7.0
|
|
260
260
|
@deprecated(
|
|
261
261
|
version="1.3.0",
|
|
262
|
-
reason="'LegacyPlotArtifact' will be removed in 1.
|
|
262
|
+
reason="'LegacyPlotArtifact' will be removed in 1.7.0, use 'PlotArtifact' instead",
|
|
263
263
|
category=FutureWarning,
|
|
264
264
|
)
|
|
265
265
|
class LegacyPlotArtifact(LegacyArtifact):
|
|
@@ -303,10 +303,10 @@ class LegacyPlotArtifact(LegacyArtifact):
|
|
|
303
303
|
return self._TEMPLATE.format(self.description or self.key, self.key, data_uri)
|
|
304
304
|
|
|
305
305
|
|
|
306
|
-
# TODO: remove in 1.
|
|
306
|
+
# TODO: remove in 1.7.0
|
|
307
307
|
@deprecated(
|
|
308
308
|
version="1.3.0",
|
|
309
|
-
reason="'LegacyChartArtifact' will be removed in 1.
|
|
309
|
+
reason="'LegacyChartArtifact' will be removed in 1.7.0, use 'ChartArtifact' instead",
|
|
310
310
|
category=FutureWarning,
|
|
311
311
|
)
|
|
312
312
|
class LegacyChartArtifact(LegacyArtifact):
|
|
@@ -377,10 +377,10 @@ class LegacyChartArtifact(LegacyArtifact):
|
|
|
377
377
|
)
|
|
378
378
|
|
|
379
379
|
|
|
380
|
-
# TODO: remove in 1.
|
|
380
|
+
# TODO: remove in 1.7.0
|
|
381
381
|
@deprecated(
|
|
382
382
|
version="1.3.0",
|
|
383
|
-
reason="'LegacyBokehArtifact' will be removed in 1.
|
|
383
|
+
reason="'LegacyBokehArtifact' will be removed in 1.7.0, use 'BokehArtifact' instead",
|
|
384
384
|
category=FutureWarning,
|
|
385
385
|
)
|
|
386
386
|
class LegacyBokehArtifact(LegacyArtifact):
|
|
@@ -433,10 +433,10 @@ class LegacyBokehArtifact(LegacyArtifact):
|
|
|
433
433
|
return file_html(self._figure, CDN, self.key)
|
|
434
434
|
|
|
435
435
|
|
|
436
|
-
# TODO: remove in 1.
|
|
436
|
+
# TODO: remove in 1.7.0
|
|
437
437
|
@deprecated(
|
|
438
438
|
version="1.3.0",
|
|
439
|
-
reason="'LegacyPlotlyArtifact' will be removed in 1.
|
|
439
|
+
reason="'LegacyPlotlyArtifact' will be removed in 1.7.0, use 'PlotlyArtifact' instead",
|
|
440
440
|
category=FutureWarning,
|
|
441
441
|
)
|
|
442
442
|
class LegacyPlotlyArtifact(LegacyArtifact):
|
mlrun/config.py
CHANGED
|
@@ -278,7 +278,7 @@ default_config = {
|
|
|
278
278
|
"real_path": "",
|
|
279
279
|
# comma delimited prefixes of paths allowed through the /files API (v3io & the real_path are always allowed).
|
|
280
280
|
# These paths must be schemas (cannot be used for local files). For example "s3://mybucket,gcs://"
|
|
281
|
-
"allowed_file_paths": "s3://,gcs://,gs://,az://,dbfs://",
|
|
281
|
+
"allowed_file_paths": "s3://,gcs://,gs://,az://,dbfs://,ds://",
|
|
282
282
|
"db_type": "sqldb",
|
|
283
283
|
"max_workers": 64,
|
|
284
284
|
# See mlrun.common.schemas.APIStates for options
|
mlrun/data_types/to_pandas.py
CHANGED
|
@@ -178,7 +178,7 @@ def toPandas(spark_df):
|
|
|
178
178
|
if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
|
|
179
179
|
dtype[fieldIdx] = np.float64
|
|
180
180
|
if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
|
|
181
|
-
dtype[fieldIdx] =
|
|
181
|
+
dtype[fieldIdx] = object
|
|
182
182
|
|
|
183
183
|
df = pd.DataFrame()
|
|
184
184
|
for index, t in enumerate(dtype):
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -20,7 +20,6 @@ from azure.storage.blob._shared.base_client import parse_connection_str
|
|
|
20
20
|
from fsspec.registry import get_filesystem_class
|
|
21
21
|
|
|
22
22
|
import mlrun.errors
|
|
23
|
-
from mlrun.errors import err_to_str
|
|
24
23
|
|
|
25
24
|
from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
26
25
|
|
|
@@ -33,20 +32,16 @@ class AzureBlobStore(DataStore):
|
|
|
33
32
|
|
|
34
33
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
35
34
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
36
|
-
self.get_filesystem()
|
|
37
35
|
|
|
38
|
-
|
|
36
|
+
@property
|
|
37
|
+
def filesystem(self):
|
|
39
38
|
"""return fsspec file system object, if supported"""
|
|
40
39
|
if self._filesystem:
|
|
41
40
|
return self._filesystem
|
|
42
41
|
try:
|
|
43
42
|
import adlfs # noqa
|
|
44
43
|
except ImportError as exc:
|
|
45
|
-
|
|
46
|
-
raise ImportError(
|
|
47
|
-
f"Azure adlfs not installed, run pip install adlfs, {err_to_str(exc)}"
|
|
48
|
-
)
|
|
49
|
-
return None
|
|
44
|
+
raise ImportError("Azure adlfs not installed") from exc
|
|
50
45
|
# in order to support az and wasbs kinds.
|
|
51
46
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
52
47
|
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
@@ -57,7 +52,7 @@ class AzureBlobStore(DataStore):
|
|
|
57
52
|
return self._filesystem
|
|
58
53
|
|
|
59
54
|
def get_storage_options(self):
|
|
60
|
-
|
|
55
|
+
res = dict(
|
|
61
56
|
account_name=self._get_secret_or_env("account_name")
|
|
62
57
|
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
|
|
63
58
|
account_key=self._get_secret_or_env("account_key")
|
|
@@ -74,6 +69,7 @@ class AzureBlobStore(DataStore):
|
|
|
74
69
|
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
75
70
|
credential=self._get_secret_or_env("credential"),
|
|
76
71
|
)
|
|
72
|
+
return self._sanitize_storage_options(res)
|
|
77
73
|
|
|
78
74
|
def _convert_key_to_remote_path(self, key):
|
|
79
75
|
key = key.strip("/")
|
|
@@ -86,12 +82,12 @@ class AzureBlobStore(DataStore):
|
|
|
86
82
|
|
|
87
83
|
def upload(self, key, src_path):
|
|
88
84
|
remote_path = self._convert_key_to_remote_path(key)
|
|
89
|
-
self.
|
|
85
|
+
self.filesystem.put_file(src_path, remote_path, overwrite=True)
|
|
90
86
|
|
|
91
87
|
def get(self, key, size=None, offset=0):
|
|
92
88
|
remote_path = self._convert_key_to_remote_path(key)
|
|
93
89
|
end = offset + size if size else None
|
|
94
|
-
blob = self.
|
|
90
|
+
blob = self.filesystem.cat_file(remote_path, start=offset, end=end)
|
|
95
91
|
return blob
|
|
96
92
|
|
|
97
93
|
def put(self, key, data, append=False):
|
|
@@ -106,12 +102,12 @@ class AzureBlobStore(DataStore):
|
|
|
106
102
|
mode = "w"
|
|
107
103
|
else:
|
|
108
104
|
raise TypeError("Data type unknown. Unable to put in Azure!")
|
|
109
|
-
with self.
|
|
105
|
+
with self.filesystem.open(remote_path, mode) as f:
|
|
110
106
|
f.write(data)
|
|
111
107
|
|
|
112
108
|
def stat(self, key):
|
|
113
109
|
remote_path = self._convert_key_to_remote_path(key)
|
|
114
|
-
files = self.
|
|
110
|
+
files = self.filesystem.ls(remote_path, detail=True)
|
|
115
111
|
if len(files) == 1 and files[0]["type"] == "file":
|
|
116
112
|
size = files[0]["size"]
|
|
117
113
|
modified = files[0]["last_modified"]
|
|
@@ -123,10 +119,10 @@ class AzureBlobStore(DataStore):
|
|
|
123
119
|
|
|
124
120
|
def listdir(self, key):
|
|
125
121
|
remote_path = self._convert_key_to_remote_path(key)
|
|
126
|
-
if self.
|
|
122
|
+
if self.filesystem.isfile(remote_path):
|
|
127
123
|
return key
|
|
128
124
|
remote_path = f"{remote_path}/**"
|
|
129
|
-
files = self.
|
|
125
|
+
files = self.filesystem.glob(remote_path)
|
|
130
126
|
key_length = len(key)
|
|
131
127
|
files = [
|
|
132
128
|
f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
|
|
@@ -149,7 +145,7 @@ class AzureBlobStore(DataStore):
|
|
|
149
145
|
for key in ["account_name", "account_key"]:
|
|
150
146
|
parsed_value = parsed_credential.get(key)
|
|
151
147
|
if parsed_value:
|
|
152
|
-
if
|
|
148
|
+
if key in st and st[key] != parsed_value:
|
|
153
149
|
if key == "account_name":
|
|
154
150
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
155
151
|
f"Storage option for '{key}' is '{st[key]}',\
|
mlrun/datastore/base.py
CHANGED
|
@@ -25,6 +25,7 @@ import pyarrow
|
|
|
25
25
|
import pytz
|
|
26
26
|
import requests
|
|
27
27
|
import urllib3
|
|
28
|
+
from deprecated import deprecated
|
|
28
29
|
|
|
29
30
|
import mlrun.errors
|
|
30
31
|
from mlrun.errors import err_to_str
|
|
@@ -71,16 +72,24 @@ class DataStore:
|
|
|
71
72
|
def is_unstructured(self):
|
|
72
73
|
return True
|
|
73
74
|
|
|
75
|
+
@staticmethod
|
|
76
|
+
def _sanitize_storage_options(options):
|
|
77
|
+
if not options:
|
|
78
|
+
return {}
|
|
79
|
+
options = {k: v for k, v in options.items() if v is not None and v != ""}
|
|
80
|
+
return options
|
|
81
|
+
|
|
74
82
|
@staticmethod
|
|
75
83
|
def _sanitize_url(url):
|
|
76
84
|
"""
|
|
77
85
|
Extract only the schema, netloc, and path from an input URL if they exist,
|
|
78
86
|
excluding parameters, query, or fragments.
|
|
79
87
|
"""
|
|
88
|
+
if not url:
|
|
89
|
+
raise mlrun.errors.MLRunInvalidArgumentError("Cannot parse an empty URL")
|
|
80
90
|
parsed_url = urllib.parse.urlparse(url)
|
|
81
|
-
scheme = f"{parsed_url.scheme}:" if parsed_url.scheme else ""
|
|
82
91
|
netloc = f"//{parsed_url.netloc}" if parsed_url.netloc else "//"
|
|
83
|
-
return f"{scheme}{netloc}{parsed_url.path}"
|
|
92
|
+
return f"{parsed_url.scheme}:{netloc}{parsed_url.path}"
|
|
84
93
|
|
|
85
94
|
@staticmethod
|
|
86
95
|
def uri_to_kfp(endpoint, subpath):
|
|
@@ -90,7 +99,18 @@ class DataStore:
|
|
|
90
99
|
def uri_to_ipython(endpoint, subpath):
|
|
91
100
|
return ""
|
|
92
101
|
|
|
93
|
-
|
|
102
|
+
# TODO: remove in 1.8.0
|
|
103
|
+
@deprecated(
|
|
104
|
+
version="1.8.0",
|
|
105
|
+
reason="'get_filesystem()' will be removed in 1.8.0, use "
|
|
106
|
+
"'filesystem' property instead",
|
|
107
|
+
category=FutureWarning,
|
|
108
|
+
)
|
|
109
|
+
def get_filesystem(self):
|
|
110
|
+
return self.filesystem
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def filesystem(self) -> Optional[fsspec.AbstractFileSystem]:
|
|
94
114
|
"""return fsspec file system object, if supported"""
|
|
95
115
|
return None
|
|
96
116
|
|
|
@@ -106,10 +126,10 @@ class DataStore:
|
|
|
106
126
|
|
|
107
127
|
def get_storage_options(self):
|
|
108
128
|
"""get fsspec storage options"""
|
|
109
|
-
return None
|
|
129
|
+
return self._sanitize_storage_options(None)
|
|
110
130
|
|
|
111
131
|
def open(self, filepath, mode):
|
|
112
|
-
file_system = self.
|
|
132
|
+
file_system = self.filesystem
|
|
113
133
|
return file_system.open(filepath, mode)
|
|
114
134
|
|
|
115
135
|
def _join(self, key):
|
|
@@ -230,7 +250,7 @@ class DataStore:
|
|
|
230
250
|
df_module = df_module or pd
|
|
231
251
|
file_url = self._sanitize_url(url)
|
|
232
252
|
is_csv, is_json, drop_time_column = False, False, False
|
|
233
|
-
file_system = self.
|
|
253
|
+
file_system = self.filesystem
|
|
234
254
|
if file_url.endswith(".csv") or format == "csv":
|
|
235
255
|
is_csv = True
|
|
236
256
|
drop_time_column = False
|
|
@@ -355,7 +375,7 @@ class DataStore:
|
|
|
355
375
|
}
|
|
356
376
|
|
|
357
377
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
358
|
-
self.
|
|
378
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
359
379
|
|
|
360
380
|
@staticmethod
|
|
361
381
|
def _is_dd(df_module):
|
|
@@ -645,9 +665,10 @@ def http_head(url, headers=None, auth=None):
|
|
|
645
665
|
return response.headers
|
|
646
666
|
|
|
647
667
|
|
|
648
|
-
def http_put(url, data, headers=None, auth=None):
|
|
668
|
+
def http_put(url, data, headers=None, auth=None, session=None):
|
|
649
669
|
try:
|
|
650
|
-
|
|
670
|
+
put_api = session.put if session else requests.put
|
|
671
|
+
response = put_api(
|
|
651
672
|
url, data=data, headers=headers, auth=auth, verify=verify_ssl
|
|
652
673
|
)
|
|
653
674
|
except OSError as exc:
|
|
@@ -671,7 +692,8 @@ class HttpStore(DataStore):
|
|
|
671
692
|
self._enrich_https_token()
|
|
672
693
|
self._validate_https_token()
|
|
673
694
|
|
|
674
|
-
|
|
695
|
+
@property
|
|
696
|
+
def filesystem(self):
|
|
675
697
|
"""return fsspec file system object, if supported"""
|
|
676
698
|
if not self._filesystem:
|
|
677
699
|
self._filesystem = fsspec.filesystem("http")
|
|
@@ -131,18 +131,18 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
131
131
|
|
|
132
132
|
class DatastoreProfileS3(DatastoreProfile):
|
|
133
133
|
type: str = pydantic.Field("s3")
|
|
134
|
-
_private_attributes = ("
|
|
134
|
+
_private_attributes = ("access_key_id", "secret_key")
|
|
135
135
|
endpoint_url: typing.Optional[str] = None
|
|
136
136
|
force_non_anonymous: typing.Optional[str] = None
|
|
137
137
|
profile_name: typing.Optional[str] = None
|
|
138
138
|
assume_role_arn: typing.Optional[str] = None
|
|
139
|
-
|
|
139
|
+
access_key_id: typing.Optional[str] = None
|
|
140
140
|
secret_key: typing.Optional[str] = None
|
|
141
141
|
|
|
142
142
|
def secrets(self) -> dict:
|
|
143
143
|
res = {}
|
|
144
|
-
if self.
|
|
145
|
-
res["AWS_ACCESS_KEY_ID"] = self.
|
|
144
|
+
if self.access_key_id:
|
|
145
|
+
res["AWS_ACCESS_KEY_ID"] = self.access_key_id
|
|
146
146
|
if self.secret_key:
|
|
147
147
|
res["AWS_SECRET_ACCESS_KEY"] = self.secret_key
|
|
148
148
|
if self.endpoint_url:
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -83,9 +83,9 @@ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
|
|
|
83
83
|
class DBFSStore(DataStore):
|
|
84
84
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
85
85
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
86
|
-
self.get_filesystem(silent=False)
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
@property
|
|
88
|
+
def filesystem(self):
|
|
89
89
|
"""return fsspec file system object, if supported"""
|
|
90
90
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
91
91
|
if not self._filesystem:
|
|
@@ -97,13 +97,14 @@ class DBFSStore(DataStore):
|
|
|
97
97
|
return self._filesystem
|
|
98
98
|
|
|
99
99
|
def get_storage_options(self):
|
|
100
|
-
|
|
100
|
+
res = dict(
|
|
101
101
|
token=self._get_secret_or_env("DATABRICKS_TOKEN"),
|
|
102
102
|
instance=self._get_secret_or_env("DATABRICKS_HOST"),
|
|
103
103
|
)
|
|
104
|
+
return self._sanitize_storage_options(res)
|
|
104
105
|
|
|
105
106
|
def _verify_filesystem_and_key(self, key: str):
|
|
106
|
-
if not self.
|
|
107
|
+
if not self.filesystem:
|
|
107
108
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
108
109
|
"Performing actions on data-item without a valid filesystem"
|
|
109
110
|
)
|
|
@@ -120,7 +121,7 @@ class DBFSStore(DataStore):
|
|
|
120
121
|
raise mlrun.errors.MLRunInvalidArgumentError("offset cannot be None")
|
|
121
122
|
start = offset or None
|
|
122
123
|
end = offset + size if size else None
|
|
123
|
-
return self.
|
|
124
|
+
return self.filesystem.cat_file(key, start=start, end=end)
|
|
124
125
|
|
|
125
126
|
def put(self, key, data, append=False):
|
|
126
127
|
self._verify_filesystem_and_key(key)
|
|
@@ -134,16 +135,16 @@ class DBFSStore(DataStore):
|
|
|
134
135
|
mode += "b"
|
|
135
136
|
elif not isinstance(data, str):
|
|
136
137
|
raise TypeError(f"Unknown data type {type(data)}")
|
|
137
|
-
with self.
|
|
138
|
+
with self.filesystem.open(key, mode) as f:
|
|
138
139
|
f.write(data)
|
|
139
140
|
|
|
140
141
|
def upload(self, key: str, src_path: str):
|
|
141
142
|
self._verify_filesystem_and_key(key)
|
|
142
|
-
self.
|
|
143
|
+
self.filesystem.put_file(src_path, key, overwrite=True)
|
|
143
144
|
|
|
144
145
|
def stat(self, key: str):
|
|
145
146
|
self._verify_filesystem_and_key(key)
|
|
146
|
-
file = self.
|
|
147
|
+
file = self.filesystem.stat(key)
|
|
147
148
|
if file["type"] == "file":
|
|
148
149
|
size = file["size"]
|
|
149
150
|
elif file["type"] == "directory":
|
|
@@ -155,10 +156,10 @@ class DBFSStore(DataStore):
|
|
|
155
156
|
Basic ls of file/dir - without recursion.
|
|
156
157
|
"""
|
|
157
158
|
self._verify_filesystem_and_key(key)
|
|
158
|
-
if self.
|
|
159
|
+
if self.filesystem.isfile(key):
|
|
159
160
|
return key
|
|
160
161
|
remote_path = f"{key}/*"
|
|
161
|
-
files = self.
|
|
162
|
+
files = self.filesystem.glob(remote_path)
|
|
162
163
|
# Get only the files and directories under key path, without the key path itself.
|
|
163
164
|
# for example in a filesystem that has this path: /test_mlrun_dbfs_objects/test.txt
|
|
164
165
|
# listdir with the input /test_mlrun_dbfs_objects as a key will return ['test.txt'].
|
|
@@ -170,4 +171,4 @@ class DBFSStore(DataStore):
|
|
|
170
171
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
171
172
|
"dbfs file system does not support maxdepth option in rm function"
|
|
172
173
|
)
|
|
173
|
-
self.
|
|
174
|
+
self.filesystem.rm(path=path, recursive=recursive)
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -47,7 +47,8 @@ class FileStore(DataStore):
|
|
|
47
47
|
key = path.join(self._real_path, suffix)
|
|
48
48
|
return path.join(self.subpath, key)
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
@property
|
|
51
|
+
def filesystem(self):
|
|
51
52
|
"""return fsspec file system object, if supported"""
|
|
52
53
|
if not self._filesystem:
|
|
53
54
|
self._filesystem = fsspec.filesystem("file")
|
|
@@ -30,7 +30,8 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
30
30
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
31
31
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
@property
|
|
34
|
+
def filesystem(self):
|
|
34
35
|
"""return fsspec file system object, if supported"""
|
|
35
36
|
if self._filesystem:
|
|
36
37
|
return self._filesystem
|
|
@@ -59,12 +60,12 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
59
60
|
except json.JSONDecodeError:
|
|
60
61
|
# If it's not json, handle it as a filename
|
|
61
62
|
token = credentials
|
|
62
|
-
|
|
63
|
+
return self._sanitize_storage_options(dict(token=token))
|
|
63
64
|
else:
|
|
64
65
|
logger.info(
|
|
65
66
|
"No GCS credentials available - auth will rely on auto-discovery of credentials"
|
|
66
67
|
)
|
|
67
|
-
return None
|
|
68
|
+
return self._sanitize_storage_options(None)
|
|
68
69
|
|
|
69
70
|
def _make_path(self, key):
|
|
70
71
|
key = key.strip("/")
|
|
@@ -75,7 +76,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
75
76
|
path = self._make_path(key)
|
|
76
77
|
|
|
77
78
|
end = offset + size if size else None
|
|
78
|
-
blob = self.
|
|
79
|
+
blob = self.filesystem.cat_file(path, start=offset, end=end)
|
|
79
80
|
return blob
|
|
80
81
|
|
|
81
82
|
def put(self, key, data, append=False):
|
|
@@ -94,17 +95,17 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
94
95
|
raise TypeError(
|
|
95
96
|
"Data type unknown. Unable to put in Google cloud storage!"
|
|
96
97
|
)
|
|
97
|
-
with self.
|
|
98
|
+
with self.filesystem.open(path, mode) as f:
|
|
98
99
|
f.write(data)
|
|
99
100
|
|
|
100
101
|
def upload(self, key, src_path):
|
|
101
102
|
path = self._make_path(key)
|
|
102
|
-
self.
|
|
103
|
+
self.filesystem.put_file(src_path, path, overwrite=True)
|
|
103
104
|
|
|
104
105
|
def stat(self, key):
|
|
105
106
|
path = self._make_path(key)
|
|
106
107
|
|
|
107
|
-
files = self.
|
|
108
|
+
files = self.filesystem.ls(path, detail=True)
|
|
108
109
|
if len(files) == 1 and files[0]["type"] == "file":
|
|
109
110
|
size = files[0]["size"]
|
|
110
111
|
modified = files[0]["updated"]
|
|
@@ -116,10 +117,10 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
116
117
|
|
|
117
118
|
def listdir(self, key):
|
|
118
119
|
path = self._make_path(key)
|
|
119
|
-
if self.
|
|
120
|
+
if self.filesystem.isfile(path):
|
|
120
121
|
return key
|
|
121
122
|
remote_path = f"{path}/**"
|
|
122
|
-
files = self.
|
|
123
|
+
files = self.filesystem.glob(remote_path)
|
|
123
124
|
key_length = len(key)
|
|
124
125
|
files = [
|
|
125
126
|
f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
|
|
@@ -128,7 +129,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
128
129
|
|
|
129
130
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
130
131
|
path = self._make_path(path)
|
|
131
|
-
self.
|
|
132
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
132
133
|
|
|
133
134
|
def get_spark_options(self):
|
|
134
135
|
res = None
|