mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -16,12 +16,13 @@ import time
|
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from urllib.parse import urlparse
|
|
18
18
|
|
|
19
|
+
from azure.storage.blob import BlobServiceClient
|
|
19
20
|
from azure.storage.blob._shared.base_client import parse_connection_str
|
|
20
21
|
from fsspec.registry import get_filesystem_class
|
|
21
22
|
|
|
22
23
|
import mlrun.errors
|
|
23
24
|
|
|
24
|
-
from .base import DataStore, FileStats,
|
|
25
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
25
26
|
|
|
26
27
|
# Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
|
|
27
28
|
# pointed to by the connection string, so the user is not expected to specify it in any way.
|
|
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
|
29
30
|
|
|
30
31
|
class AzureBlobStore(DataStore):
|
|
31
32
|
using_bucket = True
|
|
33
|
+
max_concurrency = 100
|
|
34
|
+
max_blocksize = 1024 * 1024 * 4
|
|
35
|
+
max_single_put_size = (
|
|
36
|
+
1024 * 1024 * 8
|
|
37
|
+
) # for service_client property only, does not affect filesystem
|
|
32
38
|
|
|
33
39
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
34
40
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
41
|
+
self._service_client = None
|
|
42
|
+
self._storage_options = None
|
|
43
|
+
|
|
44
|
+
def get_storage_options(self):
|
|
45
|
+
return self.storage_options
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def storage_options(self):
|
|
49
|
+
if not self._storage_options:
|
|
50
|
+
res = dict(
|
|
51
|
+
account_name=self._get_secret_or_env("account_name")
|
|
52
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
|
|
53
|
+
account_key=self._get_secret_or_env("account_key")
|
|
54
|
+
or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
|
|
55
|
+
connection_string=self._get_secret_or_env("connection_string")
|
|
56
|
+
or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
|
|
57
|
+
tenant_id=self._get_secret_or_env("tenant_id")
|
|
58
|
+
or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
|
|
59
|
+
client_id=self._get_secret_or_env("client_id")
|
|
60
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
|
|
61
|
+
client_secret=self._get_secret_or_env("client_secret")
|
|
62
|
+
or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
|
|
63
|
+
sas_token=self._get_secret_or_env("sas_token")
|
|
64
|
+
or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
|
|
65
|
+
credential=self._get_secret_or_env("credential"),
|
|
66
|
+
)
|
|
67
|
+
self._storage_options = self._sanitize_storage_options(res)
|
|
68
|
+
return self._storage_options
|
|
35
69
|
|
|
36
70
|
@property
|
|
37
71
|
def filesystem(self):
|
|
38
72
|
"""return fsspec file system object, if supported"""
|
|
39
|
-
if self._filesystem:
|
|
40
|
-
return self._filesystem
|
|
41
73
|
try:
|
|
42
74
|
import adlfs # noqa
|
|
43
75
|
except ImportError as exc:
|
|
44
76
|
raise ImportError("Azure adlfs not installed") from exc
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
filesystem_class
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
77
|
+
|
|
78
|
+
if not self._filesystem:
|
|
79
|
+
# in order to support az and wasbs kinds
|
|
80
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
81
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
82
|
+
filesystem_class,
|
|
83
|
+
using_bucket=self.using_bucket,
|
|
84
|
+
blocksize=self.max_blocksize,
|
|
85
|
+
**self.storage_options,
|
|
86
|
+
)
|
|
52
87
|
return self._filesystem
|
|
53
88
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
89
|
+
@property
|
|
90
|
+
def service_client(self):
|
|
91
|
+
try:
|
|
92
|
+
import azure # noqa
|
|
93
|
+
except ImportError as exc:
|
|
94
|
+
raise ImportError("Azure not installed") from exc
|
|
95
|
+
|
|
96
|
+
if not self._service_client:
|
|
97
|
+
self._do_connect()
|
|
98
|
+
return self._service_client
|
|
99
|
+
|
|
100
|
+
def _do_connect(self):
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
Creates a client for azure.
|
|
104
|
+
Raises MLRunInvalidArgumentError if none of the connection details are available
|
|
105
|
+
based on do_connect in AzureBlobFileSystem:
|
|
106
|
+
https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
|
|
107
|
+
"""
|
|
108
|
+
from azure.identity import ClientSecretCredential
|
|
109
|
+
|
|
110
|
+
storage_options = self.storage_options
|
|
111
|
+
connection_string = storage_options.get("connection_string")
|
|
112
|
+
client_name = storage_options.get("account_name")
|
|
113
|
+
account_key = storage_options.get("account_key")
|
|
114
|
+
sas_token = storage_options.get("sas_token")
|
|
115
|
+
client_id = storage_options.get("client_id")
|
|
116
|
+
credential = storage_options.get("credential")
|
|
117
|
+
|
|
118
|
+
credential_from_client_id = None
|
|
119
|
+
if (
|
|
120
|
+
credential is None
|
|
121
|
+
and account_key is None
|
|
122
|
+
and sas_token is None
|
|
123
|
+
and client_id is not None
|
|
124
|
+
):
|
|
125
|
+
credential_from_client_id = ClientSecretCredential(
|
|
126
|
+
tenant_id=storage_options.get("tenant_id"),
|
|
127
|
+
client_id=client_id,
|
|
128
|
+
client_secret=storage_options.get("client_secret"),
|
|
129
|
+
)
|
|
130
|
+
try:
|
|
131
|
+
if connection_string is not None:
|
|
132
|
+
self._service_client = BlobServiceClient.from_connection_string(
|
|
133
|
+
conn_str=connection_string,
|
|
134
|
+
max_block_size=self.max_blocksize,
|
|
135
|
+
max_single_put_size=self.max_single_put_size,
|
|
136
|
+
)
|
|
137
|
+
elif client_name is not None:
|
|
138
|
+
account_url = f"https://{client_name}.blob.core.windows.net"
|
|
139
|
+
cred = credential_from_client_id or credential or account_key
|
|
140
|
+
if not cred and sas_token is not None:
|
|
141
|
+
if not sas_token.startswith("?"):
|
|
142
|
+
sas_token = f"?{sas_token}"
|
|
143
|
+
account_url = account_url + sas_token
|
|
144
|
+
self._service_client = BlobServiceClient(
|
|
145
|
+
account_url=account_url,
|
|
146
|
+
credential=cred,
|
|
147
|
+
max_block_size=self.max_blocksize,
|
|
148
|
+
max_single_put_size=self.max_single_put_size,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
152
|
+
"Must provide either a connection_string or account_name with credentials"
|
|
153
|
+
)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
156
|
+
f"unable to connect to account for {e}"
|
|
157
|
+
)
|
|
73
158
|
|
|
74
159
|
def _convert_key_to_remote_path(self, key):
|
|
75
160
|
key = key.strip("/")
|
|
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
|
|
|
82
167
|
|
|
83
168
|
def upload(self, key, src_path):
|
|
84
169
|
remote_path = self._convert_key_to_remote_path(key)
|
|
85
|
-
|
|
170
|
+
container, remote_path = remote_path.split("/", 1)
|
|
171
|
+
container_client = self.service_client.get_container_client(container=container)
|
|
172
|
+
with open(file=src_path, mode="rb") as data:
|
|
173
|
+
container_client.upload_blob(
|
|
174
|
+
name=remote_path,
|
|
175
|
+
data=data,
|
|
176
|
+
overwrite=True,
|
|
177
|
+
max_concurrency=self.max_concurrency,
|
|
178
|
+
)
|
|
86
179
|
|
|
87
180
|
def get(self, key, size=None, offset=0):
|
|
88
181
|
remote_path = self._convert_key_to_remote_path(key)
|
|
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
|
|
|
96
189
|
"Append mode not supported for Azure blob datastore"
|
|
97
190
|
)
|
|
98
191
|
remote_path = self._convert_key_to_remote_path(key)
|
|
99
|
-
|
|
100
|
-
mode = "wb"
|
|
101
|
-
elif isinstance(data, str):
|
|
102
|
-
mode = "w"
|
|
103
|
-
else:
|
|
104
|
-
raise TypeError("Data type unknown. Unable to put in Azure!")
|
|
192
|
+
data, mode = self._prepare_put_data(data, append)
|
|
105
193
|
with self.filesystem.open(remote_path, mode) as f:
|
|
106
194
|
f.write(data)
|
|
107
195
|
|
|
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
|
|
|
135
223
|
|
|
136
224
|
def get_spark_options(self):
|
|
137
225
|
res = {}
|
|
138
|
-
st = self.
|
|
226
|
+
st = self.storage_options
|
|
139
227
|
service = "blob"
|
|
140
228
|
primary_url = None
|
|
141
229
|
if st.get("connection_string"):
|
mlrun/datastore/base.py
CHANGED
|
@@ -24,13 +24,12 @@ import pandas as pd
|
|
|
24
24
|
import pyarrow
|
|
25
25
|
import pytz
|
|
26
26
|
import requests
|
|
27
|
-
import urllib3
|
|
28
27
|
from deprecated import deprecated
|
|
29
28
|
|
|
30
29
|
import mlrun.config
|
|
31
30
|
import mlrun.errors
|
|
32
31
|
from mlrun.errors import err_to_str
|
|
33
|
-
from mlrun.utils import StorePrefix,
|
|
32
|
+
from mlrun.utils import StorePrefix, is_jupyter, logger
|
|
34
33
|
|
|
35
34
|
from .store_resources import is_store_uri, parse_store_uri
|
|
36
35
|
from .utils import filter_df_start_end_time, select_columns_from_df
|
|
@@ -157,6 +156,18 @@ class DataStore:
|
|
|
157
156
|
def put(self, key, data, append=False):
|
|
158
157
|
pass
|
|
159
158
|
|
|
159
|
+
def _prepare_put_data(self, data, append=False):
|
|
160
|
+
mode = "a" if append else "w"
|
|
161
|
+
if isinstance(data, bytearray):
|
|
162
|
+
data = bytes(data)
|
|
163
|
+
|
|
164
|
+
if isinstance(data, bytes):
|
|
165
|
+
return data, f"{mode}b"
|
|
166
|
+
elif isinstance(data, str):
|
|
167
|
+
return data, mode
|
|
168
|
+
else:
|
|
169
|
+
raise TypeError(f"Unable to put a value of type {type(self).__name__}")
|
|
170
|
+
|
|
160
171
|
def stat(self, key):
|
|
161
172
|
pass
|
|
162
173
|
|
|
@@ -215,6 +226,15 @@ class DataStore:
|
|
|
215
226
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
216
227
|
"When providing start_time or end_time, must provide time_column"
|
|
217
228
|
)
|
|
229
|
+
if (
|
|
230
|
+
start_time
|
|
231
|
+
and end_time
|
|
232
|
+
and start_time.utcoffset() != end_time.utcoffset()
|
|
233
|
+
):
|
|
234
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
235
|
+
"start_time and end_time must have the same time zone"
|
|
236
|
+
)
|
|
237
|
+
|
|
218
238
|
if start_time or end_time or additional_filters:
|
|
219
239
|
partitions_time_attributes = find_partitions(url, file_system)
|
|
220
240
|
set_filters(
|
|
@@ -232,13 +252,17 @@ class DataStore:
|
|
|
232
252
|
):
|
|
233
253
|
raise ex
|
|
234
254
|
|
|
235
|
-
|
|
236
|
-
if start_time
|
|
237
|
-
start_time_inner = start_time.replace(
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
255
|
+
start_time_inner = None
|
|
256
|
+
if start_time:
|
|
257
|
+
start_time_inner = start_time.replace(
|
|
258
|
+
tzinfo=None if start_time.tzinfo else pytz.utc
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
end_time_inner = None
|
|
262
|
+
if end_time:
|
|
263
|
+
end_time_inner = end_time.replace(
|
|
264
|
+
tzinfo=None if end_time.tzinfo else pytz.utc
|
|
265
|
+
)
|
|
242
266
|
|
|
243
267
|
set_filters(
|
|
244
268
|
partitions_time_attributes,
|
|
@@ -319,11 +343,7 @@ class DataStore:
|
|
|
319
343
|
dfs.append(df_module.read_csv(*updated_args, **kwargs))
|
|
320
344
|
return df_module.concat(dfs)
|
|
321
345
|
|
|
322
|
-
elif (
|
|
323
|
-
file_url.endswith(".parquet")
|
|
324
|
-
or file_url.endswith(".pq")
|
|
325
|
-
or format == "parquet"
|
|
326
|
-
):
|
|
346
|
+
elif mlrun.utils.helpers.is_parquet_file(file_url, format):
|
|
327
347
|
if columns:
|
|
328
348
|
kwargs["columns"] = columns
|
|
329
349
|
|
|
@@ -386,7 +406,10 @@ class DataStore:
|
|
|
386
406
|
}
|
|
387
407
|
|
|
388
408
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
389
|
-
|
|
409
|
+
try:
|
|
410
|
+
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
411
|
+
except FileNotFoundError:
|
|
412
|
+
pass
|
|
390
413
|
|
|
391
414
|
@staticmethod
|
|
392
415
|
def _is_dd(df_module):
|
|
@@ -596,14 +619,14 @@ class DataItem:
|
|
|
596
619
|
)
|
|
597
620
|
return df
|
|
598
621
|
|
|
599
|
-
def show(self, format=None):
|
|
622
|
+
def show(self, format: Optional[str] = None) -> None:
|
|
600
623
|
"""show the data object content in Jupyter
|
|
601
624
|
|
|
602
625
|
:param format: format to use (when there is no/wrong suffix), e.g. 'png'
|
|
603
626
|
"""
|
|
604
|
-
if not
|
|
627
|
+
if not is_jupyter:
|
|
605
628
|
logger.warning(
|
|
606
|
-
"Jupyter
|
|
629
|
+
"Jupyter was not detected. `.show()` displays only inside Jupyter."
|
|
607
630
|
)
|
|
608
631
|
return
|
|
609
632
|
|
|
@@ -721,8 +744,6 @@ class HttpStore(DataStore):
|
|
|
721
744
|
|
|
722
745
|
verify_ssl = mlconf.httpdb.http.verify
|
|
723
746
|
try:
|
|
724
|
-
if not verify_ssl:
|
|
725
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
726
747
|
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
727
748
|
except OSError as exc:
|
|
728
749
|
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
@@ -736,7 +757,7 @@ class HttpStore(DataStore):
|
|
|
736
757
|
# As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
|
|
737
758
|
# Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
|
|
738
759
|
# method specifically to strip away the 'ds' schema as required.
|
|
739
|
-
def
|
|
760
|
+
def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
|
|
740
761
|
if not issubclass(cls, fsspec.AbstractFileSystem):
|
|
741
762
|
raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
|
|
742
763
|
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def parse_url(url):
|
|
35
|
+
if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
|
|
36
|
+
url = url.replace("v3io://", "v3io:///", 1)
|
|
35
37
|
parsed_url = urlparse(url)
|
|
36
38
|
schema = parsed_url.scheme.lower()
|
|
37
39
|
endpoint = parsed_url.hostname
|
|
@@ -94,7 +96,7 @@ def schema_to_store(schema):
|
|
|
94
96
|
from .dbfs_store import DBFSStore
|
|
95
97
|
|
|
96
98
|
return DBFSStore
|
|
97
|
-
elif schema
|
|
99
|
+
elif schema in ["hdfs", "webhdfs"]:
|
|
98
100
|
from .hdfs import HdfsStore
|
|
99
101
|
|
|
100
102
|
return HdfsStore
|
|
@@ -207,7 +209,7 @@ class StoreManager:
|
|
|
207
209
|
) -> (DataStore, str, str):
|
|
208
210
|
schema, endpoint, parsed_url = parse_url(url)
|
|
209
211
|
subpath = parsed_url.path
|
|
210
|
-
store_key = f"{schema}://{endpoint}"
|
|
212
|
+
store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
|
|
211
213
|
|
|
212
214
|
if schema == "ds":
|
|
213
215
|
datastore_profile = datastore_profile_read(url, project_name, secrets)
|
|
@@ -412,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
|
|
|
412
412
|
return res or None
|
|
413
413
|
|
|
414
414
|
def url(self, subpath):
|
|
415
|
-
return f"
|
|
415
|
+
return f"webhdfs://{self.host}:{self.http_port}{subpath}"
|
|
416
416
|
|
|
417
417
|
|
|
418
418
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
|
|
|
19
19
|
|
|
20
20
|
import mlrun.errors
|
|
21
21
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
22
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class DatabricksFileBugFixed(DatabricksFile):
|
|
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
|
|
|
89
89
|
"""return fsspec file system object, if supported"""
|
|
90
90
|
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
91
91
|
if not self._filesystem:
|
|
92
|
-
self._filesystem =
|
|
92
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
93
93
|
cls=filesystem_class,
|
|
94
94
|
using_bucket=False,
|
|
95
95
|
**self.get_storage_options(),
|
|
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
|
|
|
130
130
|
"Append mode not supported for Databricks file system"
|
|
131
131
|
)
|
|
132
132
|
# can not use append mode because it overrides data.
|
|
133
|
-
mode =
|
|
134
|
-
if isinstance(data, bytes):
|
|
135
|
-
mode += "b"
|
|
136
|
-
elif not isinstance(data, str):
|
|
137
|
-
raise TypeError(f"Unknown data type {type(data)}")
|
|
133
|
+
data, mode = self._prepare_put_data(data, append)
|
|
138
134
|
with self.filesystem.open(key, mode) as f:
|
|
139
135
|
f.write(data)
|
|
140
136
|
|
mlrun/datastore/filestore.py
CHANGED
|
@@ -66,9 +66,7 @@ class FileStore(DataStore):
|
|
|
66
66
|
dir_to_create = path.dirname(self._join(key))
|
|
67
67
|
if dir_to_create:
|
|
68
68
|
self._ensure_directory(dir_to_create)
|
|
69
|
-
mode =
|
|
70
|
-
if isinstance(data, bytes):
|
|
71
|
-
mode = mode + "b"
|
|
69
|
+
data, mode = self._prepare_put_data(data, append)
|
|
72
70
|
with open(self._join(key), mode) as fp:
|
|
73
71
|
fp.write(data)
|
|
74
72
|
fp.close()
|
|
@@ -12,44 +12,82 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
+
import os
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
|
|
17
18
|
from fsspec.registry import get_filesystem_class
|
|
19
|
+
from google.auth.credentials import Credentials
|
|
20
|
+
from google.cloud.storage import Client, transfer_manager
|
|
21
|
+
from google.oauth2 import service_account
|
|
18
22
|
|
|
19
23
|
import mlrun.errors
|
|
20
24
|
from mlrun.utils import logger
|
|
21
25
|
|
|
22
|
-
from .base import DataStore, FileStats,
|
|
26
|
+
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
23
27
|
|
|
24
28
|
# Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class GoogleCloudStorageStore(DataStore):
|
|
28
32
|
using_bucket = True
|
|
33
|
+
workers = 8
|
|
34
|
+
chunk_size = 32 * 1024 * 1024
|
|
29
35
|
|
|
30
36
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
31
37
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
38
|
+
self._storage_client = None
|
|
39
|
+
self._storage_options = None
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def storage_client(self):
|
|
43
|
+
if self._storage_client:
|
|
44
|
+
return self._storage_client
|
|
45
|
+
|
|
46
|
+
token = self._get_credentials().get("token")
|
|
47
|
+
access = "https://www.googleapis.com/auth/devstorage.full_control"
|
|
48
|
+
if isinstance(token, str):
|
|
49
|
+
if os.path.exists(token):
|
|
50
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
51
|
+
token, scopes=[access]
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
55
|
+
"gcsfs authentication file not found!"
|
|
56
|
+
)
|
|
57
|
+
elif isinstance(token, dict):
|
|
58
|
+
credentials = service_account.Credentials.from_service_account_info(
|
|
59
|
+
token, scopes=[access]
|
|
60
|
+
)
|
|
61
|
+
elif isinstance(token, Credentials):
|
|
62
|
+
credentials = token
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(f"Unsupported token type: {type(token)}")
|
|
65
|
+
self._storage_client = Client(credentials=credentials)
|
|
66
|
+
return self._storage_client
|
|
32
67
|
|
|
33
68
|
@property
|
|
34
69
|
def filesystem(self):
|
|
35
70
|
"""return fsspec file system object, if supported"""
|
|
36
|
-
if self._filesystem:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
) from exc
|
|
44
|
-
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
45
|
-
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
46
|
-
filesystem_class,
|
|
47
|
-
using_bucket=self.using_bucket,
|
|
48
|
-
**self.get_storage_options(),
|
|
49
|
-
)
|
|
71
|
+
if not self._filesystem:
|
|
72
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
73
|
+
self._filesystem = make_datastore_schema_sanitizer(
|
|
74
|
+
filesystem_class,
|
|
75
|
+
using_bucket=self.using_bucket,
|
|
76
|
+
**self.storage_options,
|
|
77
|
+
)
|
|
50
78
|
return self._filesystem
|
|
51
79
|
|
|
52
|
-
|
|
80
|
+
@property
|
|
81
|
+
def storage_options(self):
|
|
82
|
+
if self._storage_options:
|
|
83
|
+
return self._storage_options
|
|
84
|
+
credentials = self._get_credentials()
|
|
85
|
+
# due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
|
|
86
|
+
credentials["use_listings_cache"] = False
|
|
87
|
+
self._storage_options = credentials
|
|
88
|
+
return self._storage_options
|
|
89
|
+
|
|
90
|
+
def _get_credentials(self):
|
|
53
91
|
credentials = self._get_secret_or_env(
|
|
54
92
|
"GCP_CREDENTIALS"
|
|
55
93
|
) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
|
|
@@ -71,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
71
109
|
)
|
|
72
110
|
return self._sanitize_storage_options(None)
|
|
73
111
|
|
|
112
|
+
def get_storage_options(self):
|
|
113
|
+
return self.storage_options
|
|
114
|
+
|
|
74
115
|
def _make_path(self, key):
|
|
75
116
|
key = key.strip("/")
|
|
76
117
|
path = Path(self.endpoint, key).as_posix()
|
|
@@ -90,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
90
131
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
91
132
|
"Append mode not supported for Google cloud storage datastore"
|
|
92
133
|
)
|
|
93
|
-
|
|
94
|
-
if isinstance(data, bytes):
|
|
95
|
-
mode = "wb"
|
|
96
|
-
elif isinstance(data, str):
|
|
97
|
-
mode = "w"
|
|
98
|
-
else:
|
|
99
|
-
raise TypeError(
|
|
100
|
-
"Data type unknown. Unable to put in Google cloud storage!"
|
|
101
|
-
)
|
|
134
|
+
data, mode = self._prepare_put_data(data, append)
|
|
102
135
|
with self.filesystem.open(path, mode) as f:
|
|
103
136
|
f.write(data)
|
|
104
137
|
|
|
105
138
|
def upload(self, key, src_path):
|
|
106
|
-
|
|
107
|
-
self.
|
|
139
|
+
file_size = os.path.getsize(src_path)
|
|
140
|
+
united_path = self._make_path(key)
|
|
141
|
+
|
|
142
|
+
# Multiple upload limitation recommendations as described in
|
|
143
|
+
# https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
|
|
144
|
+
|
|
145
|
+
if file_size <= self.chunk_size:
|
|
146
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
bucket = self.storage_client.bucket(self.endpoint)
|
|
150
|
+
blob = bucket.blob(key.strip("/"))
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
transfer_manager.upload_chunks_concurrently(
|
|
154
|
+
src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
|
|
155
|
+
)
|
|
156
|
+
except Exception as upload_chunks_concurrently_exception:
|
|
157
|
+
logger.warning(
|
|
158
|
+
f"gcs: failed to concurrently upload {src_path},"
|
|
159
|
+
f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
|
|
160
|
+
)
|
|
161
|
+
self.filesystem.put_file(src_path, united_path, overwrite=True)
|
|
108
162
|
|
|
109
163
|
def stat(self, key):
|
|
110
164
|
path = self._make_path(key)
|
|
@@ -133,11 +187,13 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
133
187
|
|
|
134
188
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
135
189
|
path = self._make_path(path)
|
|
136
|
-
|
|
190
|
+
# in order to raise an error in case of a connection error (ML-7056)
|
|
191
|
+
self.filesystem.exists(path)
|
|
192
|
+
super().rm(path, recursive=recursive, maxdepth=maxdepth)
|
|
137
193
|
|
|
138
194
|
def get_spark_options(self):
|
|
139
195
|
res = {}
|
|
140
|
-
st = self.
|
|
196
|
+
st = self._get_credentials()
|
|
141
197
|
if "token" in st:
|
|
142
198
|
res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
|
|
143
199
|
if isinstance(st["token"], str):
|
mlrun/datastore/inmem.py
CHANGED
|
@@ -72,7 +72,7 @@ class InMemoryStore(DataStore):
|
|
|
72
72
|
if columns:
|
|
73
73
|
kwargs["usecols"] = columns
|
|
74
74
|
reader = df_module.read_csv
|
|
75
|
-
elif
|
|
75
|
+
elif mlrun.utils.helpers.is_parquet_file(url, format):
|
|
76
76
|
if columns:
|
|
77
77
|
kwargs["columns"] = columns
|
|
78
78
|
reader = df_module.read_parquet
|
|
@@ -85,3 +85,6 @@ class InMemoryStore(DataStore):
|
|
|
85
85
|
kwargs.pop(field, None)
|
|
86
86
|
|
|
87
87
|
return reader(item, **kwargs)
|
|
88
|
+
|
|
89
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
90
|
+
self._items.pop(path, None)
|
mlrun/datastore/redis.py
CHANGED