mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/manager.py +6 -1
- mlrun/common/constants.py +2 -0
- mlrun/common/model_monitoring/helpers.py +12 -6
- mlrun/common/schemas/__init__.py +11 -0
- mlrun/common/schemas/api_gateway.py +85 -0
- mlrun/common/schemas/auth.py +2 -2
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/common.py +40 -0
- mlrun/common/schemas/model_monitoring/constants.py +4 -1
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +31 -17
- mlrun/datastore/azure_blob.py +22 -9
- mlrun/datastore/base.py +15 -25
- mlrun/datastore/datastore.py +19 -8
- mlrun/datastore/datastore_profile.py +47 -5
- mlrun/datastore/google_cloud_storage.py +10 -6
- mlrun/datastore/hdfs.py +51 -0
- mlrun/datastore/redis.py +4 -0
- mlrun/datastore/s3.py +4 -0
- mlrun/datastore/sources.py +29 -43
- mlrun/datastore/targets.py +59 -53
- mlrun/datastore/utils.py +2 -49
- mlrun/datastore/v3io.py +4 -0
- mlrun/db/base.py +50 -0
- mlrun/db/httpdb.py +121 -50
- mlrun/db/nopdb.py +13 -0
- mlrun/execution.py +3 -3
- mlrun/feature_store/feature_vector.py +2 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
- mlrun/frameworks/tf_keras/model_handler.py +7 -7
- mlrun/k8s_utils.py +10 -5
- mlrun/kfpops.py +19 -10
- mlrun/model.py +5 -0
- mlrun/model_monitoring/api.py +3 -3
- mlrun/model_monitoring/application.py +1 -1
- mlrun/model_monitoring/applications/__init__.py +13 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
- mlrun/model_monitoring/batch.py +9 -111
- mlrun/model_monitoring/controller.py +73 -55
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/features_drift_table.py +62 -53
- mlrun/model_monitoring/helpers.py +30 -21
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
- mlrun/package/packagers/pandas_packagers.py +3 -3
- mlrun/package/utils/_archiver.py +3 -1
- mlrun/platforms/iguazio.py +8 -65
- mlrun/projects/pipelines.py +21 -11
- mlrun/projects/project.py +180 -42
- mlrun/run.py +1 -1
- mlrun/runtimes/base.py +25 -2
- mlrun/runtimes/kubejob.py +5 -3
- mlrun/runtimes/local.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +6 -6
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +300 -0
- mlrun/runtimes/nuclio/function.py +9 -9
- mlrun/runtimes/nuclio/serving.py +3 -3
- mlrun/runtimes/pod.py +3 -3
- mlrun/runtimes/sparkjob/spark3job.py +3 -3
- mlrun/serving/remote.py +4 -2
- mlrun/serving/server.py +2 -8
- mlrun/utils/async_http.py +3 -3
- mlrun/utils/helpers.py +27 -5
- mlrun/utils/http.py +3 -3
- mlrun/utils/logger.py +2 -2
- mlrun/utils/notifications/notification_pusher.py +6 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/METADATA +13 -16
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/RECORD +76 -68
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/top_level.txt +0 -0
|
@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
132
132
|
return attributes
|
|
133
133
|
|
|
134
134
|
|
|
135
|
+
class DatastoreProfileV3io(DatastoreProfile):
|
|
136
|
+
type: str = pydantic.Field("v3io")
|
|
137
|
+
v3io_access_key: typing.Optional[str] = None
|
|
138
|
+
_private_attributes = "v3io_access_key"
|
|
139
|
+
|
|
140
|
+
def url(self, subpath):
|
|
141
|
+
subpath = subpath.lstrip("/")
|
|
142
|
+
return f"v3io:///{subpath}"
|
|
143
|
+
|
|
144
|
+
def secrets(self) -> dict:
|
|
145
|
+
res = {}
|
|
146
|
+
if self.v3io_access_key:
|
|
147
|
+
res["V3IO_ACCESS_KEY"] = self.v3io_access_key
|
|
148
|
+
return res
|
|
149
|
+
|
|
150
|
+
|
|
135
151
|
class DatastoreProfileS3(DatastoreProfile):
|
|
136
152
|
type: str = pydantic.Field("s3")
|
|
137
153
|
_private_attributes = ("access_key_id", "secret_key")
|
|
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
156
172
|
res["AWS_PROFILE"] = self.profile_name
|
|
157
173
|
if self.assume_role_arn:
|
|
158
174
|
res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
|
|
159
|
-
return res
|
|
175
|
+
return res
|
|
160
176
|
|
|
161
177
|
def url(self, subpath):
|
|
162
178
|
return f"s3:/{subpath}"
|
|
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
199
215
|
res["REDIS_USER"] = self.username
|
|
200
216
|
if self.password:
|
|
201
217
|
res["REDIS_PASSWORD"] = self.password
|
|
202
|
-
return res
|
|
218
|
+
return res
|
|
203
219
|
|
|
204
220
|
def url(self, subpath):
|
|
205
221
|
return self.endpoint_url + subpath
|
|
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
|
|
|
220
236
|
res["DATABRICKS_TOKEN"] = self.token
|
|
221
237
|
if self.endpoint_url:
|
|
222
238
|
res["DATABRICKS_HOST"] = self.endpoint_url
|
|
223
|
-
return res
|
|
239
|
+
return res
|
|
224
240
|
|
|
225
241
|
|
|
226
242
|
class DatastoreProfileGCS(DatastoreProfile):
|
|
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
247
263
|
res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
|
|
248
264
|
if self.gcp_credentials:
|
|
249
265
|
res["GCP_CREDENTIALS"] = self.gcp_credentials
|
|
250
|
-
return res
|
|
266
|
+
return res
|
|
251
267
|
|
|
252
268
|
|
|
253
269
|
class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
292
308
|
res["sas_token"] = self.sas_token
|
|
293
309
|
if self.credential:
|
|
294
310
|
res["credential"] = self.credential
|
|
295
|
-
return res
|
|
311
|
+
return res
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class DatastoreProfileHdfs(DatastoreProfile):
|
|
315
|
+
type: str = pydantic.Field("hdfs")
|
|
316
|
+
_private_attributes = "token"
|
|
317
|
+
host: typing.Optional[str] = None
|
|
318
|
+
port: typing.Optional[int] = None
|
|
319
|
+
http_port: typing.Optional[int] = None
|
|
320
|
+
user: typing.Optional[str] = None
|
|
321
|
+
|
|
322
|
+
def secrets(self) -> dict:
|
|
323
|
+
res = {}
|
|
324
|
+
if self.host:
|
|
325
|
+
res["HDFS_HOST"] = self.host
|
|
326
|
+
if self.port:
|
|
327
|
+
res["HDFS_PORT"] = self.port
|
|
328
|
+
if self.port:
|
|
329
|
+
res["HDFS_HTTP_PORT"] = self.http_port
|
|
330
|
+
if self.user:
|
|
331
|
+
res["HDFS_USER"] = self.user
|
|
332
|
+
return res or None
|
|
333
|
+
|
|
334
|
+
def url(self, subpath):
|
|
335
|
+
return f"hdfs://{self.host}:{self.http_port}{subpath}"
|
|
296
336
|
|
|
297
337
|
|
|
298
338
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
346
386
|
decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
|
|
347
387
|
datastore_type = decoded_dict.get("type")
|
|
348
388
|
ds_profile_factory = {
|
|
389
|
+
"v3io": DatastoreProfileV3io,
|
|
349
390
|
"s3": DatastoreProfileS3,
|
|
350
391
|
"redis": DatastoreProfileRedis,
|
|
351
392
|
"basic": DatastoreProfileBasic,
|
|
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
354
395
|
"dbfs": DatastoreProfileDBFS,
|
|
355
396
|
"gcs": DatastoreProfileGCS,
|
|
356
397
|
"az": DatastoreProfileAzureBlob,
|
|
398
|
+
"hdfs": DatastoreProfileHdfs,
|
|
357
399
|
}
|
|
358
400
|
if datastore_type in ds_profile_factory:
|
|
359
401
|
return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
|
|
@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
147
147
|
if "project_id" in credentials:
|
|
148
148
|
res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
|
|
149
149
|
if "private_key_id" in credentials:
|
|
150
|
-
res[
|
|
151
|
-
"
|
|
152
|
-
|
|
150
|
+
res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
|
|
151
|
+
credentials["private_key_id"]
|
|
152
|
+
)
|
|
153
153
|
if "private_key" in credentials:
|
|
154
|
-
res[
|
|
155
|
-
"
|
|
156
|
-
|
|
154
|
+
res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
|
|
155
|
+
credentials["private_key"]
|
|
156
|
+
)
|
|
157
157
|
if "client_email" in credentials:
|
|
158
158
|
res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
|
|
159
159
|
"client_email"
|
|
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
161
161
|
if "client_id" in credentials:
|
|
162
162
|
res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
|
|
163
163
|
return res
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def spark_url(self):
|
|
167
|
+
return f"gs://{self.endpoint}"
|
mlrun/datastore/hdfs.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
import fsspec
|
|
17
|
+
|
|
18
|
+
from mlrun.datastore.base import DataStore
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HdfsStore(DataStore):
|
|
22
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
23
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
24
|
+
|
|
25
|
+
self.host = self._get_secret_or_env("HDFS_HOST")
|
|
26
|
+
self.port = self._get_secret_or_env("HDFS_PORT")
|
|
27
|
+
self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
|
|
28
|
+
self.user = self._get_secret_or_env("HDFS_USER")
|
|
29
|
+
if not self.user:
|
|
30
|
+
self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
|
|
31
|
+
|
|
32
|
+
self._filesystem = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def filesystem(self):
|
|
36
|
+
if not self._filesystem:
|
|
37
|
+
self._filesystem = fsspec.filesystem(
|
|
38
|
+
"webhdfs",
|
|
39
|
+
host=self.host,
|
|
40
|
+
port=self.http_port,
|
|
41
|
+
user=self.user,
|
|
42
|
+
)
|
|
43
|
+
return self._filesystem
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def url(self):
|
|
47
|
+
return f"webhdfs://{self.host}:{self.http_port}"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def spark_url(self):
|
|
51
|
+
return f"hdfs://{self.host}:{self.port}"
|
mlrun/datastore/redis.py
CHANGED
mlrun/datastore/s3.py
CHANGED
|
@@ -156,6 +156,10 @@ class S3Store(DataStore):
|
|
|
156
156
|
|
|
157
157
|
return self._sanitize_storage_options(storage_options)
|
|
158
158
|
|
|
159
|
+
@property
|
|
160
|
+
def spark_url(self):
|
|
161
|
+
return f"s3a://{self.endpoint}"
|
|
162
|
+
|
|
159
163
|
def get_bucket_and_key(self, key):
|
|
160
164
|
path = self._join(key)[1:]
|
|
161
165
|
return self.endpoint, path
|
mlrun/datastore/sources.py
CHANGED
|
@@ -39,7 +39,6 @@ from .utils import (
|
|
|
39
39
|
_generate_sql_query_with_time_filter,
|
|
40
40
|
filter_df_start_end_time,
|
|
41
41
|
select_columns_from_df,
|
|
42
|
-
store_path_to_spark,
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
|
|
@@ -193,14 +192,10 @@ class CSVSource(BaseSourceDriver):
|
|
|
193
192
|
parse_dates.append(time_field)
|
|
194
193
|
|
|
195
194
|
data_item = mlrun.store_manager.object(self.path)
|
|
196
|
-
|
|
197
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
198
|
-
path = store.url + path
|
|
199
|
-
else:
|
|
200
|
-
path = data_item.url
|
|
195
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
201
196
|
|
|
202
197
|
return storey.CSVSource(
|
|
203
|
-
paths=
|
|
198
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
204
199
|
build_dict=True,
|
|
205
200
|
key_field=self.key_field or key_field,
|
|
206
201
|
storage_options=data_item.store.get_storage_options(),
|
|
@@ -209,25 +204,17 @@ class CSVSource(BaseSourceDriver):
|
|
|
209
204
|
)
|
|
210
205
|
|
|
211
206
|
def get_spark_options(self):
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
218
|
-
"format": "csv",
|
|
219
|
-
"header": "true",
|
|
220
|
-
"inferSchema": "true",
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
return {**result, **storage_spark_options}
|
|
224
|
-
else:
|
|
225
|
-
return {
|
|
226
|
-
"path": store_path_to_spark(self.path),
|
|
207
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
208
|
+
spark_options = store.get_spark_options()
|
|
209
|
+
spark_options.update(
|
|
210
|
+
{
|
|
211
|
+
"path": url,
|
|
227
212
|
"format": "csv",
|
|
228
213
|
"header": "true",
|
|
229
214
|
"inferSchema": "true",
|
|
230
215
|
}
|
|
216
|
+
)
|
|
217
|
+
return spark_options
|
|
231
218
|
|
|
232
219
|
def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
|
|
233
220
|
import pyspark.sql.functions as funcs
|
|
@@ -357,14 +344,10 @@ class ParquetSource(BaseSourceDriver):
|
|
|
357
344
|
attributes["context"] = context
|
|
358
345
|
|
|
359
346
|
data_item = mlrun.store_manager.object(self.path)
|
|
360
|
-
|
|
361
|
-
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
362
|
-
path = store.url + path
|
|
363
|
-
else:
|
|
364
|
-
path = data_item.url
|
|
347
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
365
348
|
|
|
366
349
|
return storey.ParquetSource(
|
|
367
|
-
paths=
|
|
350
|
+
paths=url, # unlike self.path, it already has store:// replaced
|
|
368
351
|
key_field=self.key_field or key_field,
|
|
369
352
|
storage_options=data_item.store.get_storage_options(),
|
|
370
353
|
end_filter=self.end_time,
|
|
@@ -374,20 +357,15 @@ class ParquetSource(BaseSourceDriver):
|
|
|
374
357
|
)
|
|
375
358
|
|
|
376
359
|
def get_spark_options(self):
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
383
|
-
"format": "parquet",
|
|
384
|
-
}
|
|
385
|
-
return {**result, **storage_spark_options}
|
|
386
|
-
else:
|
|
387
|
-
return {
|
|
388
|
-
"path": store_path_to_spark(self.path),
|
|
360
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
361
|
+
spark_options = store.get_spark_options()
|
|
362
|
+
spark_options.update(
|
|
363
|
+
{
|
|
364
|
+
"path": store.spark_url + path,
|
|
389
365
|
"format": "parquet",
|
|
390
366
|
}
|
|
367
|
+
)
|
|
368
|
+
return spark_options
|
|
391
369
|
|
|
392
370
|
def to_dataframe(
|
|
393
371
|
self,
|
|
@@ -875,8 +853,16 @@ class StreamSource(OnlineSource):
|
|
|
875
853
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
876
854
|
|
|
877
855
|
def add_nuclio_trigger(self, function):
|
|
878
|
-
|
|
879
|
-
|
|
856
|
+
store, path, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
857
|
+
if store.kind != "v3io":
|
|
858
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
859
|
+
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
860
|
+
)
|
|
861
|
+
path = "v3io:/" + path
|
|
862
|
+
storage_options = store.get_storage_options()
|
|
863
|
+
access_key = storage_options.get("v3io_access_key")
|
|
864
|
+
endpoint, stream_path = parse_path(url)
|
|
865
|
+
v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
|
|
880
866
|
container, stream_path = split_path(stream_path)
|
|
881
867
|
res = v3io_client.stream.create(
|
|
882
868
|
container=container,
|
|
@@ -896,7 +882,7 @@ class StreamSource(OnlineSource):
|
|
|
896
882
|
kwargs["worker_allocation_mode"] = "static"
|
|
897
883
|
|
|
898
884
|
function.add_v3io_stream_trigger(
|
|
899
|
-
|
|
885
|
+
path,
|
|
900
886
|
self.name,
|
|
901
887
|
self.attributes["group"],
|
|
902
888
|
self.attributes["seek_to"],
|
mlrun/datastore/targets.py
CHANGED
|
@@ -29,7 +29,7 @@ import mlrun
|
|
|
29
29
|
import mlrun.utils.helpers
|
|
30
30
|
from mlrun.config import config
|
|
31
31
|
from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
|
|
32
|
-
from mlrun.utils import now_date
|
|
32
|
+
from mlrun.utils import logger, now_date
|
|
33
33
|
from mlrun.utils.helpers import to_parquet
|
|
34
34
|
from mlrun.utils.v3io_clients import get_frames_client
|
|
35
35
|
|
|
@@ -43,7 +43,6 @@ from .utils import (
|
|
|
43
43
|
filter_df_start_end_time,
|
|
44
44
|
parse_kafka_url,
|
|
45
45
|
select_columns_from_df,
|
|
46
|
-
store_path_to_spark,
|
|
47
46
|
)
|
|
48
47
|
|
|
49
48
|
|
|
@@ -448,14 +447,11 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
448
447
|
if self.credentials_prefix
|
|
449
448
|
else None
|
|
450
449
|
)
|
|
451
|
-
store, resolved_store_path = mlrun.store_manager.get_or_create_store(
|
|
450
|
+
store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
|
|
452
451
|
self.get_target_path(),
|
|
453
452
|
credentials_prefix_secrets,
|
|
454
453
|
)
|
|
455
|
-
|
|
456
|
-
return store, store.url + resolved_store_path
|
|
457
|
-
else:
|
|
458
|
-
return store, self.get_target_path()
|
|
454
|
+
return store, resolved_store_path, url
|
|
459
455
|
|
|
460
456
|
def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
|
|
461
457
|
result = []
|
|
@@ -504,7 +500,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
504
500
|
write_spark_dataframe_with_options(options, df, "overwrite")
|
|
505
501
|
elif hasattr(df, "dask"):
|
|
506
502
|
dask_options = self.get_dask_options()
|
|
507
|
-
store, target_path = self._get_store_and_path()
|
|
503
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
508
504
|
storage_options = store.get_storage_options()
|
|
509
505
|
df = df.repartition(partition_size="100MB")
|
|
510
506
|
try:
|
|
@@ -525,7 +521,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
525
521
|
except Exception as exc:
|
|
526
522
|
raise RuntimeError("Failed to write Dask Dataframe") from exc
|
|
527
523
|
else:
|
|
528
|
-
store, target_path = self._get_store_and_path()
|
|
524
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
529
525
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
530
526
|
file_system = store.filesystem
|
|
531
527
|
if file_system.protocol == "file":
|
|
@@ -692,7 +688,7 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
692
688
|
raise NotImplementedError()
|
|
693
689
|
|
|
694
690
|
def purge(self):
|
|
695
|
-
store, target_path = self._get_store_and_path()
|
|
691
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
696
692
|
store.rm(target_path, recursive=True)
|
|
697
693
|
|
|
698
694
|
def as_df(
|
|
@@ -872,7 +868,7 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
872
868
|
for key_column in key_columns:
|
|
873
869
|
tuple_key_columns.append((key_column.name, key_column.value_type))
|
|
874
870
|
|
|
875
|
-
store, target_path = self._get_store_and_path()
|
|
871
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
876
872
|
|
|
877
873
|
storage_options = store.get_storage_options()
|
|
878
874
|
if storage_options and self.storage_options:
|
|
@@ -925,27 +921,19 @@ class ParquetTarget(BaseStoreTarget):
|
|
|
925
921
|
if unit == time_partitioning_granularity:
|
|
926
922
|
break
|
|
927
923
|
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
path = store.url + path
|
|
934
|
-
result = {
|
|
935
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
936
|
-
"format": "parquet",
|
|
937
|
-
}
|
|
938
|
-
result = {**result, **storage_spark_options}
|
|
939
|
-
else:
|
|
940
|
-
result = {
|
|
941
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
924
|
+
store, path, url = self._get_store_and_path()
|
|
925
|
+
spark_options = store.get_spark_options()
|
|
926
|
+
spark_options.update(
|
|
927
|
+
{
|
|
928
|
+
"path": store.spark_url + path,
|
|
942
929
|
"format": "parquet",
|
|
943
930
|
}
|
|
931
|
+
)
|
|
944
932
|
for partition_col in self.partition_cols or []:
|
|
945
933
|
partition_cols.append(partition_col)
|
|
946
934
|
if partition_cols:
|
|
947
|
-
|
|
948
|
-
return
|
|
935
|
+
spark_options["partitionBy"] = partition_cols
|
|
936
|
+
return spark_options
|
|
949
937
|
|
|
950
938
|
def get_dask_options(self):
|
|
951
939
|
return {"format": "parquet"}
|
|
@@ -1052,7 +1040,7 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1052
1040
|
column_list = self._get_column_list(
|
|
1053
1041
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1054
1042
|
)
|
|
1055
|
-
store, target_path = self._get_store_and_path()
|
|
1043
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1056
1044
|
graph.add_step(
|
|
1057
1045
|
name=self.name or "CSVTarget",
|
|
1058
1046
|
after=after,
|
|
@@ -1067,24 +1055,16 @@ class CSVTarget(BaseStoreTarget):
|
|
|
1067
1055
|
)
|
|
1068
1056
|
|
|
1069
1057
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
path = store.url + path
|
|
1076
|
-
result = {
|
|
1077
|
-
"path": store_path_to_spark(path, storage_spark_options),
|
|
1078
|
-
"format": "csv",
|
|
1079
|
-
"header": "true",
|
|
1080
|
-
}
|
|
1081
|
-
return {**result, **storage_spark_options}
|
|
1082
|
-
else:
|
|
1083
|
-
return {
|
|
1084
|
-
"path": store_path_to_spark(self.get_target_path()),
|
|
1058
|
+
store, path, url = self._get_store_and_path()
|
|
1059
|
+
spark_options = store.get_spark_options()
|
|
1060
|
+
spark_options.update(
|
|
1061
|
+
{
|
|
1062
|
+
"path": store.spark_url + path,
|
|
1085
1063
|
"format": "csv",
|
|
1086
1064
|
"header": "true",
|
|
1087
1065
|
}
|
|
1066
|
+
)
|
|
1067
|
+
return spark_options
|
|
1088
1068
|
|
|
1089
1069
|
def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
|
|
1090
1070
|
import pyspark.sql.functions as funcs
|
|
@@ -1209,7 +1189,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
|
|
|
1209
1189
|
df = df.copy(deep=False)
|
|
1210
1190
|
access_key = self._get_credential("V3IO_ACCESS_KEY")
|
|
1211
1191
|
|
|
1212
|
-
|
|
1192
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1193
|
+
storage_options = store.get_storage_options()
|
|
1194
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1195
|
+
|
|
1196
|
+
_, path_with_container = parse_path(target_path)
|
|
1213
1197
|
container, path = split_path(path_with_container)
|
|
1214
1198
|
|
|
1215
1199
|
frames_client = get_frames_client(
|
|
@@ -1227,17 +1211,31 @@ class NoSqlTarget(NoSqlBaseTarget):
|
|
|
1227
1211
|
def get_table_object(self):
|
|
1228
1212
|
from storey import Table, V3ioDriver
|
|
1229
1213
|
|
|
1230
|
-
|
|
1231
|
-
endpoint, uri = parse_path(
|
|
1214
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1215
|
+
endpoint, uri = parse_path(target_path)
|
|
1216
|
+
storage_options = store.get_storage_options()
|
|
1217
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1218
|
+
|
|
1232
1219
|
return Table(
|
|
1233
1220
|
uri,
|
|
1234
|
-
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
|
|
1221
|
+
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
|
|
1235
1222
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
1236
1223
|
)
|
|
1237
1224
|
|
|
1238
1225
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1226
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1227
|
+
storage_options = store.get_storage_options()
|
|
1228
|
+
store_access_key = storage_options.get("v3io_access_key")
|
|
1229
|
+
env_access_key = self._secrets.get(
|
|
1230
|
+
"V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
|
|
1231
|
+
)
|
|
1232
|
+
if store_access_key and env_access_key and store_access_key != env_access_key:
|
|
1233
|
+
logger.warning(
|
|
1234
|
+
"The Spark v3io connector does not support access_key parameterization."
|
|
1235
|
+
"Spark will disregard the store-provided key."
|
|
1236
|
+
)
|
|
1239
1237
|
spark_options = {
|
|
1240
|
-
"path":
|
|
1238
|
+
"path": store.spark_url + path_in_store,
|
|
1241
1239
|
"format": "io.iguaz.v3io.spark.sql.kv",
|
|
1242
1240
|
}
|
|
1243
1241
|
if isinstance(key_column, list) and len(key_column) >= 1:
|
|
@@ -1330,10 +1328,10 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
|
|
|
1330
1328
|
def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
|
|
1331
1329
|
endpoint, uri = self._get_server_endpoint()
|
|
1332
1330
|
parsed_endpoint = urlparse(endpoint)
|
|
1333
|
-
|
|
1331
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1334
1332
|
return {
|
|
1335
1333
|
"key.column": "_spark_object_name",
|
|
1336
|
-
"table": "{" +
|
|
1334
|
+
"table": "{" + path_in_store,
|
|
1337
1335
|
"format": "org.apache.spark.sql.redis",
|
|
1338
1336
|
"host": parsed_endpoint.hostname,
|
|
1339
1337
|
"port": parsed_endpoint.port,
|
|
@@ -1381,10 +1379,12 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1381
1379
|
from storey import V3ioDriver
|
|
1382
1380
|
|
|
1383
1381
|
key_columns = list(key_columns.keys())
|
|
1384
|
-
path = self.
|
|
1382
|
+
store, path_in_store, path = self._get_store_and_path()
|
|
1385
1383
|
if not path:
|
|
1386
1384
|
raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
|
|
1387
1385
|
endpoint, uri = parse_path(path)
|
|
1386
|
+
storage_options = store.get_storage_options()
|
|
1387
|
+
access_key = storage_options.get("v3io_access_key")
|
|
1388
1388
|
column_list = self._get_column_list(
|
|
1389
1389
|
features=features, timestamp_key=timestamp_key, key_columns=key_columns
|
|
1390
1390
|
)
|
|
@@ -1395,7 +1395,9 @@ class StreamTarget(BaseStoreTarget):
|
|
|
1395
1395
|
graph_shape="cylinder",
|
|
1396
1396
|
class_name="storey.StreamTarget",
|
|
1397
1397
|
columns=column_list,
|
|
1398
|
-
storage=V3ioDriver(
|
|
1398
|
+
storage=V3ioDriver(
|
|
1399
|
+
webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
|
|
1400
|
+
),
|
|
1399
1401
|
stream_path=uri,
|
|
1400
1402
|
**self.attributes,
|
|
1401
1403
|
)
|
|
@@ -1531,7 +1533,11 @@ class TSDBTarget(BaseStoreTarget):
|
|
|
1531
1533
|
key_column = [key_column]
|
|
1532
1534
|
new_index.extend(key_column)
|
|
1533
1535
|
|
|
1534
|
-
|
|
1536
|
+
store, path_in_store, target_path = self._get_store_and_path()
|
|
1537
|
+
storage_options = store.get_storage_options()
|
|
1538
|
+
access_key = storage_options.get("v3io_access_key", access_key)
|
|
1539
|
+
|
|
1540
|
+
_, path_with_container = parse_path(target_path)
|
|
1535
1541
|
container, path = split_path(path_with_container)
|
|
1536
1542
|
|
|
1537
1543
|
frames_client = get_frames_client(
|
mlrun/datastore/utils.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import tarfile
|
|
16
16
|
import tempfile
|
|
17
17
|
import typing
|
|
18
|
-
from urllib.parse import parse_qs, urlparse
|
|
18
|
+
from urllib.parse import parse_qs, urlparse
|
|
19
19
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
import semver
|
|
@@ -23,53 +23,6 @@ import semver
|
|
|
23
23
|
import mlrun.datastore
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def store_path_to_spark(path, spark_options=None):
|
|
27
|
-
schemas = ["redis://", "rediss://", "ds://"]
|
|
28
|
-
if any(path.startswith(schema) for schema in schemas):
|
|
29
|
-
url = urlparse(path)
|
|
30
|
-
if url.path:
|
|
31
|
-
path = url.path
|
|
32
|
-
elif path.startswith("gcs://"):
|
|
33
|
-
path = "gs:" + path[len("gcs:") :]
|
|
34
|
-
elif path.startswith("v3io:///"):
|
|
35
|
-
path = "v3io:" + path[len("v3io:/") :]
|
|
36
|
-
elif path.startswith("az://"):
|
|
37
|
-
account_key = None
|
|
38
|
-
path = "wasbs:" + path[len("az:") :]
|
|
39
|
-
prefix = "spark.hadoop.fs.azure.account.key."
|
|
40
|
-
if spark_options:
|
|
41
|
-
for key in spark_options:
|
|
42
|
-
if key.startswith(prefix):
|
|
43
|
-
account_key = key[len(prefix) :]
|
|
44
|
-
break
|
|
45
|
-
if account_key:
|
|
46
|
-
# transfer "wasb://basket/some/path" to wasb://basket@account_key.blob.core.windows.net/some/path
|
|
47
|
-
parsed_url = urlparse(path)
|
|
48
|
-
new_netloc = f"{parsed_url.hostname}@{account_key}"
|
|
49
|
-
path = urlunparse(
|
|
50
|
-
(
|
|
51
|
-
parsed_url.scheme,
|
|
52
|
-
new_netloc,
|
|
53
|
-
parsed_url.path,
|
|
54
|
-
parsed_url.params,
|
|
55
|
-
parsed_url.query,
|
|
56
|
-
parsed_url.fragment,
|
|
57
|
-
)
|
|
58
|
-
)
|
|
59
|
-
elif path.startswith("s3://"):
|
|
60
|
-
if path.startswith("s3:///"):
|
|
61
|
-
# 's3:///' not supported since mlrun 0.9.0 should use s3:// instead
|
|
62
|
-
from mlrun.errors import MLRunInvalidArgumentError
|
|
63
|
-
|
|
64
|
-
valid_path = "s3:" + path[len("s3:/") :]
|
|
65
|
-
raise MLRunInvalidArgumentError(
|
|
66
|
-
f"'s3:///' is not supported, try using 's3://' instead.\nE.g: '{valid_path}'"
|
|
67
|
-
)
|
|
68
|
-
else:
|
|
69
|
-
path = "s3a:" + path[len("s3:") :]
|
|
70
|
-
return path
|
|
71
|
-
|
|
72
|
-
|
|
73
26
|
def parse_kafka_url(url: str, bootstrap_servers: list = None) -> tuple[str, list]:
|
|
74
27
|
"""Generating Kafka topic and adjusting a list of bootstrap servers.
|
|
75
28
|
|
|
@@ -105,7 +58,7 @@ def upload_tarball(source_dir, target, secrets=None):
|
|
|
105
58
|
with tarfile.open(mode="w:gz", fileobj=temp_fh) as tar:
|
|
106
59
|
tar.add(source_dir, arcname="")
|
|
107
60
|
stores = mlrun.datastore.store_manager.set(secrets)
|
|
108
|
-
datastore, subpath = stores.get_or_create_store(target)
|
|
61
|
+
datastore, subpath, url = stores.get_or_create_store(target)
|
|
109
62
|
datastore.upload(subpath, temp_fh.name)
|
|
110
63
|
|
|
111
64
|
|
mlrun/datastore/v3io.py
CHANGED
|
@@ -79,6 +79,10 @@ class V3ioStore(DataStore):
|
|
|
79
79
|
schema = "https" if self.secure else "http"
|
|
80
80
|
return f"{schema}://{self.endpoint}"
|
|
81
81
|
|
|
82
|
+
@property
|
|
83
|
+
def spark_url(self):
|
|
84
|
+
return "v3io:/"
|
|
85
|
+
|
|
82
86
|
@property
|
|
83
87
|
def filesystem(self):
|
|
84
88
|
"""return fsspec file system object, if supported"""
|