mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (76) hide show
  1. mlrun/artifacts/manager.py +6 -1
  2. mlrun/common/constants.py +2 -0
  3. mlrun/common/model_monitoring/helpers.py +12 -6
  4. mlrun/common/schemas/__init__.py +11 -0
  5. mlrun/common/schemas/api_gateway.py +85 -0
  6. mlrun/common/schemas/auth.py +2 -2
  7. mlrun/common/schemas/client_spec.py +1 -0
  8. mlrun/common/schemas/common.py +40 -0
  9. mlrun/common/schemas/model_monitoring/constants.py +4 -1
  10. mlrun/common/schemas/project.py +2 -0
  11. mlrun/config.py +31 -17
  12. mlrun/datastore/azure_blob.py +22 -9
  13. mlrun/datastore/base.py +15 -25
  14. mlrun/datastore/datastore.py +19 -8
  15. mlrun/datastore/datastore_profile.py +47 -5
  16. mlrun/datastore/google_cloud_storage.py +10 -6
  17. mlrun/datastore/hdfs.py +51 -0
  18. mlrun/datastore/redis.py +4 -0
  19. mlrun/datastore/s3.py +4 -0
  20. mlrun/datastore/sources.py +29 -43
  21. mlrun/datastore/targets.py +59 -53
  22. mlrun/datastore/utils.py +2 -49
  23. mlrun/datastore/v3io.py +4 -0
  24. mlrun/db/base.py +50 -0
  25. mlrun/db/httpdb.py +121 -50
  26. mlrun/db/nopdb.py +13 -0
  27. mlrun/execution.py +3 -3
  28. mlrun/feature_store/feature_vector.py +2 -2
  29. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
  30. mlrun/frameworks/tf_keras/model_handler.py +7 -7
  31. mlrun/k8s_utils.py +10 -5
  32. mlrun/kfpops.py +19 -10
  33. mlrun/model.py +5 -0
  34. mlrun/model_monitoring/api.py +3 -3
  35. mlrun/model_monitoring/application.py +1 -1
  36. mlrun/model_monitoring/applications/__init__.py +13 -0
  37. mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
  38. mlrun/model_monitoring/batch.py +9 -111
  39. mlrun/model_monitoring/controller.py +73 -55
  40. mlrun/model_monitoring/controller_handler.py +13 -5
  41. mlrun/model_monitoring/features_drift_table.py +62 -53
  42. mlrun/model_monitoring/helpers.py +30 -21
  43. mlrun/model_monitoring/metrics/__init__.py +13 -0
  44. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  45. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
  46. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
  47. mlrun/package/packagers/pandas_packagers.py +3 -3
  48. mlrun/package/utils/_archiver.py +3 -1
  49. mlrun/platforms/iguazio.py +8 -65
  50. mlrun/projects/pipelines.py +21 -11
  51. mlrun/projects/project.py +180 -42
  52. mlrun/run.py +1 -1
  53. mlrun/runtimes/base.py +25 -2
  54. mlrun/runtimes/kubejob.py +5 -3
  55. mlrun/runtimes/local.py +2 -2
  56. mlrun/runtimes/mpijob/abstract.py +6 -6
  57. mlrun/runtimes/nuclio/__init__.py +1 -0
  58. mlrun/runtimes/nuclio/api_gateway.py +300 -0
  59. mlrun/runtimes/nuclio/function.py +9 -9
  60. mlrun/runtimes/nuclio/serving.py +3 -3
  61. mlrun/runtimes/pod.py +3 -3
  62. mlrun/runtimes/sparkjob/spark3job.py +3 -3
  63. mlrun/serving/remote.py +4 -2
  64. mlrun/serving/server.py +2 -8
  65. mlrun/utils/async_http.py +3 -3
  66. mlrun/utils/helpers.py +27 -5
  67. mlrun/utils/http.py +3 -3
  68. mlrun/utils/logger.py +2 -2
  69. mlrun/utils/notifications/notification_pusher.py +6 -6
  70. mlrun/utils/version/version.json +2 -2
  71. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/METADATA +13 -16
  72. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/RECORD +76 -68
  73. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/WHEEL +1 -1
  74. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/LICENSE +0 -0
  75. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/entry_points.txt +0 -0
  76. {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc5.dist-info}/top_level.txt +0 -0
@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
132
132
  return attributes
133
133
 
134
134
 
135
+ class DatastoreProfileV3io(DatastoreProfile):
136
+ type: str = pydantic.Field("v3io")
137
+ v3io_access_key: typing.Optional[str] = None
138
+ _private_attributes = "v3io_access_key"
139
+
140
+ def url(self, subpath):
141
+ subpath = subpath.lstrip("/")
142
+ return f"v3io:///{subpath}"
143
+
144
+ def secrets(self) -> dict:
145
+ res = {}
146
+ if self.v3io_access_key:
147
+ res["V3IO_ACCESS_KEY"] = self.v3io_access_key
148
+ return res
149
+
150
+
135
151
  class DatastoreProfileS3(DatastoreProfile):
136
152
  type: str = pydantic.Field("s3")
137
153
  _private_attributes = ("access_key_id", "secret_key")
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
156
172
  res["AWS_PROFILE"] = self.profile_name
157
173
  if self.assume_role_arn:
158
174
  res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
159
- return res if res else None
175
+ return res
160
176
 
161
177
  def url(self, subpath):
162
178
  return f"s3:/{subpath}"
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
199
215
  res["REDIS_USER"] = self.username
200
216
  if self.password:
201
217
  res["REDIS_PASSWORD"] = self.password
202
- return res if res else None
218
+ return res
203
219
 
204
220
  def url(self, subpath):
205
221
  return self.endpoint_url + subpath
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
220
236
  res["DATABRICKS_TOKEN"] = self.token
221
237
  if self.endpoint_url:
222
238
  res["DATABRICKS_HOST"] = self.endpoint_url
223
- return res if res else None
239
+ return res
224
240
 
225
241
 
226
242
  class DatastoreProfileGCS(DatastoreProfile):
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
247
263
  res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
248
264
  if self.gcp_credentials:
249
265
  res["GCP_CREDENTIALS"] = self.gcp_credentials
250
- return res if res else None
266
+ return res
251
267
 
252
268
 
253
269
  class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
292
308
  res["sas_token"] = self.sas_token
293
309
  if self.credential:
294
310
  res["credential"] = self.credential
295
- return res if res else None
311
+ return res
312
+
313
+
314
+ class DatastoreProfileHdfs(DatastoreProfile):
315
+ type: str = pydantic.Field("hdfs")
316
+ _private_attributes = "token"
317
+ host: typing.Optional[str] = None
318
+ port: typing.Optional[int] = None
319
+ http_port: typing.Optional[int] = None
320
+ user: typing.Optional[str] = None
321
+
322
+ def secrets(self) -> dict:
323
+ res = {}
324
+ if self.host:
325
+ res["HDFS_HOST"] = self.host
326
+ if self.port:
327
+ res["HDFS_PORT"] = self.port
328
+ if self.port:
329
+ res["HDFS_HTTP_PORT"] = self.http_port
330
+ if self.user:
331
+ res["HDFS_USER"] = self.user
332
+ return res or None
333
+
334
+ def url(self, subpath):
335
+ return f"hdfs://{self.host}:{self.http_port}{subpath}"
296
336
 
297
337
 
298
338
  class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
346
386
  decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
347
387
  datastore_type = decoded_dict.get("type")
348
388
  ds_profile_factory = {
389
+ "v3io": DatastoreProfileV3io,
349
390
  "s3": DatastoreProfileS3,
350
391
  "redis": DatastoreProfileRedis,
351
392
  "basic": DatastoreProfileBasic,
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
354
395
  "dbfs": DatastoreProfileDBFS,
355
396
  "gcs": DatastoreProfileGCS,
356
397
  "az": DatastoreProfileAzureBlob,
398
+ "hdfs": DatastoreProfileHdfs,
357
399
  }
358
400
  if datastore_type in ds_profile_factory:
359
401
  return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
147
147
  if "project_id" in credentials:
148
148
  res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
149
149
  if "private_key_id" in credentials:
150
- res[
151
- "spark.hadoop.fs.gs.auth.service.account.private.key.id"
152
- ] = credentials["private_key_id"]
150
+ res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
151
+ credentials["private_key_id"]
152
+ )
153
153
  if "private_key" in credentials:
154
- res[
155
- "spark.hadoop.fs.gs.auth.service.account.private.key"
156
- ] = credentials["private_key"]
154
+ res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
155
+ credentials["private_key"]
156
+ )
157
157
  if "client_email" in credentials:
158
158
  res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
159
159
  "client_email"
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
161
161
  if "client_id" in credentials:
162
162
  res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
163
163
  return res
164
+
165
+ @property
166
+ def spark_url(self):
167
+ return f"gs://{self.endpoint}"
@@ -0,0 +1,51 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ import fsspec
17
+
18
+ from mlrun.datastore.base import DataStore
19
+
20
+
21
+ class HdfsStore(DataStore):
22
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
23
+ super().__init__(parent, name, schema, endpoint, secrets)
24
+
25
+ self.host = self._get_secret_or_env("HDFS_HOST")
26
+ self.port = self._get_secret_or_env("HDFS_PORT")
27
+ self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
28
+ self.user = self._get_secret_or_env("HDFS_USER")
29
+ if not self.user:
30
+ self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
31
+
32
+ self._filesystem = None
33
+
34
+ @property
35
+ def filesystem(self):
36
+ if not self._filesystem:
37
+ self._filesystem = fsspec.filesystem(
38
+ "webhdfs",
39
+ host=self.host,
40
+ port=self.http_port,
41
+ user=self.user,
42
+ )
43
+ return self._filesystem
44
+
45
+ @property
46
+ def url(self):
47
+ return f"webhdfs://{self.host}:{self.http_port}"
48
+
49
+ @property
50
+ def spark_url(self):
51
+ return f"hdfs://{self.host}:{self.port}"
mlrun/datastore/redis.py CHANGED
@@ -163,3 +163,7 @@ class RedisStore(DataStore):
163
163
  self.redis.delete(k)
164
164
  else:
165
165
  self.redis.delete(key)
166
+
167
+ @property
168
+ def spark_url(self):
169
+ return ""
mlrun/datastore/s3.py CHANGED
@@ -156,6 +156,10 @@ class S3Store(DataStore):
156
156
 
157
157
  return self._sanitize_storage_options(storage_options)
158
158
 
159
+ @property
160
+ def spark_url(self):
161
+ return f"s3a://{self.endpoint}"
162
+
159
163
  def get_bucket_and_key(self, key):
160
164
  path = self._join(key)[1:]
161
165
  return self.endpoint, path
@@ -39,7 +39,6 @@ from .utils import (
39
39
  _generate_sql_query_with_time_filter,
40
40
  filter_df_start_end_time,
41
41
  select_columns_from_df,
42
- store_path_to_spark,
43
42
  )
44
43
 
45
44
 
@@ -193,14 +192,10 @@ class CSVSource(BaseSourceDriver):
193
192
  parse_dates.append(time_field)
194
193
 
195
194
  data_item = mlrun.store_manager.object(self.path)
196
- if self.path and self.path.startswith("ds://"):
197
- store, path = mlrun.store_manager.get_or_create_store(self.path)
198
- path = store.url + path
199
- else:
200
- path = data_item.url
195
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
201
196
 
202
197
  return storey.CSVSource(
203
- paths=path, # unlike self.path, it already has store:// replaced
198
+ paths=url, # unlike self.path, it already has store:// replaced
204
199
  build_dict=True,
205
200
  key_field=self.key_field or key_field,
206
201
  storage_options=data_item.store.get_storage_options(),
@@ -209,25 +204,17 @@ class CSVSource(BaseSourceDriver):
209
204
  )
210
205
 
211
206
  def get_spark_options(self):
212
- if self.path and self.path.startswith("ds://"):
213
- store, path = mlrun.store_manager.get_or_create_store(self.path)
214
- storage_spark_options = store.get_spark_options()
215
- path = store.url + path
216
- result = {
217
- "path": store_path_to_spark(path, storage_spark_options),
218
- "format": "csv",
219
- "header": "true",
220
- "inferSchema": "true",
221
- }
222
-
223
- return {**result, **storage_spark_options}
224
- else:
225
- return {
226
- "path": store_path_to_spark(self.path),
207
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
208
+ spark_options = store.get_spark_options()
209
+ spark_options.update(
210
+ {
211
+ "path": url,
227
212
  "format": "csv",
228
213
  "header": "true",
229
214
  "inferSchema": "true",
230
215
  }
216
+ )
217
+ return spark_options
231
218
 
232
219
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
233
220
  import pyspark.sql.functions as funcs
@@ -357,14 +344,10 @@ class ParquetSource(BaseSourceDriver):
357
344
  attributes["context"] = context
358
345
 
359
346
  data_item = mlrun.store_manager.object(self.path)
360
- if self.path and self.path.startswith("ds://"):
361
- store, path = mlrun.store_manager.get_or_create_store(self.path)
362
- path = store.url + path
363
- else:
364
- path = data_item.url
347
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
365
348
 
366
349
  return storey.ParquetSource(
367
- paths=path, # unlike self.path, it already has store:// replaced
350
+ paths=url, # unlike self.path, it already has store:// replaced
368
351
  key_field=self.key_field or key_field,
369
352
  storage_options=data_item.store.get_storage_options(),
370
353
  end_filter=self.end_time,
@@ -374,20 +357,15 @@ class ParquetSource(BaseSourceDriver):
374
357
  )
375
358
 
376
359
  def get_spark_options(self):
377
- if self.path and self.path.startswith("ds://"):
378
- store, path = mlrun.store_manager.get_or_create_store(self.path)
379
- storage_spark_options = store.get_spark_options()
380
- path = store.url + path
381
- result = {
382
- "path": store_path_to_spark(path, storage_spark_options),
383
- "format": "parquet",
384
- }
385
- return {**result, **storage_spark_options}
386
- else:
387
- return {
388
- "path": store_path_to_spark(self.path),
360
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
361
+ spark_options = store.get_spark_options()
362
+ spark_options.update(
363
+ {
364
+ "path": store.spark_url + path,
389
365
  "format": "parquet",
390
366
  }
367
+ )
368
+ return spark_options
391
369
 
392
370
  def to_dataframe(
393
371
  self,
@@ -875,8 +853,16 @@ class StreamSource(OnlineSource):
875
853
  super().__init__(name, attributes=attrs, **kwargs)
876
854
 
877
855
  def add_nuclio_trigger(self, function):
878
- endpoint, stream_path = parse_path(self.path)
879
- v3io_client = v3io.dataplane.Client(endpoint=endpoint)
856
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
857
+ if store.kind != "v3io":
858
+ raise mlrun.errors.MLRunInvalidArgumentError(
859
+ "Only profiles that reference the v3io datastore can be used with StreamSource"
860
+ )
861
+ path = "v3io:/" + path
862
+ storage_options = store.get_storage_options()
863
+ access_key = storage_options.get("v3io_access_key")
864
+ endpoint, stream_path = parse_path(url)
865
+ v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
880
866
  container, stream_path = split_path(stream_path)
881
867
  res = v3io_client.stream.create(
882
868
  container=container,
@@ -896,7 +882,7 @@ class StreamSource(OnlineSource):
896
882
  kwargs["worker_allocation_mode"] = "static"
897
883
 
898
884
  function.add_v3io_stream_trigger(
899
- self.path,
885
+ path,
900
886
  self.name,
901
887
  self.attributes["group"],
902
888
  self.attributes["seek_to"],
@@ -29,7 +29,7 @@ import mlrun
29
29
  import mlrun.utils.helpers
30
30
  from mlrun.config import config
31
31
  from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
32
- from mlrun.utils import now_date
32
+ from mlrun.utils import logger, now_date
33
33
  from mlrun.utils.helpers import to_parquet
34
34
  from mlrun.utils.v3io_clients import get_frames_client
35
35
 
@@ -43,7 +43,6 @@ from .utils import (
43
43
  filter_df_start_end_time,
44
44
  parse_kafka_url,
45
45
  select_columns_from_df,
46
- store_path_to_spark,
47
46
  )
48
47
 
49
48
 
@@ -448,14 +447,11 @@ class BaseStoreTarget(DataTargetBase):
448
447
  if self.credentials_prefix
449
448
  else None
450
449
  )
451
- store, resolved_store_path = mlrun.store_manager.get_or_create_store(
450
+ store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
452
451
  self.get_target_path(),
453
452
  credentials_prefix_secrets,
454
453
  )
455
- if self.get_target_path() and self.get_target_path().startswith("ds://"):
456
- return store, store.url + resolved_store_path
457
- else:
458
- return store, self.get_target_path()
454
+ return store, resolved_store_path, url
459
455
 
460
456
  def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
461
457
  result = []
@@ -504,7 +500,7 @@ class BaseStoreTarget(DataTargetBase):
504
500
  write_spark_dataframe_with_options(options, df, "overwrite")
505
501
  elif hasattr(df, "dask"):
506
502
  dask_options = self.get_dask_options()
507
- store, target_path = self._get_store_and_path()
503
+ store, path_in_store, target_path = self._get_store_and_path()
508
504
  storage_options = store.get_storage_options()
509
505
  df = df.repartition(partition_size="100MB")
510
506
  try:
@@ -525,7 +521,7 @@ class BaseStoreTarget(DataTargetBase):
525
521
  except Exception as exc:
526
522
  raise RuntimeError("Failed to write Dask Dataframe") from exc
527
523
  else:
528
- store, target_path = self._get_store_and_path()
524
+ store, path_in_store, target_path = self._get_store_and_path()
529
525
  target_path = generate_path_with_chunk(self, chunk_id, target_path)
530
526
  file_system = store.filesystem
531
527
  if file_system.protocol == "file":
@@ -692,7 +688,7 @@ class BaseStoreTarget(DataTargetBase):
692
688
  raise NotImplementedError()
693
689
 
694
690
  def purge(self):
695
- store, target_path = self._get_store_and_path()
691
+ store, path_in_store, target_path = self._get_store_and_path()
696
692
  store.rm(target_path, recursive=True)
697
693
 
698
694
  def as_df(
@@ -872,7 +868,7 @@ class ParquetTarget(BaseStoreTarget):
872
868
  for key_column in key_columns:
873
869
  tuple_key_columns.append((key_column.name, key_column.value_type))
874
870
 
875
- store, target_path = self._get_store_and_path()
871
+ store, path_in_store, target_path = self._get_store_and_path()
876
872
 
877
873
  storage_options = store.get_storage_options()
878
874
  if storage_options and self.storage_options:
@@ -925,27 +921,19 @@ class ParquetTarget(BaseStoreTarget):
925
921
  if unit == time_partitioning_granularity:
926
922
  break
927
923
 
928
- if self.path and self.path.startswith("ds://"):
929
- store, path = mlrun.store_manager.get_or_create_store(
930
- self.get_target_path()
931
- )
932
- storage_spark_options = store.get_spark_options()
933
- path = store.url + path
934
- result = {
935
- "path": store_path_to_spark(path, storage_spark_options),
936
- "format": "parquet",
937
- }
938
- result = {**result, **storage_spark_options}
939
- else:
940
- result = {
941
- "path": store_path_to_spark(self.get_target_path()),
924
+ store, path, url = self._get_store_and_path()
925
+ spark_options = store.get_spark_options()
926
+ spark_options.update(
927
+ {
928
+ "path": store.spark_url + path,
942
929
  "format": "parquet",
943
930
  }
931
+ )
944
932
  for partition_col in self.partition_cols or []:
945
933
  partition_cols.append(partition_col)
946
934
  if partition_cols:
947
- result["partitionBy"] = partition_cols
948
- return result
935
+ spark_options["partitionBy"] = partition_cols
936
+ return spark_options
949
937
 
950
938
  def get_dask_options(self):
951
939
  return {"format": "parquet"}
@@ -1052,7 +1040,7 @@ class CSVTarget(BaseStoreTarget):
1052
1040
  column_list = self._get_column_list(
1053
1041
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1054
1042
  )
1055
- store, target_path = self._get_store_and_path()
1043
+ store, path_in_store, target_path = self._get_store_and_path()
1056
1044
  graph.add_step(
1057
1045
  name=self.name or "CSVTarget",
1058
1046
  after=after,
@@ -1067,24 +1055,16 @@ class CSVTarget(BaseStoreTarget):
1067
1055
  )
1068
1056
 
1069
1057
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1070
- if self.path and self.path.startswith("ds://"):
1071
- store, path = mlrun.store_manager.get_or_create_store(
1072
- self.get_target_path()
1073
- )
1074
- storage_spark_options = store.get_spark_options()
1075
- path = store.url + path
1076
- result = {
1077
- "path": store_path_to_spark(path, storage_spark_options),
1078
- "format": "csv",
1079
- "header": "true",
1080
- }
1081
- return {**result, **storage_spark_options}
1082
- else:
1083
- return {
1084
- "path": store_path_to_spark(self.get_target_path()),
1058
+ store, path, url = self._get_store_and_path()
1059
+ spark_options = store.get_spark_options()
1060
+ spark_options.update(
1061
+ {
1062
+ "path": store.spark_url + path,
1085
1063
  "format": "csv",
1086
1064
  "header": "true",
1087
1065
  }
1066
+ )
1067
+ return spark_options
1088
1068
 
1089
1069
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1090
1070
  import pyspark.sql.functions as funcs
@@ -1209,7 +1189,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
1209
1189
  df = df.copy(deep=False)
1210
1190
  access_key = self._get_credential("V3IO_ACCESS_KEY")
1211
1191
 
1212
- _, path_with_container = parse_path(self.get_target_path())
1192
+ store, path_in_store, target_path = self._get_store_and_path()
1193
+ storage_options = store.get_storage_options()
1194
+ access_key = storage_options.get("v3io_access_key", access_key)
1195
+
1196
+ _, path_with_container = parse_path(target_path)
1213
1197
  container, path = split_path(path_with_container)
1214
1198
 
1215
1199
  frames_client = get_frames_client(
@@ -1227,17 +1211,31 @@ class NoSqlTarget(NoSqlBaseTarget):
1227
1211
  def get_table_object(self):
1228
1212
  from storey import Table, V3ioDriver
1229
1213
 
1230
- # TODO use options/cred
1231
- endpoint, uri = parse_path(self.get_target_path())
1214
+ store, path_in_store, target_path = self._get_store_and_path()
1215
+ endpoint, uri = parse_path(target_path)
1216
+ storage_options = store.get_storage_options()
1217
+ access_key = storage_options.get("v3io_access_key")
1218
+
1232
1219
  return Table(
1233
1220
  uri,
1234
- V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
1221
+ V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
1235
1222
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
1236
1223
  )
1237
1224
 
1238
1225
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1226
+ store, path_in_store, target_path = self._get_store_and_path()
1227
+ storage_options = store.get_storage_options()
1228
+ store_access_key = storage_options.get("v3io_access_key")
1229
+ env_access_key = self._secrets.get(
1230
+ "V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
1231
+ )
1232
+ if store_access_key and env_access_key and store_access_key != env_access_key:
1233
+ logger.warning(
1234
+ "The Spark v3io connector does not support access_key parameterization."
1235
+ "Spark will disregard the store-provided key."
1236
+ )
1239
1237
  spark_options = {
1240
- "path": store_path_to_spark(self.get_target_path()),
1238
+ "path": store.spark_url + path_in_store,
1241
1239
  "format": "io.iguaz.v3io.spark.sql.kv",
1242
1240
  }
1243
1241
  if isinstance(key_column, list) and len(key_column) >= 1:
@@ -1330,10 +1328,10 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1330
1328
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1331
1329
  endpoint, uri = self._get_server_endpoint()
1332
1330
  parsed_endpoint = urlparse(endpoint)
1333
-
1331
+ store, path_in_store, path = self._get_store_and_path()
1334
1332
  return {
1335
1333
  "key.column": "_spark_object_name",
1336
- "table": "{" + store_path_to_spark(self.get_target_path()),
1334
+ "table": "{" + path_in_store,
1337
1335
  "format": "org.apache.spark.sql.redis",
1338
1336
  "host": parsed_endpoint.hostname,
1339
1337
  "port": parsed_endpoint.port,
@@ -1381,10 +1379,12 @@ class StreamTarget(BaseStoreTarget):
1381
1379
  from storey import V3ioDriver
1382
1380
 
1383
1381
  key_columns = list(key_columns.keys())
1384
- path = self.get_target_path()
1382
+ store, path_in_store, path = self._get_store_and_path()
1385
1383
  if not path:
1386
1384
  raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
1387
1385
  endpoint, uri = parse_path(path)
1386
+ storage_options = store.get_storage_options()
1387
+ access_key = storage_options.get("v3io_access_key")
1388
1388
  column_list = self._get_column_list(
1389
1389
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1390
1390
  )
@@ -1395,7 +1395,9 @@ class StreamTarget(BaseStoreTarget):
1395
1395
  graph_shape="cylinder",
1396
1396
  class_name="storey.StreamTarget",
1397
1397
  columns=column_list,
1398
- storage=V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
1398
+ storage=V3ioDriver(
1399
+ webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
1400
+ ),
1399
1401
  stream_path=uri,
1400
1402
  **self.attributes,
1401
1403
  )
@@ -1531,7 +1533,11 @@ class TSDBTarget(BaseStoreTarget):
1531
1533
  key_column = [key_column]
1532
1534
  new_index.extend(key_column)
1533
1535
 
1534
- _, path_with_container = parse_path(self.get_target_path())
1536
+ store, path_in_store, target_path = self._get_store_and_path()
1537
+ storage_options = store.get_storage_options()
1538
+ access_key = storage_options.get("v3io_access_key", access_key)
1539
+
1540
+ _, path_with_container = parse_path(target_path)
1535
1541
  container, path = split_path(path_with_container)
1536
1542
 
1537
1543
  frames_client = get_frames_client(
mlrun/datastore/utils.py CHANGED
@@ -15,7 +15,7 @@
15
15
  import tarfile
16
16
  import tempfile
17
17
  import typing
18
- from urllib.parse import parse_qs, urlparse, urlunparse
18
+ from urllib.parse import parse_qs, urlparse
19
19
 
20
20
  import pandas as pd
21
21
  import semver
@@ -23,53 +23,6 @@ import semver
23
23
  import mlrun.datastore
24
24
 
25
25
 
26
- def store_path_to_spark(path, spark_options=None):
27
- schemas = ["redis://", "rediss://", "ds://"]
28
- if any(path.startswith(schema) for schema in schemas):
29
- url = urlparse(path)
30
- if url.path:
31
- path = url.path
32
- elif path.startswith("gcs://"):
33
- path = "gs:" + path[len("gcs:") :]
34
- elif path.startswith("v3io:///"):
35
- path = "v3io:" + path[len("v3io:/") :]
36
- elif path.startswith("az://"):
37
- account_key = None
38
- path = "wasbs:" + path[len("az:") :]
39
- prefix = "spark.hadoop.fs.azure.account.key."
40
- if spark_options:
41
- for key in spark_options:
42
- if key.startswith(prefix):
43
- account_key = key[len(prefix) :]
44
- break
45
- if account_key:
46
- # transfer "wasb://basket/some/path" to wasb://basket@account_key.blob.core.windows.net/some/path
47
- parsed_url = urlparse(path)
48
- new_netloc = f"{parsed_url.hostname}@{account_key}"
49
- path = urlunparse(
50
- (
51
- parsed_url.scheme,
52
- new_netloc,
53
- parsed_url.path,
54
- parsed_url.params,
55
- parsed_url.query,
56
- parsed_url.fragment,
57
- )
58
- )
59
- elif path.startswith("s3://"):
60
- if path.startswith("s3:///"):
61
- # 's3:///' not supported since mlrun 0.9.0 should use s3:// instead
62
- from mlrun.errors import MLRunInvalidArgumentError
63
-
64
- valid_path = "s3:" + path[len("s3:/") :]
65
- raise MLRunInvalidArgumentError(
66
- f"'s3:///' is not supported, try using 's3://' instead.\nE.g: '{valid_path}'"
67
- )
68
- else:
69
- path = "s3a:" + path[len("s3:") :]
70
- return path
71
-
72
-
73
26
  def parse_kafka_url(url: str, bootstrap_servers: list = None) -> tuple[str, list]:
74
27
  """Generating Kafka topic and adjusting a list of bootstrap servers.
75
28
 
@@ -105,7 +58,7 @@ def upload_tarball(source_dir, target, secrets=None):
105
58
  with tarfile.open(mode="w:gz", fileobj=temp_fh) as tar:
106
59
  tar.add(source_dir, arcname="")
107
60
  stores = mlrun.datastore.store_manager.set(secrets)
108
- datastore, subpath = stores.get_or_create_store(target)
61
+ datastore, subpath, url = stores.get_or_create_store(target)
109
62
  datastore.upload(subpath, temp_fh.name)
110
63
 
111
64
 
mlrun/datastore/v3io.py CHANGED
@@ -79,6 +79,10 @@ class V3ioStore(DataStore):
79
79
  schema = "https" if self.secure else "http"
80
80
  return f"{schema}://{self.endpoint}"
81
81
 
82
+ @property
83
+ def spark_url(self):
84
+ return "v3io:/"
85
+
82
86
  @property
83
87
  def filesystem(self):
84
88
  """return fsspec file system object, if supported"""