mlrun 1.6.0rc6__py3-none-any.whl → 1.6.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (50) hide show
  1. mlrun/__main__.py +32 -31
  2. mlrun/common/schemas/auth.py +2 -0
  3. mlrun/common/schemas/workflow.py +2 -0
  4. mlrun/config.py +3 -3
  5. mlrun/datastore/base.py +9 -3
  6. mlrun/datastore/datastore.py +10 -7
  7. mlrun/datastore/datastore_profile.py +19 -2
  8. mlrun/datastore/dbfs_store.py +6 -6
  9. mlrun/datastore/s3.py +6 -2
  10. mlrun/datastore/sources.py +12 -2
  11. mlrun/datastore/targets.py +43 -20
  12. mlrun/db/httpdb.py +22 -0
  13. mlrun/feature_store/feature_set.py +5 -2
  14. mlrun/feature_store/retrieval/spark_merger.py +7 -1
  15. mlrun/kfpops.py +1 -1
  16. mlrun/launcher/client.py +1 -6
  17. mlrun/launcher/remote.py +5 -3
  18. mlrun/model.py +2 -2
  19. mlrun/model_monitoring/batch_application.py +61 -94
  20. mlrun/package/packager.py +115 -89
  21. mlrun/package/packagers/default_packager.py +66 -65
  22. mlrun/package/packagers/numpy_packagers.py +109 -62
  23. mlrun/package/packagers/pandas_packagers.py +12 -23
  24. mlrun/package/packagers/python_standard_library_packagers.py +35 -57
  25. mlrun/package/packagers_manager.py +16 -13
  26. mlrun/package/utils/_pickler.py +8 -18
  27. mlrun/package/utils/_supported_format.py +1 -1
  28. mlrun/projects/pipelines.py +63 -4
  29. mlrun/projects/project.py +34 -11
  30. mlrun/runtimes/__init__.py +6 -0
  31. mlrun/runtimes/base.py +12 -1
  32. mlrun/runtimes/daskjob.py +73 -5
  33. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -0
  34. mlrun/runtimes/function.py +53 -4
  35. mlrun/runtimes/kubejob.py +1 -1
  36. mlrun/runtimes/local.py +9 -9
  37. mlrun/runtimes/pod.py +1 -1
  38. mlrun/runtimes/remotesparkjob.py +1 -0
  39. mlrun/runtimes/serving.py +11 -1
  40. mlrun/runtimes/sparkjob/spark3job.py +4 -1
  41. mlrun/runtimes/utils.py +1 -46
  42. mlrun/utils/helpers.py +1 -17
  43. mlrun/utils/notifications/notification_pusher.py +27 -6
  44. mlrun/utils/version/version.json +2 -2
  45. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/METADATA +7 -6
  46. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/RECORD +50 -50
  47. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/WHEEL +1 -1
  48. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/LICENSE +0 -0
  49. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/entry_points.txt +0 -0
  50. {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/top_level.txt +0 -0
mlrun/__main__.py CHANGED
@@ -152,7 +152,7 @@ def main():
152
152
  @click.option("--schedule", help="cron schedule")
153
153
  @click.option("--from-env", is_flag=True, help="read the spec from the env var")
154
154
  @click.option("--dump", is_flag=True, help="dump run results as YAML")
155
- @click.option("--image", default="mlrun/mlrun", help="container image")
155
+ @click.option("--image", default="", help="container image (defaults to mlrun/mlrun)")
156
156
  @click.option("--kind", default="", help="serverless runtime kind")
157
157
  @click.option("--source", default="", help="source code archive/git")
158
158
  @click.option("--local", is_flag=True, help="run the task locally (ignore runtime)")
@@ -289,7 +289,7 @@ def run(
289
289
  exit(1)
290
290
  else:
291
291
  kind = kind or "job"
292
- runtime = {"kind": kind, "spec": {"image": image}}
292
+ runtime = {"kind": kind, "spec": {"image": image or "mlrun/mlrun"}}
293
293
 
294
294
  if kind not in ["", "local", "dask"] and url:
295
295
  if url_file and path.isfile(url_file):
@@ -303,7 +303,7 @@ def run(
303
303
  elif runtime:
304
304
  runtime = py_eval(runtime)
305
305
  if not isinstance(runtime, dict):
306
- print(f"runtime parameter must be a dict, not {type(runtime)}")
306
+ print(f"Runtime parameter must be a dict, not {type(runtime)}")
307
307
  exit(1)
308
308
  else:
309
309
  runtime = {}
@@ -317,7 +317,7 @@ def run(
317
317
  get_in(runtime, "spec.build.origin_filename", origin_file)
318
318
  )
319
319
  if kfp:
320
- print(f"code:\n{code}\n")
320
+ print(f"Code:\n{code}\n")
321
321
  suffix = pathlib.Path(url_file).suffix if url else ".py"
322
322
 
323
323
  # * is a placeholder for the url file when we want to use url args and let mlrun resolve the url file
@@ -340,7 +340,7 @@ def run(
340
340
  url = f"bash {url_file} {url_args}".strip()
341
341
  else:
342
342
  print(
343
- "error, command must be specified with '{codefile}' in it "
343
+ "Error: command must be specified with '{codefile}' in it "
344
344
  "(to determine the position of the code file)"
345
345
  )
346
346
  exit(1)
@@ -365,8 +365,9 @@ def run(
365
365
 
366
366
  if run_args:
367
367
  update_in(runtime, "spec.args", list(run_args))
368
- if image:
369
- update_in(runtime, "spec.image", image)
368
+
369
+ update_in(runtime, "spec.image", image or "mlrun/mlrun", replace=bool(image))
370
+
370
371
  set_item(runobj.spec, handler, "handler")
371
372
  set_item(runobj.spec, param, "parameters", fill_params(param))
372
373
 
@@ -427,7 +428,7 @@ def run(
427
428
  if resp and dump:
428
429
  print(resp.to_yaml())
429
430
  except RunError as err:
430
- print(f"runtime error: {err_to_str(err)}")
431
+ print(f"Runtime error: {err_to_str(err)}")
431
432
  exit(1)
432
433
 
433
434
 
@@ -499,7 +500,7 @@ def build(
499
500
  if runtime:
500
501
  runtime = py_eval(runtime)
501
502
  if not isinstance(runtime, dict):
502
- print(f"runtime parameter must be a dict, not {type(runtime)}")
503
+ print(f"Runtime parameter must be a dict, not {type(runtime)}")
503
504
  exit(1)
504
505
  if kfp:
505
506
  print("Runtime:")
@@ -514,7 +515,7 @@ def build(
514
515
  func = import_function(func_url)
515
516
 
516
517
  else:
517
- print("please specify the function path or url")
518
+ print("Error: Function path or url are required")
518
519
  exit(1)
519
520
 
520
521
  meta = func.metadata
@@ -531,12 +532,12 @@ def build(
531
532
 
532
533
  if source.endswith(".py"):
533
534
  if not path.isfile(source):
534
- print(f"source file doesnt exist ({source})")
535
+ print(f"Source file doesnt exist ({source})")
535
536
  exit(1)
536
537
  with open(source) as fp:
537
538
  body = fp.read()
538
539
  based = b64encode(body.encode("utf-8")).decode("utf-8")
539
- logger.info(f"packing code at {source}")
540
+ logger.info(f"Packing code at {source}")
540
541
  b.functionSourceCode = based
541
542
  func.spec.command = ""
542
543
  else:
@@ -562,13 +563,13 @@ def build(
562
563
  )
563
564
 
564
565
  if hasattr(func, "deploy"):
565
- logger.info("remote deployment started")
566
+ logger.info("Remote deployment started")
566
567
  try:
567
568
  func.deploy(
568
569
  with_mlrun=with_mlrun, watch=not silent, is_kfp=kfp, skip_deployed=skip
569
570
  )
570
571
  except Exception as err:
571
- print(f"deploy error, {err_to_str(err)}")
572
+ print(f"Deploy error, {err_to_str(err)}")
572
573
  exit(1)
573
574
 
574
575
  state = func.status.state
@@ -583,9 +584,9 @@ def build(
583
584
  fp.write(full_image)
584
585
  print("full image path = ", full_image)
585
586
 
586
- print(f"function built, state={state} image={image}")
587
+ print(f"Function built, state={state} image={image}")
587
588
  else:
588
- print("function does not have a deploy() method")
589
+ print("Function does not have a deploy() method")
589
590
  exit(1)
590
591
 
591
592
 
@@ -644,7 +645,7 @@ def deploy(
644
645
  else:
645
646
  runtime = {}
646
647
  if not isinstance(runtime, dict):
647
- print(f"runtime parameter must be a dict, not {type(runtime)}")
648
+ print(f"Runtime parameter must be a dict, not {type(runtime)}")
648
649
  exit(1)
649
650
 
650
651
  if verbose:
@@ -682,7 +683,7 @@ def deploy(
682
683
  print(f"deploy error: {err_to_str(err)}")
683
684
  exit(1)
684
685
 
685
- print(f"function deployed, address={addr}")
686
+ print(f"Function deployed, address={addr}")
686
687
  with open("/tmp/output", "w") as fp:
687
688
  fp.write(addr)
688
689
  with open("/tmp/name", "w") as fp:
@@ -715,7 +716,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
715
716
  if db:
716
717
  mlconf.dbpath = db
717
718
  if not project:
718
- print("warning, project parameter was not specified using default !")
719
+ print("Warning, project parameter was not specified using default !")
719
720
  if kind.startswith("po"):
720
721
  print("Unsupported, use 'get runtimes' instead")
721
722
  return
@@ -793,7 +794,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
793
794
  elif kind.startswith("workflow"):
794
795
  run_db = get_run_db()
795
796
  if project == "*":
796
- print("warning, reading workflows for all projects may take a long time !")
797
+ print("Warning, reading workflows for all projects may take a long time !")
797
798
  pipelines = run_db.list_pipelines(project=project, page_size=200)
798
799
  pipe_runs = pipelines.runs
799
800
  while pipelines.next_page_token is not None:
@@ -820,7 +821,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
820
821
 
821
822
  else:
822
823
  print(
823
- "currently only get runs | runtimes | workflows | artifacts | func [name] | runtime are supported"
824
+ "Currently only get runs | runtimes | workflows | artifacts | func [name] | runtime are supported"
824
825
  )
825
826
 
826
827
 
@@ -905,7 +906,7 @@ def db(
905
906
  )
906
907
  pid = child.pid
907
908
  print(
908
- f"background pid: {pid}, logs written to mlrun-stdout.log and mlrun-stderr.log, use:\n"
909
+ f"Background pid: {pid}, logs written to mlrun-stdout.log and mlrun-stderr.log, use:\n"
909
910
  f"`kill {pid}` (linux/mac) or `taskkill /pid {pid} /t /f` (windows), to kill the mlrun service process"
910
911
  )
911
912
  else:
@@ -923,7 +924,7 @@ def db(
923
924
  dotenv.set_key(filename, "MLRUN_MOCK_NUCLIO_DEPLOYMENT", "auto", quote_mode="")
924
925
  if pid:
925
926
  dotenv.set_key(filename, "MLRUN_SERVICE_PID", str(pid), quote_mode="")
926
- print(f"updated configuration in {update_env} .env file")
927
+ print(f"Updated configuration in {update_env} .env file")
927
928
 
928
929
 
929
930
  @main.command()
@@ -951,7 +952,7 @@ def logs(uid, project, offset, db, watch):
951
952
  print(text.decode())
952
953
 
953
954
  if state:
954
- print(f"final state: {state}")
955
+ print(f"Final state: {state}")
955
956
 
956
957
 
957
958
  @main.command()
@@ -1119,7 +1120,7 @@ def project(
1119
1120
  if arguments:
1120
1121
  args = fill_params(arguments)
1121
1122
 
1122
- print(f"running workflow {run} file: {workflow_path}")
1123
+ print(f"Running workflow {run} file: {workflow_path}")
1123
1124
  gitops = (
1124
1125
  git_issue
1125
1126
  or environ.get("GITHUB_EVENT_PATH")
@@ -1158,7 +1159,7 @@ def project(
1158
1159
  exit(1)
1159
1160
 
1160
1161
  elif sync:
1161
- print("saving project functions to db ..")
1162
+ print("Saving project functions to db ..")
1162
1163
  proj.sync_functions(save=True)
1163
1164
 
1164
1165
 
@@ -1295,7 +1296,7 @@ def show_or_set_config(
1295
1296
  if not op or op == "get":
1296
1297
  # print out the configuration (default or based on the specified env/api)
1297
1298
  if env_file and not path.isfile(path.expanduser(env_file)):
1298
- print(f"error, env file {env_file} does not exist")
1299
+ print(f"Error: Env file {env_file} does not exist")
1299
1300
  exit(1)
1300
1301
  if env_file or api:
1301
1302
  mlrun.set_environment(
@@ -1315,7 +1316,7 @@ def show_or_set_config(
1315
1316
  f".env file {filename} not found, creating new and setting configuration"
1316
1317
  )
1317
1318
  else:
1318
- print(f"updating configuration in .env file {filename}")
1319
+ print(f"Updating configuration in .env file {filename}")
1319
1320
  env_dict = {
1320
1321
  "MLRUN_DBPATH": api,
1321
1322
  "MLRUN_ARTIFACT_PATH": artifact_path,
@@ -1331,7 +1332,7 @@ def show_or_set_config(
1331
1332
  if env_file:
1332
1333
  # if its not the default file print the usage details
1333
1334
  print(
1334
- f"to use the {env_file} .env file add the following to your development environment:\n"
1335
+ f"To use the {env_file} .env file add the following to your development environment:\n"
1335
1336
  f"MLRUN_ENV_FILE={env_file}"
1336
1337
  )
1337
1338
 
@@ -1340,11 +1341,11 @@ def show_or_set_config(
1340
1341
  if not path.isfile(filename):
1341
1342
  print(f".env file {filename} not found")
1342
1343
  else:
1343
- print(f"deleting .env file {filename}")
1344
+ print(f"Deleting .env file {filename}")
1344
1345
  remove(filename)
1345
1346
 
1346
1347
  else:
1347
- print(f"Error, unsupported config option {op}")
1348
+ print(f"Error: Unsupported config option {op}")
1348
1349
 
1349
1350
 
1350
1351
  def fill_params(params, params_dict=None):
@@ -59,6 +59,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
59
59
  hub_source = "hub-source"
60
60
  workflow = "workflow"
61
61
  datastore_profile = "datastore-profile"
62
+ api_gateways = "api-gateways"
62
63
 
63
64
  def to_resource_string(
64
65
  self,
@@ -94,6 +95,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
94
95
  AuthorizationResourceTypes.hub_source: "/marketplace/sources",
95
96
  # workflow define how to run a pipeline and can be considered as the specification of a pipeline.
96
97
  AuthorizationResourceTypes.workflow: "/projects/{project_name}/workflows/{resource_name}",
98
+ AuthorizationResourceTypes.api_gateways: "/projects/{project_name}/api-gateways",
97
99
  }[self].format(project_name=project_name, resource_name=resource_name)
98
100
 
99
101
 
@@ -16,6 +16,7 @@ import typing
16
16
 
17
17
  import pydantic
18
18
 
19
+ from .notification import Notification
19
20
  from .schedule import ScheduleCronTrigger
20
21
 
21
22
 
@@ -40,6 +41,7 @@ class WorkflowRequest(pydantic.BaseModel):
40
41
  source: typing.Optional[str] = None
41
42
  run_name: typing.Optional[str] = None
42
43
  namespace: typing.Optional[str] = None
44
+ notifications: typing.Optional[typing.List[Notification]] = None
43
45
 
44
46
 
45
47
  class WorkflowResponse(pydantic.BaseModel):
mlrun/config.py CHANGED
@@ -462,7 +462,7 @@ default_config = {
462
462
  "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
463
463
  "batch_processing_function_branch": "master",
464
464
  "parquet_batching_max_events": 10_000,
465
- "parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
465
+ "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
466
466
  # See mlrun.model_monitoring.stores.ModelEndpointStoreType for available options
467
467
  "store_type": "v3io-nosql",
468
468
  "endpoint_store_connection": "",
@@ -1016,9 +1016,9 @@ class Config:
1016
1016
  mock_nuclio = not mlrun.mlconf.is_nuclio_detected()
1017
1017
  return True if mock_nuclio and force_mock is None else force_mock
1018
1018
 
1019
- def get_v3io_access_key(self):
1019
+ def get_v3io_access_key(self) -> typing.Optional[str]:
1020
1020
  # Get v3io access key from the environment
1021
- return os.environ.get("V3IO_ACCESS_KEY")
1021
+ return os.getenv("V3IO_ACCESS_KEY")
1022
1022
 
1023
1023
  def get_model_monitoring_file_target_path(
1024
1024
  self,
mlrun/datastore/base.py CHANGED
@@ -49,6 +49,8 @@ class FileStats:
49
49
 
50
50
 
51
51
  class DataStore:
52
+ using_bucket = False
53
+
52
54
  def __init__(self, parent, name, kind, endpoint="", secrets: dict = None):
53
55
  self._parent = parent
54
56
  self.kind = kind
@@ -303,7 +305,9 @@ class DataStore:
303
305
  storage_options = self.get_storage_options()
304
306
  if url.startswith("ds://"):
305
307
  parsed_url = urllib.parse.urlparse(url)
306
- url = parsed_url.path[1:]
308
+ url = parsed_url.path
309
+ if self.using_bucket:
310
+ url = url[1:]
307
311
  # Pass the underlying file system
308
312
  kwargs["filesystem"] = file_system
309
313
  elif storage_options:
@@ -707,7 +711,7 @@ class HttpStore(DataStore):
707
711
  # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
708
712
  # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
709
713
  # method specifically to strip away the 'ds' schema as required.
710
- def makeDatastoreSchemaSanitizer(cls, *args, **kwargs):
714
+ def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
711
715
  if not issubclass(cls, fsspec.AbstractFileSystem):
712
716
  raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
713
717
 
@@ -716,7 +720,9 @@ def makeDatastoreSchemaSanitizer(cls, *args, **kwargs):
716
720
  def _strip_protocol(cls, url):
717
721
  if url.startswith("ds://"):
718
722
  parsed_url = urlparse(url)
719
- url = parsed_url.path[1:]
723
+ url = parsed_url.path
724
+ if using_bucket:
725
+ url = url[1:]
720
726
  return super()._strip_protocol(url)
721
727
 
722
728
  return DatastoreSchemaSanitizer(*args, **kwargs)
@@ -194,18 +194,18 @@ class StoreManager:
194
194
 
195
195
  if schema == "ds":
196
196
  profile_name = endpoint
197
- datastore = TemporaryClientDatastoreProfiles().get(profile_name)
198
- if not datastore:
197
+ datastore_profile = TemporaryClientDatastoreProfiles().get(profile_name)
198
+ if not datastore_profile:
199
199
  project_name = urlparse(url).username or mlrun.mlconf.default_project
200
- datastore = mlrun.db.get_run_db(
200
+ datastore_profile = mlrun.db.get_run_db(
201
201
  secrets=self._secrets
202
202
  ).get_datastore_profile(profile_name, project_name)
203
203
 
204
- if secrets and datastore.secrets():
205
- secrets = merge(secrets, datastore.secrets())
204
+ if secrets and datastore_profile.secrets():
205
+ secrets = merge(secrets, datastore_profile.secrets())
206
206
  else:
207
- secrets = secrets or datastore.secrets()
208
- url = datastore.url(subpath)
207
+ secrets = secrets or datastore_profile.secrets()
208
+ url = datastore_profile.url(subpath)
209
209
  schema, endpoint, parsed_url = parse_url(url)
210
210
  subpath = parsed_url.path
211
211
 
@@ -233,3 +233,6 @@ class StoreManager:
233
233
  self._stores[store_key] = store
234
234
  # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
235
235
  return store, url if store.kind == "file" else subpath
236
+
237
+ def reset_secrets(self):
238
+ self._secrets = {}
@@ -48,11 +48,9 @@ class DatastoreProfile(pydantic.BaseModel):
48
48
  )
49
49
  return full_key
50
50
 
51
- @classmethod
52
51
  def secrets(self) -> dict:
53
52
  return None
54
53
 
55
- @classmethod
56
54
  def url(self, subpath) -> str:
57
55
  return None
58
56
 
@@ -204,6 +202,24 @@ class DatastoreProfileRedis(DatastoreProfile):
204
202
  return self.endpoint_url + subpath
205
203
 
206
204
 
205
+ class DatastoreProfileDBFS(DatastoreProfile):
206
+ type: str = pydantic.Field("dbfs")
207
+ _private_attributes = ("token",)
208
+ endpoint_url: typing.Optional[str] = None # host
209
+ token: typing.Optional[str] = None
210
+
211
+ def url(self, subpath) -> str:
212
+ return f"dbfs://{subpath}"
213
+
214
+ def secrets(self) -> dict:
215
+ res = {}
216
+ if self.token:
217
+ res["DATABRICKS_TOKEN"] = self.token
218
+ if self.endpoint_url:
219
+ res["DATABRICKS_HOST"] = self.endpoint_url
220
+ return res if res else None
221
+
222
+
207
223
  class DatastoreProfile2Json(pydantic.BaseModel):
208
224
  @staticmethod
209
225
  def _to_json(attributes):
@@ -260,6 +276,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
260
276
  "basic": DatastoreProfileBasic,
261
277
  "kafka_target": DatastoreProfileKafkaTarget,
262
278
  "kafka_source": DatastoreProfileKafkaSource,
279
+ "dbfs": DatastoreProfileDBFS,
263
280
  }
264
281
  if datastore_type in ds_profile_factory:
265
282
  return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
@@ -14,12 +14,11 @@
14
14
 
15
15
  import pathlib
16
16
 
17
- import fsspec
18
17
  from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
19
18
 
20
19
  import mlrun.errors
21
20
 
22
- from .base import DataStore, FileStats
21
+ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
23
22
 
24
23
 
25
24
  class DatabricksFileBugFixed(DatabricksFile):
@@ -83,15 +82,16 @@ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
83
82
  class DBFSStore(DataStore):
84
83
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
85
84
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
86
- if not endpoint:
87
- endpoint = self._get_secret_or_env("DATABRICKS_HOST")
88
- self.endpoint = endpoint
89
85
  self.get_filesystem(silent=False)
90
86
 
91
87
  def get_filesystem(self, silent=True):
92
88
  """return fsspec file system object, if supported"""
93
89
  if not self._filesystem:
94
- self._filesystem = fsspec.filesystem("dbfs", **self.get_storage_options())
90
+ self._filesystem = makeDatastoreSchemaSanitizer(
91
+ cls=DatabricksFileSystemDisableCache,
92
+ using_bucket=False,
93
+ **self.get_storage_options(),
94
+ )
95
95
  return self._filesystem
96
96
 
97
97
  def get_storage_options(self):
mlrun/datastore/s3.py CHANGED
@@ -22,6 +22,8 @@ from .base import DataStore, FileStats, get_range, makeDatastoreSchemaSanitizer
22
22
 
23
23
 
24
24
  class S3Store(DataStore):
25
+ using_bucket = True
26
+
25
27
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
26
28
  super().__init__(parent, name, schema, endpoint, secrets)
27
29
  # will be used in case user asks to assume a role and work through fsspec
@@ -108,7 +110,9 @@ class S3Store(DataStore):
108
110
  return None
109
111
 
110
112
  self._filesystem = makeDatastoreSchemaSanitizer(
111
- s3fs.S3FileSystem, **self.get_storage_options()
113
+ s3fs.S3FileSystem,
114
+ using_bucket=self.using_bucket,
115
+ **self.get_storage_options(),
112
116
  )
113
117
  return self._filesystem
114
118
 
@@ -173,7 +177,7 @@ class S3Store(DataStore):
173
177
  if not key.endswith("/"):
174
178
  key += "/"
175
179
  # Object names is S3 are not fully following filesystem semantics - they do not start with /, even for
176
- # "absolute paths". Therefore, we are are removing leading / from path filter.
180
+ # "absolute paths". Therefore, we are removing leading / from path filter.
177
181
  if key.startswith("/"):
178
182
  key = key[1:]
179
183
  key_length = len(key)
@@ -177,9 +177,14 @@ class CSVSource(BaseSourceDriver):
177
177
  parse_dates.append(time_field)
178
178
 
179
179
  data_item = mlrun.store_manager.object(self.path)
180
+ if self.path.startswith("ds://"):
181
+ store, path = mlrun.store_manager.get_or_create_store(self.path)
182
+ path = store.url + path
183
+ else:
184
+ path = data_item.url
180
185
 
181
186
  return storey.CSVSource(
182
- paths=data_item.url, # unlike self.path, it already has store:// replaced
187
+ paths=path, # unlike self.path, it already has store:// replaced
183
188
  build_dict=True,
184
189
  key_field=self.key_field or key_field,
185
190
  storage_options=data_item.store.get_storage_options(),
@@ -323,9 +328,14 @@ class ParquetSource(BaseSourceDriver):
323
328
  attributes["context"] = context
324
329
 
325
330
  data_item = mlrun.store_manager.object(self.path)
331
+ if self.path.startswith("ds://"):
332
+ store, path = mlrun.store_manager.get_or_create_store(self.path)
333
+ path = store.url + path
334
+ else:
335
+ path = data_item.url
326
336
 
327
337
  return storey.ParquetSource(
328
- paths=data_item.url, # unlike self.path, it already has store:// replaced
338
+ paths=path, # unlike self.path, it already has store:// replaced
329
339
  key_field=self.key_field or key_field,
330
340
  storage_options=data_item.store.get_storage_options(),
331
341
  end_filter=self.end_time,