mlrun 1.6.0rc6__py3-none-any.whl → 1.6.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +32 -31
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/workflow.py +2 -0
- mlrun/config.py +3 -3
- mlrun/datastore/base.py +9 -3
- mlrun/datastore/datastore.py +10 -7
- mlrun/datastore/datastore_profile.py +19 -2
- mlrun/datastore/dbfs_store.py +6 -6
- mlrun/datastore/s3.py +6 -2
- mlrun/datastore/sources.py +12 -2
- mlrun/datastore/targets.py +43 -20
- mlrun/db/httpdb.py +22 -0
- mlrun/feature_store/feature_set.py +5 -2
- mlrun/feature_store/retrieval/spark_merger.py +7 -1
- mlrun/kfpops.py +1 -1
- mlrun/launcher/client.py +1 -6
- mlrun/launcher/remote.py +5 -3
- mlrun/model.py +2 -2
- mlrun/model_monitoring/batch_application.py +61 -94
- mlrun/package/packager.py +115 -89
- mlrun/package/packagers/default_packager.py +66 -65
- mlrun/package/packagers/numpy_packagers.py +109 -62
- mlrun/package/packagers/pandas_packagers.py +12 -23
- mlrun/package/packagers/python_standard_library_packagers.py +35 -57
- mlrun/package/packagers_manager.py +16 -13
- mlrun/package/utils/_pickler.py +8 -18
- mlrun/package/utils/_supported_format.py +1 -1
- mlrun/projects/pipelines.py +63 -4
- mlrun/projects/project.py +34 -11
- mlrun/runtimes/__init__.py +6 -0
- mlrun/runtimes/base.py +12 -1
- mlrun/runtimes/daskjob.py +73 -5
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -0
- mlrun/runtimes/function.py +53 -4
- mlrun/runtimes/kubejob.py +1 -1
- mlrun/runtimes/local.py +9 -9
- mlrun/runtimes/pod.py +1 -1
- mlrun/runtimes/remotesparkjob.py +1 -0
- mlrun/runtimes/serving.py +11 -1
- mlrun/runtimes/sparkjob/spark3job.py +4 -1
- mlrun/runtimes/utils.py +1 -46
- mlrun/utils/helpers.py +1 -17
- mlrun/utils/notifications/notification_pusher.py +27 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/METADATA +7 -6
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/RECORD +50 -50
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/WHEEL +1 -1
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/LICENSE +0 -0
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.0rc6.dist-info → mlrun-1.6.0rc8.dist-info}/top_level.txt +0 -0
mlrun/__main__.py
CHANGED
|
@@ -152,7 +152,7 @@ def main():
|
|
|
152
152
|
@click.option("--schedule", help="cron schedule")
|
|
153
153
|
@click.option("--from-env", is_flag=True, help="read the spec from the env var")
|
|
154
154
|
@click.option("--dump", is_flag=True, help="dump run results as YAML")
|
|
155
|
-
@click.option("--image", default="
|
|
155
|
+
@click.option("--image", default="", help="container image (defaults to mlrun/mlrun)")
|
|
156
156
|
@click.option("--kind", default="", help="serverless runtime kind")
|
|
157
157
|
@click.option("--source", default="", help="source code archive/git")
|
|
158
158
|
@click.option("--local", is_flag=True, help="run the task locally (ignore runtime)")
|
|
@@ -289,7 +289,7 @@ def run(
|
|
|
289
289
|
exit(1)
|
|
290
290
|
else:
|
|
291
291
|
kind = kind or "job"
|
|
292
|
-
runtime = {"kind": kind, "spec": {"image": image}}
|
|
292
|
+
runtime = {"kind": kind, "spec": {"image": image or "mlrun/mlrun"}}
|
|
293
293
|
|
|
294
294
|
if kind not in ["", "local", "dask"] and url:
|
|
295
295
|
if url_file and path.isfile(url_file):
|
|
@@ -303,7 +303,7 @@ def run(
|
|
|
303
303
|
elif runtime:
|
|
304
304
|
runtime = py_eval(runtime)
|
|
305
305
|
if not isinstance(runtime, dict):
|
|
306
|
-
print(f"
|
|
306
|
+
print(f"Runtime parameter must be a dict, not {type(runtime)}")
|
|
307
307
|
exit(1)
|
|
308
308
|
else:
|
|
309
309
|
runtime = {}
|
|
@@ -317,7 +317,7 @@ def run(
|
|
|
317
317
|
get_in(runtime, "spec.build.origin_filename", origin_file)
|
|
318
318
|
)
|
|
319
319
|
if kfp:
|
|
320
|
-
print(f"
|
|
320
|
+
print(f"Code:\n{code}\n")
|
|
321
321
|
suffix = pathlib.Path(url_file).suffix if url else ".py"
|
|
322
322
|
|
|
323
323
|
# * is a placeholder for the url file when we want to use url args and let mlrun resolve the url file
|
|
@@ -340,7 +340,7 @@ def run(
|
|
|
340
340
|
url = f"bash {url_file} {url_args}".strip()
|
|
341
341
|
else:
|
|
342
342
|
print(
|
|
343
|
-
"
|
|
343
|
+
"Error: command must be specified with '{codefile}' in it "
|
|
344
344
|
"(to determine the position of the code file)"
|
|
345
345
|
)
|
|
346
346
|
exit(1)
|
|
@@ -365,8 +365,9 @@ def run(
|
|
|
365
365
|
|
|
366
366
|
if run_args:
|
|
367
367
|
update_in(runtime, "spec.args", list(run_args))
|
|
368
|
-
|
|
369
|
-
|
|
368
|
+
|
|
369
|
+
update_in(runtime, "spec.image", image or "mlrun/mlrun", replace=bool(image))
|
|
370
|
+
|
|
370
371
|
set_item(runobj.spec, handler, "handler")
|
|
371
372
|
set_item(runobj.spec, param, "parameters", fill_params(param))
|
|
372
373
|
|
|
@@ -427,7 +428,7 @@ def run(
|
|
|
427
428
|
if resp and dump:
|
|
428
429
|
print(resp.to_yaml())
|
|
429
430
|
except RunError as err:
|
|
430
|
-
print(f"
|
|
431
|
+
print(f"Runtime error: {err_to_str(err)}")
|
|
431
432
|
exit(1)
|
|
432
433
|
|
|
433
434
|
|
|
@@ -499,7 +500,7 @@ def build(
|
|
|
499
500
|
if runtime:
|
|
500
501
|
runtime = py_eval(runtime)
|
|
501
502
|
if not isinstance(runtime, dict):
|
|
502
|
-
print(f"
|
|
503
|
+
print(f"Runtime parameter must be a dict, not {type(runtime)}")
|
|
503
504
|
exit(1)
|
|
504
505
|
if kfp:
|
|
505
506
|
print("Runtime:")
|
|
@@ -514,7 +515,7 @@ def build(
|
|
|
514
515
|
func = import_function(func_url)
|
|
515
516
|
|
|
516
517
|
else:
|
|
517
|
-
print("
|
|
518
|
+
print("Error: Function path or url are required")
|
|
518
519
|
exit(1)
|
|
519
520
|
|
|
520
521
|
meta = func.metadata
|
|
@@ -531,12 +532,12 @@ def build(
|
|
|
531
532
|
|
|
532
533
|
if source.endswith(".py"):
|
|
533
534
|
if not path.isfile(source):
|
|
534
|
-
print(f"
|
|
535
|
+
print(f"Source file doesnt exist ({source})")
|
|
535
536
|
exit(1)
|
|
536
537
|
with open(source) as fp:
|
|
537
538
|
body = fp.read()
|
|
538
539
|
based = b64encode(body.encode("utf-8")).decode("utf-8")
|
|
539
|
-
logger.info(f"
|
|
540
|
+
logger.info(f"Packing code at {source}")
|
|
540
541
|
b.functionSourceCode = based
|
|
541
542
|
func.spec.command = ""
|
|
542
543
|
else:
|
|
@@ -562,13 +563,13 @@ def build(
|
|
|
562
563
|
)
|
|
563
564
|
|
|
564
565
|
if hasattr(func, "deploy"):
|
|
565
|
-
logger.info("
|
|
566
|
+
logger.info("Remote deployment started")
|
|
566
567
|
try:
|
|
567
568
|
func.deploy(
|
|
568
569
|
with_mlrun=with_mlrun, watch=not silent, is_kfp=kfp, skip_deployed=skip
|
|
569
570
|
)
|
|
570
571
|
except Exception as err:
|
|
571
|
-
print(f"
|
|
572
|
+
print(f"Deploy error, {err_to_str(err)}")
|
|
572
573
|
exit(1)
|
|
573
574
|
|
|
574
575
|
state = func.status.state
|
|
@@ -583,9 +584,9 @@ def build(
|
|
|
583
584
|
fp.write(full_image)
|
|
584
585
|
print("full image path = ", full_image)
|
|
585
586
|
|
|
586
|
-
print(f"
|
|
587
|
+
print(f"Function built, state={state} image={image}")
|
|
587
588
|
else:
|
|
588
|
-
print("
|
|
589
|
+
print("Function does not have a deploy() method")
|
|
589
590
|
exit(1)
|
|
590
591
|
|
|
591
592
|
|
|
@@ -644,7 +645,7 @@ def deploy(
|
|
|
644
645
|
else:
|
|
645
646
|
runtime = {}
|
|
646
647
|
if not isinstance(runtime, dict):
|
|
647
|
-
print(f"
|
|
648
|
+
print(f"Runtime parameter must be a dict, not {type(runtime)}")
|
|
648
649
|
exit(1)
|
|
649
650
|
|
|
650
651
|
if verbose:
|
|
@@ -682,7 +683,7 @@ def deploy(
|
|
|
682
683
|
print(f"deploy error: {err_to_str(err)}")
|
|
683
684
|
exit(1)
|
|
684
685
|
|
|
685
|
-
print(f"
|
|
686
|
+
print(f"Function deployed, address={addr}")
|
|
686
687
|
with open("/tmp/output", "w") as fp:
|
|
687
688
|
fp.write(addr)
|
|
688
689
|
with open("/tmp/name", "w") as fp:
|
|
@@ -715,7 +716,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
|
|
|
715
716
|
if db:
|
|
716
717
|
mlconf.dbpath = db
|
|
717
718
|
if not project:
|
|
718
|
-
print("
|
|
719
|
+
print("Warning, project parameter was not specified using default !")
|
|
719
720
|
if kind.startswith("po"):
|
|
720
721
|
print("Unsupported, use 'get runtimes' instead")
|
|
721
722
|
return
|
|
@@ -793,7 +794,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
|
|
|
793
794
|
elif kind.startswith("workflow"):
|
|
794
795
|
run_db = get_run_db()
|
|
795
796
|
if project == "*":
|
|
796
|
-
print("
|
|
797
|
+
print("Warning, reading workflows for all projects may take a long time !")
|
|
797
798
|
pipelines = run_db.list_pipelines(project=project, page_size=200)
|
|
798
799
|
pipe_runs = pipelines.runs
|
|
799
800
|
while pipelines.next_page_token is not None:
|
|
@@ -820,7 +821,7 @@ def get(kind, name, selector, namespace, uid, project, tag, db, extra_args):
|
|
|
820
821
|
|
|
821
822
|
else:
|
|
822
823
|
print(
|
|
823
|
-
"
|
|
824
|
+
"Currently only get runs | runtimes | workflows | artifacts | func [name] | runtime are supported"
|
|
824
825
|
)
|
|
825
826
|
|
|
826
827
|
|
|
@@ -905,7 +906,7 @@ def db(
|
|
|
905
906
|
)
|
|
906
907
|
pid = child.pid
|
|
907
908
|
print(
|
|
908
|
-
f"
|
|
909
|
+
f"Background pid: {pid}, logs written to mlrun-stdout.log and mlrun-stderr.log, use:\n"
|
|
909
910
|
f"`kill {pid}` (linux/mac) or `taskkill /pid {pid} /t /f` (windows), to kill the mlrun service process"
|
|
910
911
|
)
|
|
911
912
|
else:
|
|
@@ -923,7 +924,7 @@ def db(
|
|
|
923
924
|
dotenv.set_key(filename, "MLRUN_MOCK_NUCLIO_DEPLOYMENT", "auto", quote_mode="")
|
|
924
925
|
if pid:
|
|
925
926
|
dotenv.set_key(filename, "MLRUN_SERVICE_PID", str(pid), quote_mode="")
|
|
926
|
-
print(f"
|
|
927
|
+
print(f"Updated configuration in {update_env} .env file")
|
|
927
928
|
|
|
928
929
|
|
|
929
930
|
@main.command()
|
|
@@ -951,7 +952,7 @@ def logs(uid, project, offset, db, watch):
|
|
|
951
952
|
print(text.decode())
|
|
952
953
|
|
|
953
954
|
if state:
|
|
954
|
-
print(f"
|
|
955
|
+
print(f"Final state: {state}")
|
|
955
956
|
|
|
956
957
|
|
|
957
958
|
@main.command()
|
|
@@ -1119,7 +1120,7 @@ def project(
|
|
|
1119
1120
|
if arguments:
|
|
1120
1121
|
args = fill_params(arguments)
|
|
1121
1122
|
|
|
1122
|
-
print(f"
|
|
1123
|
+
print(f"Running workflow {run} file: {workflow_path}")
|
|
1123
1124
|
gitops = (
|
|
1124
1125
|
git_issue
|
|
1125
1126
|
or environ.get("GITHUB_EVENT_PATH")
|
|
@@ -1158,7 +1159,7 @@ def project(
|
|
|
1158
1159
|
exit(1)
|
|
1159
1160
|
|
|
1160
1161
|
elif sync:
|
|
1161
|
-
print("
|
|
1162
|
+
print("Saving project functions to db ..")
|
|
1162
1163
|
proj.sync_functions(save=True)
|
|
1163
1164
|
|
|
1164
1165
|
|
|
@@ -1295,7 +1296,7 @@ def show_or_set_config(
|
|
|
1295
1296
|
if not op or op == "get":
|
|
1296
1297
|
# print out the configuration (default or based on the specified env/api)
|
|
1297
1298
|
if env_file and not path.isfile(path.expanduser(env_file)):
|
|
1298
|
-
print(f"
|
|
1299
|
+
print(f"Error: Env file {env_file} does not exist")
|
|
1299
1300
|
exit(1)
|
|
1300
1301
|
if env_file or api:
|
|
1301
1302
|
mlrun.set_environment(
|
|
@@ -1315,7 +1316,7 @@ def show_or_set_config(
|
|
|
1315
1316
|
f".env file {filename} not found, creating new and setting configuration"
|
|
1316
1317
|
)
|
|
1317
1318
|
else:
|
|
1318
|
-
print(f"
|
|
1319
|
+
print(f"Updating configuration in .env file {filename}")
|
|
1319
1320
|
env_dict = {
|
|
1320
1321
|
"MLRUN_DBPATH": api,
|
|
1321
1322
|
"MLRUN_ARTIFACT_PATH": artifact_path,
|
|
@@ -1331,7 +1332,7 @@ def show_or_set_config(
|
|
|
1331
1332
|
if env_file:
|
|
1332
1333
|
# if its not the default file print the usage details
|
|
1333
1334
|
print(
|
|
1334
|
-
f"
|
|
1335
|
+
f"To use the {env_file} .env file add the following to your development environment:\n"
|
|
1335
1336
|
f"MLRUN_ENV_FILE={env_file}"
|
|
1336
1337
|
)
|
|
1337
1338
|
|
|
@@ -1340,11 +1341,11 @@ def show_or_set_config(
|
|
|
1340
1341
|
if not path.isfile(filename):
|
|
1341
1342
|
print(f".env file {filename} not found")
|
|
1342
1343
|
else:
|
|
1343
|
-
print(f"
|
|
1344
|
+
print(f"Deleting .env file {filename}")
|
|
1344
1345
|
remove(filename)
|
|
1345
1346
|
|
|
1346
1347
|
else:
|
|
1347
|
-
print(f"Error
|
|
1348
|
+
print(f"Error: Unsupported config option {op}")
|
|
1348
1349
|
|
|
1349
1350
|
|
|
1350
1351
|
def fill_params(params, params_dict=None):
|
mlrun/common/schemas/auth.py
CHANGED
|
@@ -59,6 +59,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
|
|
|
59
59
|
hub_source = "hub-source"
|
|
60
60
|
workflow = "workflow"
|
|
61
61
|
datastore_profile = "datastore-profile"
|
|
62
|
+
api_gateways = "api-gateways"
|
|
62
63
|
|
|
63
64
|
def to_resource_string(
|
|
64
65
|
self,
|
|
@@ -94,6 +95,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
|
|
|
94
95
|
AuthorizationResourceTypes.hub_source: "/marketplace/sources",
|
|
95
96
|
# workflow define how to run a pipeline and can be considered as the specification of a pipeline.
|
|
96
97
|
AuthorizationResourceTypes.workflow: "/projects/{project_name}/workflows/{resource_name}",
|
|
98
|
+
AuthorizationResourceTypes.api_gateways: "/projects/{project_name}/api-gateways",
|
|
97
99
|
}[self].format(project_name=project_name, resource_name=resource_name)
|
|
98
100
|
|
|
99
101
|
|
mlrun/common/schemas/workflow.py
CHANGED
|
@@ -16,6 +16,7 @@ import typing
|
|
|
16
16
|
|
|
17
17
|
import pydantic
|
|
18
18
|
|
|
19
|
+
from .notification import Notification
|
|
19
20
|
from .schedule import ScheduleCronTrigger
|
|
20
21
|
|
|
21
22
|
|
|
@@ -40,6 +41,7 @@ class WorkflowRequest(pydantic.BaseModel):
|
|
|
40
41
|
source: typing.Optional[str] = None
|
|
41
42
|
run_name: typing.Optional[str] = None
|
|
42
43
|
namespace: typing.Optional[str] = None
|
|
44
|
+
notifications: typing.Optional[typing.List[Notification]] = None
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
class WorkflowResponse(pydantic.BaseModel):
|
mlrun/config.py
CHANGED
|
@@ -462,7 +462,7 @@ default_config = {
|
|
|
462
462
|
"default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
|
|
463
463
|
"batch_processing_function_branch": "master",
|
|
464
464
|
"parquet_batching_max_events": 10_000,
|
|
465
|
-
"parquet_batching_timeout_secs": timedelta(minutes=
|
|
465
|
+
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
466
466
|
# See mlrun.model_monitoring.stores.ModelEndpointStoreType for available options
|
|
467
467
|
"store_type": "v3io-nosql",
|
|
468
468
|
"endpoint_store_connection": "",
|
|
@@ -1016,9 +1016,9 @@ class Config:
|
|
|
1016
1016
|
mock_nuclio = not mlrun.mlconf.is_nuclio_detected()
|
|
1017
1017
|
return True if mock_nuclio and force_mock is None else force_mock
|
|
1018
1018
|
|
|
1019
|
-
def get_v3io_access_key(self):
|
|
1019
|
+
def get_v3io_access_key(self) -> typing.Optional[str]:
|
|
1020
1020
|
# Get v3io access key from the environment
|
|
1021
|
-
return os.
|
|
1021
|
+
return os.getenv("V3IO_ACCESS_KEY")
|
|
1022
1022
|
|
|
1023
1023
|
def get_model_monitoring_file_target_path(
|
|
1024
1024
|
self,
|
mlrun/datastore/base.py
CHANGED
|
@@ -49,6 +49,8 @@ class FileStats:
|
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
class DataStore:
|
|
52
|
+
using_bucket = False
|
|
53
|
+
|
|
52
54
|
def __init__(self, parent, name, kind, endpoint="", secrets: dict = None):
|
|
53
55
|
self._parent = parent
|
|
54
56
|
self.kind = kind
|
|
@@ -303,7 +305,9 @@ class DataStore:
|
|
|
303
305
|
storage_options = self.get_storage_options()
|
|
304
306
|
if url.startswith("ds://"):
|
|
305
307
|
parsed_url = urllib.parse.urlparse(url)
|
|
306
|
-
url = parsed_url.path
|
|
308
|
+
url = parsed_url.path
|
|
309
|
+
if self.using_bucket:
|
|
310
|
+
url = url[1:]
|
|
307
311
|
# Pass the underlying file system
|
|
308
312
|
kwargs["filesystem"] = file_system
|
|
309
313
|
elif storage_options:
|
|
@@ -707,7 +711,7 @@ class HttpStore(DataStore):
|
|
|
707
711
|
# As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
|
|
708
712
|
# Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
|
|
709
713
|
# method specifically to strip away the 'ds' schema as required.
|
|
710
|
-
def makeDatastoreSchemaSanitizer(cls, *args, **kwargs):
|
|
714
|
+
def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
|
|
711
715
|
if not issubclass(cls, fsspec.AbstractFileSystem):
|
|
712
716
|
raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")
|
|
713
717
|
|
|
@@ -716,7 +720,9 @@ def makeDatastoreSchemaSanitizer(cls, *args, **kwargs):
|
|
|
716
720
|
def _strip_protocol(cls, url):
|
|
717
721
|
if url.startswith("ds://"):
|
|
718
722
|
parsed_url = urlparse(url)
|
|
719
|
-
url = parsed_url.path
|
|
723
|
+
url = parsed_url.path
|
|
724
|
+
if using_bucket:
|
|
725
|
+
url = url[1:]
|
|
720
726
|
return super()._strip_protocol(url)
|
|
721
727
|
|
|
722
728
|
return DatastoreSchemaSanitizer(*args, **kwargs)
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -194,18 +194,18 @@ class StoreManager:
|
|
|
194
194
|
|
|
195
195
|
if schema == "ds":
|
|
196
196
|
profile_name = endpoint
|
|
197
|
-
|
|
198
|
-
if not
|
|
197
|
+
datastore_profile = TemporaryClientDatastoreProfiles().get(profile_name)
|
|
198
|
+
if not datastore_profile:
|
|
199
199
|
project_name = urlparse(url).username or mlrun.mlconf.default_project
|
|
200
|
-
|
|
200
|
+
datastore_profile = mlrun.db.get_run_db(
|
|
201
201
|
secrets=self._secrets
|
|
202
202
|
).get_datastore_profile(profile_name, project_name)
|
|
203
203
|
|
|
204
|
-
if secrets and
|
|
205
|
-
secrets = merge(secrets,
|
|
204
|
+
if secrets and datastore_profile.secrets():
|
|
205
|
+
secrets = merge(secrets, datastore_profile.secrets())
|
|
206
206
|
else:
|
|
207
|
-
secrets = secrets or
|
|
208
|
-
url =
|
|
207
|
+
secrets = secrets or datastore_profile.secrets()
|
|
208
|
+
url = datastore_profile.url(subpath)
|
|
209
209
|
schema, endpoint, parsed_url = parse_url(url)
|
|
210
210
|
subpath = parsed_url.path
|
|
211
211
|
|
|
@@ -233,3 +233,6 @@ class StoreManager:
|
|
|
233
233
|
self._stores[store_key] = store
|
|
234
234
|
# in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
|
|
235
235
|
return store, url if store.kind == "file" else subpath
|
|
236
|
+
|
|
237
|
+
def reset_secrets(self):
|
|
238
|
+
self._secrets = {}
|
|
@@ -48,11 +48,9 @@ class DatastoreProfile(pydantic.BaseModel):
|
|
|
48
48
|
)
|
|
49
49
|
return full_key
|
|
50
50
|
|
|
51
|
-
@classmethod
|
|
52
51
|
def secrets(self) -> dict:
|
|
53
52
|
return None
|
|
54
53
|
|
|
55
|
-
@classmethod
|
|
56
54
|
def url(self, subpath) -> str:
|
|
57
55
|
return None
|
|
58
56
|
|
|
@@ -204,6 +202,24 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
204
202
|
return self.endpoint_url + subpath
|
|
205
203
|
|
|
206
204
|
|
|
205
|
+
class DatastoreProfileDBFS(DatastoreProfile):
|
|
206
|
+
type: str = pydantic.Field("dbfs")
|
|
207
|
+
_private_attributes = ("token",)
|
|
208
|
+
endpoint_url: typing.Optional[str] = None # host
|
|
209
|
+
token: typing.Optional[str] = None
|
|
210
|
+
|
|
211
|
+
def url(self, subpath) -> str:
|
|
212
|
+
return f"dbfs://{subpath}"
|
|
213
|
+
|
|
214
|
+
def secrets(self) -> dict:
|
|
215
|
+
res = {}
|
|
216
|
+
if self.token:
|
|
217
|
+
res["DATABRICKS_TOKEN"] = self.token
|
|
218
|
+
if self.endpoint_url:
|
|
219
|
+
res["DATABRICKS_HOST"] = self.endpoint_url
|
|
220
|
+
return res if res else None
|
|
221
|
+
|
|
222
|
+
|
|
207
223
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
208
224
|
@staticmethod
|
|
209
225
|
def _to_json(attributes):
|
|
@@ -260,6 +276,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
260
276
|
"basic": DatastoreProfileBasic,
|
|
261
277
|
"kafka_target": DatastoreProfileKafkaTarget,
|
|
262
278
|
"kafka_source": DatastoreProfileKafkaSource,
|
|
279
|
+
"dbfs": DatastoreProfileDBFS,
|
|
263
280
|
}
|
|
264
281
|
if datastore_type in ds_profile_factory:
|
|
265
282
|
return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
|
mlrun/datastore/dbfs_store.py
CHANGED
|
@@ -14,12 +14,11 @@
|
|
|
14
14
|
|
|
15
15
|
import pathlib
|
|
16
16
|
|
|
17
|
-
import fsspec
|
|
18
17
|
from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
|
|
19
18
|
|
|
20
19
|
import mlrun.errors
|
|
21
20
|
|
|
22
|
-
from .base import DataStore, FileStats
|
|
21
|
+
from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
class DatabricksFileBugFixed(DatabricksFile):
|
|
@@ -83,15 +82,16 @@ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
|
|
|
83
82
|
class DBFSStore(DataStore):
|
|
84
83
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
85
84
|
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
86
|
-
if not endpoint:
|
|
87
|
-
endpoint = self._get_secret_or_env("DATABRICKS_HOST")
|
|
88
|
-
self.endpoint = endpoint
|
|
89
85
|
self.get_filesystem(silent=False)
|
|
90
86
|
|
|
91
87
|
def get_filesystem(self, silent=True):
|
|
92
88
|
"""return fsspec file system object, if supported"""
|
|
93
89
|
if not self._filesystem:
|
|
94
|
-
self._filesystem =
|
|
90
|
+
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
91
|
+
cls=DatabricksFileSystemDisableCache,
|
|
92
|
+
using_bucket=False,
|
|
93
|
+
**self.get_storage_options(),
|
|
94
|
+
)
|
|
95
95
|
return self._filesystem
|
|
96
96
|
|
|
97
97
|
def get_storage_options(self):
|
mlrun/datastore/s3.py
CHANGED
|
@@ -22,6 +22,8 @@ from .base import DataStore, FileStats, get_range, makeDatastoreSchemaSanitizer
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class S3Store(DataStore):
|
|
25
|
+
using_bucket = True
|
|
26
|
+
|
|
25
27
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
26
28
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
27
29
|
# will be used in case user asks to assume a role and work through fsspec
|
|
@@ -108,7 +110,9 @@ class S3Store(DataStore):
|
|
|
108
110
|
return None
|
|
109
111
|
|
|
110
112
|
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
111
|
-
s3fs.S3FileSystem,
|
|
113
|
+
s3fs.S3FileSystem,
|
|
114
|
+
using_bucket=self.using_bucket,
|
|
115
|
+
**self.get_storage_options(),
|
|
112
116
|
)
|
|
113
117
|
return self._filesystem
|
|
114
118
|
|
|
@@ -173,7 +177,7 @@ class S3Store(DataStore):
|
|
|
173
177
|
if not key.endswith("/"):
|
|
174
178
|
key += "/"
|
|
175
179
|
# Object names is S3 are not fully following filesystem semantics - they do not start with /, even for
|
|
176
|
-
# "absolute paths". Therefore, we are
|
|
180
|
+
# "absolute paths". Therefore, we are removing leading / from path filter.
|
|
177
181
|
if key.startswith("/"):
|
|
178
182
|
key = key[1:]
|
|
179
183
|
key_length = len(key)
|
mlrun/datastore/sources.py
CHANGED
|
@@ -177,9 +177,14 @@ class CSVSource(BaseSourceDriver):
|
|
|
177
177
|
parse_dates.append(time_field)
|
|
178
178
|
|
|
179
179
|
data_item = mlrun.store_manager.object(self.path)
|
|
180
|
+
if self.path.startswith("ds://"):
|
|
181
|
+
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
182
|
+
path = store.url + path
|
|
183
|
+
else:
|
|
184
|
+
path = data_item.url
|
|
180
185
|
|
|
181
186
|
return storey.CSVSource(
|
|
182
|
-
paths=
|
|
187
|
+
paths=path, # unlike self.path, it already has store:// replaced
|
|
183
188
|
build_dict=True,
|
|
184
189
|
key_field=self.key_field or key_field,
|
|
185
190
|
storage_options=data_item.store.get_storage_options(),
|
|
@@ -323,9 +328,14 @@ class ParquetSource(BaseSourceDriver):
|
|
|
323
328
|
attributes["context"] = context
|
|
324
329
|
|
|
325
330
|
data_item = mlrun.store_manager.object(self.path)
|
|
331
|
+
if self.path.startswith("ds://"):
|
|
332
|
+
store, path = mlrun.store_manager.get_or_create_store(self.path)
|
|
333
|
+
path = store.url + path
|
|
334
|
+
else:
|
|
335
|
+
path = data_item.url
|
|
326
336
|
|
|
327
337
|
return storey.ParquetSource(
|
|
328
|
-
paths=
|
|
338
|
+
paths=path, # unlike self.path, it already has store:// replaced
|
|
329
339
|
key_field=self.key_field or key_field,
|
|
330
340
|
storage_options=data_item.store.get_storage_options(),
|
|
331
341
|
end_filter=self.end_time,
|