mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +3 -41
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/frontend_spec.py +2 -1
- mlrun/api/api/endpoints/functions.py +95 -59
- mlrun/api/api/endpoints/grafana_proxy.py +9 -9
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/model_endpoints.py +3 -2
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/submit.py +2 -1
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +3 -4
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +6 -2
- mlrun/api/crud/feature_store.py +5 -0
- mlrun/api/crud/model_monitoring/__init__.py +1 -0
- mlrun/api/crud/model_monitoring/deployment.py +497 -0
- mlrun/api/crud/model_monitoring/grafana.py +96 -42
- mlrun/api/crud/model_monitoring/helpers.py +159 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +6 -11
- mlrun/api/crud/projects.py +2 -2
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/init_db.py +2 -4
- mlrun/api/db/session.py +1 -1
- mlrun/api/db/sqldb/db.py +129 -31
- mlrun/api/db/sqldb/models/models_mysql.py +15 -1
- mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
- mlrun/api/launcher.py +38 -6
- mlrun/api/main.py +3 -2
- mlrun/api/rundb/__init__.py +13 -0
- mlrun/{db → api/rundb}/sqldb.py +36 -84
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/schemas/__init__.py +17 -6
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/clients/iguazio.py +33 -33
- mlrun/api/utils/clients/nuclio.py +2 -2
- mlrun/api/utils/periodic.py +9 -2
- mlrun/api/utils/projects/follower.py +14 -7
- mlrun/api/utils/projects/leader.py +2 -1
- mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
- mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
- mlrun/api/utils/runtimes/__init__.py +14 -0
- mlrun/api/utils/runtimes/nuclio.py +43 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +5 -1
- mlrun/api/utils/singletons/project_member.py +4 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +4 -4
- mlrun/artifacts/manager.py +2 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/common/db/__init__.py +14 -0
- mlrun/common/helpers.py +37 -0
- mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
- mlrun/common/model_monitoring/helpers.py +69 -0
- mlrun/common/schemas/__init__.py +13 -1
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/function.py +17 -0
- mlrun/common/schemas/model_monitoring/__init__.py +48 -0
- mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
- mlrun/common/schemas/model_monitoring/grafana.py +55 -0
- mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
- mlrun/common/schemas/notification.py +1 -0
- mlrun/common/schemas/object.py +4 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/regex.py +1 -1
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +45 -42
- mlrun/datastore/__init__.py +21 -0
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/datastore.py +9 -0
- mlrun/datastore/dbfs_store.py +168 -0
- mlrun/datastore/helpers.py +18 -0
- mlrun/datastore/sources.py +1 -0
- mlrun/datastore/store_resources.py +2 -5
- mlrun/datastore/v3io.py +1 -2
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -20
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/common.py +2 -1
- mlrun/feature_store/feature_set.py +1 -11
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +22 -16
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +57 -53
- mlrun/launcher/client.py +5 -4
- mlrun/launcher/factory.py +24 -13
- mlrun/launcher/local.py +6 -6
- mlrun/launcher/remote.py +4 -4
- mlrun/lists.py +0 -11
- mlrun/model.py +11 -17
- mlrun/model_monitoring/__init__.py +2 -22
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +22 -210
- mlrun/model_monitoring/model_endpoint.py +1 -1
- mlrun/model_monitoring/model_monitoring_batch.py +127 -50
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +16 -11
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
- mlrun/model_monitoring/stores/models/mysql.py +47 -29
- mlrun/model_monitoring/stores/models/sqlite.py +47 -29
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
- mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
- mlrun/model_monitoring/tracking_policy.py +104 -0
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/package/packagers/numpy_packagers.py +1 -1
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +53 -159
- mlrun/projects/project.py +10 -37
- mlrun/render.py +1 -1
- mlrun/run.py +8 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +29 -1249
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/funcdoc.py +0 -9
- mlrun/runtimes/function.py +25 -29
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +28 -18
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +14 -6
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/runtimes/utils.py +0 -26
- mlrun/serving/routers.py +7 -7
- mlrun/serving/server.py +11 -8
- mlrun/serving/states.py +7 -1
- mlrun/serving/v2_serving.py +6 -6
- mlrun/utils/helpers.py +23 -42
- mlrun/utils/notifications/notification/__init__.py +4 -0
- mlrun/utils/notifications/notification/webhook.py +61 -0
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- mlrun/utils/model_monitoring.py +0 -249
- /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -27,8 +27,8 @@ import copy
|
|
|
27
27
|
import json
|
|
28
28
|
import os
|
|
29
29
|
import typing
|
|
30
|
-
import urllib.parse
|
|
31
30
|
from collections.abc import Mapping
|
|
31
|
+
from datetime import timedelta
|
|
32
32
|
from distutils.util import strtobool
|
|
33
33
|
from os.path import expanduser
|
|
34
34
|
from threading import Lock
|
|
@@ -149,7 +149,7 @@ default_config = {
|
|
|
149
149
|
"timeout_mode": "enabled",
|
|
150
150
|
# timeout in seconds to wait for background task to be updated / finished by the worker responsible for the task
|
|
151
151
|
"default_timeouts": {
|
|
152
|
-
"operations": {"migrations": "3600"},
|
|
152
|
+
"operations": {"migrations": "3600", "load_project": "60"},
|
|
153
153
|
"runtimes": {"dask": "600"},
|
|
154
154
|
},
|
|
155
155
|
},
|
|
@@ -286,6 +286,7 @@ default_config = {
|
|
|
286
286
|
# - mlrun.runtimes.constants.NuclioIngressAddTemplatedIngressModes
|
|
287
287
|
# - mlrun.runtimes.function.enrich_function_with_ingress
|
|
288
288
|
"add_templated_ingress_host_mode": "never",
|
|
289
|
+
"explicit_ack": "enabled",
|
|
289
290
|
},
|
|
290
291
|
"logs": {
|
|
291
292
|
"decode": {
|
|
@@ -416,7 +417,8 @@ default_config = {
|
|
|
416
417
|
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
|
|
417
418
|
"batch_processing_function_branch": "master",
|
|
418
419
|
"parquet_batching_max_events": 10000,
|
|
419
|
-
|
|
420
|
+
"parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
|
|
421
|
+
# See mlrun.model_monitoring.stores.ModelEndpointStoreType for available options
|
|
420
422
|
"store_type": "v3io-nosql",
|
|
421
423
|
"endpoint_store_connection": "",
|
|
422
424
|
},
|
|
@@ -456,7 +458,7 @@ default_config = {
|
|
|
456
458
|
},
|
|
457
459
|
"default_targets": "parquet,nosql",
|
|
458
460
|
"default_job_image": "mlrun/mlrun",
|
|
459
|
-
"flush_interval":
|
|
461
|
+
"flush_interval": None,
|
|
460
462
|
},
|
|
461
463
|
"ui": {
|
|
462
464
|
"projects_prefix": "projects", # The UI link prefix for projects
|
|
@@ -515,7 +517,11 @@ default_config = {
|
|
|
515
517
|
"debug": {
|
|
516
518
|
"expose_internal_api_endpoints": False,
|
|
517
519
|
},
|
|
518
|
-
"
|
|
520
|
+
"workflows": {
|
|
521
|
+
"default_workflow_runner_name": "workflow-runner-{}",
|
|
522
|
+
# Default timeout seconds for retrieving workflow id after execution:
|
|
523
|
+
"timeouts": {"local": 120, "kfp": 30},
|
|
524
|
+
},
|
|
519
525
|
"log_collector": {
|
|
520
526
|
"address": "localhost:8282",
|
|
521
527
|
# log collection mode can be one of: "sidecar", "legacy", "best-effort"
|
|
@@ -775,7 +781,6 @@ class Config:
|
|
|
775
781
|
return semver.VersionInfo.parse(f"{semver_compatible_igz_version}.0")
|
|
776
782
|
|
|
777
783
|
def verify_security_context_enrichment_mode_is_allowed(self):
|
|
778
|
-
|
|
779
784
|
# TODO: move SecurityContextEnrichmentModes to a different package so that we could use it here without
|
|
780
785
|
# importing mlrun.api
|
|
781
786
|
if config.function.spec.security_context.enrichment_mode == "disabled":
|
|
@@ -932,36 +937,6 @@ class Config:
|
|
|
932
937
|
# when dbpath is set we want to connect to it which will sync configuration from it to the client
|
|
933
938
|
mlrun.db.get_run_db(value, force_reconnect=True)
|
|
934
939
|
|
|
935
|
-
@property
|
|
936
|
-
def iguazio_api_url(self):
|
|
937
|
-
"""
|
|
938
|
-
we want to be able to run with old versions of the service who runs the API (which doesn't configure this
|
|
939
|
-
value) so we're doing best effort to try and resolve it from other configurations
|
|
940
|
-
TODO: Remove this hack when 0.6.x is old enough
|
|
941
|
-
"""
|
|
942
|
-
if not self._iguazio_api_url:
|
|
943
|
-
if self.httpdb.builder.docker_registry and self.igz_version:
|
|
944
|
-
return self._extract_iguazio_api_from_docker_registry_url()
|
|
945
|
-
return self._iguazio_api_url
|
|
946
|
-
|
|
947
|
-
def _extract_iguazio_api_from_docker_registry_url(self):
|
|
948
|
-
docker_registry_url = self.httpdb.builder.docker_registry
|
|
949
|
-
# add schema otherwise parsing go wrong
|
|
950
|
-
if "://" not in docker_registry_url:
|
|
951
|
-
docker_registry_url = f"http://{docker_registry_url}"
|
|
952
|
-
parsed_registry_url = urllib.parse.urlparse(docker_registry_url)
|
|
953
|
-
registry_hostname = parsed_registry_url.hostname
|
|
954
|
-
# replace the first domain section (app service name) with dashboard
|
|
955
|
-
first_dot_index = registry_hostname.find(".")
|
|
956
|
-
if first_dot_index < 0:
|
|
957
|
-
# if not found it's not the format we know - can't resolve the api url from the registry url
|
|
958
|
-
return ""
|
|
959
|
-
return f"https://dashboard{registry_hostname[first_dot_index:]}"
|
|
960
|
-
|
|
961
|
-
@iguazio_api_url.setter
|
|
962
|
-
def iguazio_api_url(self, value):
|
|
963
|
-
self._iguazio_api_url = value
|
|
964
|
-
|
|
965
940
|
def is_api_running_on_k8s(self):
|
|
966
941
|
# determine if the API service is attached to K8s cluster
|
|
967
942
|
# when there is a cluster the .namespace is set
|
|
@@ -1044,6 +1019,40 @@ class Config:
|
|
|
1044
1019
|
ver in mlrun.mlconf.ce.mode for ver in ["lite", "full"]
|
|
1045
1020
|
)
|
|
1046
1021
|
|
|
1022
|
+
def get_s3_storage_options(self) -> typing.Dict[str, typing.Any]:
|
|
1023
|
+
"""
|
|
1024
|
+
Generate storage options dictionary as required for handling S3 path in fsspec. The model monitoring stream
|
|
1025
|
+
graph uses this method for generating the storage options for S3 parquet target path.
|
|
1026
|
+
:return: A storage options dictionary in which each key-value pair represents a particular configuration,
|
|
1027
|
+
such as endpoint_url or aws access key.
|
|
1028
|
+
"""
|
|
1029
|
+
key = mlrun.get_secret_or_env("AWS_ACCESS_KEY_ID")
|
|
1030
|
+
secret = mlrun.get_secret_or_env("AWS_SECRET_ACCESS_KEY")
|
|
1031
|
+
|
|
1032
|
+
force_non_anonymous = mlrun.get_secret_or_env("S3_NON_ANONYMOUS")
|
|
1033
|
+
profile = mlrun.get_secret_or_env("AWS_PROFILE")
|
|
1034
|
+
|
|
1035
|
+
storage_options = dict(
|
|
1036
|
+
anon=not (force_non_anonymous or (key and secret)),
|
|
1037
|
+
key=key,
|
|
1038
|
+
secret=secret,
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
endpoint_url = mlrun.get_secret_or_env("S3_ENDPOINT_URL")
|
|
1042
|
+
if endpoint_url:
|
|
1043
|
+
client_kwargs = {"endpoint_url": endpoint_url}
|
|
1044
|
+
storage_options["client_kwargs"] = client_kwargs
|
|
1045
|
+
|
|
1046
|
+
if profile:
|
|
1047
|
+
storage_options["profile"] = profile
|
|
1048
|
+
|
|
1049
|
+
return storage_options
|
|
1050
|
+
|
|
1051
|
+
def is_explicit_ack(self) -> bool:
|
|
1052
|
+
return self.httpdb.nuclio.explicit_ack == "enabled" and (
|
|
1053
|
+
not self.nuclio_version or self.nuclio_version >= "1.11.20"
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1047
1056
|
|
|
1048
1057
|
# Global configuration
|
|
1049
1058
|
config = Config.from_dict(default_config)
|
|
@@ -1091,12 +1100,6 @@ def _do_populate(env=None, skip_errors=False):
|
|
|
1091
1100
|
if data:
|
|
1092
1101
|
config.update(data, skip_errors=skip_errors)
|
|
1093
1102
|
|
|
1094
|
-
# HACK to enable config property to both have dynamic default and to use the value from dict/env like other
|
|
1095
|
-
# configurations - we just need a key in the dict that is different than the property name, so simply adding prefix
|
|
1096
|
-
# underscore
|
|
1097
|
-
config._cfg["_iguazio_api_url"] = config._cfg["iguazio_api_url"]
|
|
1098
|
-
del config._cfg["iguazio_api_url"]
|
|
1099
|
-
|
|
1100
1103
|
_validate_config(config)
|
|
1101
1104
|
|
|
1102
1105
|
|
mlrun/datastore/__init__.py
CHANGED
|
@@ -29,8 +29,12 @@ __all__ = [
|
|
|
29
29
|
"StreamSource",
|
|
30
30
|
"KafkaSource",
|
|
31
31
|
"RedisStore",
|
|
32
|
+
"DatabricksFileSystemDisableCache",
|
|
33
|
+
"DatabricksFileBugFixed",
|
|
32
34
|
]
|
|
33
35
|
|
|
36
|
+
import fsspec
|
|
37
|
+
|
|
34
38
|
import mlrun.datastore.wasbfs
|
|
35
39
|
|
|
36
40
|
from ..platforms.iguazio import (
|
|
@@ -42,6 +46,7 @@ from ..platforms.iguazio import (
|
|
|
42
46
|
from ..utils import logger
|
|
43
47
|
from .base import DataItem
|
|
44
48
|
from .datastore import StoreManager, in_memory_store, uri_to_ipython
|
|
49
|
+
from .dbfs_store import DatabricksFileBugFixed, DatabricksFileSystemDisableCache
|
|
45
50
|
from .s3 import parse_s3_bucket_and_key
|
|
46
51
|
from .sources import (
|
|
47
52
|
BigQuerySource,
|
|
@@ -62,6 +67,22 @@ from .utils import parse_kafka_url
|
|
|
62
67
|
|
|
63
68
|
store_manager = StoreManager()
|
|
64
69
|
|
|
70
|
+
if hasattr(fsspec, "register_implementation"):
|
|
71
|
+
fsspec.register_implementation(
|
|
72
|
+
"dbfs", DatabricksFileSystemDisableCache, clobber=True
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
from fsspec.registry import known_implementations
|
|
76
|
+
|
|
77
|
+
known_implementations["dbfs"] = {
|
|
78
|
+
"class": "mlrun.datastore.dbfs_store.DatabricksFileSystemDisableCache",
|
|
79
|
+
"err": "Please make sure your fsspec version supports dbfs",
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
del known_implementations
|
|
83
|
+
|
|
84
|
+
del fsspec # clear the module namespace
|
|
85
|
+
|
|
65
86
|
|
|
66
87
|
def set_in_memory_item(key, value):
|
|
67
88
|
item = store_manager.object(f"memory://{key}")
|
mlrun/datastore/base.py
CHANGED
|
@@ -261,7 +261,7 @@ class DataStore:
|
|
|
261
261
|
updated_args = [f"{base_path}/{filename}"]
|
|
262
262
|
updated_args.extend(args[1:])
|
|
263
263
|
dfs.append(df_module.read_csv(*updated_args, **kwargs))
|
|
264
|
-
return
|
|
264
|
+
return df_module.concat(dfs)
|
|
265
265
|
|
|
266
266
|
elif (
|
|
267
267
|
file_url.endswith(".parquet")
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -86,6 +86,10 @@ def schema_to_store(schema):
|
|
|
86
86
|
"Google cloud storage packages are missing, use pip install mlrun[google-cloud-storage]"
|
|
87
87
|
)
|
|
88
88
|
return GoogleCloudStorageStore
|
|
89
|
+
elif schema == "dbfs":
|
|
90
|
+
from .dbfs_store import DBFSStore
|
|
91
|
+
|
|
92
|
+
return DBFSStore
|
|
89
93
|
else:
|
|
90
94
|
raise ValueError(f"unsupported store scheme ({schema})")
|
|
91
95
|
|
|
@@ -175,6 +179,11 @@ class StoreManager:
|
|
|
175
179
|
)
|
|
176
180
|
|
|
177
181
|
store, subpath = self.get_or_create_store(url, secrets=secrets)
|
|
182
|
+
schema, endpoint, parsed_url = parse_url(url)
|
|
183
|
+
# TODO: Modify the URL replacement to be outside of the dataitem. Dataitem class should
|
|
184
|
+
# be implemented as a generic class.
|
|
185
|
+
if endpoint and schema == "dbfs":
|
|
186
|
+
url = url.replace(endpoint, "", 1)
|
|
178
187
|
return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
|
|
179
188
|
|
|
180
189
|
def get_or_create_store(self, url, secrets: dict = None) -> (DataStore, str):
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pathlib
|
|
16
|
+
|
|
17
|
+
import fsspec
|
|
18
|
+
from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
|
|
19
|
+
|
|
20
|
+
import mlrun.errors
|
|
21
|
+
|
|
22
|
+
from .base import DataStore, FileStats
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DatabricksFileBugFixed(DatabricksFile):
|
|
26
|
+
"""Overrides DatabricksFile to add the following fix: https://github.com/fsspec/filesystem_spec/pull/1278"""
|
|
27
|
+
|
|
28
|
+
def _upload_chunk(self, final=False):
|
|
29
|
+
"""Internal function to add a chunk of data to a started upload"""
|
|
30
|
+
self.buffer.seek(0)
|
|
31
|
+
data = self.buffer.getvalue()
|
|
32
|
+
|
|
33
|
+
data_chunks = [
|
|
34
|
+
data[start:end] for start, end in self._to_sized_blocks(end=len(data))
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
for data_chunk in data_chunks:
|
|
38
|
+
self.fs._add_data(handle=self.handle, data=data_chunk)
|
|
39
|
+
|
|
40
|
+
if final:
|
|
41
|
+
self.fs._close_handle(handle=self.handle)
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
def _fetch_range(self, start, end):
|
|
45
|
+
"""Internal function to download a block of data"""
|
|
46
|
+
return_buffer = b""
|
|
47
|
+
for chunk_start, chunk_end in self._to_sized_blocks(start, end):
|
|
48
|
+
return_buffer += self.fs._get_data(
|
|
49
|
+
path=self.path, start=chunk_start, end=chunk_end
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return return_buffer
|
|
53
|
+
|
|
54
|
+
def _to_sized_blocks(self, start=0, end=100):
|
|
55
|
+
"""Helper function to split a range from 0 to total_length into blocksizes"""
|
|
56
|
+
for data_chunk in range(start, end, self.blocksize):
|
|
57
|
+
data_start = data_chunk
|
|
58
|
+
data_end = min(end, data_chunk + self.blocksize)
|
|
59
|
+
yield data_start, data_end
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DatabricksFileSystemDisableCache(DatabricksFileSystem):
|
|
63
|
+
root_marker = "/"
|
|
64
|
+
protocol = "dbfs"
|
|
65
|
+
|
|
66
|
+
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
|
67
|
+
"""
|
|
68
|
+
Overwrite the base class method to make sure to create a DBFile.
|
|
69
|
+
All arguments are copied from the base method.
|
|
70
|
+
|
|
71
|
+
Only the default blocksize is allowed.
|
|
72
|
+
"""
|
|
73
|
+
return DatabricksFileBugFixed(
|
|
74
|
+
self, path, mode=mode, block_size=block_size, **kwargs
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# _ls_from_cache is not working properly, so we disable it.
|
|
78
|
+
def _ls_from_cache(self, path):
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# dbfs objects will be represented with the following URL: dbfs://<path>
|
|
83
|
+
class DBFSStore(DataStore):
|
|
84
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
85
|
+
super().__init__(parent, name, schema, endpoint, secrets=secrets)
|
|
86
|
+
self.get_filesystem(silent=False)
|
|
87
|
+
|
|
88
|
+
def get_filesystem(self, silent=True):
|
|
89
|
+
"""return fsspec file system object, if supported"""
|
|
90
|
+
if not self._filesystem:
|
|
91
|
+
self._filesystem = fsspec.filesystem("dbfs", **self.get_storage_options())
|
|
92
|
+
return self._filesystem
|
|
93
|
+
|
|
94
|
+
def get_storage_options(self):
|
|
95
|
+
return dict(
|
|
96
|
+
token=self._get_secret_or_env("DATABRICKS_TOKEN"), instance=self.endpoint
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _verify_filesystem_and_key(self, key: str):
|
|
100
|
+
if not self._filesystem:
|
|
101
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
102
|
+
"Performing actions on data-item without a valid filesystem"
|
|
103
|
+
)
|
|
104
|
+
if not key.startswith("/"):
|
|
105
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
106
|
+
"Invalid key parameter - key must start with '/'"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def get(self, key: str, size=None, offset=0) -> bytes:
|
|
110
|
+
self._verify_filesystem_and_key(key)
|
|
111
|
+
if size is not None and size <= 0:
|
|
112
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
113
|
+
"size cannot be negative or zero"
|
|
114
|
+
)
|
|
115
|
+
start = offset or None
|
|
116
|
+
end = offset + size if size is not None else None
|
|
117
|
+
return self._filesystem.cat_file(key, start=start, end=end)
|
|
118
|
+
|
|
119
|
+
def put(self, key, data, append=False):
|
|
120
|
+
|
|
121
|
+
self._verify_filesystem_and_key(key)
|
|
122
|
+
if append:
|
|
123
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
124
|
+
"Append mode not supported for Databricks file system"
|
|
125
|
+
)
|
|
126
|
+
# can not use append mode because it overrides data.
|
|
127
|
+
mode = "w"
|
|
128
|
+
if isinstance(data, bytes):
|
|
129
|
+
mode += "b"
|
|
130
|
+
elif not isinstance(data, str):
|
|
131
|
+
raise TypeError(f"Unknown data type {type(data)}")
|
|
132
|
+
with self._filesystem.open(key, mode) as f:
|
|
133
|
+
f.write(data)
|
|
134
|
+
|
|
135
|
+
def upload(self, key: str, src_path: str):
|
|
136
|
+
self._verify_filesystem_and_key(key)
|
|
137
|
+
self._filesystem.put_file(src_path, key, overwrite=True)
|
|
138
|
+
|
|
139
|
+
def stat(self, key: str):
|
|
140
|
+
self._verify_filesystem_and_key(key)
|
|
141
|
+
file = self._filesystem.stat(key)
|
|
142
|
+
if file["type"] == "file":
|
|
143
|
+
size = file["size"]
|
|
144
|
+
elif file["type"] == "directory":
|
|
145
|
+
raise FileNotFoundError("Operation expects a file not a directory!")
|
|
146
|
+
return FileStats(size, None)
|
|
147
|
+
|
|
148
|
+
def listdir(self, key: str):
|
|
149
|
+
"""
|
|
150
|
+
Basic ls of file/dir - without recursion.
|
|
151
|
+
"""
|
|
152
|
+
self._verify_filesystem_and_key(key)
|
|
153
|
+
if self._filesystem.isfile(key):
|
|
154
|
+
return key
|
|
155
|
+
remote_path = f"{key}/*"
|
|
156
|
+
files = self._filesystem.glob(remote_path)
|
|
157
|
+
# Get only the files and directories under key path, without the key path itself.
|
|
158
|
+
# for example in a filesystem that has this path: /test_mlrun_dbfs_objects/test.txt
|
|
159
|
+
# listdir with the input /test_mlrun_dbfs_objects as a key will return ['test.txt'].
|
|
160
|
+
files = [pathlib.Path(file).name for file in files if "/" in file]
|
|
161
|
+
return files
|
|
162
|
+
|
|
163
|
+
def rm(self, path, recursive=False, maxdepth=None):
|
|
164
|
+
if maxdepth:
|
|
165
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
166
|
+
"dbfs file system does not support maxdepth option in rm function"
|
|
167
|
+
)
|
|
168
|
+
self.get_filesystem().rm(path=path, recursive=recursive)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ONE_GB = 1024 * 1024 * 1024
|
|
18
|
+
ONE_MB = 1024 * 1024
|
mlrun/datastore/sources.py
CHANGED
|
@@ -16,12 +16,9 @@
|
|
|
16
16
|
|
|
17
17
|
import mlrun
|
|
18
18
|
from mlrun.config import config
|
|
19
|
-
from mlrun.utils.helpers import
|
|
20
|
-
is_legacy_artifact,
|
|
21
|
-
parse_artifact_uri,
|
|
22
|
-
parse_versioned_object_uri,
|
|
23
|
-
)
|
|
19
|
+
from mlrun.utils.helpers import is_legacy_artifact, parse_artifact_uri
|
|
24
20
|
|
|
21
|
+
from ..common.helpers import parse_versioned_object_uri
|
|
25
22
|
from ..platforms.iguazio import parse_path
|
|
26
23
|
from ..utils import DB_SCHEMA, StorePrefix
|
|
27
24
|
from .targets import get_online_target
|
mlrun/datastore/v3io.py
CHANGED
|
@@ -22,6 +22,7 @@ import fsspec
|
|
|
22
22
|
import v3io.dataplane
|
|
23
23
|
|
|
24
24
|
import mlrun
|
|
25
|
+
from mlrun.datastore.helpers import ONE_GB, ONE_MB
|
|
25
26
|
|
|
26
27
|
from ..platforms.iguazio import parse_path, split_path
|
|
27
28
|
from .base import (
|
|
@@ -36,8 +37,6 @@ from .base import (
|
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
V3IO_LOCAL_ROOT = "v3io"
|
|
39
|
-
ONE_GB = 1024 * 1024 * 1024
|
|
40
|
-
ONE_MB = 1024 * 1024
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class V3ioStore(DataStore):
|
mlrun/db/__init__.py
CHANGED
|
@@ -12,14 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from os import environ
|
|
15
|
-
from urllib.parse import urlparse
|
|
16
15
|
|
|
17
16
|
from ..config import config
|
|
18
|
-
from ..platforms import add_or_refresh_credentials
|
|
19
|
-
from ..utils import logger
|
|
20
17
|
from .base import RunDBError, RunDBInterface # noqa
|
|
21
|
-
from .nopdb import NopDB
|
|
22
|
-
from .sqldb import SQLDB
|
|
23
18
|
|
|
24
19
|
|
|
25
20
|
def get_or_set_dburl(default=""):
|
|
@@ -29,69 +24,10 @@ def get_or_set_dburl(default=""):
|
|
|
29
24
|
return config.dbpath
|
|
30
25
|
|
|
31
26
|
|
|
32
|
-
def get_httpdb_kwargs(host, username, password):
|
|
33
|
-
username = username or config.httpdb.user
|
|
34
|
-
password = password or config.httpdb.password
|
|
35
|
-
|
|
36
|
-
username, password, token = add_or_refresh_credentials(
|
|
37
|
-
host, username, password, config.httpdb.token
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return {
|
|
41
|
-
"user": username,
|
|
42
|
-
"password": password,
|
|
43
|
-
"token": token,
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
_run_db = None
|
|
48
|
-
_last_db_url = None
|
|
49
|
-
|
|
50
|
-
|
|
51
27
|
def get_run_db(url="", secrets=None, force_reconnect=False):
|
|
52
28
|
"""Returns the runtime database"""
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
if not url:
|
|
56
|
-
url = get_or_set_dburl("./")
|
|
57
|
-
|
|
58
|
-
if (
|
|
59
|
-
_last_db_url is not None
|
|
60
|
-
and url == _last_db_url
|
|
61
|
-
and _run_db
|
|
62
|
-
and not force_reconnect
|
|
63
|
-
):
|
|
64
|
-
return _run_db
|
|
65
|
-
_last_db_url = url
|
|
66
|
-
|
|
67
|
-
parsed_url = urlparse(url)
|
|
68
|
-
scheme = parsed_url.scheme.lower()
|
|
69
|
-
kwargs = {}
|
|
70
|
-
if "://" not in str(url) or scheme in ["file", "s3", "v3io", "v3ios"]:
|
|
71
|
-
logger.warning(
|
|
72
|
-
"Could not detect path to API server, not connected to API server!"
|
|
73
|
-
)
|
|
74
|
-
logger.warning(
|
|
75
|
-
"MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server"
|
|
76
|
-
" in order to connect"
|
|
77
|
-
)
|
|
78
|
-
cls = NopDB
|
|
79
|
-
|
|
80
|
-
elif scheme in ("http", "https"):
|
|
81
|
-
# import here to avoid circular imports
|
|
82
|
-
from .httpdb import HTTPRunDB
|
|
83
|
-
|
|
84
|
-
cls = HTTPRunDB
|
|
85
|
-
kwargs = get_httpdb_kwargs(
|
|
86
|
-
parsed_url.hostname, parsed_url.username, parsed_url.password
|
|
87
|
-
)
|
|
88
|
-
endpoint = parsed_url.hostname
|
|
89
|
-
if parsed_url.port:
|
|
90
|
-
endpoint += f":{parsed_url.port}"
|
|
91
|
-
url = f"{parsed_url.scheme}://{endpoint}{parsed_url.path}"
|
|
92
|
-
else:
|
|
93
|
-
cls = SQLDB
|
|
29
|
+
# import here to avoid circular import
|
|
30
|
+
import mlrun.db.factory
|
|
94
31
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
return _run_db
|
|
32
|
+
run_db_factory = mlrun.db.factory.RunDBFactory()
|
|
33
|
+
return run_db_factory.create_run_db(url, secrets, force_reconnect)
|
mlrun/db/base.py
CHANGED
|
@@ -621,3 +621,15 @@ class RunDBInterface(ABC):
|
|
|
621
621
|
notifications: typing.List[mlrun.model.Notification],
|
|
622
622
|
):
|
|
623
623
|
pass
|
|
624
|
+
|
|
625
|
+
def store_run_notifications(
|
|
626
|
+
self,
|
|
627
|
+
notification_objects: typing.List[mlrun.model.Notification],
|
|
628
|
+
run_uid: str,
|
|
629
|
+
project: str = None,
|
|
630
|
+
mask_params: bool = True,
|
|
631
|
+
):
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
def watch_log(self, uid, project="", watch=True, offset=0):
|
|
635
|
+
pass
|
mlrun/db/factory.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright 2023 MLRun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from dependency_injector import containers, providers
|
|
15
|
+
|
|
16
|
+
import mlrun.db
|
|
17
|
+
import mlrun.db.httpdb
|
|
18
|
+
import mlrun.db.nopdb
|
|
19
|
+
import mlrun.utils.singleton
|
|
20
|
+
from mlrun.utils import logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RunDBFactory(
|
|
24
|
+
metaclass=mlrun.utils.singleton.AbstractSingleton,
|
|
25
|
+
):
|
|
26
|
+
def __init__(self):
|
|
27
|
+
self._run_db = None
|
|
28
|
+
self._last_db_url = None
|
|
29
|
+
self._rundb_container = RunDBContainer()
|
|
30
|
+
|
|
31
|
+
def create_run_db(self, url="", secrets=None, force_reconnect=False):
|
|
32
|
+
"""Returns the runtime database"""
|
|
33
|
+
if not url:
|
|
34
|
+
url = mlrun.db.get_or_set_dburl("./")
|
|
35
|
+
|
|
36
|
+
if (
|
|
37
|
+
self._last_db_url is not None
|
|
38
|
+
and url == self._last_db_url
|
|
39
|
+
and self._run_db
|
|
40
|
+
and not force_reconnect
|
|
41
|
+
):
|
|
42
|
+
return self._run_db
|
|
43
|
+
|
|
44
|
+
self._last_db_url = url
|
|
45
|
+
|
|
46
|
+
if "://" not in str(url):
|
|
47
|
+
logger.warning(
|
|
48
|
+
"Could not detect path to API server, not connected to API server!"
|
|
49
|
+
)
|
|
50
|
+
logger.warning(
|
|
51
|
+
"MLRUN_DBPATH is misconfigured. Set this environment variable to the URL of the API server"
|
|
52
|
+
" in order to connect"
|
|
53
|
+
)
|
|
54
|
+
self._run_db = self._rundb_container.nop(url)
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
self._run_db = self._rundb_container.run_db(url)
|
|
58
|
+
|
|
59
|
+
self._run_db.connect(secrets=secrets)
|
|
60
|
+
return self._run_db
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class RunDBContainer(containers.DeclarativeContainer):
|
|
64
|
+
nop = providers.Factory(mlrun.db.nopdb.NopDB)
|
|
65
|
+
run_db = providers.Factory(mlrun.db.httpdb.HTTPRunDB)
|