mlrun 1.7.0rc3__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/manager.py +6 -1
- mlrun/common/constants.py +1 -0
- mlrun/common/model_monitoring/helpers.py +12 -6
- mlrun/common/schemas/__init__.py +1 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/common.py +40 -0
- mlrun/common/schemas/model_monitoring/constants.py +4 -1
- mlrun/common/schemas/project.py +2 -0
- mlrun/config.py +20 -15
- mlrun/datastore/azure_blob.py +22 -9
- mlrun/datastore/base.py +15 -25
- mlrun/datastore/datastore.py +19 -8
- mlrun/datastore/datastore_profile.py +47 -5
- mlrun/datastore/google_cloud_storage.py +10 -6
- mlrun/datastore/hdfs.py +51 -0
- mlrun/datastore/redis.py +4 -0
- mlrun/datastore/s3.py +4 -0
- mlrun/datastore/sources.py +29 -43
- mlrun/datastore/targets.py +58 -48
- mlrun/datastore/utils.py +2 -49
- mlrun/datastore/v3io.py +4 -0
- mlrun/db/base.py +34 -0
- mlrun/db/httpdb.py +71 -42
- mlrun/execution.py +3 -3
- mlrun/feature_store/feature_vector.py +2 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
- mlrun/frameworks/tf_keras/model_handler.py +7 -7
- mlrun/k8s_utils.py +10 -5
- mlrun/kfpops.py +19 -10
- mlrun/model.py +5 -0
- mlrun/model_monitoring/api.py +3 -3
- mlrun/model_monitoring/application.py +1 -1
- mlrun/model_monitoring/applications/__init__.py +13 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
- mlrun/model_monitoring/batch.py +9 -111
- mlrun/model_monitoring/controller.py +73 -55
- mlrun/model_monitoring/controller_handler.py +13 -5
- mlrun/model_monitoring/features_drift_table.py +62 -53
- mlrun/model_monitoring/helpers.py +30 -21
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
- mlrun/package/packagers/pandas_packagers.py +3 -3
- mlrun/package/utils/_archiver.py +3 -1
- mlrun/platforms/iguazio.py +8 -65
- mlrun/projects/pipelines.py +21 -11
- mlrun/projects/project.py +121 -42
- mlrun/runtimes/base.py +21 -2
- mlrun/runtimes/kubejob.py +5 -3
- mlrun/runtimes/local.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +6 -6
- mlrun/runtimes/nuclio/function.py +9 -9
- mlrun/runtimes/nuclio/serving.py +3 -3
- mlrun/runtimes/pod.py +3 -3
- mlrun/runtimes/sparkjob/spark3job.py +3 -3
- mlrun/serving/remote.py +4 -2
- mlrun/serving/server.py +2 -8
- mlrun/utils/async_http.py +3 -3
- mlrun/utils/helpers.py +27 -5
- mlrun/utils/http.py +3 -3
- mlrun/utils/notifications/notification_pusher.py +6 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/METADATA +13 -16
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/RECORD +69 -63
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc3.dist-info → mlrun-1.7.0rc4.dist-info}/top_level.txt +0 -0
mlrun/artifacts/manager.py
CHANGED
|
@@ -17,7 +17,11 @@ from os.path import exists, isdir
|
|
|
17
17
|
from urllib.parse import urlparse
|
|
18
18
|
|
|
19
19
|
import mlrun.config
|
|
20
|
-
from mlrun.utils.helpers import
|
|
20
|
+
from mlrun.utils.helpers import (
|
|
21
|
+
get_local_file_schema,
|
|
22
|
+
template_artifact_path,
|
|
23
|
+
validate_inline_artifact_body_size,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
from ..utils import (
|
|
23
27
|
is_legacy_artifact,
|
|
@@ -212,6 +216,7 @@ class ArtifactManager:
|
|
|
212
216
|
target_path = target_path or item.target_path
|
|
213
217
|
|
|
214
218
|
validate_artifact_key_name(key, "artifact.key")
|
|
219
|
+
validate_inline_artifact_body_size(item.spec.inline)
|
|
215
220
|
src_path = local_path or item.src_path # TODO: remove src_path
|
|
216
221
|
self.ensure_artifact_source_file_exists(item=item, path=src_path, body=body)
|
|
217
222
|
if format == "html" or (src_path and pathlib.Path(src_path).suffix == "html"):
|
mlrun/common/constants.py
CHANGED
|
@@ -16,6 +16,7 @@ import sys
|
|
|
16
16
|
import typing
|
|
17
17
|
|
|
18
18
|
import mlrun.common
|
|
19
|
+
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
19
20
|
from mlrun.common.schemas.model_monitoring import (
|
|
20
21
|
EndpointUID,
|
|
21
22
|
FunctionURI,
|
|
@@ -64,7 +65,7 @@ def parse_model_endpoint_store_prefix(store_prefix: str):
|
|
|
64
65
|
|
|
65
66
|
|
|
66
67
|
def parse_monitoring_stream_path(
|
|
67
|
-
stream_uri: str, project: str,
|
|
68
|
+
stream_uri: str, project: str, function_name: str = None
|
|
68
69
|
):
|
|
69
70
|
if stream_uri.startswith("kafka://"):
|
|
70
71
|
if "?topic" in stream_uri:
|
|
@@ -72,23 +73,28 @@ def parse_monitoring_stream_path(
|
|
|
72
73
|
"Custom kafka topic is not allowed"
|
|
73
74
|
)
|
|
74
75
|
# Add topic to stream kafka uri
|
|
75
|
-
if
|
|
76
|
+
if (
|
|
77
|
+
function_name is None
|
|
78
|
+
or function_name == mm_constants.MonitoringFunctionNames.STREAM
|
|
79
|
+
):
|
|
76
80
|
stream_uri += f"?topic=monitoring_stream_{project}"
|
|
77
81
|
else:
|
|
78
|
-
stream_uri += f"?topic=monitoring_stream_{project}_{
|
|
82
|
+
stream_uri += f"?topic=monitoring_stream_{project}_{function_name}"
|
|
79
83
|
|
|
80
84
|
elif stream_uri.startswith("v3io://") and mlrun.mlconf.is_ce_mode():
|
|
81
85
|
# V3IO is not supported in CE mode, generating a default http stream path
|
|
82
|
-
if
|
|
86
|
+
if function_name is None:
|
|
83
87
|
stream_uri = (
|
|
84
88
|
mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
|
|
85
|
-
project=project
|
|
89
|
+
project=project, namespace=mlrun.mlconf.namespace
|
|
86
90
|
)
|
|
87
91
|
)
|
|
88
92
|
else:
|
|
89
93
|
stream_uri = (
|
|
90
94
|
mlrun.mlconf.model_endpoint_monitoring.default_http_sink_app.format(
|
|
91
|
-
project=project,
|
|
95
|
+
project=project,
|
|
96
|
+
application_name=function_name,
|
|
97
|
+
namespace=mlrun.mlconf.namespace,
|
|
92
98
|
)
|
|
93
99
|
)
|
|
94
100
|
return stream_uri
|
mlrun/common/schemas/__init__.py
CHANGED
|
@@ -29,6 +29,7 @@ class ClientSpec(pydantic.BaseModel):
|
|
|
29
29
|
ui_url: typing.Optional[str]
|
|
30
30
|
artifact_path: typing.Optional[str]
|
|
31
31
|
feature_store_data_prefixes: typing.Optional[dict[str, str]]
|
|
32
|
+
feature_store_default_targets: typing.Optional[str]
|
|
32
33
|
spark_app_image: typing.Optional[str]
|
|
33
34
|
spark_app_image_tag: typing.Optional[str]
|
|
34
35
|
spark_history_server_path: typing.Optional[str]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
import typing
|
|
16
|
+
|
|
17
|
+
import pydantic
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ImageBuilder(pydantic.BaseModel):
|
|
21
|
+
functionSourceCode: typing.Optional[str] = None
|
|
22
|
+
codeEntryType: typing.Optional[str] = None
|
|
23
|
+
codeEntryAttributes: typing.Optional[str] = None
|
|
24
|
+
source: typing.Optional[str] = None
|
|
25
|
+
code_origin: typing.Optional[str] = None
|
|
26
|
+
origin_filename: typing.Optional[str] = None
|
|
27
|
+
image: typing.Optional[str] = None
|
|
28
|
+
base_image: typing.Optional[str] = None
|
|
29
|
+
commands: typing.Optional[list] = None
|
|
30
|
+
extra: typing.Optional[str] = None
|
|
31
|
+
extra_args: typing.Optional[dict] = None
|
|
32
|
+
builder_env: typing.Optional[dict] = None
|
|
33
|
+
secret: typing.Optional[str] = None
|
|
34
|
+
registry: typing.Optional[str] = None
|
|
35
|
+
load_source_on_run: typing.Optional[bool] = None
|
|
36
|
+
with_mlrun: typing.Optional[bool] = None
|
|
37
|
+
auto_build: typing.Optional[bool] = None
|
|
38
|
+
build_pod: typing.Optional[str] = None
|
|
39
|
+
requirements: typing.Optional[list] = None
|
|
40
|
+
source_code_target_dir: typing.Optional[str] = None
|
|
@@ -181,7 +181,7 @@ class MonitoringFunctionNames:
|
|
|
181
181
|
WRITER = "model-monitoring-writer"
|
|
182
182
|
BATCH = "model-monitoring-batch"
|
|
183
183
|
APPLICATION_CONTROLLER = "model-monitoring-controller"
|
|
184
|
-
STREAM =
|
|
184
|
+
STREAM = "model-monitoring-stream"
|
|
185
185
|
|
|
186
186
|
@staticmethod
|
|
187
187
|
def all():
|
|
@@ -289,3 +289,6 @@ class ModelMonitoringAppLabel:
|
|
|
289
289
|
|
|
290
290
|
class ControllerPolicy:
|
|
291
291
|
BASE_PERIOD = "base_period"
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME = "histogram-data-drift"
|
mlrun/common/schemas/project.py
CHANGED
|
@@ -19,6 +19,7 @@ import pydantic
|
|
|
19
19
|
|
|
20
20
|
import mlrun.common.types
|
|
21
21
|
|
|
22
|
+
from .common import ImageBuilder
|
|
22
23
|
from .object import ObjectKind, ObjectStatus
|
|
23
24
|
|
|
24
25
|
|
|
@@ -85,6 +86,7 @@ class ProjectSpec(pydantic.BaseModel):
|
|
|
85
86
|
desired_state: typing.Optional[ProjectDesiredState] = ProjectDesiredState.online
|
|
86
87
|
custom_packagers: typing.Optional[list[tuple[str, bool]]] = None
|
|
87
88
|
default_image: typing.Optional[str] = None
|
|
89
|
+
build: typing.Optional[ImageBuilder] = None
|
|
88
90
|
|
|
89
91
|
class Config:
|
|
90
92
|
extra = pydantic.Extra.allow
|
mlrun/config.py
CHANGED
|
@@ -287,6 +287,12 @@ default_config = {
|
|
|
287
287
|
"state": "online",
|
|
288
288
|
"retry_api_call_on_exception": "enabled",
|
|
289
289
|
"http_connection_timeout_keep_alive": 11,
|
|
290
|
+
# http client used by httpdb
|
|
291
|
+
"http": {
|
|
292
|
+
# when True, the client will verify the server's TLS
|
|
293
|
+
# set to False for backwards compatibility.
|
|
294
|
+
"verify": False,
|
|
295
|
+
},
|
|
290
296
|
"db": {
|
|
291
297
|
"commit_retry_timeout": 30,
|
|
292
298
|
"commit_retry_interval": 3,
|
|
@@ -484,8 +490,8 @@ default_config = {
|
|
|
484
490
|
"offline_storage_path": "model-endpoints/{kind}",
|
|
485
491
|
# Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
|
|
486
492
|
# when the user is working in CE environment and has not provided any stream path.
|
|
487
|
-
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.
|
|
488
|
-
"default_http_sink_app": "http://nuclio-{project}-{application_name}.
|
|
493
|
+
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
|
|
494
|
+
"default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
|
|
489
495
|
"batch_processing_function_branch": "master",
|
|
490
496
|
"parquet_batching_max_events": 10_000,
|
|
491
497
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
@@ -605,7 +611,7 @@ default_config = {
|
|
|
605
611
|
"workflows": {
|
|
606
612
|
"default_workflow_runner_name": "workflow-runner-{}",
|
|
607
613
|
# Default timeout seconds for retrieving workflow id after execution:
|
|
608
|
-
"timeouts": {"local": 120, "kfp": 30, "remote":
|
|
614
|
+
"timeouts": {"local": 120, "kfp": 30, "remote": 90},
|
|
609
615
|
},
|
|
610
616
|
"log_collector": {
|
|
611
617
|
"address": "localhost:8282",
|
|
@@ -957,10 +963,10 @@ class Config:
|
|
|
957
963
|
with_gpu = (
|
|
958
964
|
with_gpu_requests if requirement == "requests" else with_gpu_limits
|
|
959
965
|
)
|
|
960
|
-
resources[
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
966
|
+
resources[requirement] = (
|
|
967
|
+
self.get_default_function_pod_requirement_resources(
|
|
968
|
+
requirement, with_gpu
|
|
969
|
+
)
|
|
964
970
|
)
|
|
965
971
|
return resources
|
|
966
972
|
|
|
@@ -1053,7 +1059,7 @@ class Config:
|
|
|
1053
1059
|
kind: str = "",
|
|
1054
1060
|
target: str = "online",
|
|
1055
1061
|
artifact_path: str = None,
|
|
1056
|
-
|
|
1062
|
+
function_name: str = None,
|
|
1057
1063
|
) -> str:
|
|
1058
1064
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
1059
1065
|
|
|
@@ -1068,7 +1074,7 @@ class Config:
|
|
|
1068
1074
|
artifact path instead.
|
|
1069
1075
|
:param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
|
|
1070
1076
|
relative artifact path will be taken from the global MLRun artifact path.
|
|
1071
|
-
:param
|
|
1077
|
+
:param function_name: Application name, None for model_monitoring_stream.
|
|
1072
1078
|
|
|
1073
1079
|
:return: Full configured path for the provided kind.
|
|
1074
1080
|
"""
|
|
@@ -1082,20 +1088,19 @@ class Config:
|
|
|
1082
1088
|
return store_prefix_dict[kind].format(project=project)
|
|
1083
1089
|
|
|
1084
1090
|
if (
|
|
1085
|
-
|
|
1091
|
+
function_name
|
|
1092
|
+
and function_name
|
|
1086
1093
|
!= mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
|
|
1087
1094
|
):
|
|
1088
1095
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1089
1096
|
project=project,
|
|
1090
1097
|
kind=kind
|
|
1091
|
-
if
|
|
1092
|
-
else f"{kind}-{
|
|
1098
|
+
if function_name is None
|
|
1099
|
+
else f"{kind}-{function_name.lower()}",
|
|
1093
1100
|
)
|
|
1094
1101
|
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1095
1102
|
project=project,
|
|
1096
|
-
kind=kind
|
|
1097
|
-
if application_name is None
|
|
1098
|
-
else f"{kind}-{application_name.lower()}",
|
|
1103
|
+
kind=kind,
|
|
1099
1104
|
)
|
|
1100
1105
|
|
|
1101
1106
|
# Get the current offline path from the configuration
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -175,9 +175,9 @@ class AzureBlobStore(DataStore):
|
|
|
175
175
|
|
|
176
176
|
if "client_secret" in st or "client_id" in st or "tenant_id" in st:
|
|
177
177
|
res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
|
|
178
|
-
res[
|
|
179
|
-
|
|
180
|
-
|
|
178
|
+
res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
|
|
179
|
+
"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
|
|
180
|
+
)
|
|
181
181
|
if "client_id" in st:
|
|
182
182
|
res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
|
|
183
183
|
"client_id"
|
|
@@ -188,14 +188,27 @@ class AzureBlobStore(DataStore):
|
|
|
188
188
|
]
|
|
189
189
|
if "tenant_id" in st:
|
|
190
190
|
tenant_id = st["tenant_id"]
|
|
191
|
-
res[
|
|
192
|
-
f"
|
|
193
|
-
|
|
191
|
+
res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
|
|
192
|
+
f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
|
|
193
|
+
)
|
|
194
194
|
|
|
195
195
|
if "sas_token" in st:
|
|
196
196
|
res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
|
|
197
|
-
res[
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
|
|
198
|
+
"org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
|
|
199
|
+
)
|
|
200
200
|
res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
|
|
201
201
|
return res
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def spark_url(self):
|
|
205
|
+
spark_options = self.get_spark_options()
|
|
206
|
+
url = f"wasbs://{self.endpoint}"
|
|
207
|
+
prefix = "spark.hadoop.fs.azure.account.key."
|
|
208
|
+
if spark_options:
|
|
209
|
+
for key in spark_options:
|
|
210
|
+
if key.startswith(prefix):
|
|
211
|
+
account_key = key[len(prefix) :]
|
|
212
|
+
url += f"@{account_key}"
|
|
213
|
+
break
|
|
214
|
+
return url
|
mlrun/datastore/base.py
CHANGED
|
@@ -147,6 +147,10 @@ class DataStore:
|
|
|
147
147
|
def url(self):
|
|
148
148
|
return f"{self.kind}://{self.endpoint}"
|
|
149
149
|
|
|
150
|
+
@property
|
|
151
|
+
def spark_url(self):
|
|
152
|
+
return self.url
|
|
153
|
+
|
|
150
154
|
def get(self, key, size=None, offset=0):
|
|
151
155
|
pass
|
|
152
156
|
|
|
@@ -320,31 +324,17 @@ class DataStore:
|
|
|
320
324
|
raise Exception(f"File type unhandled {url}")
|
|
321
325
|
|
|
322
326
|
if file_system:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
# Pass the underlying file system
|
|
335
|
-
kwargs["filesystem"] = file_system
|
|
336
|
-
elif storage_options:
|
|
337
|
-
kwargs["storage_options"] = storage_options
|
|
338
|
-
df = reader(url, **kwargs)
|
|
339
|
-
else:
|
|
340
|
-
file = url
|
|
341
|
-
# Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
|
|
342
|
-
if file_system.protocol != "file":
|
|
343
|
-
# If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
|
|
344
|
-
# support the storage_options parameter.
|
|
345
|
-
file = file_system.open(url)
|
|
346
|
-
|
|
347
|
-
df = reader(file, **kwargs)
|
|
327
|
+
storage_options = self.get_storage_options()
|
|
328
|
+
if url.startswith("ds://"):
|
|
329
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
330
|
+
url = parsed_url.path
|
|
331
|
+
if self.using_bucket:
|
|
332
|
+
url = url[1:]
|
|
333
|
+
# Pass the underlying file system
|
|
334
|
+
kwargs["filesystem"] = file_system
|
|
335
|
+
elif storage_options:
|
|
336
|
+
kwargs["storage_options"] = storage_options
|
|
337
|
+
df = reader(url, **kwargs)
|
|
348
338
|
else:
|
|
349
339
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
350
340
|
self.download(self._join(subpath), temp_file.name)
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -94,6 +94,10 @@ def schema_to_store(schema):
|
|
|
94
94
|
from .dbfs_store import DBFSStore
|
|
95
95
|
|
|
96
96
|
return DBFSStore
|
|
97
|
+
elif schema == "hdfs":
|
|
98
|
+
from .hdfs import HdfsStore
|
|
99
|
+
|
|
100
|
+
return HdfsStore
|
|
97
101
|
else:
|
|
98
102
|
raise ValueError(f"unsupported store scheme ({schema})")
|
|
99
103
|
|
|
@@ -170,7 +174,7 @@ class StoreManager:
|
|
|
170
174
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
171
175
|
f"resource {url} does not have a valid/persistent offline target"
|
|
172
176
|
)
|
|
173
|
-
return resource, target
|
|
177
|
+
return resource, target or ""
|
|
174
178
|
|
|
175
179
|
def object(
|
|
176
180
|
self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
|
|
@@ -182,14 +186,21 @@ class StoreManager:
|
|
|
182
186
|
url, project, allow_empty_resources, secrets
|
|
183
187
|
)
|
|
184
188
|
|
|
185
|
-
store, subpath = self.get_or_create_store(
|
|
189
|
+
store, subpath, url = self.get_or_create_store(
|
|
186
190
|
url, secrets=secrets, project_name=project
|
|
187
191
|
)
|
|
188
|
-
return DataItem(
|
|
192
|
+
return DataItem(
|
|
193
|
+
key,
|
|
194
|
+
store,
|
|
195
|
+
subpath,
|
|
196
|
+
url,
|
|
197
|
+
meta=meta,
|
|
198
|
+
artifact_url=artifact_url,
|
|
199
|
+
)
|
|
189
200
|
|
|
190
201
|
def get_or_create_store(
|
|
191
202
|
self, url, secrets: dict = None, project_name=""
|
|
192
|
-
) -> (DataStore, str):
|
|
203
|
+
) -> (DataStore, str, str):
|
|
193
204
|
schema, endpoint, parsed_url = parse_url(url)
|
|
194
205
|
subpath = parsed_url.path
|
|
195
206
|
store_key = f"{schema}://{endpoint}"
|
|
@@ -206,17 +217,17 @@ class StoreManager:
|
|
|
206
217
|
|
|
207
218
|
if schema == "memory":
|
|
208
219
|
subpath = url[len("memory://") :]
|
|
209
|
-
return in_memory_store, subpath
|
|
220
|
+
return in_memory_store, subpath, url
|
|
210
221
|
|
|
211
222
|
if not schema and endpoint:
|
|
212
223
|
if endpoint in self._stores.keys():
|
|
213
|
-
return self._stores[endpoint], subpath
|
|
224
|
+
return self._stores[endpoint], subpath, url
|
|
214
225
|
else:
|
|
215
226
|
raise ValueError(f"no such store ({endpoint})")
|
|
216
227
|
|
|
217
228
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
218
229
|
if store_key in self._stores.keys():
|
|
219
|
-
return self._stores[store_key], subpath
|
|
230
|
+
return self._stores[store_key], subpath, url
|
|
220
231
|
|
|
221
232
|
# support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
|
|
222
233
|
# when running on server we don't cache the datastore, because there are multiple users and we don't want to
|
|
@@ -227,7 +238,7 @@ class StoreManager:
|
|
|
227
238
|
if not secrets and not mlrun.config.is_running_as_api():
|
|
228
239
|
self._stores[store_key] = store
|
|
229
240
|
# in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
|
|
230
|
-
return store, url if store.kind == "file" else subpath
|
|
241
|
+
return store, url if store.kind == "file" else subpath, url
|
|
231
242
|
|
|
232
243
|
def reset_secrets(self):
|
|
233
244
|
self._secrets = {}
|
|
@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
|
|
|
132
132
|
return attributes
|
|
133
133
|
|
|
134
134
|
|
|
135
|
+
class DatastoreProfileV3io(DatastoreProfile):
|
|
136
|
+
type: str = pydantic.Field("v3io")
|
|
137
|
+
v3io_access_key: typing.Optional[str] = None
|
|
138
|
+
_private_attributes = "v3io_access_key"
|
|
139
|
+
|
|
140
|
+
def url(self, subpath):
|
|
141
|
+
subpath = subpath.lstrip("/")
|
|
142
|
+
return f"v3io:///{subpath}"
|
|
143
|
+
|
|
144
|
+
def secrets(self) -> dict:
|
|
145
|
+
res = {}
|
|
146
|
+
if self.v3io_access_key:
|
|
147
|
+
res["V3IO_ACCESS_KEY"] = self.v3io_access_key
|
|
148
|
+
return res
|
|
149
|
+
|
|
150
|
+
|
|
135
151
|
class DatastoreProfileS3(DatastoreProfile):
|
|
136
152
|
type: str = pydantic.Field("s3")
|
|
137
153
|
_private_attributes = ("access_key_id", "secret_key")
|
|
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
|
|
|
156
172
|
res["AWS_PROFILE"] = self.profile_name
|
|
157
173
|
if self.assume_role_arn:
|
|
158
174
|
res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
|
|
159
|
-
return res
|
|
175
|
+
return res
|
|
160
176
|
|
|
161
177
|
def url(self, subpath):
|
|
162
178
|
return f"s3:/{subpath}"
|
|
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
199
215
|
res["REDIS_USER"] = self.username
|
|
200
216
|
if self.password:
|
|
201
217
|
res["REDIS_PASSWORD"] = self.password
|
|
202
|
-
return res
|
|
218
|
+
return res
|
|
203
219
|
|
|
204
220
|
def url(self, subpath):
|
|
205
221
|
return self.endpoint_url + subpath
|
|
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
|
|
|
220
236
|
res["DATABRICKS_TOKEN"] = self.token
|
|
221
237
|
if self.endpoint_url:
|
|
222
238
|
res["DATABRICKS_HOST"] = self.endpoint_url
|
|
223
|
-
return res
|
|
239
|
+
return res
|
|
224
240
|
|
|
225
241
|
|
|
226
242
|
class DatastoreProfileGCS(DatastoreProfile):
|
|
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
|
|
|
247
263
|
res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
|
|
248
264
|
if self.gcp_credentials:
|
|
249
265
|
res["GCP_CREDENTIALS"] = self.gcp_credentials
|
|
250
|
-
return res
|
|
266
|
+
return res
|
|
251
267
|
|
|
252
268
|
|
|
253
269
|
class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
|
|
|
292
308
|
res["sas_token"] = self.sas_token
|
|
293
309
|
if self.credential:
|
|
294
310
|
res["credential"] = self.credential
|
|
295
|
-
return res
|
|
311
|
+
return res
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class DatastoreProfileHdfs(DatastoreProfile):
|
|
315
|
+
type: str = pydantic.Field("hdfs")
|
|
316
|
+
_private_attributes = "token"
|
|
317
|
+
host: typing.Optional[str] = None
|
|
318
|
+
port: typing.Optional[int] = None
|
|
319
|
+
http_port: typing.Optional[int] = None
|
|
320
|
+
user: typing.Optional[str] = None
|
|
321
|
+
|
|
322
|
+
def secrets(self) -> dict:
|
|
323
|
+
res = {}
|
|
324
|
+
if self.host:
|
|
325
|
+
res["HDFS_HOST"] = self.host
|
|
326
|
+
if self.port:
|
|
327
|
+
res["HDFS_PORT"] = self.port
|
|
328
|
+
if self.port:
|
|
329
|
+
res["HDFS_HTTP_PORT"] = self.http_port
|
|
330
|
+
if self.user:
|
|
331
|
+
res["HDFS_USER"] = self.user
|
|
332
|
+
return res or None
|
|
333
|
+
|
|
334
|
+
def url(self, subpath):
|
|
335
|
+
return f"hdfs://{self.host}:{self.http_port}{subpath}"
|
|
296
336
|
|
|
297
337
|
|
|
298
338
|
class DatastoreProfile2Json(pydantic.BaseModel):
|
|
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
346
386
|
decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
|
|
347
387
|
datastore_type = decoded_dict.get("type")
|
|
348
388
|
ds_profile_factory = {
|
|
389
|
+
"v3io": DatastoreProfileV3io,
|
|
349
390
|
"s3": DatastoreProfileS3,
|
|
350
391
|
"redis": DatastoreProfileRedis,
|
|
351
392
|
"basic": DatastoreProfileBasic,
|
|
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
|
|
|
354
395
|
"dbfs": DatastoreProfileDBFS,
|
|
355
396
|
"gcs": DatastoreProfileGCS,
|
|
356
397
|
"az": DatastoreProfileAzureBlob,
|
|
398
|
+
"hdfs": DatastoreProfileHdfs,
|
|
357
399
|
}
|
|
358
400
|
if datastore_type in ds_profile_factory:
|
|
359
401
|
return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
|
|
@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
147
147
|
if "project_id" in credentials:
|
|
148
148
|
res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
|
|
149
149
|
if "private_key_id" in credentials:
|
|
150
|
-
res[
|
|
151
|
-
"
|
|
152
|
-
|
|
150
|
+
res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
|
|
151
|
+
credentials["private_key_id"]
|
|
152
|
+
)
|
|
153
153
|
if "private_key" in credentials:
|
|
154
|
-
res[
|
|
155
|
-
"
|
|
156
|
-
|
|
154
|
+
res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
|
|
155
|
+
credentials["private_key"]
|
|
156
|
+
)
|
|
157
157
|
if "client_email" in credentials:
|
|
158
158
|
res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
|
|
159
159
|
"client_email"
|
|
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
161
161
|
if "client_id" in credentials:
|
|
162
162
|
res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
|
|
163
163
|
return res
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def spark_url(self):
|
|
167
|
+
return f"gs://{self.endpoint}"
|
mlrun/datastore/hdfs.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
import fsspec
|
|
17
|
+
|
|
18
|
+
from mlrun.datastore.base import DataStore
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HdfsStore(DataStore):
|
|
22
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
23
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
24
|
+
|
|
25
|
+
self.host = self._get_secret_or_env("HDFS_HOST")
|
|
26
|
+
self.port = self._get_secret_or_env("HDFS_PORT")
|
|
27
|
+
self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
|
|
28
|
+
self.user = self._get_secret_or_env("HDFS_USER")
|
|
29
|
+
if not self.user:
|
|
30
|
+
self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
|
|
31
|
+
|
|
32
|
+
self._filesystem = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def filesystem(self):
|
|
36
|
+
if not self._filesystem:
|
|
37
|
+
self._filesystem = fsspec.filesystem(
|
|
38
|
+
"webhdfs",
|
|
39
|
+
host=self.host,
|
|
40
|
+
port=self.http_port,
|
|
41
|
+
user=self.user,
|
|
42
|
+
)
|
|
43
|
+
return self._filesystem
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def url(self):
|
|
47
|
+
return f"webhdfs://{self.host}:{self.http_port}"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def spark_url(self):
|
|
51
|
+
return f"hdfs://{self.host}:{self.port}"
|
mlrun/datastore/redis.py
CHANGED
mlrun/datastore/s3.py
CHANGED
|
@@ -156,6 +156,10 @@ class S3Store(DataStore):
|
|
|
156
156
|
|
|
157
157
|
return self._sanitize_storage_options(storage_options)
|
|
158
158
|
|
|
159
|
+
@property
|
|
160
|
+
def spark_url(self):
|
|
161
|
+
return f"s3a://{self.endpoint}"
|
|
162
|
+
|
|
159
163
|
def get_bucket_and_key(self, key):
|
|
160
164
|
path = self._join(key)[1:]
|
|
161
165
|
return self.endpoint, path
|