mlrun 1.7.0rc2__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (70) hide show
  1. mlrun/artifacts/manager.py +6 -1
  2. mlrun/common/constants.py +1 -0
  3. mlrun/common/model_monitoring/helpers.py +12 -6
  4. mlrun/common/schemas/__init__.py +1 -0
  5. mlrun/common/schemas/client_spec.py +1 -0
  6. mlrun/common/schemas/common.py +40 -0
  7. mlrun/common/schemas/model_monitoring/constants.py +4 -1
  8. mlrun/common/schemas/project.py +2 -0
  9. mlrun/config.py +20 -16
  10. mlrun/datastore/azure_blob.py +22 -9
  11. mlrun/datastore/base.py +15 -25
  12. mlrun/datastore/datastore.py +19 -8
  13. mlrun/datastore/datastore_profile.py +47 -5
  14. mlrun/datastore/google_cloud_storage.py +10 -6
  15. mlrun/datastore/hdfs.py +51 -0
  16. mlrun/datastore/redis.py +4 -0
  17. mlrun/datastore/s3.py +4 -0
  18. mlrun/datastore/sources.py +31 -50
  19. mlrun/datastore/targets.py +58 -48
  20. mlrun/datastore/utils.py +2 -49
  21. mlrun/datastore/v3io.py +4 -0
  22. mlrun/db/base.py +34 -0
  23. mlrun/db/httpdb.py +71 -42
  24. mlrun/execution.py +3 -3
  25. mlrun/feature_store/feature_vector.py +2 -2
  26. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
  27. mlrun/frameworks/tf_keras/model_handler.py +7 -7
  28. mlrun/k8s_utils.py +10 -5
  29. mlrun/kfpops.py +19 -10
  30. mlrun/model.py +5 -0
  31. mlrun/model_monitoring/api.py +3 -3
  32. mlrun/model_monitoring/application.py +1 -1
  33. mlrun/model_monitoring/applications/__init__.py +13 -0
  34. mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
  35. mlrun/model_monitoring/batch.py +9 -111
  36. mlrun/model_monitoring/controller.py +73 -55
  37. mlrun/model_monitoring/controller_handler.py +13 -5
  38. mlrun/model_monitoring/features_drift_table.py +62 -53
  39. mlrun/model_monitoring/helpers.py +30 -21
  40. mlrun/model_monitoring/metrics/__init__.py +13 -0
  41. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  42. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
  43. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
  44. mlrun/package/packagers/pandas_packagers.py +3 -3
  45. mlrun/package/utils/_archiver.py +3 -1
  46. mlrun/platforms/iguazio.py +8 -65
  47. mlrun/projects/pipelines.py +21 -11
  48. mlrun/projects/project.py +121 -42
  49. mlrun/runtimes/base.py +21 -2
  50. mlrun/runtimes/kubejob.py +5 -3
  51. mlrun/runtimes/local.py +2 -2
  52. mlrun/runtimes/mpijob/abstract.py +6 -6
  53. mlrun/runtimes/nuclio/function.py +9 -9
  54. mlrun/runtimes/nuclio/serving.py +3 -3
  55. mlrun/runtimes/pod.py +3 -3
  56. mlrun/runtimes/sparkjob/spark3job.py +3 -3
  57. mlrun/serving/remote.py +4 -2
  58. mlrun/serving/server.py +15 -18
  59. mlrun/serving/states.py +27 -12
  60. mlrun/utils/async_http.py +3 -3
  61. mlrun/utils/helpers.py +27 -5
  62. mlrun/utils/http.py +3 -3
  63. mlrun/utils/notifications/notification_pusher.py +6 -6
  64. mlrun/utils/version/version.json +2 -2
  65. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/METADATA +13 -16
  66. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/RECORD +70 -64
  67. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/LICENSE +0 -0
  68. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/WHEEL +0 -0
  69. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/entry_points.txt +0 -0
  70. {mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,11 @@ from os.path import exists, isdir
17
17
  from urllib.parse import urlparse
18
18
 
19
19
  import mlrun.config
20
- from mlrun.utils.helpers import get_local_file_schema, template_artifact_path
20
+ from mlrun.utils.helpers import (
21
+ get_local_file_schema,
22
+ template_artifact_path,
23
+ validate_inline_artifact_body_size,
24
+ )
21
25
 
22
26
  from ..utils import (
23
27
  is_legacy_artifact,
@@ -212,6 +216,7 @@ class ArtifactManager:
212
216
  target_path = target_path or item.target_path
213
217
 
214
218
  validate_artifact_key_name(key, "artifact.key")
219
+ validate_inline_artifact_body_size(item.spec.inline)
215
220
  src_path = local_path or item.src_path # TODO: remove src_path
216
221
  self.ensure_artifact_source_file_exists(item=item, path=src_path, body=body)
217
222
  if format == "html" or (src_path and pathlib.Path(src_path).suffix == "html"):
mlrun/common/constants.py CHANGED
@@ -13,3 +13,4 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  IMAGE_NAME_ENRICH_REGISTRY_PREFIX = "." # prefix for image name to enrich with registry
16
+ MYSQL_MEDIUMBLOB_SIZE_BYTES = 16 * 1024 * 1024
@@ -16,6 +16,7 @@ import sys
16
16
  import typing
17
17
 
18
18
  import mlrun.common
19
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
19
20
  from mlrun.common.schemas.model_monitoring import (
20
21
  EndpointUID,
21
22
  FunctionURI,
@@ -64,7 +65,7 @@ def parse_model_endpoint_store_prefix(store_prefix: str):
64
65
 
65
66
 
66
67
  def parse_monitoring_stream_path(
67
- stream_uri: str, project: str, application_name: str = None
68
+ stream_uri: str, project: str, function_name: str = None
68
69
  ):
69
70
  if stream_uri.startswith("kafka://"):
70
71
  if "?topic" in stream_uri:
@@ -72,23 +73,28 @@ def parse_monitoring_stream_path(
72
73
  "Custom kafka topic is not allowed"
73
74
  )
74
75
  # Add topic to stream kafka uri
75
- if application_name is None:
76
+ if (
77
+ function_name is None
78
+ or function_name == mm_constants.MonitoringFunctionNames.STREAM
79
+ ):
76
80
  stream_uri += f"?topic=monitoring_stream_{project}"
77
81
  else:
78
- stream_uri += f"?topic=monitoring_stream_{project}_{application_name}"
82
+ stream_uri += f"?topic=monitoring_stream_{project}_{function_name}"
79
83
 
80
84
  elif stream_uri.startswith("v3io://") and mlrun.mlconf.is_ce_mode():
81
85
  # V3IO is not supported in CE mode, generating a default http stream path
82
- if application_name is None:
86
+ if function_name is None:
83
87
  stream_uri = (
84
88
  mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
85
- project=project
89
+ project=project, namespace=mlrun.mlconf.namespace
86
90
  )
87
91
  )
88
92
  else:
89
93
  stream_uri = (
90
94
  mlrun.mlconf.model_endpoint_monitoring.default_http_sink_app.format(
91
- project=project, application_name=application_name
95
+ project=project,
96
+ application_name=function_name,
97
+ namespace=mlrun.mlconf.namespace,
92
98
  )
93
99
  )
94
100
  return stream_uri
@@ -43,6 +43,7 @@ from .clusterization_spec import (
43
43
  ClusterizationSpec,
44
44
  WaitForChiefToReachOnlineStateFeatureFlag,
45
45
  )
46
+ from .common import ImageBuilder
46
47
  from .constants import (
47
48
  APIStates,
48
49
  ClusterizationRole,
@@ -29,6 +29,7 @@ class ClientSpec(pydantic.BaseModel):
29
29
  ui_url: typing.Optional[str]
30
30
  artifact_path: typing.Optional[str]
31
31
  feature_store_data_prefixes: typing.Optional[dict[str, str]]
32
+ feature_store_default_targets: typing.Optional[str]
32
33
  spark_app_image: typing.Optional[str]
33
34
  spark_app_image_tag: typing.Optional[str]
34
35
  spark_history_server_path: typing.Optional[str]
@@ -0,0 +1,40 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import typing
16
+
17
+ import pydantic
18
+
19
+
20
+ class ImageBuilder(pydantic.BaseModel):
21
+ functionSourceCode: typing.Optional[str] = None
22
+ codeEntryType: typing.Optional[str] = None
23
+ codeEntryAttributes: typing.Optional[str] = None
24
+ source: typing.Optional[str] = None
25
+ code_origin: typing.Optional[str] = None
26
+ origin_filename: typing.Optional[str] = None
27
+ image: typing.Optional[str] = None
28
+ base_image: typing.Optional[str] = None
29
+ commands: typing.Optional[list] = None
30
+ extra: typing.Optional[str] = None
31
+ extra_args: typing.Optional[dict] = None
32
+ builder_env: typing.Optional[dict] = None
33
+ secret: typing.Optional[str] = None
34
+ registry: typing.Optional[str] = None
35
+ load_source_on_run: typing.Optional[bool] = None
36
+ with_mlrun: typing.Optional[bool] = None
37
+ auto_build: typing.Optional[bool] = None
38
+ build_pod: typing.Optional[str] = None
39
+ requirements: typing.Optional[list] = None
40
+ source_code_target_dir: typing.Optional[str] = None
@@ -181,7 +181,7 @@ class MonitoringFunctionNames:
181
181
  WRITER = "model-monitoring-writer"
182
182
  BATCH = "model-monitoring-batch"
183
183
  APPLICATION_CONTROLLER = "model-monitoring-controller"
184
- STREAM = None
184
+ STREAM = "model-monitoring-stream"
185
185
 
186
186
  @staticmethod
187
187
  def all():
@@ -289,3 +289,6 @@ class ModelMonitoringAppLabel:
289
289
 
290
290
  class ControllerPolicy:
291
291
  BASE_PERIOD = "base_period"
292
+
293
+
294
+ MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME = "histogram-data-drift"
@@ -19,6 +19,7 @@ import pydantic
19
19
 
20
20
  import mlrun.common.types
21
21
 
22
+ from .common import ImageBuilder
22
23
  from .object import ObjectKind, ObjectStatus
23
24
 
24
25
 
@@ -85,6 +86,7 @@ class ProjectSpec(pydantic.BaseModel):
85
86
  desired_state: typing.Optional[ProjectDesiredState] = ProjectDesiredState.online
86
87
  custom_packagers: typing.Optional[list[tuple[str, bool]]] = None
87
88
  default_image: typing.Optional[str] = None
89
+ build: typing.Optional[ImageBuilder] = None
88
90
 
89
91
  class Config:
90
92
  extra = pydantic.Extra.allow
mlrun/config.py CHANGED
@@ -149,7 +149,6 @@ default_config = {
149
149
  "url": "",
150
150
  },
151
151
  "v3io_framesd": "http://framesd:8080",
152
- "datastore": {"async_source_mode": "disabled"},
153
152
  # default node selector to be applied to all functions - json string base64 encoded format
154
153
  "default_function_node_selector": "e30=",
155
154
  # default priority class to be applied to functions running on k8s cluster
@@ -288,6 +287,12 @@ default_config = {
288
287
  "state": "online",
289
288
  "retry_api_call_on_exception": "enabled",
290
289
  "http_connection_timeout_keep_alive": 11,
290
+ # http client used by httpdb
291
+ "http": {
292
+ # when True, the client will verify the server's TLS
293
+ # set to False for backwards compatibility.
294
+ "verify": False,
295
+ },
291
296
  "db": {
292
297
  "commit_retry_timeout": 30,
293
298
  "commit_retry_interval": 3,
@@ -485,8 +490,8 @@ default_config = {
485
490
  "offline_storage_path": "model-endpoints/{kind}",
486
491
  # Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
487
492
  # when the user is working in CE environment and has not provided any stream path.
488
- "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
489
- "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
493
+ "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
494
+ "default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
490
495
  "batch_processing_function_branch": "master",
491
496
  "parquet_batching_max_events": 10_000,
492
497
  "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
@@ -606,7 +611,7 @@ default_config = {
606
611
  "workflows": {
607
612
  "default_workflow_runner_name": "workflow-runner-{}",
608
613
  # Default timeout seconds for retrieving workflow id after execution:
609
- "timeouts": {"local": 120, "kfp": 30, "remote": 30},
614
+ "timeouts": {"local": 120, "kfp": 30, "remote": 90},
610
615
  },
611
616
  "log_collector": {
612
617
  "address": "localhost:8282",
@@ -958,10 +963,10 @@ class Config:
958
963
  with_gpu = (
959
964
  with_gpu_requests if requirement == "requests" else with_gpu_limits
960
965
  )
961
- resources[
962
- requirement
963
- ] = self.get_default_function_pod_requirement_resources(
964
- requirement, with_gpu
966
+ resources[requirement] = (
967
+ self.get_default_function_pod_requirement_resources(
968
+ requirement, with_gpu
969
+ )
965
970
  )
966
971
  return resources
967
972
 
@@ -1054,7 +1059,7 @@ class Config:
1054
1059
  kind: str = "",
1055
1060
  target: str = "online",
1056
1061
  artifact_path: str = None,
1057
- application_name: str = None,
1062
+ function_name: str = None,
1058
1063
  ) -> str:
1059
1064
  """Get the full path from the configuration based on the provided project and kind.
1060
1065
 
@@ -1069,7 +1074,7 @@ class Config:
1069
1074
  artifact path instead.
1070
1075
  :param artifact_path: Optional artifact path that will be used as a relative path. If not provided, the
1071
1076
  relative artifact path will be taken from the global MLRun artifact path.
1072
- :param application_name: Application name, None for model_monitoring_stream.
1077
+ :param function_name: Application name, None for model_monitoring_stream.
1073
1078
 
1074
1079
  :return: Full configured path for the provided kind.
1075
1080
  """
@@ -1083,20 +1088,19 @@ class Config:
1083
1088
  return store_prefix_dict[kind].format(project=project)
1084
1089
 
1085
1090
  if (
1086
- application_name
1091
+ function_name
1092
+ and function_name
1087
1093
  != mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
1088
1094
  ):
1089
1095
  return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1090
1096
  project=project,
1091
1097
  kind=kind
1092
- if application_name is None
1093
- else f"{kind}-{application_name.lower()}",
1098
+ if function_name is None
1099
+ else f"{kind}-{function_name.lower()}",
1094
1100
  )
1095
1101
  return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1096
1102
  project=project,
1097
- kind=kind
1098
- if application_name is None
1099
- else f"{kind}-{application_name.lower()}",
1103
+ kind=kind,
1100
1104
  )
1101
1105
 
1102
1106
  # Get the current offline path from the configuration
@@ -175,9 +175,9 @@ class AzureBlobStore(DataStore):
175
175
 
176
176
  if "client_secret" in st or "client_id" in st or "tenant_id" in st:
177
177
  res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
178
- res[
179
- f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"
180
- ] = "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
178
+ res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
179
+ "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
180
+ )
181
181
  if "client_id" in st:
182
182
  res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
183
183
  "client_id"
@@ -188,14 +188,27 @@ class AzureBlobStore(DataStore):
188
188
  ]
189
189
  if "tenant_id" in st:
190
190
  tenant_id = st["tenant_id"]
191
- res[
192
- f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"
193
- ] = f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
191
+ res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
192
+ f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
193
+ )
194
194
 
195
195
  if "sas_token" in st:
196
196
  res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
197
- res[
198
- f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"
199
- ] = "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
197
+ res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
198
+ "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
199
+ )
200
200
  res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
201
201
  return res
202
+
203
+ @property
204
+ def spark_url(self):
205
+ spark_options = self.get_spark_options()
206
+ url = f"wasbs://{self.endpoint}"
207
+ prefix = "spark.hadoop.fs.azure.account.key."
208
+ if spark_options:
209
+ for key in spark_options:
210
+ if key.startswith(prefix):
211
+ account_key = key[len(prefix) :]
212
+ url += f"@{account_key}"
213
+ break
214
+ return url
mlrun/datastore/base.py CHANGED
@@ -147,6 +147,10 @@ class DataStore:
147
147
  def url(self):
148
148
  return f"{self.kind}://{self.endpoint}"
149
149
 
150
+ @property
151
+ def spark_url(self):
152
+ return self.url
153
+
150
154
  def get(self, key, size=None, offset=0):
151
155
  pass
152
156
 
@@ -320,31 +324,17 @@ class DataStore:
320
324
  raise Exception(f"File type unhandled {url}")
321
325
 
322
326
  if file_system:
323
- if (
324
- self.supports_isdir()
325
- and file_system.isdir(file_url)
326
- or self._is_dd(df_module)
327
- ):
328
- storage_options = self.get_storage_options()
329
- if url.startswith("ds://"):
330
- parsed_url = urllib.parse.urlparse(url)
331
- url = parsed_url.path
332
- if self.using_bucket:
333
- url = url[1:]
334
- # Pass the underlying file system
335
- kwargs["filesystem"] = file_system
336
- elif storage_options:
337
- kwargs["storage_options"] = storage_options
338
- df = reader(url, **kwargs)
339
- else:
340
- file = url
341
- # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
342
- if file_system.protocol != "file":
343
- # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
344
- # support the storage_options parameter.
345
- file = file_system.open(url)
346
-
347
- df = reader(file, **kwargs)
327
+ storage_options = self.get_storage_options()
328
+ if url.startswith("ds://"):
329
+ parsed_url = urllib.parse.urlparse(url)
330
+ url = parsed_url.path
331
+ if self.using_bucket:
332
+ url = url[1:]
333
+ # Pass the underlying file system
334
+ kwargs["filesystem"] = file_system
335
+ elif storage_options:
336
+ kwargs["storage_options"] = storage_options
337
+ df = reader(url, **kwargs)
348
338
  else:
349
339
  temp_file = tempfile.NamedTemporaryFile(delete=False)
350
340
  self.download(self._join(subpath), temp_file.name)
@@ -94,6 +94,10 @@ def schema_to_store(schema):
94
94
  from .dbfs_store import DBFSStore
95
95
 
96
96
  return DBFSStore
97
+ elif schema == "hdfs":
98
+ from .hdfs import HdfsStore
99
+
100
+ return HdfsStore
97
101
  else:
98
102
  raise ValueError(f"unsupported store scheme ({schema})")
99
103
 
@@ -170,7 +174,7 @@ class StoreManager:
170
174
  raise mlrun.errors.MLRunInvalidArgumentError(
171
175
  f"resource {url} does not have a valid/persistent offline target"
172
176
  )
173
- return resource, target
177
+ return resource, target or ""
174
178
 
175
179
  def object(
176
180
  self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,14 +186,21 @@ class StoreManager:
182
186
  url, project, allow_empty_resources, secrets
183
187
  )
184
188
 
185
- store, subpath = self.get_or_create_store(
189
+ store, subpath, url = self.get_or_create_store(
186
190
  url, secrets=secrets, project_name=project
187
191
  )
188
- return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
192
+ return DataItem(
193
+ key,
194
+ store,
195
+ subpath,
196
+ url,
197
+ meta=meta,
198
+ artifact_url=artifact_url,
199
+ )
189
200
 
190
201
  def get_or_create_store(
191
202
  self, url, secrets: dict = None, project_name=""
192
- ) -> (DataStore, str):
203
+ ) -> (DataStore, str, str):
193
204
  schema, endpoint, parsed_url = parse_url(url)
194
205
  subpath = parsed_url.path
195
206
  store_key = f"{schema}://{endpoint}"
@@ -206,17 +217,17 @@ class StoreManager:
206
217
 
207
218
  if schema == "memory":
208
219
  subpath = url[len("memory://") :]
209
- return in_memory_store, subpath
220
+ return in_memory_store, subpath, url
210
221
 
211
222
  if not schema and endpoint:
212
223
  if endpoint in self._stores.keys():
213
- return self._stores[endpoint], subpath
224
+ return self._stores[endpoint], subpath, url
214
225
  else:
215
226
  raise ValueError(f"no such store ({endpoint})")
216
227
 
217
228
  if not secrets and not mlrun.config.is_running_as_api():
218
229
  if store_key in self._stores.keys():
219
- return self._stores[store_key], subpath
230
+ return self._stores[store_key], subpath, url
220
231
 
221
232
  # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
222
233
  # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -227,7 +238,7 @@ class StoreManager:
227
238
  if not secrets and not mlrun.config.is_running_as_api():
228
239
  self._stores[store_key] = store
229
240
  # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
230
- return store, url if store.kind == "file" else subpath
241
+ return store, url if store.kind == "file" else subpath, url
231
242
 
232
243
  def reset_secrets(self):
233
244
  self._secrets = {}
@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
132
132
  return attributes
133
133
 
134
134
 
135
+ class DatastoreProfileV3io(DatastoreProfile):
136
+ type: str = pydantic.Field("v3io")
137
+ v3io_access_key: typing.Optional[str] = None
138
+ _private_attributes = "v3io_access_key"
139
+
140
+ def url(self, subpath):
141
+ subpath = subpath.lstrip("/")
142
+ return f"v3io:///{subpath}"
143
+
144
+ def secrets(self) -> dict:
145
+ res = {}
146
+ if self.v3io_access_key:
147
+ res["V3IO_ACCESS_KEY"] = self.v3io_access_key
148
+ return res
149
+
150
+
135
151
  class DatastoreProfileS3(DatastoreProfile):
136
152
  type: str = pydantic.Field("s3")
137
153
  _private_attributes = ("access_key_id", "secret_key")
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
156
172
  res["AWS_PROFILE"] = self.profile_name
157
173
  if self.assume_role_arn:
158
174
  res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
159
- return res if res else None
175
+ return res
160
176
 
161
177
  def url(self, subpath):
162
178
  return f"s3:/{subpath}"
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
199
215
  res["REDIS_USER"] = self.username
200
216
  if self.password:
201
217
  res["REDIS_PASSWORD"] = self.password
202
- return res if res else None
218
+ return res
203
219
 
204
220
  def url(self, subpath):
205
221
  return self.endpoint_url + subpath
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
220
236
  res["DATABRICKS_TOKEN"] = self.token
221
237
  if self.endpoint_url:
222
238
  res["DATABRICKS_HOST"] = self.endpoint_url
223
- return res if res else None
239
+ return res
224
240
 
225
241
 
226
242
  class DatastoreProfileGCS(DatastoreProfile):
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
247
263
  res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
248
264
  if self.gcp_credentials:
249
265
  res["GCP_CREDENTIALS"] = self.gcp_credentials
250
- return res if res else None
266
+ return res
251
267
 
252
268
 
253
269
  class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
292
308
  res["sas_token"] = self.sas_token
293
309
  if self.credential:
294
310
  res["credential"] = self.credential
295
- return res if res else None
311
+ return res
312
+
313
+
314
+ class DatastoreProfileHdfs(DatastoreProfile):
315
+ type: str = pydantic.Field("hdfs")
316
+ _private_attributes = "token"
317
+ host: typing.Optional[str] = None
318
+ port: typing.Optional[int] = None
319
+ http_port: typing.Optional[int] = None
320
+ user: typing.Optional[str] = None
321
+
322
+ def secrets(self) -> dict:
323
+ res = {}
324
+ if self.host:
325
+ res["HDFS_HOST"] = self.host
326
+ if self.port:
327
+ res["HDFS_PORT"] = self.port
328
+ if self.port:
329
+ res["HDFS_HTTP_PORT"] = self.http_port
330
+ if self.user:
331
+ res["HDFS_USER"] = self.user
332
+ return res or None
333
+
334
+ def url(self, subpath):
335
+ return f"hdfs://{self.host}:{self.http_port}{subpath}"
296
336
 
297
337
 
298
338
  class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
346
386
  decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
347
387
  datastore_type = decoded_dict.get("type")
348
388
  ds_profile_factory = {
389
+ "v3io": DatastoreProfileV3io,
349
390
  "s3": DatastoreProfileS3,
350
391
  "redis": DatastoreProfileRedis,
351
392
  "basic": DatastoreProfileBasic,
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
354
395
  "dbfs": DatastoreProfileDBFS,
355
396
  "gcs": DatastoreProfileGCS,
356
397
  "az": DatastoreProfileAzureBlob,
398
+ "hdfs": DatastoreProfileHdfs,
357
399
  }
358
400
  if datastore_type in ds_profile_factory:
359
401
  return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
147
147
  if "project_id" in credentials:
148
148
  res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
149
149
  if "private_key_id" in credentials:
150
- res[
151
- "spark.hadoop.fs.gs.auth.service.account.private.key.id"
152
- ] = credentials["private_key_id"]
150
+ res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
151
+ credentials["private_key_id"]
152
+ )
153
153
  if "private_key" in credentials:
154
- res[
155
- "spark.hadoop.fs.gs.auth.service.account.private.key"
156
- ] = credentials["private_key"]
154
+ res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
155
+ credentials["private_key"]
156
+ )
157
157
  if "client_email" in credentials:
158
158
  res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
159
159
  "client_email"
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
161
161
  if "client_id" in credentials:
162
162
  res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
163
163
  return res
164
+
165
+ @property
166
+ def spark_url(self):
167
+ return f"gs://{self.endpoint}"
@@ -0,0 +1,51 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ import fsspec
17
+
18
+ from mlrun.datastore.base import DataStore
19
+
20
+
21
+ class HdfsStore(DataStore):
22
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
23
+ super().__init__(parent, name, schema, endpoint, secrets)
24
+
25
+ self.host = self._get_secret_or_env("HDFS_HOST")
26
+ self.port = self._get_secret_or_env("HDFS_PORT")
27
+ self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
28
+ self.user = self._get_secret_or_env("HDFS_USER")
29
+ if not self.user:
30
+ self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
31
+
32
+ self._filesystem = None
33
+
34
+ @property
35
+ def filesystem(self):
36
+ if not self._filesystem:
37
+ self._filesystem = fsspec.filesystem(
38
+ "webhdfs",
39
+ host=self.host,
40
+ port=self.http_port,
41
+ user=self.user,
42
+ )
43
+ return self._filesystem
44
+
45
+ @property
46
+ def url(self):
47
+ return f"webhdfs://{self.host}:{self.http_port}"
48
+
49
+ @property
50
+ def spark_url(self):
51
+ return f"hdfs://{self.host}:{self.port}"
mlrun/datastore/redis.py CHANGED
@@ -163,3 +163,7 @@ class RedisStore(DataStore):
163
163
  self.redis.delete(k)
164
164
  else:
165
165
  self.redis.delete(key)
166
+
167
+ @property
168
+ def spark_url(self):
169
+ return ""
mlrun/datastore/s3.py CHANGED
@@ -156,6 +156,10 @@ class S3Store(DataStore):
156
156
 
157
157
  return self._sanitize_storage_options(storage_options)
158
158
 
159
+ @property
160
+ def spark_url(self):
161
+ return f"s3a://{self.endpoint}"
162
+
159
163
  def get_bucket_and_key(self, key):
160
164
  path = self._join(key)[1:]
161
165
  return self.endpoint, path