PyPI - mlrun - Versions diffs - 1.7.0rc2__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl - Mend

mlrun 1.7.0rc2py3-none-any.whl → 1.7.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (70) hide show

mlrun/artifacts/manager.py +6 -1
mlrun/common/constants.py +1 -0
mlrun/common/model_monitoring/helpers.py +12 -6
mlrun/common/schemas/__init__.py +1 -0
mlrun/common/schemas/client_spec.py +1 -0
mlrun/common/schemas/common.py +40 -0
mlrun/common/schemas/model_monitoring/constants.py +4 -1
mlrun/common/schemas/project.py +2 -0
mlrun/config.py +20 -16
mlrun/datastore/azure_blob.py +22 -9
mlrun/datastore/base.py +15 -25
mlrun/datastore/datastore.py +19 -8
mlrun/datastore/datastore_profile.py +47 -5
mlrun/datastore/google_cloud_storage.py +10 -6
mlrun/datastore/hdfs.py +51 -0
mlrun/datastore/redis.py +4 -0
mlrun/datastore/s3.py +4 -0
mlrun/datastore/sources.py +31 -50
mlrun/datastore/targets.py +58 -48
mlrun/datastore/utils.py +2 -49
mlrun/datastore/v3io.py +4 -0
mlrun/db/base.py +34 -0
mlrun/db/httpdb.py +71 -42
mlrun/execution.py +3 -3
mlrun/feature_store/feature_vector.py +2 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +3 -3
mlrun/frameworks/tf_keras/model_handler.py +7 -7
mlrun/k8s_utils.py +10 -5
mlrun/kfpops.py +19 -10
mlrun/model.py +5 -0
mlrun/model_monitoring/api.py +3 -3
mlrun/model_monitoring/application.py +1 -1
mlrun/model_monitoring/applications/__init__.py +13 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +218 -0
mlrun/model_monitoring/batch.py +9 -111
mlrun/model_monitoring/controller.py +73 -55
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/features_drift_table.py +62 -53
mlrun/model_monitoring/helpers.py +30 -21
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +14 -14
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -1
mlrun/package/packagers/pandas_packagers.py +3 -3
mlrun/package/utils/_archiver.py +3 -1
mlrun/platforms/iguazio.py +8 -65
mlrun/projects/pipelines.py +21 -11
mlrun/projects/project.py +121 -42
mlrun/runtimes/base.py +21 -2
mlrun/runtimes/kubejob.py +5 -3
mlrun/runtimes/local.py +2 -2
mlrun/runtimes/mpijob/abstract.py +6 -6
mlrun/runtimes/nuclio/function.py +9 -9
mlrun/runtimes/nuclio/serving.py +3 -3
mlrun/runtimes/pod.py +3 -3
mlrun/runtimes/sparkjob/spark3job.py +3 -3
mlrun/serving/remote.py +4 -2
mlrun/serving/server.py +15 -18
mlrun/serving/states.py +27 -12
mlrun/utils/async_http.py +3 -3
mlrun/utils/helpers.py +27 -5
mlrun/utils/http.py +3 -3
mlrun/utils/notifications/notification_pusher.py +6 -6
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/METADATA +13 -16
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/RECORD +70 -64
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc2.dist-info → mlrun-1.7.0rc4.dist-info}/top_level.txt +0 -0

mlrun/artifacts/manager.py CHANGED Viewed

@@ -17,7 +17,11 @@ from os.path import exists, isdir
 from urllib.parse import urlparse
 import mlrun.config
-from mlrun.utils.helpers import get_local_file_schema, template_artifact_path
+from mlrun.utils.helpers import (
+    get_local_file_schema,
+    template_artifact_path,
+    validate_inline_artifact_body_size,
+)
 from ..utils import (
     is_legacy_artifact,
@@ -212,6 +216,7 @@ class ArtifactManager:
             target_path = target_path or item.target_path
         validate_artifact_key_name(key, "artifact.key")
+        validate_inline_artifact_body_size(item.spec.inline)
         src_path = local_path or item.src_path  # TODO: remove src_path
         self.ensure_artifact_source_file_exists(item=item, path=src_path, body=body)
         if format == "html" or (src_path and pathlib.Path(src_path).suffix == "html"):

mlrun/common/constants.py CHANGED Viewed

@@ -13,3 +13,4 @@
 # limitations under the License.
 #
 IMAGE_NAME_ENRICH_REGISTRY_PREFIX = "."  # prefix for image name to enrich with registry
+MYSQL_MEDIUMBLOB_SIZE_BYTES = 16 * 1024 * 1024

mlrun/common/model_monitoring/helpers.py CHANGED Viewed

@@ -16,6 +16,7 @@ import sys
 import typing
 import mlrun.common
+import mlrun.common.schemas.model_monitoring.constants as mm_constants
 from mlrun.common.schemas.model_monitoring import (
     EndpointUID,
     FunctionURI,
@@ -64,7 +65,7 @@ def parse_model_endpoint_store_prefix(store_prefix: str):
 def parse_monitoring_stream_path(
-    stream_uri: str, project: str, application_name: str = None
+    stream_uri: str, project: str, function_name: str = None
 ):
     if stream_uri.startswith("kafka://"):
         if "?topic" in stream_uri:
@@ -72,23 +73,28 @@ def parse_monitoring_stream_path(
                 "Custom kafka topic is not allowed"
             )
         # Add topic to stream kafka uri
-        if application_name is None:
+        if (
+            function_name is None
+            or function_name == mm_constants.MonitoringFunctionNames.STREAM
+        ):
             stream_uri += f"?topic=monitoring_stream_{project}"
         else:
-            stream_uri += f"?topic=monitoring_stream_{project}_{application_name}"
+            stream_uri += f"?topic=monitoring_stream_{project}_{function_name}"
     elif stream_uri.startswith("v3io://") and mlrun.mlconf.is_ce_mode():
         # V3IO is not supported in CE mode, generating a default http stream path
-        if application_name is None:
+        if function_name is None:
             stream_uri = (
                 mlrun.mlconf.model_endpoint_monitoring.default_http_sink.format(
-                    project=project
+                    project=project, namespace=mlrun.mlconf.namespace
                 )
             )
         else:
             stream_uri = (
                 mlrun.mlconf.model_endpoint_monitoring.default_http_sink_app.format(
-                    project=project, application_name=application_name
+                    project=project,
+                    application_name=function_name,
+                    namespace=mlrun.mlconf.namespace,
                 )
             )
     return stream_uri

mlrun/common/schemas/__init__.py CHANGED Viewed

@@ -43,6 +43,7 @@ from .clusterization_spec import (
     ClusterizationSpec,
     WaitForChiefToReachOnlineStateFeatureFlag,
 )
+from .common import ImageBuilder
 from .constants import (
     APIStates,
     ClusterizationRole,

mlrun/common/schemas/client_spec.py CHANGED Viewed

@@ -29,6 +29,7 @@ class ClientSpec(pydantic.BaseModel):
     ui_url: typing.Optional[str]
     artifact_path: typing.Optional[str]
     feature_store_data_prefixes: typing.Optional[dict[str, str]]
+    feature_store_default_targets: typing.Optional[str]
     spark_app_image: typing.Optional[str]
     spark_app_image_tag: typing.Optional[str]
     spark_history_server_path: typing.Optional[str]

mlrun/common/schemas/common.py ADDED Viewed

@@ -0,0 +1,40 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import typing
+import pydantic
+class ImageBuilder(pydantic.BaseModel):
+    functionSourceCode: typing.Optional[str] = None
+    codeEntryType: typing.Optional[str] = None
+    codeEntryAttributes: typing.Optional[str] = None
+    source: typing.Optional[str] = None
+    code_origin: typing.Optional[str] = None
+    origin_filename: typing.Optional[str] = None
+    image: typing.Optional[str] = None
+    base_image: typing.Optional[str] = None
+    commands: typing.Optional[list] = None
+    extra: typing.Optional[str] = None
+    extra_args: typing.Optional[dict] = None
+    builder_env: typing.Optional[dict] = None
+    secret: typing.Optional[str] = None
+    registry: typing.Optional[str] = None
+    load_source_on_run: typing.Optional[bool] = None
+    with_mlrun: typing.Optional[bool] = None
+    auto_build: typing.Optional[bool] = None
+    build_pod: typing.Optional[str] = None
+    requirements: typing.Optional[list] = None
+    source_code_target_dir: typing.Optional[str] = None

mlrun/common/schemas/model_monitoring/constants.py CHANGED Viewed

@@ -181,7 +181,7 @@ class MonitoringFunctionNames:
     WRITER = "model-monitoring-writer"
     BATCH = "model-monitoring-batch"
     APPLICATION_CONTROLLER = "model-monitoring-controller"
-    STREAM = None
+    STREAM = "model-monitoring-stream"
     @staticmethod
     def all():
@@ -289,3 +289,6 @@ class ModelMonitoringAppLabel:
 class ControllerPolicy:
     BASE_PERIOD = "base_period"
+MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME = "histogram-data-drift"

mlrun/common/schemas/project.py CHANGED Viewed

@@ -19,6 +19,7 @@ import pydantic
 import mlrun.common.types
+from .common import ImageBuilder
 from .object import ObjectKind, ObjectStatus
@@ -85,6 +86,7 @@ class ProjectSpec(pydantic.BaseModel):
     desired_state: typing.Optional[ProjectDesiredState] = ProjectDesiredState.online
     custom_packagers: typing.Optional[list[tuple[str, bool]]] = None
     default_image: typing.Optional[str] = None
+    build: typing.Optional[ImageBuilder] = None
     class Config:
         extra = pydantic.Extra.allow

mlrun/config.py CHANGED Viewed

@@ -149,7 +149,6 @@ default_config = {
         "url": "",
     },
     "v3io_framesd": "http://framesd:8080",
-    "datastore": {"async_source_mode": "disabled"},
     # default node selector to be applied to all functions - json string base64 encoded format
     "default_function_node_selector": "e30=",
     # default priority class to be applied to functions running on k8s cluster
@@ -288,6 +287,12 @@ default_config = {
         "state": "online",
         "retry_api_call_on_exception": "enabled",
         "http_connection_timeout_keep_alive": 11,
+        # http client used by httpdb
+        "http": {
+            # when True, the client will verify the server's TLS
+            # set to False for backwards compatibility.
+            "verify": False,
+        },
         "db": {
             "commit_retry_timeout": 30,
             "commit_retry_interval": 3,
@@ -485,8 +490,8 @@ default_config = {
         "offline_storage_path": "model-endpoints/{kind}",
         # Default http path that points to the monitoring stream nuclio function. Will be used as a stream path
         # when the user is working in CE environment and has not provided any stream path.
-        "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
-        "default_http_sink_app": "http://nuclio-{project}-{application_name}.mlrun.svc.cluster.local:8080",
+        "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
+        "default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
         "batch_processing_function_branch": "master",
         "parquet_batching_max_events": 10_000,
         "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
@@ -606,7 +611,7 @@ default_config = {
     "workflows": {
         "default_workflow_runner_name": "workflow-runner-{}",
         # Default timeout seconds for retrieving workflow id after execution:
-        "timeouts": {"local": 120, "kfp": 30, "remote": 30},
+        "timeouts": {"local": 120, "kfp": 30, "remote": 90},
     },
     "log_collector": {
         "address": "localhost:8282",
@@ -958,10 +963,10 @@ class Config:
             with_gpu = (
                 with_gpu_requests if requirement == "requests" else with_gpu_limits
             )
-            resources[
-                requirement
-            ] = self.get_default_function_pod_requirement_resources(
-                requirement, with_gpu
+            resources[requirement] = (
+                self.get_default_function_pod_requirement_resources(
+                    requirement, with_gpu
+                )
             )
         return resources
@@ -1054,7 +1059,7 @@ class Config:
         kind: str = "",
         target: str = "online",
         artifact_path: str = None,
-        application_name: str = None,
+        function_name: str = None,
     ) -> str:
         """Get the full path from the configuration based on the provided project and kind.
@@ -1069,7 +1074,7 @@ class Config:
                                 artifact path instead.
         :param artifact_path:   Optional artifact path that will be used as a relative path. If not provided, the
                                 relative artifact path will be taken from the global MLRun artifact path.
-        :param application_name:    Application name, None for model_monitoring_stream.
+        :param function_name:    Application name, None for model_monitoring_stream.
         :return:                Full configured path for the provided kind.
         """
@@ -1083,20 +1088,19 @@ class Config:
                 return store_prefix_dict[kind].format(project=project)
             if (
-                application_name
+                function_name
+                and function_name
                 != mlrun.common.schemas.model_monitoring.constants.MonitoringFunctionNames.STREAM
             ):
                 return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
                     project=project,
                     kind=kind
-                    if application_name is None
-                    else f"{kind}-{application_name.lower()}",
+                    if function_name is None
+                    else f"{kind}-{function_name.lower()}",
                 )
             return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
                 project=project,
-                kind=kind
-                if application_name is None
-                else f"{kind}-{application_name.lower()}",
+                kind=kind,
             )
         # Get the current offline path from the configuration

mlrun/datastore/azure_blob.py CHANGED Viewed

@@ -175,9 +175,9 @@ class AzureBlobStore(DataStore):
         if "client_secret" in st or "client_id" in st or "tenant_id" in st:
             res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "OAuth"
-            res[
-                f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"
-            ] = "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
+            res[f"spark.hadoop.fs.azure.account.oauth.provider.type.{host}"] = (
+                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
+            )
             if "client_id" in st:
                 res[f"spark.hadoop.fs.azure.account.oauth2.client.id.{host}"] = st[
                     "client_id"
@@ -188,14 +188,27 @@ class AzureBlobStore(DataStore):
                 ]
             if "tenant_id" in st:
                 tenant_id = st["tenant_id"]
-                res[
-                    f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"
-                ] = f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
+                res[f"spark.hadoop.fs.azure.account.oauth2.client.endpoint.{host}"] = (
+                    f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
+                )
         if "sas_token" in st:
             res[f"spark.hadoop.fs.azure.account.auth.type.{host}"] = "SAS"
-            res[
-                f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"
-            ] = "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
+            res[f"spark.hadoop.fs.azure.sas.token.provider.type.{host}"] = (
+                "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
+            )
             res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
         return res
+    @property
+    def spark_url(self):
+        spark_options = self.get_spark_options()
+        url = f"wasbs://{self.endpoint}"
+        prefix = "spark.hadoop.fs.azure.account.key."
+        if spark_options:
+            for key in spark_options:
+                if key.startswith(prefix):
+                    account_key = key[len(prefix) :]
+                    url += f"@{account_key}"
+                    break
+        return url

mlrun/datastore/base.py CHANGED Viewed

@@ -147,6 +147,10 @@ class DataStore:
     def url(self):
         return f"{self.kind}://{self.endpoint}"
+    @property
+    def spark_url(self):
+        return self.url
     def get(self, key, size=None, offset=0):
         pass
@@ -320,31 +324,17 @@ class DataStore:
             raise Exception(f"File type unhandled {url}")
         if file_system:
-            if (
-                self.supports_isdir()
-                and file_system.isdir(file_url)
-                or self._is_dd(df_module)
-            ):
-                storage_options = self.get_storage_options()
-                if url.startswith("ds://"):
-                    parsed_url = urllib.parse.urlparse(url)
-                    url = parsed_url.path
-                    if self.using_bucket:
-                        url = url[1:]
-                    # Pass the underlying file system
-                    kwargs["filesystem"] = file_system
-                elif storage_options:
-                    kwargs["storage_options"] = storage_options
-                df = reader(url, **kwargs)
-            else:
-                file = url
-                # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
-                if file_system.protocol != "file":
-                    # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
-                    # support the storage_options parameter.
-                    file = file_system.open(url)
-                df = reader(file, **kwargs)
+            storage_options = self.get_storage_options()
+            if url.startswith("ds://"):
+                parsed_url = urllib.parse.urlparse(url)
+                url = parsed_url.path
+                if self.using_bucket:
+                    url = url[1:]
+                # Pass the underlying file system
+                kwargs["filesystem"] = file_system
+            elif storage_options:
+                kwargs["storage_options"] = storage_options
+            df = reader(url, **kwargs)
         else:
             temp_file = tempfile.NamedTemporaryFile(delete=False)
             self.download(self._join(subpath), temp_file.name)

mlrun/datastore/datastore.py CHANGED Viewed

@@ -94,6 +94,10 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
+    elif schema == "hdfs":
+        from .hdfs import HdfsStore
+        return HdfsStore
     else:
         raise ValueError(f"unsupported store scheme ({schema})")
@@ -170,7 +174,7 @@ class StoreManager:
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"resource {url} does not have a valid/persistent offline target"
             )
-        return resource, target
+        return resource, target or ""
     def object(
         self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,14 +186,21 @@ class StoreManager:
                 url, project, allow_empty_resources, secrets
             )
-        store, subpath = self.get_or_create_store(
+        store, subpath, url = self.get_or_create_store(
             url, secrets=secrets, project_name=project
         )
-        return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
+        return DataItem(
+            key,
+            store,
+            subpath,
+            url,
+            meta=meta,
+            artifact_url=artifact_url,
+        )
     def get_or_create_store(
         self, url, secrets: dict = None, project_name=""
-    ) -> (DataStore, str):
+    ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
         store_key = f"{schema}://{endpoint}"
@@ -206,17 +217,17 @@ class StoreManager:
         if schema == "memory":
             subpath = url[len("memory://") :]
-            return in_memory_store, subpath
+            return in_memory_store, subpath, url
         if not schema and endpoint:
             if endpoint in self._stores.keys():
-                return self._stores[endpoint], subpath
+                return self._stores[endpoint], subpath, url
             else:
                 raise ValueError(f"no such store ({endpoint})")
         if not secrets and not mlrun.config.is_running_as_api():
             if store_key in self._stores.keys():
-                return self._stores[store_key], subpath
+                return self._stores[store_key], subpath, url
         # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
         # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -227,7 +238,7 @@ class StoreManager:
         if not secrets and not mlrun.config.is_running_as_api():
             self._stores[store_key] = store
         # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
-        return store, url if store.kind == "file" else subpath
+        return store, url if store.kind == "file" else subpath, url
     def reset_secrets(self):
         self._secrets = {}

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -132,6 +132,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
         return attributes
+class DatastoreProfileV3io(DatastoreProfile):
+    type: str = pydantic.Field("v3io")
+    v3io_access_key: typing.Optional[str] = None
+    _private_attributes = "v3io_access_key"
+    def url(self, subpath):
+        subpath = subpath.lstrip("/")
+        return f"v3io:///{subpath}"
+    def secrets(self) -> dict:
+        res = {}
+        if self.v3io_access_key:
+            res["V3IO_ACCESS_KEY"] = self.v3io_access_key
+        return res
 class DatastoreProfileS3(DatastoreProfile):
     type: str = pydantic.Field("s3")
     _private_attributes = ("access_key_id", "secret_key")
@@ -156,7 +172,7 @@ class DatastoreProfileS3(DatastoreProfile):
             res["AWS_PROFILE"] = self.profile_name
         if self.assume_role_arn:
             res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
-        return res if res else None
+        return res
     def url(self, subpath):
         return f"s3:/{subpath}"
@@ -199,7 +215,7 @@ class DatastoreProfileRedis(DatastoreProfile):
             res["REDIS_USER"] = self.username
         if self.password:
             res["REDIS_PASSWORD"] = self.password
-        return res if res else None
+        return res
     def url(self, subpath):
         return self.endpoint_url + subpath
@@ -220,7 +236,7 @@ class DatastoreProfileDBFS(DatastoreProfile):
             res["DATABRICKS_TOKEN"] = self.token
         if self.endpoint_url:
             res["DATABRICKS_HOST"] = self.endpoint_url
-        return res if res else None
+        return res
 class DatastoreProfileGCS(DatastoreProfile):
@@ -247,7 +263,7 @@ class DatastoreProfileGCS(DatastoreProfile):
             res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
         if self.gcp_credentials:
             res["GCP_CREDENTIALS"] = self.gcp_credentials
-        return res if res else None
+        return res
 class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -292,7 +308,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
             res["sas_token"] = self.sas_token
         if self.credential:
             res["credential"] = self.credential
-        return res if res else None
+        return res
+class DatastoreProfileHdfs(DatastoreProfile):
+    type: str = pydantic.Field("hdfs")
+    _private_attributes = "token"
+    host: typing.Optional[str] = None
+    port: typing.Optional[int] = None
+    http_port: typing.Optional[int] = None
+    user: typing.Optional[str] = None
+    def secrets(self) -> dict:
+        res = {}
+        if self.host:
+            res["HDFS_HOST"] = self.host
+        if self.port:
+            res["HDFS_PORT"] = self.port
+        if self.port:
+            res["HDFS_HTTP_PORT"] = self.http_port
+        if self.user:
+            res["HDFS_USER"] = self.user
+        return res or None
+    def url(self, subpath):
+        return f"hdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +386,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
         decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
         datastore_type = decoded_dict.get("type")
         ds_profile_factory = {
+            "v3io": DatastoreProfileV3io,
             "s3": DatastoreProfileS3,
             "redis": DatastoreProfileRedis,
             "basic": DatastoreProfileBasic,
@@ -354,6 +395,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
             "dbfs": DatastoreProfileDBFS,
             "gcs": DatastoreProfileGCS,
             "az": DatastoreProfileAzureBlob,
+            "hdfs": DatastoreProfileHdfs,
         }
         if datastore_type in ds_profile_factory:
             return ds_profile_factory[datastore_type].parse_obj(decoded_dict)

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -147,13 +147,13 @@ class GoogleCloudStorageStore(DataStore):
             if "project_id" in credentials:
                 res["spark.hadoop.fs.gs.project.id"] = credentials["project_id"]
             if "private_key_id" in credentials:
-                res[
-                    "spark.hadoop.fs.gs.auth.service.account.private.key.id"
-                ] = credentials["private_key_id"]
+                res["spark.hadoop.fs.gs.auth.service.account.private.key.id"] = (
+                    credentials["private_key_id"]
+                )
             if "private_key" in credentials:
-                res[
-                    "spark.hadoop.fs.gs.auth.service.account.private.key"
-                ] = credentials["private_key"]
+                res["spark.hadoop.fs.gs.auth.service.account.private.key"] = (
+                    credentials["private_key"]
+                )
             if "client_email" in credentials:
                 res["spark.hadoop.fs.gs.auth.service.account.email"] = credentials[
                     "client_email"
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
             if "client_id" in credentials:
                 res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
         return res
+    @property
+    def spark_url(self):
+        return f"gs://{self.endpoint}"

mlrun/datastore/hdfs.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import fsspec
+from mlrun.datastore.base import DataStore
+class HdfsStore(DataStore):
+    def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
+        super().__init__(parent, name, schema, endpoint, secrets)
+        self.host = self._get_secret_or_env("HDFS_HOST")
+        self.port = self._get_secret_or_env("HDFS_PORT")
+        self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
+        self.user = self._get_secret_or_env("HDFS_USER")
+        if not self.user:
+            self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
+        self._filesystem = None
+    @property
+    def filesystem(self):
+        if not self._filesystem:
+            self._filesystem = fsspec.filesystem(
+                "webhdfs",
+                host=self.host,
+                port=self.http_port,
+                user=self.user,
+            )
+        return self._filesystem
+    @property
+    def url(self):
+        return f"webhdfs://{self.host}:{self.http_port}"
+    @property
+    def spark_url(self):
+        return f"hdfs://{self.host}:{self.port}"

mlrun/datastore/redis.py CHANGED Viewed

@@ -163,3 +163,7 @@ class RedisStore(DataStore):
                 self.redis.delete(k)
         else:
             self.redis.delete(key)
+    @property
+    def spark_url(self):
+        return ""

mlrun/datastore/s3.py CHANGED Viewed

@@ -156,6 +156,10 @@ class S3Store(DataStore):
         return self._sanitize_storage_options(storage_options)
+    @property
+    def spark_url(self):
+        return f"s3a://{self.endpoint}"
     def get_bucket_and_key(self, key):
         path = self._join(key)[1:]
         return self.endpoint, path

mlrun 1.7.0rc2__py3-none-any.whl → 1.7.0rc4__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc2py3-none-any.whl → 1.7.0rc4py3-none-any.whl