PyPI - mlrun - Versions diffs - 1.7.0rc38__py3-none-any.whl → 1.7.0rc41__py3-none-any.whl - Mend

mlrun 1.7.0rc38py3-none-any.whl → 1.7.0rc41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (59) hide show

mlrun/alerts/alert.py +30 -27
mlrun/common/constants.py +3 -0
mlrun/common/helpers.py +0 -1
mlrun/common/schemas/alert.py +3 -0
mlrun/common/schemas/model_monitoring/model_endpoints.py +0 -1
mlrun/common/schemas/notification.py +1 -0
mlrun/config.py +1 -1
mlrun/data_types/to_pandas.py +9 -9
mlrun/datastore/alibaba_oss.py +3 -2
mlrun/datastore/azure_blob.py +7 -9
mlrun/datastore/base.py +13 -1
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +84 -29
mlrun/datastore/redis.py +1 -0
mlrun/datastore/s3.py +3 -2
mlrun/datastore/sources.py +54 -0
mlrun/datastore/storeytargets.py +147 -0
mlrun/datastore/targets.py +76 -122
mlrun/datastore/v3io.py +1 -0
mlrun/db/httpdb.py +6 -1
mlrun/errors.py +8 -0
mlrun/execution.py +7 -0
mlrun/feature_store/api.py +5 -0
mlrun/feature_store/retrieval/job.py +1 -0
mlrun/model.py +24 -3
mlrun/model_monitoring/api.py +10 -2
mlrun/model_monitoring/applications/_application_steps.py +52 -34
mlrun/model_monitoring/applications/context.py +206 -70
mlrun/model_monitoring/applications/histogram_data_drift.py +15 -13
mlrun/model_monitoring/controller.py +15 -12
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +17 -8
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +19 -9
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +85 -47
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +46 -10
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +38 -24
mlrun/model_monitoring/helpers.py +54 -18
mlrun/model_monitoring/stream_processing.py +10 -29
mlrun/projects/pipelines.py +19 -30
mlrun/projects/project.py +86 -67
mlrun/run.py +8 -6
mlrun/runtimes/__init__.py +4 -0
mlrun/runtimes/nuclio/api_gateway.py +18 -0
mlrun/runtimes/nuclio/application/application.py +150 -59
mlrun/runtimes/nuclio/function.py +5 -11
mlrun/runtimes/nuclio/serving.py +2 -2
mlrun/runtimes/utils.py +16 -0
mlrun/serving/routers.py +1 -1
mlrun/serving/server.py +19 -5
mlrun/serving/states.py +8 -0
mlrun/serving/v2_serving.py +34 -26
mlrun/utils/helpers.py +33 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/METADATA +9 -12
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/RECORD +59 -58
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/WHEEL +1 -1
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc38.dist-info → mlrun-1.7.0rc41.dist-info}/top_level.txt +0 -0

mlrun/alerts/alert.py CHANGED Viewed

@@ -62,6 +62,7 @@ class AlertConfig(ModelObj):
             # create an alert on endpoint_id, which will be triggered to slack if there is a "data_drift_detected" event
             # 3 times in the next hour.
             from mlrun.alerts import AlertConfig
             import mlrun.common.schemas.alert as alert_objects
@@ -94,29 +95,29 @@ class AlertConfig(ModelObj):
             )
             project.store_alert_config(alert_data)
-        :param project:        name of the project to associate the alert with
-        :param name:           name of the alert
-        :param template:       optional parameter that allows to create an alert based on a predefined template.
-                               you can pass either an AlertTemplate object or a string (the template name).
-                               if a template is used, many fields of the alert will be auto-generated based on the
-                               template. however, you still need to provide the following fields:
+        :param project:        Name of the project to associate the alert with
+        :param name:           Name of the alert
+        :param template:       Optional parameter that allows creating an alert based on a predefined template.
+                               You can pass either an AlertTemplate object or a string (the template name).
+                               If a template is used, many fields of the alert will be auto-generated based on the
+                               template.However, you still need to provide the following fields:
                                `name`, `project`, `entity`, `notifications`
-        :param description:    description of the alert
-        :param summary:        summary of the alert, will be sent in the generated notifications
-        :param severity:       severity of the alert
-        :param trigger:        the events that will trigger this alert, may be a simple trigger based on events or
+        :param description:    Description of the alert
+        :param summary:        Summary of the alert, will be sent in the generated notifications
+        :param severity:       Severity of the alert
+        :param trigger:        The events that will trigger this alert, may be a simple trigger based on events or
                                complex trigger which is based on a prometheus alert
-        :param criteria:       when the alert will be triggered based on the specified number of events within the
+        :param criteria:       When the alert will be triggered based on the specified number of events within the
                                defined time period.
-        :param reset_policy:   when to clear the alert. May be "manual" for manual reset of the alert, or
+        :param reset_policy:   When to clear the alert. May be "manual" for manual reset of the alert, or
                                "auto" if the criteria contains a time period
-        :param notifications:  list of notifications to invoke once the alert is triggered
-        :param entities:       entities that the event relates to. The entity object will contain fields that uniquely
-                               identify a given entity in the system
-        :param id:             internal id of the alert (user should not supply it)
-        :param state:          state of the alert, may be active/inactive (user should not supply it)
-        :param created:        when the alert is created (user should not supply it)
-        :param count:          internal counter of the alert (user should not supply it)
+        :param notifications:  List of notifications to invoke once the alert is triggered
+        :param entities:       Entities that the event relates to. The entity object will contain fields that
+                               uniquely identify a given entity in the system
+        :param id:             Internal id of the alert (user should not supply it)
+        :param state:          State of the alert, may be active/inactive (user should not supply it)
+        :param created:        When the alert is created (user should not supply it)
+        :param count:          Internal counter of the alert (user should not supply it)
         """
         self.project = project
         self.name = name
@@ -137,8 +138,8 @@ class AlertConfig(ModelObj):
             self._apply_template(template)
     def validate_required_fields(self):
-        if not self.project or not self.name:
-            raise mlrun.errors.MLRunBadRequestError("Project and name must be provided")
+        if not self.name:
+            raise mlrun.errors.MLRunInvalidArgumentError("Alert name must be provided")
     def _serialize_field(
         self, struct: dict, field_name: str = None, strip: bool = False
@@ -237,9 +238,11 @@ class AlertConfig(ModelObj):
             db = mlrun.get_run_db()
             template = db.get_alert_template(template)
-        # Extract parameters from the template and apply them to the AlertConfig object
-        self.summary = template.summary
-        self.severity = template.severity
-        self.criteria = template.criteria
-        self.trigger = template.trigger
-        self.reset_policy = template.reset_policy
+        # Apply parameters from the template to the AlertConfig object only if they are not already specified by the
+        # user in the current configuration.
+        # User-provided parameters will take precedence over corresponding template values
+        self.summary = self.summary or template.summary
+        self.severity = self.severity or template.severity
+        self.criteria = self.criteria or template.criteria
+        self.trigger = self.trigger or template.trigger
+        self.reset_policy = self.reset_policy or template.reset_policy

mlrun/common/constants.py CHANGED Viewed

@@ -65,6 +65,9 @@ class MLRunInternalLabels:
     task_name = f"{MLRUN_LABEL_PREFIX}task-name"
     resource_name = f"{MLRUN_LABEL_PREFIX}resource_name"
     created = f"{MLRUN_LABEL_PREFIX}created"
+    producer_type = f"{MLRUN_LABEL_PREFIX}producer-type"
+    app_name = f"{MLRUN_LABEL_PREFIX}app-name"
+    endpoint_id = f"{MLRUN_LABEL_PREFIX}endpoint-id"
     host = "host"
     job_type = "job-type"
     kind = "kind"

mlrun/common/helpers.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 def parse_versioned_object_uri(

mlrun/common/schemas/alert.py CHANGED Viewed

@@ -23,6 +23,7 @@ from mlrun.common.types import StrEnum
 class EventEntityKind(StrEnum):
     MODEL_ENDPOINT_RESULT = "model-endpoint-result"
+    MODEL_MONITORING_APPLICATION = "model-monitoring-application"
     JOB = "job"
@@ -43,6 +44,7 @@ class EventKind(StrEnum):
     SYSTEM_PERFORMANCE_SUSPECTED = "system_performance_suspected"
     MM_APP_ANOMALY_DETECTED = "mm_app_anomaly_detected"
     MM_APP_ANOMALY_SUSPECTED = "mm_app_anomaly_suspected"
+    MM_APP_FAILED = "mm_app_failed"
     FAILED = "failed"
@@ -57,6 +59,7 @@ _event_kind_entity_map = {
     EventKind.SYSTEM_PERFORMANCE_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
     EventKind.MM_APP_ANOMALY_DETECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
     EventKind.MM_APP_ANOMALY_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.MM_APP_FAILED: [EventEntityKind.MODEL_MONITORING_APPLICATION],
     EventKind.FAILED: [EventEntityKind.JOB],
 }

mlrun/common/schemas/model_monitoring/model_endpoints.py CHANGED Viewed

@@ -21,7 +21,6 @@ from typing import Any, NamedTuple, Optional
 from pydantic import BaseModel, Field, validator
 from pydantic.main import Extra
-import mlrun.common.model_monitoring
 import mlrun.common.types
 from ..object import ObjectKind, ObjectSpec, ObjectStatus

mlrun/common/schemas/notification.py CHANGED Viewed

@@ -52,6 +52,7 @@ class NotificationLimits(enum.Enum):
 class Notification(pydantic.BaseModel):
     """
     Notification object schema
     :param kind: notification implementation kind - slack, webhook, etc.
     :param name: for logging and identification
     :param message: message content in the notification

mlrun/config.py CHANGED Viewed

@@ -863,7 +863,7 @@ class Config:
                     f"Unable to decode {attribute_path}"
                 )
             parsed_attribute_value = json.loads(decoded_attribute_value)
-            if type(parsed_attribute_value) != expected_type:
+            if not isinstance(parsed_attribute_value, expected_type):
                 raise mlrun.errors.MLRunInvalidArgumentTypeError(
                     f"Expected type {expected_type}, got {type(parsed_attribute_value)}"
                 )

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -21,7 +21,7 @@ import semver
 def _toPandas(spark_df):
     """
-    Modified version of spark DataFrame.toPandas() –
+    Modified version of spark DataFrame.toPandas() -
     https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
     The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -223,21 +223,21 @@ def _to_corrected_pandas_type(dt):
         TimestampType,
     )
-    if type(dt) == ByteType:
+    if isinstance(dt, ByteType):
         return np.int8
-    elif type(dt) == ShortType:
+    elif isinstance(dt, ShortType):
         return np.int16
-    elif type(dt) == IntegerType:
+    elif isinstance(dt, IntegerType):
         return np.int32
-    elif type(dt) == LongType:
+    elif isinstance(dt, LongType):
         return np.int64
-    elif type(dt) == FloatType:
+    elif isinstance(dt, FloatType):
         return np.float32
-    elif type(dt) == DoubleType:
+    elif isinstance(dt, DoubleType):
         return np.float64
-    elif type(dt) == BooleanType:
+    elif isinstance(dt, BooleanType):
         return bool
-    elif type(dt) == TimestampType:
+    elif isinstance(dt, TimestampType):
         return "datetime64[ns]"
     else:
         return None

mlrun/datastore/alibaba_oss.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 class OSSStore(DataStore):
@@ -53,7 +53,7 @@ class OSSStore(DataStore):
         except ImportError as exc:
             raise ImportError("ALIBABA ossfs not installed") from exc
         filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
+        self._filesystem = make_datastore_schema_sanitizer(
             filesystem_class,
             using_bucket=self.using_bucket,
             **self.get_storage_options(),
@@ -85,6 +85,7 @@ class OSSStore(DataStore):
         return oss.get_object(key).read()
     def put(self, key, data, append=False):
+        data, _ = self._prepare_put_data(data, append)
         bucket, key = self.get_bucket_and_key(key)
         oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
         oss.put_object(key, data)

mlrun/datastore/azure_blob.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
 # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -41,6 +41,9 @@ class AzureBlobStore(DataStore):
         self._service_client = None
         self._storage_options = None
+    def get_storage_options(self):
+        return self.storage_options
     @property
     def storage_options(self):
         if not self._storage_options:
@@ -75,7 +78,7 @@ class AzureBlobStore(DataStore):
         if not self._filesystem:
             # in order to support az and wasbs kinds
             filesystem_class = get_filesystem_class(protocol=self.kind)
-            self._filesystem = makeDatastoreSchemaSanitizer(
+            self._filesystem = make_datastore_schema_sanitizer(
                 filesystem_class,
                 using_bucket=self.using_bucket,
                 blocksize=self.max_blocksize,
@@ -186,12 +189,7 @@ class AzureBlobStore(DataStore):
                 "Append mode not supported for Azure blob datastore"
             )
         remote_path = self._convert_key_to_remote_path(key)
-        if isinstance(data, bytes):
-            mode = "wb"
-        elif isinstance(data, str):
-            mode = "w"
-        else:
-            raise TypeError("Data type unknown.  Unable to put in Azure!")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(remote_path, mode) as f:
             f.write(data)
@@ -225,7 +223,7 @@ class AzureBlobStore(DataStore):
     def get_spark_options(self):
         res = {}
-        st = self.storage_options()
+        st = self.storage_options
         service = "blob"
         primary_url = None
         if st.get("connection_string"):

mlrun/datastore/base.py CHANGED Viewed

@@ -157,6 +157,18 @@ class DataStore:
     def put(self, key, data, append=False):
         pass
+    def _prepare_put_data(self, data, append=False):
+        mode = "a" if append else "w"
+        if isinstance(data, bytearray):
+            data = bytes(data)
+        if isinstance(data, bytes):
+            return data, f"{mode}b"
+        elif isinstance(data, str):
+            return data, mode
+        else:
+            raise TypeError(f"Unable to put a value of type {type(self).__name__}")
     def stat(self, key):
         pass
@@ -748,7 +760,7 @@ class HttpStore(DataStore):
 # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
 # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
 # method specifically to strip away the 'ds' schema as required.
-def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
+def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
     if not issubclass(cls, fsspec.AbstractFileSystem):
         raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")

mlrun/datastore/dbfs_store.py CHANGED Viewed

@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 class DatabricksFileBugFixed(DatabricksFile):
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
         """return fsspec file system object, if supported"""
         filesystem_class = get_filesystem_class(protocol=self.kind)
         if not self._filesystem:
-            self._filesystem = makeDatastoreSchemaSanitizer(
+            self._filesystem = make_datastore_schema_sanitizer(
                 cls=filesystem_class,
                 using_bucket=False,
                 **self.get_storage_options(),
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
                 "Append mode not supported for Databricks file system"
             )
         #  can not use append mode because it overrides data.
-        mode = "w"
-        if isinstance(data, bytes):
-            mode += "b"
-        elif not isinstance(data, str):
-            raise TypeError(f"Unknown data type {type(data)}")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(key, mode) as f:
             f.write(data)

mlrun/datastore/filestore.py CHANGED Viewed

@@ -66,9 +66,7 @@ class FileStore(DataStore):
         dir_to_create = path.dirname(self._join(key))
         if dir_to_create:
             self._ensure_directory(dir_to_create)
-        mode = "a" if append else "w"
-        if isinstance(data, bytes):
-            mode = mode + "b"
+        data, mode = self._prepare_put_data(data, append)
         with open(self._join(key), mode) as fp:
             fp.write(data)
             fp.close()

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -12,44 +12,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import os
 from pathlib import Path
 from fsspec.registry import get_filesystem_class
+from google.auth.credentials import Credentials
+from google.cloud.storage import Client, transfer_manager
+from google.oauth2 import service_account
 import mlrun.errors
 from mlrun.utils import logger
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 # Google storage objects will be represented with the following URL: gcs://<bucket name>/<path> or gs://...
 class GoogleCloudStorageStore(DataStore):
     using_bucket = True
+    workers = 8
+    chunk_size = 32 * 1024 * 1024
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
+        self._storage_client = None
+        self._storage_options = None
+    @property
+    def storage_client(self):
+        if self._storage_client:
+            return self._storage_client
+        token = self._get_credentials().get("token")
+        access = "https://www.googleapis.com/auth/devstorage.full_control"
+        if isinstance(token, str):
+            if os.path.exists(token):
+                credentials = service_account.Credentials.from_service_account_file(
+                    token, scopes=[access]
+                )
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "gcsfs authentication file not found!"
+                )
+        elif isinstance(token, dict):
+            credentials = service_account.Credentials.from_service_account_info(
+                token, scopes=[access]
+            )
+        elif isinstance(token, Credentials):
+            credentials = token
+        else:
+            raise ValueError(f"Unsupported token type: {type(token)}")
+        self._storage_client = Client(credentials=credentials)
+        return self._storage_client
     @property
     def filesystem(self):
         """return fsspec file system object, if supported"""
-        if self._filesystem:
-            return self._filesystem
-        try:
-            import gcsfs  # noqa
-        except ImportError as exc:
-            raise ImportError(
-                "Google gcsfs not installed, run pip install gcsfs"
-            ) from exc
-        filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
-            filesystem_class,
-            using_bucket=self.using_bucket,
-            **self.get_storage_options(),
-        )
+        if not self._filesystem:
+            filesystem_class = get_filesystem_class(protocol=self.kind)
+            self._filesystem = make_datastore_schema_sanitizer(
+                filesystem_class,
+                using_bucket=self.using_bucket,
+                **self.storage_options,
+            )
         return self._filesystem
-    def get_storage_options(self):
+    @property
+    def storage_options(self):
+        if self._storage_options:
+            return self._storage_options
+        credentials = self._get_credentials()
+        # due to caching problem introduced in gcsfs 2024.3.1 (ML-7636)
+        credentials["use_listings_cache"] = False
+        self._storage_options = credentials
+        return self._storage_options
+    def _get_credentials(self):
         credentials = self._get_secret_or_env(
             "GCP_CREDENTIALS"
         ) or self._get_secret_or_env("GOOGLE_APPLICATION_CREDENTIALS")
@@ -71,6 +109,9 @@ class GoogleCloudStorageStore(DataStore):
             )
             return self._sanitize_storage_options(None)
+    def get_storage_options(self):
+        return self.storage_options
     def _make_path(self, key):
         key = key.strip("/")
         path = Path(self.endpoint, key).as_posix()
@@ -90,21 +131,34 @@ class GoogleCloudStorageStore(DataStore):
             raise mlrun.errors.MLRunInvalidArgumentError(
                 "Append mode not supported for Google cloud storage datastore"
             )
-        if isinstance(data, bytes):
-            mode = "wb"
-        elif isinstance(data, str):
-            mode = "w"
-        else:
-            raise TypeError(
-                "Data type unknown.  Unable to put in Google cloud storage!"
-            )
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(path, mode) as f:
             f.write(data)
     def upload(self, key, src_path):
-        path = self._make_path(key)
-        self.filesystem.put_file(src_path, path, overwrite=True)
+        file_size = os.path.getsize(src_path)
+        united_path = self._make_path(key)
+        # Multiple upload limitation recommendations as described in
+        # https://cloud.google.com/storage/docs/multipart-uploads#storage-upload-object-chunks-python
+        if file_size <= self.chunk_size:
+            self.filesystem.put_file(src_path, united_path, overwrite=True)
+            return
+        bucket = self.storage_client.bucket(self.endpoint)
+        blob = bucket.blob(key.strip("/"))
+        try:
+            transfer_manager.upload_chunks_concurrently(
+                src_path, blob, chunk_size=self.chunk_size, max_workers=self.workers
+            )
+        except Exception as upload_chunks_concurrently_exception:
+            logger.warning(
+                f"gcs: failed to concurrently upload {src_path},"
+                f" exception: {upload_chunks_concurrently_exception}. Retrying with single part upload."
+            )
+            self.filesystem.put_file(src_path, united_path, overwrite=True)
     def stat(self, key):
         path = self._make_path(key)
@@ -133,12 +187,13 @@ class GoogleCloudStorageStore(DataStore):
     def rm(self, path, recursive=False, maxdepth=None):
         path = self._make_path(path)
+        # in order to raise an error in case of a connection error (ML-7056)
         self.filesystem.exists(path)
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        super().rm(path, recursive=recursive, maxdepth=maxdepth)
     def get_spark_options(self):
         res = {}
-        st = self.get_storage_options()
+        st = self._get_credentials()
         if "token" in st:
             res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
             if isinstance(st["token"], str):

mlrun/datastore/redis.py CHANGED Viewed

@@ -126,6 +126,7 @@ class RedisStore(DataStore):
     def put(self, key, data, append=False):
         key = RedisStore.build_redis_key(key)
+        data, _ = self._prepare_put_data(data, append)
         if append:
             self.redis.append(key, data)
         else:

mlrun/datastore/s3.py CHANGED Viewed

@@ -20,7 +20,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, get_range, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitizer
 class S3Store(DataStore):
@@ -126,7 +126,7 @@ class S3Store(DataStore):
         except ImportError as exc:
             raise ImportError("AWS s3fs not installed") from exc
         filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
+        self._filesystem = make_datastore_schema_sanitizer(
             filesystem_class,
             using_bucket=self.using_bucket,
             **self.get_storage_options(),
@@ -183,6 +183,7 @@ class S3Store(DataStore):
         return obj.get()["Body"].read()
     def put(self, key, data, append=False):
+        data, _ = self._prepare_put_data(data, append)
         bucket, key = self.get_bucket_and_key(key)
         self.s3.Object(bucket, key).put(Body=data)

mlrun/datastore/sources.py CHANGED Viewed

@@ -32,6 +32,7 @@ from mlrun.config import config
 from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
 from mlrun.datastore.utils import transform_list_filters_to_tuple
 from mlrun.secrets import SecretsStore
+from mlrun.utils import logger
 from ..model import DataSource
 from ..platforms.iguazio import parse_path
@@ -1163,6 +1164,59 @@ class KafkaSource(OnlineSource):
             "to a Spark dataframe is not possible, as this operation is not supported by Spark"
         )
+    def create_topics(
+        self,
+        num_partitions: int = 4,
+        replication_factor: int = 1,
+        topics: list[str] = None,
+    ):
+        """
+        Create Kafka topics with the specified number of partitions and replication factor.
+        :param num_partitions:      number of partitions for the topics
+        :param replication_factor:  replication factor for the topics
+        :param topics:              list of topic names to create, if None,
+                                    the topics will be taken from the source attributes
+        """
+        from kafka.admin import KafkaAdminClient, NewTopic
+        brokers = self.attributes.get("brokers")
+        if not brokers:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "brokers must be specified in the KafkaSource attributes"
+            )
+        topics = topics or self.attributes.get("topics")
+        if not topics:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "topics must be specified in the KafkaSource attributes"
+            )
+        new_topics = [
+            NewTopic(topic, num_partitions, replication_factor) for topic in topics
+        ]
+        kafka_admin = KafkaAdminClient(
+            bootstrap_servers=brokers,
+            sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
+            sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
+            sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
+            sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
+                "sasl_kerberos_service_name", "kafka"
+            ),
+            sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
+                "sasl_kerberos_domain_name"
+            ),
+            sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
+        )
+        try:
+            kafka_admin.create_topics(new_topics)
+        finally:
+            kafka_admin.close()
+        logger.info(
+            "Kafka topics created successfully",
+            topics=topics,
+            num_partitions=num_partitions,
+            replication_factor=replication_factor,
+        )
 class SQLSource(BaseSourceDriver):
     kind = "sqldb"

mlrun 1.7.0rc38__py3-none-any.whl → 1.7.0rc41__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc38py3-none-any.whl → 1.7.0rc41py3-none-any.whl