PyPI - mlrun - Versions diffs - 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl - Mend

mlrun 1.7.0rc5py3-none-any.whl → 1.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (234) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +39 -121
mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +39 -254
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +73 -46
mlrun/artifacts/model.py +30 -158
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +73 -2
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +46 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +44 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +11 -1
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +21 -4
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +113 -2
mlrun/common/schemas/artifact.py +28 -1
mlrun/common/schemas/auth.py +11 -0
mlrun/common/schemas/client_spec.py +2 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +58 -28
mlrun/common/schemas/frontend_spec.py +8 -0
mlrun/common/schemas/function.py +11 -0
mlrun/common/schemas/hub.py +7 -9
mlrun/common/schemas/model_monitoring/__init__.py +21 -4
mlrun/common/schemas/model_monitoring/constants.py +136 -42
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
mlrun/common/schemas/notification.py +69 -12
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +7 -0
mlrun/common/schemas/project.py +67 -16
mlrun/common/schemas/runs.py +17 -0
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +14 -1
mlrun/config.py +224 -58
mlrun/data_types/data_types.py +11 -1
mlrun/data_types/spark.py +5 -4
mlrun/data_types/to_pandas.py +75 -34
mlrun/datastore/__init__.py +8 -10
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +131 -43
mlrun/datastore/base.py +107 -47
mlrun/datastore/datastore.py +17 -7
mlrun/datastore/datastore_profile.py +91 -7
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +92 -32
mlrun/datastore/hdfs.py +5 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +3 -2
mlrun/datastore/s3.py +30 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +274 -59
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +374 -102
mlrun/datastore/utils.py +68 -5
mlrun/datastore/v3io.py +28 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/base.py +231 -22
mlrun/db/factory.py +1 -4
mlrun/db/httpdb.py +864 -228
mlrun/db/nopdb.py +268 -16
mlrun/errors.py +35 -5
mlrun/execution.py +111 -38
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +46 -53
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_set.py +48 -23
mlrun/feature_store/feature_vector.py +13 -2
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +38 -19
mlrun/features.py +6 -14
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
mlrun/frameworks/lgbm/__init__.py +1 -1
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/model_handler.py +1 -1
mlrun/frameworks/parallel_coordinates.py +4 -4
mlrun/frameworks/pytorch/__init__.py +2 -2
mlrun/frameworks/sklearn/__init__.py +1 -1
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/frameworks/tf_keras/__init__.py +5 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
mlrun/frameworks/xgboost/__init__.py +1 -1
mlrun/k8s_utils.py +57 -12
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +6 -5
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +15 -5
mlrun/launcher/remote.py +10 -3
mlrun/lists.py +6 -2
mlrun/model.py +297 -48
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +152 -357
mlrun/model_monitoring/applications/__init__.py +10 -0
mlrun/model_monitoring/applications/_application_steps.py +190 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +130 -303
mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +177 -39
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +165 -398
mlrun/model_monitoring/tracking_policy.py +7 -1
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/packagers/default_packager.py +2 -2
mlrun/package/packagers_manager.py +1 -0
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +67 -228
mlrun/projects/__init__.py +6 -1
mlrun/projects/operations.py +47 -20
mlrun/projects/pipelines.py +396 -249
mlrun/projects/project.py +1125 -414
mlrun/render.py +28 -22
mlrun/run.py +207 -180
mlrun/runtimes/__init__.py +76 -11
mlrun/runtimes/base.py +40 -14
mlrun/runtimes/daskjob.py +9 -2
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +39 -10
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +8 -8
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/api_gateway.py +646 -177
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/nuclio/function.py +188 -68
mlrun/runtimes/nuclio/serving.py +57 -60
mlrun/runtimes/pod.py +191 -58
mlrun/runtimes/remotesparkjob.py +11 -8
mlrun/runtimes/sparkjob/spark3job.py +17 -18
mlrun/runtimes/utils.py +40 -73
mlrun/secrets.py +6 -2
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -3
mlrun/serving/routers.py +89 -64
mlrun/serving/server.py +54 -26
mlrun/serving/states.py +187 -56
mlrun/serving/utils.py +19 -11
mlrun/serving/v2_serving.py +136 -63
mlrun/track/tracker.py +2 -1
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +26 -6
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +375 -105
mlrun/utils/http.py +2 -2
mlrun/utils/logger.py +75 -9
mlrun/utils/notifications/notification/__init__.py +14 -10
mlrun/utils/notifications/notification/base.py +48 -0
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +2 -0
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +63 -2
mlrun/utils/notifications/notification_pusher.py +146 -16
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +3 -2
mlrun/utils/v3io_clients.py +2 -3
mlrun/utils/version/version.json +2 -2
mlrun-1.7.2.dist-info/METADATA +390 -0
mlrun-1.7.2.dist-info/RECORD +351 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -974
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/prometheus.py +0 -216
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
mlrun/platforms/other.py +0 -305
mlrun-1.7.0rc5.dist-info/METADATA +0 -269
mlrun-1.7.0rc5.dist-info/RECORD +0 -323
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc5.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0

mlrun/datastore/targets.py CHANGED Viewed

@@ -17,6 +17,7 @@ import os
 import random
 import sys
 import time
+import warnings
 from collections import Counter
 from copy import copy
 from typing import Any, Optional, Union
@@ -28,6 +29,11 @@ from mergedeep import merge
 import mlrun
 import mlrun.utils.helpers
 from mlrun.config import config
+from mlrun.datastore.snowflake_utils import (
+    get_snowflake_password,
+    get_snowflake_spark_options,
+)
+from mlrun.datastore.utils import transform_list_filters_to_tuple
 from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
 from mlrun.utils import logger, now_date
 from mlrun.utils.helpers import to_parquet
@@ -41,7 +47,6 @@ from .spark_utils import spark_session_update_hadoop_options
 from .utils import (
     _generate_sql_query_with_time_filter,
     filter_df_start_end_time,
-    parse_kafka_url,
     select_columns_from_df,
 )
@@ -57,6 +62,7 @@ class TargetTypes:
     dataframe = "dataframe"
     custom = "custom"
     sql = "sql"
+    snowflake = "snowflake"
     @staticmethod
     def all():
@@ -71,6 +77,7 @@ class TargetTypes:
             TargetTypes.dataframe,
             TargetTypes.custom,
             TargetTypes.sql,
+            TargetTypes.snowflake,
         ]
@@ -78,11 +85,14 @@ def generate_target_run_id():
     return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
-def write_spark_dataframe_with_options(spark_options, df, mode):
+def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
     non_hadoop_spark_options = spark_session_update_hadoop_options(
         df.sql_ctx.sparkSession, spark_options
     )
-    df.write.mode(mode).save(**non_hadoop_spark_options)
+    if write_format:
+        df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
+    else:
+        df.write.mode(mode).save(**non_hadoop_spark_options)
 def default_target_names():
@@ -379,6 +389,7 @@ class BaseStoreTarget(DataTargetBase):
     is_offline = False
     support_spark = False
     support_storey = False
+    support_pandas = False
     support_append = False
     def __init__(
@@ -428,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
         self.storage_options = storage_options
         self.schema = schema or {}
         self.credentials_prefix = credentials_prefix
+        if credentials_prefix:
+            warnings.warn(
+                "The 'credentials_prefix' parameter is deprecated and will be removed in "
+                "1.9.0. Please use datastore profiles instead.",
+                FutureWarning,
+            )
         self._target = None
         self._resource = None
@@ -497,7 +514,10 @@ class BaseStoreTarget(DataTargetBase):
             options = self.get_spark_options(key_column, timestamp_key)
             options.update(kwargs)
             df = self.prepare_spark_df(df, key_column, timestamp_key, options)
-            write_spark_dataframe_with_options(options, df, "overwrite")
+            write_format = options.pop("format", None)
+            write_spark_dataframe_with_options(
+                options, df, "overwrite", write_format=write_format
+            )
         elif hasattr(df, "dask"):
             dask_options = self.get_dask_options()
             store, path_in_store, target_path = self._get_store_and_path()
@@ -524,15 +544,18 @@ class BaseStoreTarget(DataTargetBase):
             store, path_in_store, target_path = self._get_store_and_path()
             target_path = generate_path_with_chunk(self, chunk_id, target_path)
             file_system = store.filesystem
-            if file_system.protocol == "file":
+            if (
+                file_system.protocol == "file"
+                # fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
+                or isinstance(file_system.protocol, (tuple, list))
+                and "file" in file_system.protocol
+            ):
                 dir = os.path.dirname(target_path)
                 if dir:
                     os.makedirs(dir, exist_ok=True)
             target_df = df
             partition_cols = None  # single parquet file
-            if not target_path.endswith(".parquet") and not target_path.endswith(
-                ".pq"
-            ):  # directory
+            if not mlrun.utils.helpers.is_parquet_file(target_path):  # directory
                 partition_cols = []
                 if timestamp_key and (
                     self.partitioned or self.time_partitioning_granularity
@@ -641,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
     def _target_path_object(self):
         """return the actual/computed target path"""
         is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
+        if self._resource and self.path:
+            parsed_url = urlparse(self.path)
+            # When the URL consists only from scheme and endpoint and no path,
+            # make a default path for DS and redis targets.
+            # Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
+            if (
+                not isinstance(self, KafkaTarget)
+                and parsed_url.scheme in ["ds", "redis", "rediss"]
+                and (not parsed_url.path or parsed_url.path == "/")
+            ):
+                return TargetPathObject(
+                    _get_target_path(
+                        self,
+                        self._resource,
+                        self.run_id is not None,
+                        netloc=parsed_url.netloc,
+                        scheme=parsed_url.scheme,
+                    ),
+                    self.run_id,
+                    is_single_file,
+                )
         return self.get_path() or (
             TargetPathObject(
                 _get_target_path(self, self._resource, self.run_id is not None),
@@ -657,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
             self.kind, self.name, self.get_target_templated_path()
         )
         target = self._target
+        target.attributes = self.attributes
         target.run_id = self.run_id
         target.status = status or target.status or "created"
         target.updated = now_date().isoformat()
@@ -685,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
         timestamp_key=None,
         featureset_status=None,
     ):
+        if not self.support_storey:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"{type(self).__name__} does not support storey engine"
+            )
         raise NotImplementedError()
     def purge(self):
+        """
+        Delete the files of the target.
+        Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
+        """
         store, path_in_store, target_path = self._get_store_and_path()
-        store.rm(target_path, recursive=True)
+        if path_in_store not in ["", "/"]:
+            store.rm(path_in_store, recursive=True)
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Unable to delete target. Please Use purge_targets from FeatureSet object."
+            )
     def as_df(
         self,
@@ -699,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return the target data as dataframe"""
+        if not self.support_pandas:
+            raise NotImplementedError()
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         return mlrun.get_dataitem(self.get_target_path()).as_df(
             columns=columns,
             df_module=df_module,
@@ -713,14 +780,22 @@ class BaseStoreTarget(DataTargetBase):
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
         # options used in spark.read.load(**options)
+        if not self.support_spark:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"{type(self).__name__} does not support spark engine"
+            )
         raise NotImplementedError()
-    def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
+    def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
         return df
     def get_dask_options(self):
         raise NotImplementedError()
+    @property
+    def source_spark_attributes(self) -> dict:
+        return {}
 class ParquetTarget(BaseStoreTarget):
     """Parquet target storage driver, used to materialize feature set/vector data into parquet files.
@@ -752,6 +827,7 @@ class ParquetTarget(BaseStoreTarget):
     support_spark = True
     support_storey = True
     support_dask = True
+    support_pandas = True
     support_append = True
     def __init__(
@@ -857,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
                 if time_unit == time_partitioning_granularity:
                     break
-        if (
-            not self.partitioned
-            and not self.get_target_path().endswith(".parquet")
-            and not self.get_target_path().endswith(".pq")
+        target_path = self.get_target_path()
+        if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
+            target_path
         ):
             partition_cols = []
@@ -868,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
         for key_column in key_columns:
             tuple_key_columns.append((key_column.name, key_column.value_type))
-        store, path_in_store, target_path = self._get_store_and_path()
-        storage_options = store.get_storage_options()
-        if storage_options and self.storage_options:
-            storage_options = merge(storage_options, self.storage_options)
-        else:
-            storage_options = storage_options or self.storage_options
         step = graph.add_step(
             name=self.name or "ParquetTarget",
             after=after,
             graph_shape="cylinder",
-            class_name="storey.ParquetTarget",
+            class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
             path=target_path,
             columns=column_list,
             index_cols=tuple_key_columns,
             partition_cols=partition_cols,
             time_field=timestamp_key,
-            storage_options=storage_options,
             max_events=self.max_events,
             flush_after_seconds=self.flush_after_seconds,
             update_last_written=featureset_status.update_last_written_for_target,
@@ -946,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return the target data as dataframe"""
@@ -956,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=transform_list_filters_to_tuple(additional_filters),
             **kwargs,
         )
         if not columns:
@@ -977,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
         return result
     def is_single_file(self):
-        if self.path:
-            return self.path.endswith(".parquet") or self.path.endswith(".pq")
-        return False
+        return mlrun.utils.helpers.is_parquet_file(self.path)
     def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
         # If partitioning by time, add the necessary columns
@@ -1019,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
     is_offline = True
     support_spark = True
     support_storey = True
+    support_pandas = True
     @staticmethod
     def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
@@ -1040,17 +1107,16 @@ class CSVTarget(BaseStoreTarget):
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
-        store, path_in_store, target_path = self._get_store_and_path()
+        target_path = self.get_target_path()
         graph.add_step(
             name=self.name or "CSVTarget",
             after=after,
             graph_shape="cylinder",
-            class_name="storey.CSVTarget",
+            class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
             path=target_path,
             columns=column_list,
             header=True,
             index_cols=key_columns,
-            storage_options=store.get_storage_options(),
             **self.attributes,
         )
@@ -1070,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
         import pyspark.sql.functions as funcs
         for col_name, col_type in df.dtypes:
-            if col_type == "timestamp":
+            # covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
+            if col_type.startswith("timestamp"):
                 # df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
                 # for compatibility with storey.
                 df = df.withColumn(
@@ -1086,8 +1153,12 @@ class CSVTarget(BaseStoreTarget):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         df = super().as_df(
             columns=columns,
             df_module=df_module,
@@ -1108,6 +1179,134 @@ class CSVTarget(BaseStoreTarget):
         return True
+class SnowflakeTarget(BaseStoreTarget):
+    """
+    :param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
+                       if they exist.
+    :param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
+    :param user: Snowflake user for login
+    :param db_schema: Database schema
+    :param database: Database name
+    :param warehouse: Snowflake warehouse name
+    :param table_name: Snowflake table name
+    """
+    support_spark = True
+    support_append = True
+    is_offline = True
+    kind = TargetTypes.snowflake
+    def __init__(
+        self,
+        name: str = "",
+        path=None,
+        attributes: dict[str, str] = None,
+        after_step=None,
+        columns=None,
+        partitioned: bool = False,
+        key_bucketing_number: Optional[int] = None,
+        partition_cols: Optional[list[str]] = None,
+        time_partitioning_granularity: Optional[str] = None,
+        max_events: Optional[int] = None,
+        flush_after_seconds: Optional[int] = None,
+        storage_options: dict[str, str] = None,
+        schema: dict[str, Any] = None,
+        credentials_prefix=None,
+        url: str = None,
+        user: str = None,
+        db_schema: str = None,
+        database: str = None,
+        warehouse: str = None,
+        table_name: str = None,
+    ):
+        attributes = attributes or {}
+        if url:
+            attributes["url"] = url
+        if user:
+            attributes["user"] = user
+        if database:
+            attributes["database"] = database
+        if db_schema:
+            attributes["db_schema"] = db_schema
+        if warehouse:
+            attributes["warehouse"] = warehouse
+        if table_name:
+            attributes["table"] = table_name
+        super().__init__(
+            name,
+            path,
+            attributes,
+            after_step,
+            list(schema.keys()) if schema else columns,
+            partitioned,
+            key_bucketing_number,
+            partition_cols,
+            time_partitioning_granularity,
+            max_events=max_events,
+            flush_after_seconds=flush_after_seconds,
+            storage_options=storage_options,
+            schema=schema,
+            credentials_prefix=credentials_prefix,
+        )
+    def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
+        spark_options = get_snowflake_spark_options(self.attributes)
+        spark_options["dbtable"] = self.attributes.get("table")
+        return spark_options
+    def purge(self):
+        import snowflake.connector
+        missing = [
+            key
+            for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
+            if self.attributes.get(key) is None
+        ]
+        if missing:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"Can't purge Snowflake target, "
+                f"some attributes are missing: {', '.join(missing)}"
+            )
+        account = self.attributes["url"].replace(".snowflakecomputing.com", "")
+        with snowflake.connector.connect(
+            account=account,
+            user=self.attributes["user"],
+            password=get_snowflake_password(),
+            warehouse=self.attributes["warehouse"],
+        ) as snowflake_connector:
+            drop_statement = (
+                f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
+                f".{self.attributes['table']}"
+            )
+            snowflake_connector.execute_string(drop_statement)
+    def as_df(
+        self,
+        columns=None,
+        df_module=None,
+        entities=None,
+        start_time=None,
+        end_time=None,
+        time_column=None,
+        additional_filters=None,
+        **kwargs,
+    ):
+        raise mlrun.errors.MLRunRuntimeError(
+            f"{type(self).__name__} does not support pandas engine"
+        )
+    @property
+    def source_spark_attributes(self) -> dict:
+        keys = ["url", "user", "database", "db_schema", "warehouse"]
+        attributes = self.attributes or {}
+        snowflake_dict = {key: attributes.get(key) for key in keys}
+        table = attributes.get("table")
+        snowflake_dict["query"] = f"SELECT * from {table}" if table else None
+        return snowflake_dict
 class NoSqlBaseTarget(BaseStoreTarget):
     is_table = True
     is_online = True
@@ -1132,6 +1331,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
         timestamp_key=None,
         featureset_status=None,
     ):
+        table, column_list = self._get_table_and_columns(features, key_columns)
+        graph.add_step(
+            name=self.name or self.writer_step_name,
+            after=after,
+            graph_shape="cylinder",
+            class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
+            columns=column_list,
+            table=table,
+            **self.attributes,
+        )
+    def _get_table_and_columns(self, features, key_columns):
         key_columns = list(key_columns.keys())
         table = self._resource.uri
         column_list = self._get_column_list(
@@ -1150,15 +1362,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
                 col for col in column_list if col[0] not in aggregate_features
             ]
-        graph.add_step(
-            name=self.name or self.writer_step_name,
-            after=after,
-            graph_shape="cylinder",
-            class_name="storey.NoSqlTarget",
-            columns=column_list,
-            table=table,
-            **self.attributes,
-        )
+        return table, column_list
     def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
         raise NotImplementedError()
@@ -1169,9 +1373,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
     def get_dask_options(self):
         return {"format": "csv"}
-    def as_df(self, columns=None, df_module=None, **kwargs):
-        raise NotImplementedError()
     def write_dataframe(
         self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
     ):
@@ -1179,7 +1380,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
             options = self.get_spark_options(key_column, timestamp_key)
             options.update(kwargs)
             df = self.prepare_spark_df(df)
-            write_spark_dataframe_with_options(options, df, "overwrite")
+            write_format = options.pop("format", None)
+            write_spark_dataframe_with_options(
+                options, df, "overwrite", write_format=write_format
+            )
         else:
             # To prevent modification of the original dataframe and make sure
             # that the last event of a key is the one being persisted
@@ -1281,11 +1485,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
     support_spark = True
     writer_step_name = "RedisNoSqlTarget"
-    # Fetch server url from the RedisNoSqlTarget::__init__() 'path' parameter.
-    # If not set fetch it from 'mlrun.mlconf.redis.url' (MLRUN_REDIS__URL environment variable).
-    # Then look for username and password at REDIS_xxx secrets
-    def _get_server_endpoint(self):
-        endpoint, uri = parse_path(self.get_target_path())
+    @staticmethod
+    def get_server_endpoint(path, credentials_prefix=None):
+        endpoint, uri = parse_path(path)
         endpoint = endpoint or mlrun.mlconf.redis.url
         if endpoint.startswith("ds://"):
             datastore_profile = datastore_profile_read(endpoint)
@@ -1302,8 +1504,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
                 raise mlrun.errors.MLRunInvalidArgumentError(
                     "Provide Redis username and password only via secrets"
                 )
-            user = self._get_credential("REDIS_USER", "")
-            password = self._get_credential("REDIS_PASSWORD", "")
+            credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
+                key="CREDENTIALS_PREFIX"
+            )
+            user = mlrun.get_secret_or_env(
+                "REDIS_USER", default="", prefix=credentials_prefix
+            )
+            password = mlrun.get_secret_or_env(
+                "REDIS_PASSWORD", default="", prefix=credentials_prefix
+            )
             host = parsed_endpoint.hostname
             port = parsed_endpoint.port if parsed_endpoint.port else "6379"
             scheme = parsed_endpoint.scheme
@@ -1317,7 +1526,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
         from storey import Table
         from storey.redis_driver import RedisDriver
-        endpoint, uri = self._get_server_endpoint()
+        endpoint, uri = self.get_server_endpoint(
+            self.get_target_path(), self.credentials_prefix
+        )
         return Table(
             uri,
@@ -1326,7 +1537,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
         )
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
-        endpoint, uri = self._get_server_endpoint()
+        endpoint, uri = self.get_server_endpoint(
+            self.get_target_path(), self.credentials_prefix
+        )
         parsed_endpoint = urlparse(endpoint)
         store, path_in_store, path = self._get_store_and_path()
         return {
@@ -1358,6 +1571,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
         return df
+    def add_writer_step(
+        self,
+        graph,
+        after,
+        features,
+        key_columns=None,
+        timestamp_key=None,
+        featureset_status=None,
+    ):
+        table, column_list = self._get_table_and_columns(features, key_columns)
+        graph.add_step(
+            path=self.get_target_path(),
+            name=self.name or self.writer_step_name,
+            after=after,
+            graph_shape="cylinder",
+            class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
+            columns=column_list,
+            table=table,
+            credentials_prefix=self.credentials_prefix,
+            **self.attributes,
+        )
 class StreamTarget(BaseStoreTarget):
     kind = TargetTypes.stream
@@ -1376,37 +1612,46 @@ class StreamTarget(BaseStoreTarget):
         timestamp_key=None,
         featureset_status=None,
     ):
-        from storey import V3ioDriver
         key_columns = list(key_columns.keys())
-        store, path_in_store, path = self._get_store_and_path()
-        if not path:
-            raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
-        endpoint, uri = parse_path(path)
-        storage_options = store.get_storage_options()
-        access_key = storage_options.get("v3io_access_key")
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
+        stream_path = self.get_target_path()
+        if not stream_path:
+            raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
         graph.add_step(
             name=self.name or "StreamTarget",
             after=after,
             graph_shape="cylinder",
-            class_name="storey.StreamTarget",
+            class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
             columns=column_list,
-            storage=V3ioDriver(
-                webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
-            ),
-            stream_path=uri,
+            stream_path=stream_path,
             **self.attributes,
         )
-    def as_df(self, columns=None, df_module=None, **kwargs):
-        raise NotImplementedError()
 class KafkaTarget(BaseStoreTarget):
+    """
+    Kafka target storage driver, used to write data into kafka topics.
+    example::
+        # define target
+        kafka_target = KafkaTarget(
+            name="kafka", path="my_topic", brokers="localhost:9092"
+        )
+        # ingest
+        stocks_set.ingest(stocks, [kafka_target])
+    :param name:                target name
+    :param path:                topic name e.g. "my_topic"
+    :param after_step:          optional, after what step in the graph to add the target
+    :param columns:             optional, which columns from data to write
+    :param bootstrap_servers:   Deprecated. Use the brokers parameter instead
+    :param producer_options:    additional configurations for kafka producer
+    :param brokers:             kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
+        "localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
+    """
     kind = TargetTypes.kafka
     is_table = False
     is_online = False
@@ -1419,11 +1664,27 @@ class KafkaTarget(BaseStoreTarget):
         *args,
         bootstrap_servers=None,
         producer_options=None,
+        brokers=None,
         **kwargs,
     ):
         attrs = {}
-        if bootstrap_servers is not None:
-            attrs["bootstrap_servers"] = bootstrap_servers
+        # TODO: Remove this in 1.9.0
+        if bootstrap_servers:
+            if brokers:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
+                    "'bootstrap_servers' parameter. Please use 'brokers' only."
+                )
+            warnings.warn(
+                "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
+                "use 'brokers' instead.",
+                FutureWarning,
+            )
+            brokers = bootstrap_servers
+        if brokers:
+            attrs["brokers"] = brokers
         if producer_options is not None:
             attrs["producer_options"] = producer_options
@@ -1442,37 +1703,21 @@ class KafkaTarget(BaseStoreTarget):
         column_list = self._get_column_list(
             features=features, timestamp_key=timestamp_key, key_columns=key_columns
         )
-        if self.path and self.path.startswith("ds://"):
-            datastore_profile = datastore_profile_read(self.path)
-            attributes = datastore_profile.attributes()
-            bootstrap_servers = attributes.pop("bootstrap_servers", None)
-            topic = datastore_profile.topic
-        else:
-            attributes = copy(self.attributes)
-            bootstrap_servers = attributes.pop("bootstrap_servers", None)
-            topic, bootstrap_servers = parse_kafka_url(
-                self.get_target_path(), bootstrap_servers
-            )
+        path = self.get_target_path()
-        if not topic:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "KafkaTarget requires a path (topic)"
-            )
+        if not path:
+            raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
         graph.add_step(
             name=self.name or "KafkaTarget",
             after=after,
             graph_shape="cylinder",
-            class_name="storey.KafkaTarget",
+            class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
             columns=column_list,
-            topic=topic,
-            bootstrap_servers=bootstrap_servers,
-            **attributes,
+            path=path,
+            attributes=self.attributes,
         )
-    def as_df(self, columns=None, df_module=None, **kwargs):
-        raise NotImplementedError()
     def purge(self):
         pass
@@ -1507,7 +1752,7 @@ class TSDBTarget(BaseStoreTarget):
         graph.add_step(
             name=self.name or "TSDBTarget",
-            class_name="storey.TSDBTarget",
+            class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
             after=after,
             graph_shape="cylinder",
             path=uri,
@@ -1517,9 +1762,6 @@ class TSDBTarget(BaseStoreTarget):
             **self.attributes,
         )
-    def as_df(self, columns=None, df_module=None, **kwargs):
-        raise NotImplementedError()
     def write_dataframe(
         self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
     ):
@@ -1557,6 +1799,7 @@ class CustomTarget(BaseStoreTarget):
     is_online = False
     support_spark = False
     support_storey = True
+    support_pandas = True
     def __init__(
         self,
@@ -1592,6 +1835,7 @@ class CustomTarget(BaseStoreTarget):
 class DFTarget(BaseStoreTarget):
     kind = TargetTypes.dataframe
     support_storey = True
+    support_pandas = True
     def __init__(self, *args, name="dataframe", **kwargs):
         self._df = None
@@ -1628,11 +1872,16 @@ class DFTarget(BaseStoreTarget):
         self,
         columns=None,
         df_module=None,
+        entities=None,
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         return select_columns_from_df(
             filter_df_start_end_time(
                 self._df,
@@ -1649,6 +1898,7 @@ class SQLTarget(BaseStoreTarget):
     is_online = True
     support_spark = False
     support_storey = True
+    support_pandas = True
     def __init__(
         self,
@@ -1791,7 +2041,7 @@ class SQLTarget(BaseStoreTarget):
             name=self.name or "SqlTarget",
             after=after,
             graph_shape="cylinder",
-            class_name="storey.NoSqlTarget",
+            class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
             columns=column_list,
             header=True,
             table=table,
@@ -1807,6 +2057,7 @@ class SQLTarget(BaseStoreTarget):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         try:
@@ -1815,6 +2066,10 @@ class SQLTarget(BaseStoreTarget):
         except (ModuleNotFoundError, ImportError) as exc:
             self._raise_sqlalchemy_import_error(exc)
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         db_path, table_name, _, _, _, _ = self._parse_url()
         engine = sqlalchemy.create_engine(db_path)
         parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
@@ -1904,7 +2159,7 @@ class SQLTarget(BaseStoreTarget):
                 raise ValueError(f"Table named {table_name} is not exist")
             elif not table_exists and create_table:
-                TYPE_TO_SQL_TYPE = {
+                type_to_sql_type = {
                     int: sqlalchemy.Integer,
                     str: sqlalchemy.String(self.attributes.get("varchar_len")),
                     datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
@@ -1917,7 +2172,7 @@ class SQLTarget(BaseStoreTarget):
                 # creat new table with the given name
                 columns = []
                 for col, col_type in self.schema.items():
-                    col_type_sql = TYPE_TO_SQL_TYPE.get(col_type)
+                    col_type_sql = type_to_sql_type.get(col_type)
                     if col_type_sql is None:
                         raise TypeError(
                             f"'{col_type}' unsupported type for column '{col}'"
@@ -1957,10 +2212,11 @@ kind_to_driver = {
     TargetTypes.tsdb: TSDBTarget,
     TargetTypes.custom: CustomTarget,
     TargetTypes.sql: SQLTarget,
+    TargetTypes.snowflake: SnowflakeTarget,
 }
-def _get_target_path(driver, resource, run_id_mode=False):
+def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
     """return the default target path given the resource and target kind"""
     kind = driver.kind
     suffix = driver.suffix
@@ -1977,11 +2233,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
     )
     name = resource.metadata.name
     project = resource.metadata.project or mlrun.mlconf.default_project
-    data_prefix = get_default_prefix_for_target(kind).format(
+    default_kind_name = kind
+    if scheme == "ds":
+        # "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
+        # as a placeholder that can be used in any specified target
+        default_kind_name = "dsnosql"
+    if scheme == "redis" or scheme == "rediss":
+        default_kind_name = TargetTypes.redisnosql
+    netloc = netloc or ""
+    data_prefix = get_default_prefix_for_target(default_kind_name).format(
+        ds_profile_name=netloc,  # In case of ds profile, set its the name
+        authority=netloc,  # In case of redis, replace {authority} with netloc
         project=project,
         kind=kind,
         name=name,
     )
+    if scheme == "rediss":
+        data_prefix = data_prefix.replace("redis://", "rediss://", 1)
     # todo: handle ver tag changes, may need to copy files?
     if not run_id_mode:
         version = resource.metadata.tag

mlrun 1.7.0rc5__py3-none-any.whl → 1.7.2__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc5py3-none-any.whl → 1.7.2py3-none-any.whl