PyPI - mlrun - Versions diffs - 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl - Mend

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +26 -112
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +144 -0
mlrun/api/schemas/__init__.py +5 -4
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +46 -257
mlrun/artifacts/dataset.py +11 -192
mlrun/artifacts/manager.py +47 -48
mlrun/artifacts/model.py +31 -159
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +69 -0
mlrun/common/db/sql_session.py +2 -3
mlrun/common/formatters/__init__.py +19 -0
mlrun/common/formatters/artifact.py +21 -0
mlrun/common/formatters/base.py +78 -0
mlrun/common/formatters/function.py +41 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/helpers.py +1 -2
mlrun/common/model_monitoring/helpers.py +9 -5
mlrun/{runtimes → common/runtimes}/constants.py +37 -9
mlrun/common/schemas/__init__.py +24 -4
mlrun/common/schemas/alert.py +203 -0
mlrun/common/schemas/api_gateway.py +148 -0
mlrun/common/schemas/artifact.py +18 -8
mlrun/common/schemas/auth.py +11 -5
mlrun/common/schemas/background_task.py +1 -1
mlrun/common/schemas/client_spec.py +4 -1
mlrun/common/schemas/feature_store.py +16 -16
mlrun/common/schemas/frontend_spec.py +8 -7
mlrun/common/schemas/function.py +5 -1
mlrun/common/schemas/hub.py +11 -18
mlrun/common/schemas/memory_reports.py +2 -2
mlrun/common/schemas/model_monitoring/__init__.py +18 -3
mlrun/common/schemas/model_monitoring/constants.py +83 -26
mlrun/common/schemas/model_monitoring/grafana.py +13 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
mlrun/common/schemas/notification.py +4 -4
mlrun/common/schemas/object.py +2 -2
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +1 -10
mlrun/common/schemas/project.py +24 -23
mlrun/common/schemas/runtime_resource.py +8 -12
mlrun/common/schemas/schedule.py +3 -3
mlrun/common/schemas/tag.py +1 -2
mlrun/common/schemas/workflow.py +2 -2
mlrun/common/types.py +7 -1
mlrun/config.py +54 -17
mlrun/data_types/to_pandas.py +10 -12
mlrun/datastore/__init__.py +5 -8
mlrun/datastore/alibaba_oss.py +130 -0
mlrun/datastore/azure_blob.py +17 -5
mlrun/datastore/base.py +62 -39
mlrun/datastore/datastore.py +28 -9
mlrun/datastore/datastore_profile.py +146 -20
mlrun/datastore/filestore.py +0 -1
mlrun/datastore/google_cloud_storage.py +6 -2
mlrun/datastore/hdfs.py +56 -0
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/redis.py +6 -2
mlrun/datastore/s3.py +9 -0
mlrun/datastore/snowflake_utils.py +43 -0
mlrun/datastore/sources.py +201 -96
mlrun/datastore/spark_utils.py +1 -2
mlrun/datastore/store_resources.py +7 -7
mlrun/datastore/targets.py +358 -104
mlrun/datastore/utils.py +72 -58
mlrun/datastore/v3io.py +5 -1
mlrun/db/base.py +185 -35
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +614 -179
mlrun/db/nopdb.py +210 -26
mlrun/errors.py +12 -1
mlrun/execution.py +41 -24
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +40 -72
mlrun/feature_store/common.py +1 -1
mlrun/feature_store/feature_set.py +76 -55
mlrun/feature_store/feature_vector.py +28 -30
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +16 -11
mlrun/feature_store/retrieval/conversion.py +11 -13
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +9 -3
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +34 -24
mlrun/feature_store/steps.py +37 -34
mlrun/features.py +9 -20
mlrun/frameworks/_common/artifacts_library.py +9 -9
mlrun/frameworks/_common/mlrun_interface.py +5 -5
mlrun/frameworks/_common/model_handler.py +48 -48
mlrun/frameworks/_common/plan.py +2 -3
mlrun/frameworks/_common/producer.py +3 -4
mlrun/frameworks/_common/utils.py +5 -5
mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
mlrun/frameworks/_ml_common/model_handler.py +24 -24
mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/_ml_common/utils.py +4 -4
mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
mlrun/frameworks/huggingface/model_server.py +4 -4
mlrun/frameworks/lgbm/__init__.py +33 -33
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
mlrun/frameworks/lgbm/model_handler.py +10 -10
mlrun/frameworks/lgbm/model_server.py +6 -6
mlrun/frameworks/lgbm/utils.py +5 -5
mlrun/frameworks/onnx/dataset.py +8 -8
mlrun/frameworks/onnx/mlrun_interface.py +3 -3
mlrun/frameworks/onnx/model_handler.py +6 -6
mlrun/frameworks/onnx/model_server.py +7 -7
mlrun/frameworks/parallel_coordinates.py +4 -3
mlrun/frameworks/pytorch/__init__.py +18 -18
mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
mlrun/frameworks/pytorch/model_handler.py +17 -17
mlrun/frameworks/pytorch/model_server.py +7 -7
mlrun/frameworks/sklearn/__init__.py +13 -13
mlrun/frameworks/sklearn/estimator.py +4 -4
mlrun/frameworks/sklearn/metrics_library.py +14 -14
mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
mlrun/frameworks/sklearn/model_handler.py +2 -2
mlrun/frameworks/tf_keras/__init__.py +10 -7
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
mlrun/frameworks/tf_keras/model_handler.py +14 -14
mlrun/frameworks/tf_keras/model_server.py +6 -6
mlrun/frameworks/xgboost/__init__.py +13 -13
mlrun/frameworks/xgboost/model_handler.py +6 -6
mlrun/k8s_utils.py +14 -16
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +16 -15
mlrun/launcher/client.py +8 -6
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +17 -11
mlrun/launcher/remote.py +16 -10
mlrun/lists.py +7 -6
mlrun/model.py +238 -73
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +138 -315
mlrun/model_monitoring/application.py +5 -296
mlrun/model_monitoring/applications/__init__.py +24 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +282 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +104 -84
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
mlrun/model_monitoring/db/tsdb/base.py +329 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/features_drift_table.py +134 -106
mlrun/model_monitoring/helpers.py +127 -28
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/prometheus.py +1 -4
mlrun/model_monitoring/stream_processing.py +62 -231
mlrun/model_monitoring/tracking_policy.py +9 -2
mlrun/model_monitoring/writer.py +152 -124
mlrun/package/__init__.py +6 -6
mlrun/package/context_handler.py +5 -5
mlrun/package/packager.py +7 -7
mlrun/package/packagers/default_packager.py +6 -6
mlrun/package/packagers/numpy_packagers.py +15 -15
mlrun/package/packagers/pandas_packagers.py +5 -5
mlrun/package/packagers/python_standard_library_packagers.py +10 -10
mlrun/package/packagers_manager.py +19 -23
mlrun/package/utils/_formatter.py +6 -6
mlrun/package/utils/_pickler.py +2 -2
mlrun/package/utils/_supported_format.py +4 -4
mlrun/package/utils/log_hint_utils.py +2 -2
mlrun/package/utils/type_hint_utils.py +4 -9
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +24 -203
mlrun/projects/operations.py +35 -21
mlrun/projects/pipelines.py +68 -99
mlrun/projects/project.py +830 -266
mlrun/render.py +3 -11
mlrun/run.py +162 -166
mlrun/runtimes/__init__.py +62 -7
mlrun/runtimes/base.py +39 -32
mlrun/runtimes/daskjob.py +8 -8
mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/function_reference.py +1 -1
mlrun/runtimes/kubejob.py +28 -122
mlrun/runtimes/local.py +6 -3
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +9 -10
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
mlrun/runtimes/nuclio/api_gateway.py +709 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +523 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
mlrun/runtimes/pod.py +286 -88
mlrun/runtimes/remotesparkjob.py +2 -2
mlrun/runtimes/sparkjob/spark3job.py +51 -34
mlrun/runtimes/utils.py +7 -75
mlrun/secrets.py +9 -5
mlrun/serving/remote.py +2 -7
mlrun/serving/routers.py +13 -10
mlrun/serving/server.py +22 -26
mlrun/serving/states.py +99 -25
mlrun/serving/utils.py +3 -3
mlrun/serving/v1_serving.py +6 -7
mlrun/serving/v2_serving.py +59 -20
mlrun/track/tracker.py +2 -1
mlrun/track/tracker_manager.py +3 -3
mlrun/track/trackers/mlflow_tracker.py +1 -2
mlrun/utils/async_http.py +5 -7
mlrun/utils/azure_vault.py +1 -1
mlrun/utils/clones.py +1 -2
mlrun/utils/condition_evaluator.py +3 -3
mlrun/utils/db.py +3 -3
mlrun/utils/helpers.py +183 -197
mlrun/utils/http.py +2 -5
mlrun/utils/logger.py +76 -14
mlrun/utils/notifications/notification/__init__.py +17 -12
mlrun/utils/notifications/notification/base.py +14 -2
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +3 -1
mlrun/utils/notifications/notification/ipython.py +3 -1
mlrun/utils/notifications/notification/slack.py +101 -21
mlrun/utils/notifications/notification/webhook.py +11 -1
mlrun/utils/notifications/notification_pusher.py +155 -30
mlrun/utils/retryer.py +208 -0
mlrun/utils/singleton.py +1 -1
mlrun/utils/v3io_clients.py +2 -4
mlrun/utils/version/version.json +2 -2
mlrun/utils/version/version.py +2 -6
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
mlrun-1.7.0rc20.dist-info/RECORD +353 -0
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/batch.py +0 -1095
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
mlrun/platforms/other.py +0 -306
mlrun-1.6.4rc2.dist-info/RECORD +0 -314
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0

mlrun/datastore/redis.py CHANGED Viewed

@@ -31,7 +31,7 @@ class RedisStore(DataStore):
     """
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
-        REDIS_DEFAULT_PORT = "6379"
+        redis_default_port = "6379"
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
         self.headers = None
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
         user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
         password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
         host = parsed_endpoint.hostname
-        port = parsed_endpoint.port if parsed_endpoint.port else REDIS_DEFAULT_PORT
+        port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
         schema = parsed_endpoint.scheme
         if user or password:
             endpoint = f"{schema}://{user}:{password}@{host}:{port}"
@@ -163,3 +163,7 @@ class RedisStore(DataStore):
                 self.redis.delete(k)
         else:
             self.redis.delete(key)
+    @property
+    def spark_url(self):
+        return ""

mlrun/datastore/s3.py CHANGED Viewed

@@ -156,6 +156,10 @@ class S3Store(DataStore):
         return self._sanitize_storage_options(storage_options)
+    @property
+    def spark_url(self):
+        return f"s3a://{self.endpoint}"
     def get_bucket_and_key(self, key):
         path = self._join(key)[1:]
         return self.endpoint, path
@@ -194,6 +198,11 @@ class S3Store(DataStore):
         bucket = self.s3.Bucket(bucket)
         return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
+    def rm(self, path, recursive=False, maxdepth=None):
+        bucket, key = self.get_bucket_and_key(path)
+        path = f"{bucket}/{key}"
+        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
 def parse_s3_bucket_and_key(s3_path):
     try:

mlrun/datastore/snowflake_utils.py ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import mlrun
+def get_snowflake_password():
+    key = "SNOWFLAKE_PASSWORD"
+    snowflake_password = mlrun.get_secret_or_env(key)
+    if not snowflake_password:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            f"No password provided. Set password using the {key} "
+            "project secret or environment variable."
+        )
+    return snowflake_password
+def get_snowflake_spark_options(attributes):
+    return {
+        "format": "net.snowflake.spark.snowflake",
+        "sfURL": attributes.get("url"),
+        "sfUser": attributes.get("user"),
+        "sfPassword": get_snowflake_password(),
+        "sfDatabase": attributes.get("database"),
+        "sfSchema": attributes.get("schema"),
+        "sfWarehouse": attributes.get("warehouse"),
+        "application": "iguazio_platform",
+        "TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
+    }

mlrun/datastore/sources.py CHANGED Viewed

@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import operator
 import os
 import warnings
 from base64 import b64encode
 from copy import copy
 from datetime import datetime
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 import pandas as pd
 import semver
@@ -27,9 +28,11 @@ from nuclio import KafkaTrigger
 from nuclio.config import split_path
 import mlrun
+from mlrun.config import config
+from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
+from mlrun.datastore.utils import transform_list_filters_to_tuple
 from mlrun.secrets import SecretsStore
-from ..config import config
 from ..model import DataSource
 from ..platforms.iguazio import parse_path
 from ..utils import get_class, is_explicit_ack_supported
@@ -39,7 +42,6 @@ from .utils import (
     _generate_sql_query_with_time_filter,
     filter_df_start_end_time,
     select_columns_from_df,
-    store_path_to_spark,
 )
@@ -102,8 +104,12 @@ class BaseSourceDriver(DataSource):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
         """return the source data as dataframe"""
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         return mlrun.store_manager.object(url=self.path).as_df(
             columns=columns,
             df_module=df_module,
@@ -114,7 +120,11 @@ class BaseSourceDriver(DataSource):
     def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
         if self.support_spark:
-            df = load_spark_dataframe_with_options(session, self.get_spark_options())
+            spark_options = self.get_spark_options()
+            spark_format = spark_options.pop("format", None)
+            df = load_spark_dataframe_with_options(
+                session, spark_options, format=spark_format
+            )
             if named_view:
                 df.createOrReplaceTempView(self.name)
             return self._filter_spark_df(df, time_field, columns)
@@ -170,10 +180,10 @@ class CSVSource(BaseSourceDriver):
         self,
         name: str = "",
         path: str = None,
-        attributes: Dict[str, str] = None,
+        attributes: dict[str, object] = None,
         key_field: str = None,
         schedule: str = None,
-        parse_dates: Union[None, int, str, List[int], List[str]] = None,
+        parse_dates: Union[None, int, str, list[int], list[str]] = None,
         **kwargs,
     ):
         super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
@@ -193,14 +203,10 @@ class CSVSource(BaseSourceDriver):
             parse_dates.append(time_field)
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.CSVSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             build_dict=True,
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
@@ -209,25 +215,17 @@ class CSVSource(BaseSourceDriver):
         )
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "csv",
-                "header": "true",
-                "inferSchema": "true",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "csv",
                 "header": "true",
                 "inferSchema": "true",
             }
+        )
+        return spark_options
     def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
         import pyspark.sql.functions as funcs
@@ -253,7 +251,11 @@ class CSVSource(BaseSourceDriver):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         reader_args = self.attributes.get("reader_args", {})
         return mlrun.store_manager.object(url=self.path).as_df(
             columns=columns,
@@ -289,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
     :parameter start_time: filters out data before this time
     :parameter end_time: filters out data after this time
     :parameter attributes: additional parameters to pass to storey.
+    :param additional_filters: List of additional_filter conditions as tuples.
+                               Each tuple should be in the format (column_name, operator, value).
+                               Supported operators: "=", ">=", "<=", ">", "<".
+                               Example: [("Product", "=", "Computer")]
+                               For all supported filters, please see:
+                               https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
     """
     kind = "parquet"
@@ -299,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
         self,
         name: str = "",
         path: str = None,
-        attributes: Dict[str, str] = None,
+        attributes: dict[str, object] = None,
         key_field: str = None,
         time_field: str = None,
         schedule: str = None,
         start_time: Optional[Union[datetime, str]] = None,
         end_time: Optional[Union[datetime, str]] = None,
+        additional_filters: Optional[list[Union[tuple, list]]] = None,
     ):
+        if additional_filters:
+            attributes = copy(attributes) or {}
+            additional_filters = transform_list_filters_to_tuple(additional_filters)
+            attributes["additional_filters"] = additional_filters
         super().__init__(
             name,
             path,
@@ -333,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
     def end_time(self, end_time):
         self._end_time = self._convert_to_datetime(end_time)
+    @property
+    def additional_filters(self):
+        return self.attributes.get("additional_filters")
     @staticmethod
     def _convert_to_datetime(time):
         if time and isinstance(time, str):
@@ -349,45 +367,48 @@ class ParquetSource(BaseSourceDriver):
         start_time=None,
         end_time=None,
         context=None,
+        additional_filters=None,
     ):
         import storey
-        attributes = self.attributes or {}
+        attributes = copy(self.attributes)
+        attributes.pop("additional_filters", None)
         if context:
             attributes["context"] = context
+        additional_filters = transform_list_filters_to_tuple(additional_filters)
         data_item = mlrun.store_manager.object(self.path)
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            path = store.url + path
-        else:
-            path = data_item.url
+        store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.ParquetSource(
-            paths=path,  # unlike self.path, it already has store:// replaced
+            paths=url,  # unlike self.path, it already has store:// replaced
             key_field=self.key_field or key_field,
             storage_options=data_item.store.get_storage_options(),
             end_filter=self.end_time,
             start_filter=self.start_time,
             filter_column=self.time_field or time_field,
+            additional_filters=self.additional_filters or additional_filters,
             **attributes,
         )
+    @classmethod
+    def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
+        new_obj = super().from_dict(
+            struct=struct, fields=fields, deprecated_fields=deprecated_fields
+        )
+        new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
+            new_obj.additional_filters
+        )
+        return new_obj
     def get_spark_options(self):
-        if self.path and self.path.startswith("ds://"):
-            store, path = mlrun.store_manager.get_or_create_store(self.path)
-            storage_spark_options = store.get_spark_options()
-            path = store.url + path
-            result = {
-                "path": store_path_to_spark(path, storage_spark_options),
-                "format": "parquet",
-            }
-            return {**result, **storage_spark_options}
-        else:
-            return {
-                "path": store_path_to_spark(self.path),
+        store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
+        spark_options = store.get_spark_options()
+        spark_options.update(
+            {
+                "path": store.spark_url + path,
                 "format": "parquet",
             }
+        )
+        return spark_options
     def to_dataframe(
         self,
@@ -397,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
         reader_args = self.attributes.get("reader_args", {})
+        additional_filters = transform_list_filters_to_tuple(additional_filters)
         return mlrun.store_manager.object(url=self.path).as_df(
             columns=columns,
             df_module=df_module,
@@ -406,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
             end_time=end_time or self.end_time,
             time_column=time_field or self.time_field,
             format="parquet",
+            additional_filters=additional_filters or self.additional_filters,
             **reader_args,
         )
+    def _build_spark_additional_filters(self, column_types: dict):
+        if not self.additional_filters:
+            return None
+        from pyspark.sql.functions import col, isnan, lit
+        operators = {
+            "==": operator.eq,
+            "=": operator.eq,
+            ">": operator.gt,
+            "<": operator.lt,
+            ">=": operator.ge,
+            "<=": operator.le,
+            "!=": operator.ne,
+        }
+        spark_filter = None
+        new_filter = lit(True)
+        for filter_tuple in self.additional_filters:
+            if not filter_tuple:
+                continue
+            col_name, op, value = filter_tuple
+            if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
+                none_exists = False
+                value = list(value)
+                for sub_value in value:
+                    if sub_value is None:
+                        value.remove(sub_value)
+                        none_exists = True
+                if none_exists:
+                    filter_nan = column_types[col_name] not in ("timestamp", "date")
+                    if value:
+                        if op.lower() == "in":
+                            new_filter = (
+                                col(col_name).isin(value) | col(col_name).isNull()
+                            )
+                            if filter_nan:
+                                new_filter = new_filter | isnan(col(col_name))
+                        else:
+                            new_filter = (
+                                ~col(col_name).isin(value) & ~col(col_name).isNull()
+                            )
+                            if filter_nan:
+                                new_filter = new_filter & ~isnan(col(col_name))
+                    else:
+                        if op.lower() == "in":
+                            new_filter = col(col_name).isNull()
+                            if filter_nan:
+                                new_filter = new_filter | isnan(col(col_name))
+                        else:
+                            new_filter = ~col(col_name).isNull()
+                            if filter_nan:
+                                new_filter = new_filter & ~isnan(col(col_name))
+                else:
+                    if op.lower() == "in":
+                        new_filter = col(col_name).isin(value)
+                    elif op.lower() == "not in":
+                        new_filter = ~col(col_name).isin(value)
+            elif op in operators:
+                new_filter = operators[op](col(col_name), value)
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    f"unsupported filter operator: {op}"
+                )
+            if spark_filter is not None:
+                spark_filter = spark_filter & new_filter
+            else:
+                spark_filter = new_filter
+        return spark_filter
+    def _filter_spark_df(self, df, time_field=None, columns=None):
+        spark_additional_filters = self._build_spark_additional_filters(
+            column_types=dict(df.dtypes)
+        )
+        if spark_additional_filters is not None:
+            df = df.filter(spark_additional_filters)
+        return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
 class BigQuerySource(BaseSourceDriver):
     """
@@ -423,12 +525,17 @@ class BigQuerySource(BaseSourceDriver):
          # use sql query
          query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
-         source = BigQuerySource("bq1", query=query_string,
-                                 gcp_project="my_project",
-                                 materialization_dataset="dataviews")
+         source = BigQuerySource(
+             "bq1",
+             query=query_string,
+             gcp_project="my_project",
+             materialization_dataset="dataviews",
+         )
          # read a table
-         source = BigQuerySource("bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project")
+         source = BigQuerySource(
+             "bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
+         )
     :parameter name: source name
@@ -531,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
         from google.cloud import bigquery
         from google.cloud.bigquery_storage_v1 import BigQueryReadClient
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         def schema_to_dtypes(schema):
             from mlrun.data_types.data_types import gbq_to_pandas_dtype
@@ -574,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
         else:
             df = rows_iterator.to_dataframe(dtypes=dtypes)
-        # TODO : filter as part of the query
         return select_columns_from_df(
             filter_df_start_end_time(
                 df,
@@ -695,32 +806,10 @@ class SnowflakeSource(BaseSourceDriver):
             **kwargs,
         )
-    def _get_password(self):
-        key = "SNOWFLAKE_PASSWORD"
-        snowflake_password = os.getenv(key) or os.getenv(
-            SecretsStore.k8s_env_variable_name_for_secret(key)
-        )
-        if not snowflake_password:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "No password provided. Set password using the SNOWFLAKE_PASSWORD "
-                "project secret or environment variable."
-            )
-        return snowflake_password
     def get_spark_options(self):
-        return {
-            "format": "net.snowflake.spark.snowflake",
-            "query": self.attributes.get("query"),
-            "sfURL": self.attributes.get("url"),
-            "sfUser": self.attributes.get("user"),
-            "sfPassword": self._get_password(),
-            "sfDatabase": self.attributes.get("database"),
-            "sfSchema": self.attributes.get("schema"),
-            "sfWarehouse": self.attributes.get("warehouse"),
-            "application": "iguazio_platform",
-        }
+        spark_options = get_snowflake_spark_options(self.attributes)
+        spark_options["query"] = self.attributes.get("query")
+        return spark_options
 class CustomSource(BaseSourceDriver):
@@ -774,7 +863,19 @@ class DataFrameSource:
             context=self.context or context,
         )
-    def to_dataframe(self, **kwargs):
+    def to_dataframe(
+        self,
+        columns=None,
+        df_module=None,
+        entities=None,
+        start_time=None,
+        end_time=None,
+        time_field=None,
+        additional_filters=None,
+    ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         return self._df
     def is_iterator(self):
@@ -800,7 +901,7 @@ class OnlineSource(BaseSourceDriver):
         self,
         name: str = None,
         path: str = None,
-        attributes: Dict[str, object] = None,
+        attributes: dict[str, object] = None,
         key_field: str = None,
         time_field: str = None,
         workers: int = None,
@@ -812,16 +913,12 @@ class OnlineSource(BaseSourceDriver):
     def to_step(self, key_field=None, time_field=None, context=None):
         import storey
-        source_class = (
-            storey.AsyncEmitSource
-            if config.datastore.async_source_mode == "enabled"
-            else storey.SyncEmitSource
-        )
         source_args = self.attributes.get("source_args", {})
         explicit_ack = (
             is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
         )
-        src_class = source_class(
+        # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
+        src_class = storey.SyncEmitSource(
             context=context,
             key_field=self.key_field or key_field,
             full_event=True,
@@ -848,8 +945,6 @@ class HttpSource(OnlineSource):
 class StreamSource(OnlineSource):
-    """Sets stream source for the flow. If stream doesn't exist it will create it"""
     kind = "v3ioStream"
     def __init__(
@@ -863,7 +958,7 @@ class StreamSource(OnlineSource):
         **kwargs,
     ):
         """
-        Sets stream source for the flow. If stream doesn't exist it will create it
+        Sets the stream source for the flow. If the stream doesn't exist it will create it.
         :param name: stream name. Default "stream"
         :param group: consumer group. Default "serving"
@@ -882,8 +977,15 @@ class StreamSource(OnlineSource):
         super().__init__(name, attributes=attrs, **kwargs)
     def add_nuclio_trigger(self, function):
-        endpoint, stream_path = parse_path(self.path)
-        v3io_client = v3io.dataplane.Client(endpoint=endpoint)
+        store, _, url = mlrun.store_manager.get_or_create_store(self.path)
+        if store.kind != "v3io":
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Only profiles that reference the v3io datastore can be used with StreamSource"
+            )
+        storage_options = store.get_storage_options()
+        access_key = storage_options.get("v3io_access_key")
+        endpoint, stream_path = parse_path(url)
+        v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
         container, stream_path = split_path(stream_path)
         res = v3io_client.stream.create(
             container=container,
@@ -903,7 +1005,7 @@ class StreamSource(OnlineSource):
             kwargs["worker_allocation_mode"] = "static"
         function.add_v3io_stream_trigger(
-            self.path,
+            url,
             self.name,
             self.attributes["group"],
             self.attributes["seek_to"],
@@ -915,8 +1017,6 @@ class StreamSource(OnlineSource):
 class KafkaSource(OnlineSource):
-    """Sets kafka source for the flow"""
     kind = "kafka"
     def __init__(
@@ -970,6 +1070,7 @@ class KafkaSource(OnlineSource):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
         raise mlrun.MLRunInvalidArgumentError(
             "KafkaSource does not support batch processing"
@@ -1047,7 +1148,7 @@ class SQLSource(BaseSourceDriver):
         db_url: str = None,
         table_name: str = None,
         spark_options: dict = None,
-        parse_dates: List[str] = None,
+        parse_dates: list[str] = None,
         **kwargs,
     ):
         """
@@ -1110,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
         start_time=None,
         end_time=None,
         time_field=None,
+        additional_filters=None,
     ):
         import sqlalchemy as sqlalchemy
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         db_path = self.attributes.get("db_path")
         table_name = self.attributes.get("table_name")
         parse_dates = self.attributes.get("parse_dates")

mlrun/datastore/spark_utils.py CHANGED Viewed

@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict
 import mlrun
-def spark_session_update_hadoop_options(session, spark_options) -> Dict[str, str]:
+def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
     hadoop_conf = session.sparkContext._jsc.hadoopConfiguration()
     non_hadoop_spark_options = {}

mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl