PyPI - mlrun - Versions diffs - 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl - Mend

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +39 -121
mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +39 -254
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +73 -46
mlrun/artifacts/model.py +30 -158
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +73 -1
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +46 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +44 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +11 -1
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +31 -4
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +196 -0
mlrun/common/schemas/artifact.py +28 -1
mlrun/common/schemas/auth.py +13 -2
mlrun/common/schemas/client_spec.py +2 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +58 -28
mlrun/common/schemas/frontend_spec.py +8 -0
mlrun/common/schemas/function.py +11 -0
mlrun/common/schemas/hub.py +7 -9
mlrun/common/schemas/model_monitoring/__init__.py +21 -4
mlrun/common/schemas/model_monitoring/constants.py +136 -42
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
mlrun/common/schemas/notification.py +69 -12
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +7 -0
mlrun/common/schemas/project.py +67 -16
mlrun/common/schemas/runs.py +17 -0
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +14 -1
mlrun/config.py +233 -58
mlrun/data_types/data_types.py +11 -1
mlrun/data_types/spark.py +5 -4
mlrun/data_types/to_pandas.py +75 -34
mlrun/datastore/__init__.py +8 -10
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +131 -43
mlrun/datastore/base.py +107 -47
mlrun/datastore/datastore.py +17 -7
mlrun/datastore/datastore_profile.py +91 -7
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +92 -32
mlrun/datastore/hdfs.py +5 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +3 -2
mlrun/datastore/s3.py +30 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +274 -59
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +387 -119
mlrun/datastore/utils.py +68 -5
mlrun/datastore/v3io.py +28 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/base.py +245 -20
mlrun/db/factory.py +1 -4
mlrun/db/httpdb.py +909 -231
mlrun/db/nopdb.py +279 -14
mlrun/errors.py +35 -5
mlrun/execution.py +111 -38
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +46 -53
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_set.py +48 -23
mlrun/feature_store/feature_vector.py +13 -2
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +38 -19
mlrun/features.py +6 -14
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
mlrun/frameworks/lgbm/__init__.py +1 -1
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/model_handler.py +1 -1
mlrun/frameworks/parallel_coordinates.py +4 -4
mlrun/frameworks/pytorch/__init__.py +2 -2
mlrun/frameworks/sklearn/__init__.py +1 -1
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/frameworks/tf_keras/__init__.py +5 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
mlrun/frameworks/xgboost/__init__.py +1 -1
mlrun/k8s_utils.py +57 -12
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +6 -5
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +15 -5
mlrun/launcher/remote.py +10 -3
mlrun/lists.py +6 -2
mlrun/model.py +297 -48
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +152 -357
mlrun/model_monitoring/applications/__init__.py +10 -0
mlrun/model_monitoring/applications/_application_steps.py +190 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +130 -303
mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +177 -39
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +165 -398
mlrun/model_monitoring/tracking_policy.py +7 -1
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/packagers/default_packager.py +2 -2
mlrun/package/packagers_manager.py +1 -0
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +67 -228
mlrun/projects/__init__.py +6 -1
mlrun/projects/operations.py +47 -20
mlrun/projects/pipelines.py +396 -249
mlrun/projects/project.py +1176 -406
mlrun/render.py +28 -22
mlrun/run.py +208 -181
mlrun/runtimes/__init__.py +76 -11
mlrun/runtimes/base.py +54 -24
mlrun/runtimes/daskjob.py +9 -2
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +39 -10
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +8 -8
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/api_gateway.py +769 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/nuclio/function.py +188 -68
mlrun/runtimes/nuclio/serving.py +57 -60
mlrun/runtimes/pod.py +191 -58
mlrun/runtimes/remotesparkjob.py +11 -8
mlrun/runtimes/sparkjob/spark3job.py +17 -18
mlrun/runtimes/utils.py +40 -73
mlrun/secrets.py +6 -2
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -3
mlrun/serving/routers.py +89 -64
mlrun/serving/server.py +54 -26
mlrun/serving/states.py +187 -56
mlrun/serving/utils.py +19 -11
mlrun/serving/v2_serving.py +136 -63
mlrun/track/tracker.py +2 -1
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +26 -6
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +375 -105
mlrun/utils/http.py +2 -2
mlrun/utils/logger.py +75 -9
mlrun/utils/notifications/notification/__init__.py +14 -10
mlrun/utils/notifications/notification/base.py +48 -0
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +2 -0
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +63 -2
mlrun/utils/notifications/notification_pusher.py +146 -16
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +3 -2
mlrun/utils/v3io_clients.py +2 -3
mlrun/utils/version/version.json +2 -2
mlrun-1.7.2.dist-info/METADATA +390 -0
mlrun-1.7.2.dist-info/RECORD +351 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -974
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/prometheus.py +0 -216
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
mlrun/platforms/other.py +0 -305
mlrun-1.7.0rc4.dist-info/METADATA +0 -269
mlrun-1.7.0rc4.dist-info/RECORD +0 -321
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0

mlrun/feature_store/feature_set.py CHANGED Viewed

@@ -337,7 +337,10 @@ class FeatureSet(ModelObj):
         example::
             import mlrun.feature_store as fstore
-            ticks = fstore.FeatureSet("ticks", entities=["stock"], timestamp_key="timestamp")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
             ticks.ingest(df)
         :param name:          name of the feature set
@@ -625,12 +628,12 @@ class FeatureSet(ModelObj):
             import mlrun.feature_store as fstore
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_entity("country",
-                            mlrun.data_types.ValueType.STRING,
-                            description="stock country")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_entity(
+                "country", mlrun.data_types.ValueType.STRING, description="stock country"
+            )
             ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
             ticks.save()
@@ -650,13 +653,23 @@ class FeatureSet(ModelObj):
             import mlrun.feature_store as fstore
             from mlrun.features import Feature
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.STRING,
-                            description="client consistency"),"ABC01")
-            ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.FLOAT,
-                            description="client volatility"),"SAB")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_feature(
+                Feature(
+                    value_type=mlrun.data_types.ValueType.STRING,
+                    description="client consistency",
+                ),
+                "ABC01",
+            )
+            ticks.add_feature(
+                Feature(
+                    value_type=mlrun.data_types.ValueType.FLOAT,
+                    description="client volatility",
+                ),
+                "SAB",
+            )
             ticks.save()
         :param feature:         setting of Feature
@@ -860,15 +873,18 @@ class FeatureSet(ModelObj):
         example::
             import mlrun.feature_store as fstore
             ...
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_aggregation(name='priceN',
-                                column='price',
-                                operations=['avg'],
-                                windows=['1d'],
-                                period='1h')
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_aggregation(
+                name="priceN",
+                column="price",
+                operations=["avg"],
+                windows=["1d"],
+                period="1h",
+            )
             ticks.plot(rankdir="LR", with_targets=True)
         :param filename:     target filepath for the graph image (None for the notebook)
@@ -901,6 +917,7 @@ class FeatureSet(ModelObj):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return featureset (offline) data as dataframe
@@ -912,6 +929,12 @@ class FeatureSet(ModelObj):
         :param end_time:     filter by end time
         :param time_column:  specify the time column name in the file
         :param kwargs:       additional reader (csv, parquet, ..) args
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         :return: DataFrame
         """
         entities = list(self.spec.entities.keys())
@@ -930,6 +953,7 @@ class FeatureSet(ModelObj):
                 start_time=start_time,
                 end_time=end_time,
                 time_field=time_column,
+                additional_filters=additional_filters,
                 **kwargs,
             )
             # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -949,6 +973,7 @@ class FeatureSet(ModelObj):
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return result
@@ -1005,7 +1030,7 @@ class FeatureSet(ModelObj):
             df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
             # for running as remote job
-            config = RunConfig(image='mlrun/mlrun')
+            config = RunConfig(image="mlrun/mlrun")
             df = ingest(stocks_set, stocks, run_config=config)
             # specify source and targets

mlrun/feature_store/feature_vector.py CHANGED Viewed

@@ -486,6 +486,7 @@ class FeatureVector(ModelObj):
         example::
             import mlrun.feature_store as fstore
             features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
             vector = fstore.FeatureVector("my-vec", features)
@@ -740,6 +741,7 @@ class FeatureVector(ModelObj):
         order_by: Union[str, list[str]] = None,
         spark_service: str = None,
         timestamp_for_filtering: Union[str, dict[str, str]] = None,
+        additional_filters: list = None,
     ):
         """retrieve offline feature vector results
@@ -796,6 +798,12 @@ class FeatureVector(ModelObj):
                                         By default, the filter executes on the timestamp_key of each feature set.
                                         Note: the time filtering is performed on each feature set before the
                                         merge process using start_time and end_time params.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                            Each tuple should be in the format (column_name, operator, value).
+                            Supported operators: "=", ">=", "<=", ">", "<".
+                            Example: [("Product", "=", "Computer")]
+                            For all supported filters, please see:
+                            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
@@ -816,6 +824,7 @@ class FeatureVector(ModelObj):
             order_by,
             spark_service,
             timestamp_for_filtering,
+            additional_filters,
         )
     def get_online_feature_service(
@@ -852,7 +861,7 @@ class FeatureVector(ModelObj):
                 Example::
-                    svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
+                    svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
                     try:
                         resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
                         print(resp)
@@ -1077,7 +1086,9 @@ class OfflineVectorResponse:
     def to_dataframe(self, to_pandas=True):
         """return result as dataframe"""
         if self.status != "completed":
-            raise mlrun.errors.MLRunTaskNotReady("feature vector dataset is not ready")
+            raise mlrun.errors.MLRunTaskNotReadyError(
+                "feature vector dataset is not ready"
+            )
         return self._merger.get_df(to_pandas=to_pandas)
     def to_parquet(self, target_path, **kw):

mlrun/feature_store/ingestion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import uuid
 import pandas as pd
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.datastore.sources import get_source_from_dict, get_source_step
 from mlrun.datastore.targets import (
     add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
         out_path=featureset.spec.output_path,
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-ingest").set_label(
-        "feature-set", featureset.uri
-    )
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
+    ).set_label("feature-set", featureset.uri)
     if run_config.owner:
-        task.set_label("owner", run_config.owner).set_label(
-            "v3io_user", run_config.owner
-        )
+        task.set_label(
+            mlrun_constants.MLRunInternalLabels.owner, run_config.owner
+        ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
     # set run UID and save in the feature set status (linking the features et to the job)
     task.metadata.uid = uuid.uuid4().hex

mlrun/feature_store/retrieval/base.py CHANGED Viewed

@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
         update_stats=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._target = target
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
             timestamp_for_filtering=timestamp_for_filtering,
             query=query,
             order_by=order_by,
+            additional_filters=additional_filters,
         )
     def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
         timestamp_for_filtering=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._create_engine_env()
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
             feature_sets.append(None)
             join_types.append(None)
-        filtered = False
+        timestamp_filtered = False
         for step in join_graph.steps:
             name = step.right_feature_set_name
             feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
             if self._drop_indexes:
                 self._append_drop_column(time_column)
             if (start_time or end_time) and time_column:
-                filtered = True
+                timestamp_filtered = True
             df = self._get_engine_df(
                 feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
                 start_time if time_column else None,
                 end_time if time_column else None,
                 time_column,
+                additional_filters,
             )
             fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
                     new_columns.append((column, alias))
             self._update_alias(dictionary={name: alias for name, alias in new_columns})
-        # None of the feature sets was filtered as required
-        if not filtered and (start_time or end_time):
+        # None of the feature sets was timestamp filtered as required
+        if not timestamp_filtered and (start_time or end_time):
             raise mlrun.errors.MLRunRuntimeError(
                 "start_time and end_time can only be provided in conjunction with "
                 "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
         start_time: typing.Union[str, datetime] = None,
         end_time: typing.Union[str, datetime] = None,
         time_column: typing.Optional[str] = None,
+        additional_filters=None,
     ):
         """
         Return the feature_set data frame according to the args

mlrun/feature_store/retrieval/dask_merger.py CHANGED Viewed

@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         import dask.dataframe as dd
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
             end_time=end_time,
             time_column=time_column,
             index=False,
+            additional_filters=additional_filters,
         )
         return self._reset_index(df).persist()

mlrun/feature_store/retrieval/job.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import uuid
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.config import config as mlconf
 from mlrun.model import DataTargetBase, new_task
 from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
     start_time=None,
     end_time=None,
     timestamp_for_filtering=None,
+    additional_filters=None,
 ):
     name = vector.metadata.name
     if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
             "end_time": end_time,
             "timestamp_for_filtering": timestamp_for_filtering,
             "engine_args": engine_args,
+            "additional_filters": additional_filters,
         },
         inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
+    ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
     task.metadata.uid = uuid.uuid4().hex
     vector.status.run_uri = task.metadata.uid
     vector.save()
@@ -151,7 +156,9 @@ class RemoteVectorResponse:
     def _is_ready(self):
         if self.status != "completed":
-            raise mlrun.errors.MLRunTaskNotReady("feature vector dataset is not ready")
+            raise mlrun.errors.MLRunTaskNotReadyError(
+                "feature vector dataset is not ready"
+            )
         self.vector.reload()
     def to_dataframe(self, columns=None, df_module=None, **kwargs):
@@ -176,6 +183,7 @@ class RemoteVectorResponse:
         file_format = kwargs.get("format")
         if not file_format:
             file_format = self.run.status.results["target"]["kind"]
         df = mlrun.get_dataitem(self.target_uri).as_df(
             columns=columns, df_module=df_module, format=file_format, **kwargs
         )
@@ -196,7 +204,8 @@ import mlrun.feature_store.retrieval
 from mlrun.datastore.targets import get_target_driver
 def merge_handler(context, vector_uri, target, entity_rows=None,
                   entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
-                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
+                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
+                  additional_filters=None):
     vector = context.get_store_resource(vector_uri)
     store_target = get_target_driver(target, vector)
     if entity_rows:
@@ -206,7 +215,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
     merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
     merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
                  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
-                 timestamp_for_filtering=timestamp_for_filtering)
+                 timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
     target = vector.status.targets[store_target.name].to_dict()
     context.log_result('feature_vector', vector.uri)

mlrun/feature_store/retrieval/local_merger.py CHANGED Viewed

@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         df = feature_set.to_dataframe(
             columns=column_names,
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
         )
         if df.index.names[0]:
             df.reset_index(inplace=True)

mlrun/feature_store/retrieval/spark_merger.py CHANGED Viewed

@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pandas as pd
-import semver
 import mlrun
+from mlrun.data_types.to_pandas import spark_df_to_pandas
+from mlrun.datastore.sources import ParquetSource
 from mlrun.datastore.targets import get_offline_target
+from mlrun.runtimes import RemoteSparkRuntime
+from mlrun.runtimes.sparkjob import Spark3Runtime
+from mlrun.utils.helpers import additional_filters_warning
-from ...runtimes import RemoteSparkRuntime
-from ...runtimes.sparkjob import Spark3Runtime
 from .base import BaseMerger
-from .conversion import PandasConversionMixin
 class SparkFeatureMerger(BaseMerger):
@@ -166,29 +167,7 @@ class SparkFeatureMerger(BaseMerger):
     def get_df(self, to_pandas=True):
         if to_pandas:
             if self._pandas_df is None:
-                df = self._result_df
-                # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
-                # when we upgrade pyspark, we should check whether this workaround is still necessary
-                # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
-                if semver.parse(pd.__version__)["major"] >= 2:
-                    import pyspark.sql.functions as pyspark_functions
-                    type_conversion_dict = {}
-                    for field in df.schema.fields:
-                        if str(field.dataType) == "TimestampType":
-                            df = df.withColumn(
-                                field.name,
-                                pyspark_functions.date_format(
-                                    pyspark_functions.to_timestamp(field.name),
-                                    "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
-                                ),
-                            )
-                            type_conversion_dict[field.name] = "datetime64[ns]"
-                    df = PandasConversionMixin.toPandas(df)
-                    if type_conversion_dict:
-                        df = df.astype(type_conversion_dict)
-                else:
-                    df = PandasConversionMixin.toPandas(df)
+                df = spark_df_to_pandas(self._result_df)
                 self._pandas_df = df
                 self._set_indexes(self._pandas_df)
             return self._pandas_df
@@ -209,9 +188,13 @@ class SparkFeatureMerger(BaseMerger):
         if self.spark is None:
             # create spark context
-            self.spark = SparkSession.builder.appName(
-                f"vector-merger-{self.vector.metadata.name}"
-            ).getOrCreate()
+            self.spark = (
+                SparkSession.builder.appName(
+                    f"vector-merger-{self.vector.metadata.name}"
+                )
+                .config("spark.driver.memory", "2g")
+                .getOrCreate()
+            )
     def _get_engine_df(
         self,
@@ -221,6 +204,7 @@ class SparkFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         source_kwargs = {}
         if feature_set.spec.passthrough:
@@ -231,6 +215,7 @@ class SparkFeatureMerger(BaseMerger):
             source_kind = feature_set.spec.source.kind
             source_path = feature_set.spec.source.path
             source_kwargs.update(feature_set.spec.source.attributes)
+            source_kwargs.pop("additional_filters", None)
         else:
             target = get_offline_target(feature_set)
             if not target:
@@ -239,17 +224,24 @@ class SparkFeatureMerger(BaseMerger):
                 )
             source_kind = target.kind
             source_path = target.get_target_path()
+            source_kwargs = target.source_spark_attributes
         # handling case where there are multiple feature sets and user creates vector where
         # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
         source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
+        if source_driver != ParquetSource:
+            additional_filters_warning(additional_filters, source_driver)
+            additional_filters = None
+        additional_filters_dict = (
+            {"additional_filters": additional_filters} if additional_filters else {}
+        )
         source = source_driver(
             name=self.vector.metadata.name,
             path=source_path,
             time_field=time_column,
             start_time=start_time,
             end_time=end_time,
+            **additional_filters_dict,
             **source_kwargs,
         )

mlrun/feature_store/steps.py CHANGED Viewed

@@ -162,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
         example::
             # replace the value "U" with '0' in the age column
-            graph.to(MapValues(mapping={'age': {'U': '0'}}, with_original_features=True))
+            graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
             # replace integers, example
-            graph.to(MapValues(mapping={'not': {0: 1, 1: 0}}))
+            graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
             # replace by range, use -inf and inf for extended range
-            graph.to(MapValues(mapping={'numbers': {'ranges': {'negative': [-inf, 0], 'positive': [0, inf]}}}))
+            graph.to(
+                MapValues(
+                    mapping={
+                        "numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
+                    }
+                )
+            )
         :param mapping: a dict with entry per column and the associated old/new values map
         :param with_original_features: set to True to keep the original features
@@ -424,8 +430,10 @@ class OneHotEncoder(StepToDict, MLRunStep):
         example::
-            mapping = {'category': ['food', 'health', 'transportation'],
-                       'gender': ['male', 'female']}
+            mapping = {
+                "category": ["food", "health", "transportation"],
+                "gender": ["male", "female"],
+            }
             graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
         :param mapping: a dict of per column categories (to map to binary fields)
@@ -542,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
             # (taken from the fraud-detection end-to-end feature store demo)
             # Define the Transactions FeatureSet
-            transaction_set = fstore.FeatureSet("transactions",
-                                            entities=[fstore.Entity("source")],
-                                            timestamp_key='timestamp',
-                                            description="transactions feature set")
+            transaction_set = fstore.FeatureSet(
+                "transactions",
+                entities=[fstore.Entity("source")],
+                timestamp_key="timestamp",
+                description="transactions feature set",
+            )
             # Get FeatureSet computation graph
             transaction_graph = transaction_set.graph
@@ -553,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
             # Add the custom `DateExtractor` step
             # to the computation graph
             transaction_graph.to(
-                    class_name='DateExtractor',
-                    name='Extract Dates',
-                    parts = ['hour', 'day_of_week'],
-                    timestamp_col = 'timestamp',
-                )
+                class_name="DateExtractor",
+                name="Extract Dates",
+                parts=["hour", "day_of_week"],
+                timestamp_col="timestamp",
+            )
         :param parts: list of pandas style date-time parts you want to extract.
         :param timestamp_col: The name of the column containing the timestamps to extract from,
@@ -694,11 +704,12 @@ class DropFeatures(StepToDict, MLRunStep):
         example::
-            feature_set = fstore.FeatureSet("fs-new",
-                                        entities=[fstore.Entity("id")],
-                                        description="feature set",
-                                        engine="pandas",
-                                        )
+            feature_set = fstore.FeatureSet(
+                "fs-new",
+                entities=[fstore.Entity("id")],
+                description="feature set",
+                engine="pandas",
+            )
             # Pre-processing graph steps
             feature_set.graph.to(DropFeatures(features=["age"]))
             df_pandas = feature_set.ingest(data)
@@ -732,3 +743,11 @@ class DropFeatures(StepToDict, MLRunStep):
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"DropFeatures can only drop features, not entities: {dropped_entities}"
             )
+        if feature_set.spec.label_column in features:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"DropFeatures can not drop label_column: {feature_set.spec.label_column}"
+            )
+        if feature_set.spec.timestamp_key in features:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"DropFeatures can not drop timestamp_key: {feature_set.spec.timestamp_key}"
+            )

mlrun/features.py CHANGED Viewed

@@ -100,7 +100,8 @@ class Feature(ModelObj):
         :param name:        name of the feature
         :param validator:   feature validation policy
         :param default:     default value
-        :param labels:      a set of key/value labels (tags)
+        :param labels:      a set of key/value labels (tags). Labels can be used to filter featues, for example,
+                            in the UI Feature store page.
         """
         self.name = name or ""
         if isinstance(value_type, ValueType):
@@ -238,10 +239,7 @@ class Validator(ModelObj):
             from mlrun.features import Validator
             # Add validator to the feature 'bid' with check type
-            quotes_set["bid"].validator = Validator(
-                check_type=True,
-                severity="info"
-            )
+            quotes_set["bid"].validator = Validator(check_type=True, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -280,10 +278,7 @@ class MinMaxValidator(Validator):
             # Add validator to the feature 'bid', where valid
             # minimal value is 52
-            quotes_set["bid"].validator = MinMaxValidator(
-                min=52,
-                severity="info"
-            )
+            quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -344,9 +339,7 @@ class MinMaxLenValidator(Validator):
             # Add length validator to the feature 'ticker', where valid
             # minimal length is 1 and maximal length is 10
             quotes_set["ticker"].validator = MinMaxLenValidator(
-                min=1,
-                max=10,
-                severity="info"
+                min=1, max=10, severity="info"
             )
         :param check_type:  check feature type e.g. True, False
@@ -408,8 +401,7 @@ class RegexValidator(Validator):
             # expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
             # e.g. A1234567, z9874563, etc.
             quotes_set["name"].validator = RegexValidator(
-                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
-                severity="info"
+                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
             )
         :param check_type:  check feature type e.g. True, False

mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl