PyPI - mlrun - Versions diffs - 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

mlrun 1.6.4rc7py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +40 -122
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +5 -4
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +47 -257
mlrun/artifacts/dataset.py +11 -192
mlrun/artifacts/manager.py +79 -47
mlrun/artifacts/model.py +31 -159
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +74 -1
mlrun/common/db/sql_session.py +5 -5
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +45 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +33 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +12 -3
mlrun/common/model_monitoring/helpers.py +9 -5
mlrun/{runtimes → common/runtimes}/constants.py +37 -9
mlrun/common/schemas/__init__.py +31 -5
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +196 -0
mlrun/common/schemas/artifact.py +25 -4
mlrun/common/schemas/auth.py +16 -5
mlrun/common/schemas/background_task.py +1 -1
mlrun/common/schemas/client_spec.py +4 -2
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +74 -44
mlrun/common/schemas/frontend_spec.py +15 -7
mlrun/common/schemas/function.py +12 -1
mlrun/common/schemas/hub.py +11 -18
mlrun/common/schemas/memory_reports.py +2 -2
mlrun/common/schemas/model_monitoring/__init__.py +20 -4
mlrun/common/schemas/model_monitoring/constants.py +123 -42
mlrun/common/schemas/model_monitoring/grafana.py +13 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
mlrun/common/schemas/notification.py +71 -14
mlrun/common/schemas/object.py +2 -2
mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
mlrun/common/schemas/pipeline.py +8 -1
mlrun/common/schemas/project.py +69 -18
mlrun/common/schemas/runs.py +7 -1
mlrun/common/schemas/runtime_resource.py +8 -12
mlrun/common/schemas/schedule.py +4 -4
mlrun/common/schemas/tag.py +1 -2
mlrun/common/schemas/workflow.py +12 -4
mlrun/common/types.py +14 -1
mlrun/config.py +154 -69
mlrun/data_types/data_types.py +6 -1
mlrun/data_types/spark.py +2 -2
mlrun/data_types/to_pandas.py +67 -37
mlrun/datastore/__init__.py +6 -8
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +143 -42
mlrun/datastore/base.py +102 -58
mlrun/datastore/datastore.py +34 -13
mlrun/datastore/datastore_profile.py +146 -20
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -4
mlrun/datastore/google_cloud_storage.py +97 -33
mlrun/datastore/hdfs.py +56 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +7 -2
mlrun/datastore/s3.py +34 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +303 -111
mlrun/datastore/spark_utils.py +31 -2
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +453 -176
mlrun/datastore/utils.py +72 -58
mlrun/datastore/v3io.py +6 -1
mlrun/db/base.py +274 -41
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +893 -225
mlrun/db/nopdb.py +291 -33
mlrun/errors.py +36 -6
mlrun/execution.py +115 -42
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +65 -73
mlrun/feature_store/common.py +7 -12
mlrun/feature_store/feature_set.py +76 -55
mlrun/feature_store/feature_vector.py +39 -31
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +16 -11
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +45 -34
mlrun/features.py +11 -21
mlrun/frameworks/_common/artifacts_library.py +9 -9
mlrun/frameworks/_common/mlrun_interface.py +5 -5
mlrun/frameworks/_common/model_handler.py +48 -48
mlrun/frameworks/_common/plan.py +5 -6
mlrun/frameworks/_common/producer.py +3 -4
mlrun/frameworks/_common/utils.py +5 -5
mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
mlrun/frameworks/_ml_common/model_handler.py +24 -24
mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
mlrun/frameworks/_ml_common/plan.py +2 -2
mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/_ml_common/utils.py +4 -4
mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
mlrun/frameworks/huggingface/model_server.py +4 -4
mlrun/frameworks/lgbm/__init__.py +33 -33
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
mlrun/frameworks/lgbm/model_handler.py +10 -10
mlrun/frameworks/lgbm/model_server.py +6 -6
mlrun/frameworks/lgbm/utils.py +5 -5
mlrun/frameworks/onnx/dataset.py +8 -8
mlrun/frameworks/onnx/mlrun_interface.py +3 -3
mlrun/frameworks/onnx/model_handler.py +6 -6
mlrun/frameworks/onnx/model_server.py +7 -7
mlrun/frameworks/parallel_coordinates.py +6 -6
mlrun/frameworks/pytorch/__init__.py +18 -18
mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
mlrun/frameworks/pytorch/model_handler.py +17 -17
mlrun/frameworks/pytorch/model_server.py +7 -7
mlrun/frameworks/sklearn/__init__.py +13 -13
mlrun/frameworks/sklearn/estimator.py +4 -4
mlrun/frameworks/sklearn/metrics_library.py +14 -14
mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
mlrun/frameworks/sklearn/model_handler.py +2 -2
mlrun/frameworks/tf_keras/__init__.py +10 -7
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
mlrun/frameworks/tf_keras/model_handler.py +14 -14
mlrun/frameworks/tf_keras/model_server.py +6 -6
mlrun/frameworks/xgboost/__init__.py +13 -13
mlrun/frameworks/xgboost/model_handler.py +6 -6
mlrun/k8s_utils.py +61 -17
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +16 -15
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +23 -13
mlrun/launcher/remote.py +17 -10
mlrun/lists.py +7 -6
mlrun/model.py +478 -103
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +163 -371
mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
mlrun/model_monitoring/applications/_application_steps.py +188 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +131 -278
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +134 -106
mlrun/model_monitoring/helpers.py +199 -55
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +131 -398
mlrun/model_monitoring/tracking_policy.py +9 -2
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/__init__.py +6 -6
mlrun/package/context_handler.py +5 -5
mlrun/package/packager.py +7 -7
mlrun/package/packagers/default_packager.py +8 -8
mlrun/package/packagers/numpy_packagers.py +15 -15
mlrun/package/packagers/pandas_packagers.py +5 -5
mlrun/package/packagers/python_standard_library_packagers.py +10 -10
mlrun/package/packagers_manager.py +19 -23
mlrun/package/utils/_formatter.py +6 -6
mlrun/package/utils/_pickler.py +2 -2
mlrun/package/utils/_supported_format.py +4 -4
mlrun/package/utils/log_hint_utils.py +2 -2
mlrun/package/utils/type_hint_utils.py +4 -9
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +24 -203
mlrun/projects/operations.py +52 -25
mlrun/projects/pipelines.py +191 -197
mlrun/projects/project.py +1227 -400
mlrun/render.py +16 -19
mlrun/run.py +209 -184
mlrun/runtimes/__init__.py +83 -15
mlrun/runtimes/base.py +51 -35
mlrun/runtimes/daskjob.py +17 -10
mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/function_reference.py +1 -1
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +40 -11
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +9 -10
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
mlrun/runtimes/nuclio/api_gateway.py +769 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
mlrun/runtimes/pod.py +281 -101
mlrun/runtimes/remotesparkjob.py +12 -9
mlrun/runtimes/sparkjob/spark3job.py +67 -51
mlrun/runtimes/utils.py +41 -75
mlrun/secrets.py +9 -5
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -7
mlrun/serving/routers.py +85 -69
mlrun/serving/server.py +69 -44
mlrun/serving/states.py +209 -36
mlrun/serving/utils.py +22 -14
mlrun/serving/v1_serving.py +6 -7
mlrun/serving/v2_serving.py +129 -54
mlrun/track/tracker.py +2 -1
mlrun/track/tracker_manager.py +3 -3
mlrun/track/trackers/mlflow_tracker.py +6 -2
mlrun/utils/async_http.py +6 -8
mlrun/utils/azure_vault.py +1 -1
mlrun/utils/clones.py +1 -2
mlrun/utils/condition_evaluator.py +3 -3
mlrun/utils/db.py +21 -3
mlrun/utils/helpers.py +405 -225
mlrun/utils/http.py +3 -6
mlrun/utils/logger.py +112 -16
mlrun/utils/notifications/notification/__init__.py +17 -13
mlrun/utils/notifications/notification/base.py +50 -2
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +3 -1
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +59 -2
mlrun/utils/notifications/notification_pusher.py +149 -30
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +208 -0
mlrun/utils/singleton.py +1 -1
mlrun/utils/v3io_clients.py +4 -6
mlrun/utils/version/version.json +2 -2
mlrun/utils/version/version.py +2 -6
mlrun-1.7.0.dist-info/METADATA +378 -0
mlrun-1.7.0.dist-info/RECORD +351 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -273
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -1095
mlrun/model_monitoring/prometheus.py +0 -219
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
mlrun/platforms/other.py +0 -306
mlrun-1.6.4rc7.dist-info/METADATA +0 -272
mlrun-1.6.4rc7.dist-info/RECORD +0 -314
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0

mlrun/feature_store/retrieval/base.py CHANGED Viewed

@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
         update_stats=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._target = target
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
             timestamp_for_filtering=timestamp_for_filtering,
             query=query,
             order_by=order_by,
+            additional_filters=additional_filters,
         )
     def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
         timestamp_for_filtering=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._create_engine_env()
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
             feature_sets.append(None)
             join_types.append(None)
-        filtered = False
+        timestamp_filtered = False
         for step in join_graph.steps:
             name = step.right_feature_set_name
             feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
             if self._drop_indexes:
                 self._append_drop_column(time_column)
             if (start_time or end_time) and time_column:
-                filtered = True
+                timestamp_filtered = True
             df = self._get_engine_df(
                 feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
                 start_time if time_column else None,
                 end_time if time_column else None,
                 time_column,
+                additional_filters,
             )
             fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
                     new_columns.append((column, alias))
             self._update_alias(dictionary={name: alias for name, alias in new_columns})
-        # None of the feature sets was filtered as required
-        if not filtered and (start_time or end_time):
+        # None of the feature sets was timestamp filtered as required
+        if not timestamp_filtered and (start_time or end_time):
             raise mlrun.errors.MLRunRuntimeError(
                 "start_time and end_time can only be provided in conjunction with "
                 "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -540,8 +544,8 @@ class BaseMerger(abc.ABC):
             self,
             name: str,
             order: int,
-            left_keys: typing.List[str] = None,
-            right_keys: typing.List[str] = None,
+            left_keys: list[str] = None,
+            right_keys: list[str] = None,
         ):
             self.name = name
             self.left_keys = left_keys if left_keys is not None else []
@@ -750,11 +754,12 @@ class BaseMerger(abc.ABC):
     def _get_engine_df(
         self,
         feature_set: FeatureSet,
-        feature_set_name: typing.List[str],
-        column_names: typing.List[str] = None,
+        feature_set_name: list[str],
+        column_names: list[str] = None,
         start_time: typing.Union[str, datetime] = None,
         end_time: typing.Union[str, datetime] = None,
         time_column: typing.Optional[str] = None,
+        additional_filters=None,
     ):
         """
         Return the feature_set data frame according to the args
@@ -773,8 +778,8 @@ class BaseMerger(abc.ABC):
     def _rename_columns_and_select(
         self,
         df,
-        rename_col_dict: typing.Dict[str, str],
-        columns: typing.List[str] = None,
+        rename_col_dict: dict[str, str],
+        columns: list[str] = None,
     ):
         """
         rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
@@ -801,7 +806,7 @@ class BaseMerger(abc.ABC):
         """
         raise NotImplementedError
-    def _order_by(self, order_by_active: typing.List[str]):
+    def _order_by(self, order_by_active: list[str]):
         """
         Order by `order_by_active` along all axis.

mlrun/feature_store/retrieval/dask_merger.py CHANGED Viewed

@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         import dask.dataframe as dd
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
             end_time=end_time,
             time_column=time_column,
             index=False,
+            additional_filters=additional_filters,
         )
         return self._reset_index(df).persist()

mlrun/feature_store/retrieval/job.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import uuid
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.config import config as mlconf
 from mlrun.model import DataTargetBase, new_task
 from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
     start_time=None,
     end_time=None,
     timestamp_for_filtering=None,
+    additional_filters=None,
 ):
     name = vector.metadata.name
     if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
             "end_time": end_time,
             "timestamp_for_filtering": timestamp_for_filtering,
             "engine_args": engine_args,
+            "additional_filters": additional_filters,
         },
         inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
+    ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
     task.metadata.uid = uuid.uuid4().hex
     vector.status.run_uri = task.metadata.uid
     vector.save()
@@ -151,7 +156,9 @@ class RemoteVectorResponse:
     def _is_ready(self):
         if self.status != "completed":
-            raise mlrun.errors.MLRunTaskNotReady("feature vector dataset is not ready")
+            raise mlrun.errors.MLRunTaskNotReadyError(
+                "feature vector dataset is not ready"
+            )
         self.vector.reload()
     def to_dataframe(self, columns=None, df_module=None, **kwargs):
@@ -176,6 +183,7 @@ class RemoteVectorResponse:
         file_format = kwargs.get("format")
         if not file_format:
             file_format = self.run.status.results["target"]["kind"]
         df = mlrun.get_dataitem(self.target_uri).as_df(
             columns=columns, df_module=df_module, format=file_format, **kwargs
         )
@@ -196,7 +204,8 @@ import mlrun.feature_store.retrieval
 from mlrun.datastore.targets import get_target_driver
 def merge_handler(context, vector_uri, target, entity_rows=None,
                   entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
-                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
+                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
+                  additional_filters=None):
     vector = context.get_store_resource(vector_uri)
     store_target = get_target_driver(target, vector)
     if entity_rows:
@@ -206,7 +215,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
     merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
     merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
                  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
-                 timestamp_for_filtering=timestamp_for_filtering)
+                 timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
     target = vector.status.targets[store_target.name].to_dict()
     context.log_result('feature_vector', vector.uri)

mlrun/feature_store/retrieval/local_merger.py CHANGED Viewed

@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         df = feature_set.to_dataframe(
             columns=column_names,
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
         )
         if df.index.names[0]:
             df.reset_index(inplace=True)

mlrun/feature_store/retrieval/spark_merger.py CHANGED Viewed

@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pandas as pd
-import semver
 import mlrun
+from mlrun.data_types.to_pandas import spark_df_to_pandas
+from mlrun.datastore.sources import ParquetSource
 from mlrun.datastore.targets import get_offline_target
+from mlrun.runtimes import RemoteSparkRuntime
+from mlrun.runtimes.sparkjob import Spark3Runtime
+from mlrun.utils.helpers import additional_filters_warning
-from ...runtimes import RemoteSparkRuntime
-from ...runtimes.sparkjob import Spark3Runtime
 from .base import BaseMerger
-from .conversion import PandasConversionMixin
 class SparkFeatureMerger(BaseMerger):
@@ -166,29 +167,7 @@ class SparkFeatureMerger(BaseMerger):
     def get_df(self, to_pandas=True):
         if to_pandas:
             if self._pandas_df is None:
-                df = self._result_df
-                # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
-                # when we upgrade pyspark, we should check whether this workaround is still necessary
-                # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
-                if semver.parse(pd.__version__)["major"] >= 2:
-                    import pyspark.sql.functions as pyspark_functions
-                    type_conversion_dict = {}
-                    for field in df.schema.fields:
-                        if str(field.dataType) == "TimestampType":
-                            df = df.withColumn(
-                                field.name,
-                                pyspark_functions.date_format(
-                                    pyspark_functions.to_timestamp(field.name),
-                                    "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
-                                ),
-                            )
-                            type_conversion_dict[field.name] = "datetime64[ns]"
-                    df = PandasConversionMixin.toPandas(df)
-                    if type_conversion_dict:
-                        df = df.astype(type_conversion_dict)
-                else:
-                    df = PandasConversionMixin.toPandas(df)
+                df = spark_df_to_pandas(self._result_df)
                 self._pandas_df = df
                 self._set_indexes(self._pandas_df)
             return self._pandas_df
@@ -209,9 +188,13 @@ class SparkFeatureMerger(BaseMerger):
         if self.spark is None:
             # create spark context
-            self.spark = SparkSession.builder.appName(
-                f"vector-merger-{self.vector.metadata.name}"
-            ).getOrCreate()
+            self.spark = (
+                SparkSession.builder.appName(
+                    f"vector-merger-{self.vector.metadata.name}"
+                )
+                .config("spark.driver.memory", "2g")
+                .getOrCreate()
+            )
     def _get_engine_df(
         self,
@@ -221,6 +204,7 @@ class SparkFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         source_kwargs = {}
         if feature_set.spec.passthrough:
@@ -231,6 +215,7 @@ class SparkFeatureMerger(BaseMerger):
             source_kind = feature_set.spec.source.kind
             source_path = feature_set.spec.source.path
             source_kwargs.update(feature_set.spec.source.attributes)
+            source_kwargs.pop("additional_filters", None)
         else:
             target = get_offline_target(feature_set)
             if not target:
@@ -239,17 +224,24 @@ class SparkFeatureMerger(BaseMerger):
                 )
             source_kind = target.kind
             source_path = target.get_target_path()
+            source_kwargs = target.source_spark_attributes
         # handling case where there are multiple feature sets and user creates vector where
         # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
         source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
+        if source_driver != ParquetSource:
+            additional_filters_warning(additional_filters, source_driver)
+            additional_filters = None
+        additional_filters_dict = (
+            {"additional_filters": additional_filters} if additional_filters else {}
+        )
         source = source_driver(
             name=self.vector.metadata.name,
             path=source_path,
             time_field=time_column,
             start_time=start_time,
             end_time=end_time,
+            **additional_filters_dict,
             **source_kwargs,
         )

mlrun/feature_store/steps.py CHANGED Viewed

@@ -16,7 +16,7 @@ import math
 import re
 import uuid
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
@@ -92,8 +92,6 @@ class MLRunStep(MapClass):
 class FeaturesetValidator(StepToDict, MLRunStep):
-    """Validate feature values according to the feature set validation policy"""
     def __init__(self, featureset=None, columns=None, name=None, **kwargs):
         """Validate feature values according to the feature set validation policy
@@ -152,11 +150,9 @@ class FeaturesetValidator(StepToDict, MLRunStep):
 class MapValues(StepToDict, MLRunStep):
-    """Map column values to new values"""
     def __init__(
         self,
-        mapping: Dict[str, Dict[Union[str, int, bool], Any]],
+        mapping: dict[str, dict[Union[str, int, bool], Any]],
         with_original_features: bool = False,
         suffix: str = "mapped",
         **kwargs,
@@ -166,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
         example::
             # replace the value "U" with '0' in the age column
-            graph.to(MapValues(mapping={'age': {'U': '0'}}, with_original_features=True))
+            graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
             # replace integers, example
-            graph.to(MapValues(mapping={'not': {0: 1, 1: 0}}))
+            graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
             # replace by range, use -inf and inf for extended range
-            graph.to(MapValues(mapping={'numbers': {'ranges': {'negative': [-inf, 0], 'positive': [0, inf]}}}))
+            graph.to(
+                MapValues(
+                    mapping={
+                        "numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
+                    }
+                )
+            )
         :param mapping: a dict with entry per column and the associated old/new values map
         :param with_original_features: set to True to keep the original features
@@ -377,7 +379,7 @@ class Imputer(StepToDict, MLRunStep):
         self,
         method: str = "avg",
         default_value=None,
-        mapping: Dict[str, Any] = None,
+        mapping: dict[str, Any] = None,
         **kwargs,
     ):
         """Replace None values with default values
@@ -423,13 +425,15 @@ class Imputer(StepToDict, MLRunStep):
 class OneHotEncoder(StepToDict, MLRunStep):
-    def __init__(self, mapping: Dict[str, List[Union[int, str]]], **kwargs):
+    def __init__(self, mapping: dict[str, list[Union[int, str]]], **kwargs):
         """Create new binary fields, one per category (one hot encoded)
         example::
-            mapping = {'category': ['food', 'health', 'transportation'],
-                       'gender': ['male', 'female']}
+            mapping = {
+                "category": ["food", "health", "transportation"],
+                "gender": ["male", "female"],
+            }
             graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
         :param mapping: a dict of per column categories (to map to binary fields)
@@ -510,15 +514,13 @@ class OneHotEncoder(StepToDict, MLRunStep):
 class DateExtractor(StepToDict, MLRunStep):
-    """Date Extractor allows you to extract a date-time component"""
     def __init__(
         self,
-        parts: Union[Dict[str, str], List[str]],
+        parts: Union[dict[str, str], list[str]],
         timestamp_col: str = None,
         **kwargs,
     ):
-        """Date Extractor extract a date-time component into new columns
+        """Date Extractor extracts a date-time component into new columns
         The extracted date part will appear as `<timestamp_col>_<date_part>` feature.
@@ -548,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
             # (taken from the fraud-detection end-to-end feature store demo)
             # Define the Transactions FeatureSet
-            transaction_set = fstore.FeatureSet("transactions",
-                                            entities=[fstore.Entity("source")],
-                                            timestamp_key='timestamp',
-                                            description="transactions feature set")
+            transaction_set = fstore.FeatureSet(
+                "transactions",
+                entities=[fstore.Entity("source")],
+                timestamp_key="timestamp",
+                description="transactions feature set",
+            )
             # Get FeatureSet computation graph
             transaction_graph = transaction_set.graph
@@ -559,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
             # Add the custom `DateExtractor` step
             # to the computation graph
             transaction_graph.to(
-                    class_name='DateExtractor',
-                    name='Extract Dates',
-                    parts = ['hour', 'day_of_week'],
-                    timestamp_col = 'timestamp',
-                )
+                class_name="DateExtractor",
+                name="Extract Dates",
+                parts=["hour", "day_of_week"],
+                timestamp_col="timestamp",
+            )
         :param parts: list of pandas style date-time parts you want to extract.
         :param timestamp_col: The name of the column containing the timestamps to extract from,
@@ -629,8 +633,6 @@ class DateExtractor(StepToDict, MLRunStep):
 class SetEventMetadata(MapClass):
-    """Set the event metadata (id and key) from the event body"""
     def __init__(
         self,
         id_path: Optional[str] = None,
@@ -695,18 +697,19 @@ class SetEventMetadata(MapClass):
 class DropFeatures(StepToDict, MLRunStep):
-    def __init__(self, features: List[str], **kwargs):
+    def __init__(self, features: list[str], **kwargs):
         """Drop all the features from feature list
         :param features:    string list of the features names to drop
         example::
-            feature_set = fstore.FeatureSet("fs-new",
-                                        entities=[fstore.Entity("id")],
-                                        description="feature set",
-                                        engine="pandas",
-                                        )
+            feature_set = fstore.FeatureSet(
+                "fs-new",
+                entities=[fstore.Entity("id")],
+                description="feature set",
+                engine="pandas",
+            )
             # Pre-processing graph steps
             feature_set.graph.to(DropFeatures(features=["age"]))
             df_pandas = feature_set.ingest(data)
@@ -740,3 +743,11 @@ class DropFeatures(StepToDict, MLRunStep):
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"DropFeatures can only drop features, not entities: {dropped_entities}"
             )
+        if feature_set.spec.label_column in features:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"DropFeatures can not drop label_column: {feature_set.spec.label_column}"
+            )
+        if feature_set.spec.timestamp_key in features:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"DropFeatures can not drop timestamp_key: {feature_set.spec.timestamp_key}"
+            )

mlrun/features.py CHANGED Viewed

@@ -14,7 +14,7 @@
 #
 import math
 import re
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 from .data_types import ValueType, python_type_to_value_type
 from .errors import MLRunRuntimeError, err_to_str
@@ -44,7 +44,7 @@ class Entity(ModelObj):
         name: str = None,
         value_type: Union[ValueType, str] = None,
         description: str = None,
-        labels: Optional[Dict[str, str]] = None,
+        labels: Optional[dict[str, str]] = None,
     ):
         """data entity (index key)
@@ -65,8 +65,6 @@ class Entity(ModelObj):
 class Feature(ModelObj):
-    """data feature"""
     _dict_fields = [
         "name",
         "description",
@@ -82,13 +80,13 @@ class Feature(ModelObj):
     def __init__(
         self,
         value_type: Union[ValueType, str] = None,
-        dims: List[int] = None,
+        dims: list[int] = None,
         description: str = None,
         aggregate: bool = None,
         name: str = None,
         validator=None,
         default: str = None,
-        labels: Dict[str, str] = None,
+        labels: dict[str, str] = None,
     ):
         """data feature
@@ -102,7 +100,8 @@ class Feature(ModelObj):
         :param name:        name of the feature
         :param validator:   feature validation policy
         :param default:     default value
-        :param labels:      a set of key/value labels (tags)
+        :param labels:      a set of key/value labels (tags). Labels can be used to filter featues, for example,
+                            in the UI Feature store page.
         """
         self.name = name or ""
         if isinstance(value_type, ValueType):
@@ -240,10 +239,7 @@ class Validator(ModelObj):
             from mlrun.features import Validator
             # Add validator to the feature 'bid' with check type
-            quotes_set["bid"].validator = Validator(
-                check_type=True,
-                severity="info"
-            )
+            quotes_set["bid"].validator = Validator(check_type=True, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -282,10 +278,7 @@ class MinMaxValidator(Validator):
             # Add validator to the feature 'bid', where valid
             # minimal value is 52
-            quotes_set["bid"].validator = MinMaxValidator(
-                min=52,
-                severity="info"
-            )
+            quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -346,9 +339,7 @@ class MinMaxLenValidator(Validator):
             # Add length validator to the feature 'ticker', where valid
             # minimal length is 1 and maximal length is 10
             quotes_set["ticker"].validator = MinMaxLenValidator(
-                min=1,
-                max=10,
-                severity="info"
+                min=1, max=10, severity="info"
             )
         :param check_type:  check feature type e.g. True, False
@@ -410,8 +401,7 @@ class RegexValidator(Validator):
             # expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
             # e.g. A1234567, z9874563, etc.
             quotes_set["name"].validator = RegexValidator(
-                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
-                severity="info"
+                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
             )
         :param check_type:  check feature type e.g. True, False
@@ -445,7 +435,7 @@ class RegexValidator(Validator):
     @classmethod
     def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
-        new_obj = super(RegexValidator, cls).from_dict(
+        new_obj = super().from_dict(
             struct=struct, fields=fields, deprecated_fields=deprecated_fields
         )
         if hasattr(new_obj, "regex"):

mlrun/frameworks/_common/artifacts_library.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 from abc import ABC, abstractmethod
-from typing import Dict, List, Type, Union
+from typing import Union
 import mlrun
@@ -39,12 +39,12 @@ class ArtifactsLibrary(ABC):
     @classmethod
     def get_plans(
         cls,
-        artifacts: Union[List[Plan], Dict[str, dict], List[str]] = None,
+        artifacts: Union[list[Plan], dict[str, dict], list[str]] = None,
         context: mlrun.MLClientCtx = None,
         include_default: bool = True,
         # custom_plans: dict = None, :param custom_plans: Custom user plans objects to initialize from.
         **default_kwargs,
-    ) -> List[Plan]:
+    ) -> list[Plan]:
         """
         Get plans for a run. The plans will be taken from the provided artifacts / configuration via code, from provided
         configuration via MLRun context and if the 'include_default' is True, from the framework artifact library's
@@ -97,7 +97,7 @@ class ArtifactsLibrary(ABC):
     @classmethod
     @abstractmethod
-    def default(cls, **kwargs) -> List[Plan]:
+    def default(cls, **kwargs) -> list[Plan]:
         """
         Get the default artifacts plans list of this framework's library.
@@ -106,7 +106,7 @@ class ArtifactsLibrary(ABC):
         pass
     @classmethod
-    def _get_library_plans(cls) -> Dict[str, Type[Plan]]:
+    def _get_library_plans(cls) -> dict[str, type[Plan]]:
         """
         Get all the supported plans in this library.
@@ -120,8 +120,8 @@ class ArtifactsLibrary(ABC):
     @staticmethod
     def _from_dict(
-        requested_plans: Dict[str, dict], available_plans: Dict[str, Type[Plan]]
-    ) -> List[Plan]:
+        requested_plans: dict[str, dict], available_plans: dict[str, type[Plan]]
+    ) -> list[Plan]:
         """
         Initialize a list of plans from a given configuration dictionary. The configuration is expected to be a
         dictionary of plans and their initialization parameters in the following format:
@@ -162,8 +162,8 @@ class ArtifactsLibrary(ABC):
     @staticmethod
     def _from_list(
-        requested_plans: List[str], available_plans: Dict[str, Type[Plan]]
-    ) -> List[Plan]:
+        requested_plans: list[str], available_plans: dict[str, type[Plan]]
+    ) -> list[Plan]:
         """
         Initialize a list of plans from a given configuration list. The configuration is expected to be a list of plans
         names to be initialized with their default configuration.

mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.4rc7py3-none-any.whl → 1.7.0py3-none-any.whl