PyPI - mlrun - Versions diffs - 1.7.0rc14__py3-none-any.whl → 1.7.0rc22__py3-none-any.whl - Mend

mlrun 1.7.0rc14py3-none-any.whl → 1.7.0rc22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (160) hide show

mlrun/__init__.py +10 -1
mlrun/__main__.py +23 -111
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +169 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +36 -253
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +46 -42
mlrun/artifacts/model.py +9 -141
mlrun/artifacts/plots.py +14 -375
mlrun/common/constants.py +65 -3
mlrun/common/formatters/__init__.py +19 -0
mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +10 -5
mlrun/common/schemas/alert.py +92 -11
mlrun/common/schemas/api_gateway.py +56 -0
mlrun/common/schemas/artifact.py +15 -5
mlrun/common/schemas/auth.py +2 -0
mlrun/common/schemas/client_spec.py +1 -0
mlrun/common/schemas/frontend_spec.py +1 -0
mlrun/common/schemas/function.py +4 -0
mlrun/common/schemas/model_monitoring/__init__.py +15 -3
mlrun/common/schemas/model_monitoring/constants.py +58 -7
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
mlrun/common/schemas/pipeline.py +0 -9
mlrun/common/schemas/project.py +5 -11
mlrun/common/types.py +1 -0
mlrun/config.py +30 -9
mlrun/data_types/to_pandas.py +9 -9
mlrun/datastore/base.py +41 -9
mlrun/datastore/datastore.py +6 -2
mlrun/datastore/datastore_profile.py +56 -4
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/redis.py +2 -2
mlrun/datastore/s3.py +5 -0
mlrun/datastore/sources.py +147 -7
mlrun/datastore/store_resources.py +7 -7
mlrun/datastore/targets.py +110 -42
mlrun/datastore/utils.py +42 -0
mlrun/db/base.py +54 -10
mlrun/db/httpdb.py +282 -79
mlrun/db/nopdb.py +52 -10
mlrun/errors.py +11 -0
mlrun/execution.py +26 -9
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +12 -47
mlrun/feature_store/feature_set.py +9 -0
mlrun/feature_store/feature_vector.py +8 -0
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/conversion.py +9 -9
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +9 -3
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +16 -0
mlrun/frameworks/__init__.py +6 -0
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/parallel_coordinates.py +2 -1
mlrun/frameworks/tf_keras/__init__.py +4 -1
mlrun/k8s_utils.py +10 -11
mlrun/launcher/base.py +4 -3
mlrun/launcher/client.py +5 -3
mlrun/launcher/local.py +12 -2
mlrun/launcher/remote.py +9 -2
mlrun/lists.py +6 -2
mlrun/model.py +47 -21
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +42 -18
mlrun/model_monitoring/application.py +5 -305
mlrun/model_monitoring/applications/__init__.py +11 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +280 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +3 -1
mlrun/model_monitoring/db/__init__.py +2 -0
mlrun/model_monitoring/db/stores/__init__.py +0 -2
mlrun/model_monitoring/db/stores/base/store.py +22 -37
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
mlrun/model_monitoring/db/tsdb/base.py +316 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +401 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +658 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/helpers.py +63 -1
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +57 -216
mlrun/model_monitoring/writer.py +134 -124
mlrun/package/__init__.py +13 -1
mlrun/package/packagers/__init__.py +6 -1
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +10 -9
mlrun/platforms/iguazio.py +21 -202
mlrun/projects/operations.py +24 -12
mlrun/projects/pipelines.py +79 -102
mlrun/projects/project.py +271 -103
mlrun/render.py +15 -14
mlrun/run.py +16 -46
mlrun/runtimes/__init__.py +6 -3
mlrun/runtimes/base.py +14 -7
mlrun/runtimes/daskjob.py +1 -0
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/kubejob.py +2 -1
mlrun/runtimes/local.py +12 -3
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/api_gateway.py +194 -84
mlrun/runtimes/nuclio/application/application.py +170 -8
mlrun/runtimes/nuclio/function.py +39 -49
mlrun/runtimes/pod.py +16 -36
mlrun/runtimes/remotesparkjob.py +9 -3
mlrun/runtimes/sparkjob/spark3job.py +1 -1
mlrun/runtimes/utils.py +6 -45
mlrun/serving/__init__.py +8 -1
mlrun/serving/server.py +2 -1
mlrun/serving/states.py +51 -8
mlrun/serving/utils.py +19 -11
mlrun/serving/v2_serving.py +5 -1
mlrun/track/tracker.py +2 -1
mlrun/utils/async_http.py +25 -5
mlrun/utils/helpers.py +157 -83
mlrun/utils/logger.py +39 -7
mlrun/utils/notifications/notification/__init__.py +14 -9
mlrun/utils/notifications/notification/base.py +1 -1
mlrun/utils/notifications/notification/slack.py +34 -7
mlrun/utils/notifications/notification/webhook.py +1 -1
mlrun/utils/notifications/notification_pusher.py +147 -16
mlrun/utils/regex.py +9 -0
mlrun/utils/v3io_clients.py +0 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/METADATA +14 -6
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/RECORD +158 -138
mlrun/kfpops.py +0 -865
mlrun/platforms/other.py +0 -305
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc22.dist-info}/top_level.txt +0 -0

mlrun/db/nopdb.py CHANGED Viewed

@@ -16,6 +16,9 @@
 import datetime
 from typing import Optional, Union
+import mlrun.alerts
+import mlrun.common.formatters
+import mlrun.common.runtimes.constants
 import mlrun.common.schemas
 import mlrun.errors
@@ -79,7 +82,10 @@ class NopDB(RunDBInterface):
         uid: Optional[Union[str, list[str]]] = None,
         project: Optional[str] = None,
         labels: Optional[Union[str, list[str]]] = None,
-        state: Optional[str] = None,
+        state: Optional[
+            mlrun.common.runtimes.constants.RunStates
+        ] = None,  # Backward compatibility
+        states: Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
         sort: bool = True,
         last: int = 0,
         iter: bool = False,
@@ -128,7 +134,18 @@ class NopDB(RunDBInterface):
     ):
         pass
-    def del_artifact(self, key, tag="", project="", tree=None, uid=None):
+    def del_artifact(
+        self,
+        key,
+        tag="",
+        project="",
+        tree=None,
+        uid=None,
+        deletion_strategy: mlrun.common.schemas.artifact.ArtifactsDeletionStrategies = (
+            mlrun.common.schemas.artifact.ArtifactsDeletionStrategies.metadata_only
+        ),
+        secrets: dict = None,
+    ):
         pass
     def del_artifacts(self, name="", project="", tag="", labels=None):
@@ -196,7 +213,7 @@ class NopDB(RunDBInterface):
     def list_projects(
         self,
         owner: str = None,
-        format_: mlrun.common.schemas.ProjectsFormat = mlrun.common.schemas.ProjectsFormat.name_only,
+        format_: mlrun.common.formatters.ProjectFormat = mlrun.common.formatters.ProjectFormat.name_only,
         labels: list[str] = None,
         state: mlrun.common.schemas.ProjectState = None,
     ) -> mlrun.common.schemas.ProjectsOutput:
@@ -351,8 +368,8 @@ class NopDB(RunDBInterface):
         namespace: str = None,
         timeout: int = 30,
         format_: Union[
-            str, mlrun.common.schemas.PipelinesFormat
-        ] = mlrun.common.schemas.PipelinesFormat.summary,
+            str, mlrun.common.formatters.PipelineFormat
+        ] = mlrun.common.formatters.PipelineFormat.summary,
         project: str = None,
     ):
         pass
@@ -365,8 +382,8 @@ class NopDB(RunDBInterface):
         page_token: str = "",
         filter_: str = "",
         format_: Union[
-            str, mlrun.common.schemas.PipelinesFormat
-        ] = mlrun.common.schemas.PipelinesFormat.metadata_only,
+            str, mlrun.common.formatters.PipelineFormat
+        ] = mlrun.common.formatters.PipelineFormat.metadata_only,
         page_size: int = None,
     ) -> mlrun.common.schemas.PipelinesOutput:
         pass
@@ -508,8 +525,11 @@ class NopDB(RunDBInterface):
     def store_api_gateway(
         self,
-        project: str,
-        api_gateway: mlrun.runtimes.nuclio.APIGateway,
+        api_gateway: Union[
+            mlrun.common.schemas.APIGateway,
+            mlrun.runtimes.nuclio.api_gateway.APIGateway,
+        ],
+        project: str = None,
     ) -> mlrun.common.schemas.APIGateway:
         pass
@@ -658,6 +678,22 @@ class NopDB(RunDBInterface):
     ) -> None:
         pass
+    def disable_model_monitoring(
+        self,
+        project: str,
+        delete_resources: bool = True,
+        delete_stream_function: bool = False,
+        delete_histogram_data_drift_app: bool = True,
+        delete_user_applications: bool = False,
+        user_application_list: list[str] = None,
+    ) -> bool:
+        pass
+    def delete_model_monitoring_function(
+        self, project: str, functions: list[str]
+    ) -> bool:
+        pass
     def deploy_histogram_data_drift_app(
         self, project: str, image: str = "mlrun/mlrun"
     ) -> None:
@@ -671,7 +707,7 @@ class NopDB(RunDBInterface):
     def store_alert_config(
         self,
         alert_name: str,
-        alert_data: Union[dict, mlrun.common.schemas.AlertConfig],
+        alert_data: Union[dict, mlrun.alerts.alert.AlertConfig],
         project="",
     ):
         pass
@@ -687,3 +723,9 @@ class NopDB(RunDBInterface):
     def reset_alert_config(self, alert_name: str, project=""):
         pass
+    def get_alert_template(self, template_name: str):
+        pass
+    def list_alert_templates(self):
+        pass

mlrun/errors.py CHANGED Viewed

@@ -155,6 +155,10 @@ class MLRunNotFoundError(MLRunHTTPStatusError):
     error_status_code = HTTPStatus.NOT_FOUND.value
+class MLRunPaginationEndOfResultsError(MLRunNotFoundError):
+    pass
 class MLRunBadRequestError(MLRunHTTPStatusError):
     error_status_code = HTTPStatus.BAD_REQUEST.value
@@ -183,6 +187,10 @@ class MLRunInternalServerError(MLRunHTTPStatusError):
     error_status_code = HTTPStatus.INTERNAL_SERVER_ERROR.value
+class MLRunNotImplementedServerError(MLRunHTTPStatusError):
+    error_status_code = HTTPStatus.NOT_IMPLEMENTED.value
 class MLRunServiceUnavailableError(MLRunHTTPStatusError):
     error_status_code = HTTPStatus.SERVICE_UNAVAILABLE.value
@@ -234,4 +242,7 @@ STATUS_ERRORS = {
     HTTPStatus.PRECONDITION_FAILED.value: MLRunPreconditionFailedError,
     HTTPStatus.INTERNAL_SERVER_ERROR.value: MLRunInternalServerError,
     HTTPStatus.SERVICE_UNAVAILABLE.value: MLRunServiceUnavailableError,
+    HTTPStatus.NOT_IMPLEMENTED.value: MLRunNotImplementedServerError,
 }
+EXPECTED_ERRORS = (MLRunPaginationEndOfResultsError,)

mlrun/execution.py CHANGED Viewed

@@ -22,6 +22,7 @@ import yaml
 from dateutil import parser
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.artifacts import ModelArtifact
 from mlrun.datastore.store_resources import get_store_resource
 from mlrun.errors import MLRunInvalidArgumentError
@@ -110,6 +111,7 @@ class MLClientCtx:
         self._project_object = None
         self._allow_empty_resources = None
+        self._reset_on_run = None
     def __enter__(self):
         return self
@@ -129,7 +131,9 @@ class MLClientCtx:
     @property
     def tag(self):
         """Run tag (uid or workflow id if exists)"""
-        return self._labels.get("workflow") or self._uid
+        return (
+            self._labels.get(mlrun_constants.MLRunInternalLabels.workflow) or self._uid
+        )
     @property
     def state(self):
@@ -329,8 +333,10 @@ class MLClientCtx:
             "uri": uri,
             "owner": get_in(self._labels, "owner"),
         }
-        if "workflow" in self._labels:
-            resp["workflow"] = self._labels["workflow"]
+        if mlrun_constants.MLRunInternalLabels.workflow in self._labels:
+            resp[mlrun_constants.MLRunInternalLabels.workflow] = self._labels[
+                mlrun_constants.MLRunInternalLabels.workflow
+            ]
         return resp
     @classmethod
@@ -384,6 +390,7 @@ class MLClientCtx:
             self._state_thresholds = spec.get(
                 "state_thresholds", self._state_thresholds
             )
+            self._reset_on_run = spec.get("reset_on_run", self._reset_on_run)
         self._init_dbs(rundb)
@@ -396,7 +403,7 @@ class MLClientCtx:
                         self._set_input(k, v)
         if host and not is_api:
-            self.set_label("host", host)
+            self.set_label(mlrun_constants.MLRunInternalLabels.host, host)
         start = get_in(attrs, "status.start_time")
         if start:
@@ -990,10 +997,15 @@ class MLClientCtx:
         # If it's a OpenMPI job, get the global rank and compare to the logging rank (worker) set in MLRun's
         # configuration:
         labels = self.labels
-        if "host" in labels and labels.get("kind", "job") == "mpijob":
+        if (
+            mlrun_constants.MLRunInternalLabels.host in labels
+            and labels.get(mlrun_constants.MLRunInternalLabels.kind, "job") == "mpijob"
+        ):
             # The host (pod name) of each worker is created by k8s, and by default it uses the rank number as the id in
             # the following template: ...-worker-<rank>
-            rank = int(labels["host"].rsplit("-", 1)[1])
+            rank = int(
+                labels[mlrun_constants.MLRunInternalLabels.host].rsplit("-", 1)[1]
+            )
             return rank == mlrun.mlconf.packagers.logging_worker
         # Single worker is always the logging worker:
@@ -1029,9 +1041,14 @@ class MLClientCtx:
             "status.last_update": to_date_str(self._last_update),
         }
-        # completion of runs is not decided by the execution as there may be
-        # multiple executions for a single run (e.g. mpi)
-        if self._state != "completed":
+        # Completion of runs is decided by the API runs monitoring as there may be
+        # multiple executions for a single run (e.g. mpi).
+        # For kinds that are not monitored by the API (local) we allow changing the state.
+        run_kind = self.labels.get(mlrun_constants.MLRunInternalLabels.kind, "")
+        if (
+            mlrun.runtimes.RuntimeKinds.is_local_runtime(run_kind)
+            or self._state != "completed"
+        ):
             struct["status.state"] = self._state
         if self.is_logging_worker():

mlrun/feature_store/__init__.py CHANGED Viewed

@@ -19,7 +19,6 @@ __all__ = [
     "get_online_feature_service",
     "ingest",
     "preview",
-    "deploy_ingestion_service",
     "deploy_ingestion_service_v2",
     "delete_feature_set",
     "delete_feature_vector",
@@ -41,7 +40,6 @@ from ..features import Entity, Feature
 from .api import (
     delete_feature_set,
     delete_feature_vector,
-    deploy_ingestion_service,
     deploy_ingestion_service_v2,
     get_feature_set,
     get_feature_vector,

mlrun/feature_store/api.py CHANGED Viewed

@@ -113,6 +113,7 @@ def get_offline_features(
     order_by: Union[str, list[str]] = None,
     spark_service: str = None,
     timestamp_for_filtering: Union[str, dict[str, str]] = None,
+    additional_filters: list = None,
 ):
     """retrieve offline feature vector results
@@ -175,6 +176,13 @@ def get_offline_features(
                                     By default, the filter executes on the timestamp_key of each feature set.
                                     Note: the time filtering is performed on each feature set before the
                                     merge process using start_time and end_time params.
+    :param additional_filters: List of additional_filter conditions as tuples.
+                                Each tuple should be in the format (column_name, operator, value).
+                                Supported operators: "=", ">=", "<=", ">", "<".
+                                Example: [("Product", "=", "Computer")]
+                                For all supported filters, please see:
+                                https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
     """
     return _get_offline_features(
@@ -194,6 +202,7 @@ def get_offline_features(
         order_by,
         spark_service,
         timestamp_for_filtering,
+        additional_filters,
     )
@@ -214,6 +223,7 @@ def _get_offline_features(
     order_by: Union[str, list[str]] = None,
     spark_service: str = None,
     timestamp_for_filtering: Union[str, dict[str, str]] = None,
+    additional_filters=None,
 ) -> Union[OfflineVectorResponse, RemoteVectorResponse]:
     if entity_rows is None and entity_timestamp_column is not None:
         raise mlrun.errors.MLRunInvalidArgumentError(
@@ -252,6 +262,7 @@ def _get_offline_features(
             start_time=start_time,
             end_time=end_time,
             timestamp_for_filtering=timestamp_for_filtering,
+            additional_filters=additional_filters,
         )
     merger = merger_engine(feature_vector, **(engine_args or {}))
@@ -267,6 +278,7 @@ def _get_offline_features(
         update_stats=update_stats,
         query=query,
         order_by=order_by,
+        additional_filters=additional_filters,
     )
@@ -1005,53 +1017,6 @@ def _deploy_ingestion_service_v2(
     return function.deploy(), function
-@deprecated(
-    version="1.5.0",
-    reason="'deploy_ingestion_service' will be removed in 1.7.0, use 'deploy_ingestion_service_v2' instead",
-    category=FutureWarning,
-)
-def deploy_ingestion_service(
-    featureset: Union[FeatureSet, str],
-    source: DataSource = None,
-    targets: list[DataTargetBase] = None,
-    name: str = None,
-    run_config: RunConfig = None,
-    verbose=False,
-) -> str:
-    """Start real-time ingestion service using nuclio function
-    Deploy a real-time function implementing feature ingestion pipeline
-    the source maps to Nuclio event triggers (http, kafka, v3io stream, etc.)
-    the `run_config` parameter allow specifying the function and job configuration,
-    see: :py:class:`~mlrun.feature_store.RunConfig`
-    example::
-        source = HTTPSource()
-        func = mlrun.code_to_function("ingest", kind="serving").apply(mount_v3io())
-        config = RunConfig(function=func)
-        my_set.deploy_ingestion_service(source, run_config=config)
-    :param featureset:    feature set object or uri
-    :param source:        data source object describing the online or offline source
-    :param targets:       list of data target objects
-    :param name:          name for the job/function
-    :param run_config:    service runtime configuration (function object/uri, resources, etc..)
-    :param verbose:       verbose log
-    :return: URL to access the deployed ingestion service
-    """
-    endpoint, _ = featureset.deploy_ingestion_service(
-        source=source,
-        targets=targets,
-        name=name,
-        run_config=run_config,
-        verbose=verbose,
-    )
-    return endpoint
 def _ingest_with_spark(
     spark=None,
     featureset: Union[FeatureSet, str] = None,

mlrun/feature_store/feature_set.py CHANGED Viewed

@@ -917,6 +917,7 @@ class FeatureSet(ModelObj):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return featureset (offline) data as dataframe
@@ -928,6 +929,12 @@ class FeatureSet(ModelObj):
         :param end_time:     filter by end time
         :param time_column:  specify the time column name in the file
         :param kwargs:       additional reader (csv, parquet, ..) args
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         :return: DataFrame
         """
         entities = list(self.spec.entities.keys())
@@ -946,6 +953,7 @@ class FeatureSet(ModelObj):
                 start_time=start_time,
                 end_time=end_time,
                 time_field=time_column,
+                additional_filters=additional_filters,
                 **kwargs,
             )
             # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -965,6 +973,7 @@ class FeatureSet(ModelObj):
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return result

mlrun/feature_store/feature_vector.py CHANGED Viewed

@@ -741,6 +741,7 @@ class FeatureVector(ModelObj):
         order_by: Union[str, list[str]] = None,
         spark_service: str = None,
         timestamp_for_filtering: Union[str, dict[str, str]] = None,
+        additional_filters: list = None,
     ):
         """retrieve offline feature vector results
@@ -797,6 +798,12 @@ class FeatureVector(ModelObj):
                                         By default, the filter executes on the timestamp_key of each feature set.
                                         Note: the time filtering is performed on each feature set before the
                                         merge process using start_time and end_time params.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                            Each tuple should be in the format (column_name, operator, value).
+                            Supported operators: "=", ">=", "<=", ">", "<".
+                            Example: [("Product", "=", "Computer")]
+                            For all supported filters, please see:
+                            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
@@ -817,6 +824,7 @@ class FeatureVector(ModelObj):
             order_by,
             spark_service,
             timestamp_for_filtering,
+            additional_filters,
         )
     def get_online_feature_service(

mlrun/feature_store/ingestion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import uuid
 import pandas as pd
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.datastore.sources import get_source_from_dict, get_source_step
 from mlrun.datastore.targets import (
     add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
         out_path=featureset.spec.output_path,
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-ingest").set_label(
-        "feature-set", featureset.uri
-    )
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
+    ).set_label("feature-set", featureset.uri)
     if run_config.owner:
-        task.set_label("owner", run_config.owner).set_label(
-            "v3io_user", run_config.owner
-        )
+        task.set_label(
+            mlrun_constants.MLRunInternalLabels.owner, run_config.owner
+        ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
     # set run UID and save in the feature set status (linking the features et to the job)
     task.metadata.uid = uuid.uuid4().hex

mlrun/feature_store/retrieval/base.py CHANGED Viewed

@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
         update_stats=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._target = target
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
             timestamp_for_filtering=timestamp_for_filtering,
             query=query,
             order_by=order_by,
+            additional_filters=additional_filters,
         )
     def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
         timestamp_for_filtering=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._create_engine_env()
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
             feature_sets.append(None)
             join_types.append(None)
-        filtered = False
+        timestamp_filtered = False
         for step in join_graph.steps:
             name = step.right_feature_set_name
             feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
             if self._drop_indexes:
                 self._append_drop_column(time_column)
             if (start_time or end_time) and time_column:
-                filtered = True
+                timestamp_filtered = True
             df = self._get_engine_df(
                 feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
                 start_time if time_column else None,
                 end_time if time_column else None,
                 time_column,
+                additional_filters,
             )
             fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
                     new_columns.append((column, alias))
             self._update_alias(dictionary={name: alias for name, alias in new_columns})
-        # None of the feature sets was filtered as required
-        if not filtered and (start_time or end_time):
+        # None of the feature sets was timestamp filtered as required
+        if not timestamp_filtered and (start_time or end_time):
             raise mlrun.errors.MLRunRuntimeError(
                 "start_time and end_time can only be provided in conjunction with "
                 "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
         start_time: typing.Union[str, datetime] = None,
         end_time: typing.Union[str, datetime] = None,
         time_column: typing.Optional[str] = None,
+        additional_filters=None,
     ):
         """
         Return the feature_set data frame according to the args

mlrun/feature_store/retrieval/conversion.py CHANGED Viewed

@@ -79,10 +79,10 @@ class PandasConversionMixin:
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
-                        "failed by the reason below:\n  %s\n"
+                        f"failed by the reason below:\n  {e}\n"
                         "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                        "true." % str(e)
+                        "true."
                     )
                     warnings.warn(msg)
                     use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin:
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                         "reached the error below and will not continue because automatic fallback "
                         "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                        "false.\n  %s" % str(e)
+                        f"false.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -158,7 +158,7 @@ class PandasConversionMixin:
                         "reached the error below and can not continue. Note that "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                         "effect on failures in the middle of "
-                        "computation.\n  %s" % str(e)
+                        f"computation.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -168,10 +168,10 @@ class PandasConversionMixin:
         column_counter = Counter(self.columns)
         dtype = [None] * len(self.schema)
-        for fieldIdx, field in enumerate(self.schema):
+        for field_idx, field in enumerate(self.schema):
             # For duplicate column name, we use `iloc` to access it.
             if column_counter[field.name] > 1:
-                pandas_col = pdf.iloc[:, fieldIdx]
+                pandas_col = pdf.iloc[:, field_idx]
             else:
                 pandas_col = pdf[field.name]
@@ -187,12 +187,12 @@ class PandasConversionMixin:
                 and field.nullable
                 and pandas_col.isnull().any()
             ):
-                dtype[fieldIdx] = pandas_type
+                dtype[field_idx] = pandas_type
             # Ensure we fall back to nullable numpy types, even when whole column is null:
             if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = np.float64
+                dtype[field_idx] = np.float64
             if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = object
+                dtype[field_idx] = object
         df = pd.DataFrame()
         for index, t in enumerate(dtype):

mlrun/feature_store/retrieval/dask_merger.py CHANGED Viewed

@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         import dask.dataframe as dd
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
             end_time=end_time,
             time_column=time_column,
             index=False,
+            additional_filters=additional_filters,
         )
         return self._reset_index(df).persist()

mlrun/feature_store/retrieval/job.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import uuid
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.config import config as mlconf
 from mlrun.model import DataTargetBase, new_task
 from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
     start_time=None,
     end_time=None,
     timestamp_for_filtering=None,
+    additional_filters=None,
 ):
     name = vector.metadata.name
     if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
             "end_time": end_time,
             "timestamp_for_filtering": timestamp_for_filtering,
             "engine_args": engine_args,
+            "additional_filters": additional_filters,
         },
         inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
+    ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
     task.metadata.uid = uuid.uuid4().hex
     vector.status.run_uri = task.metadata.uid
     vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
 from mlrun.datastore.targets import get_target_driver
 def merge_handler(context, vector_uri, target, entity_rows=None,
                   entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
-                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
+                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
+                  additional_filters=None):
     vector = context.get_store_resource(vector_uri)
     store_target = get_target_driver(target, vector)
     if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
     merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
     merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
                  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
-                 timestamp_for_filtering=timestamp_for_filtering)
+                 timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
     target = vector.status.targets[store_target.name].to_dict()
     context.log_result('feature_vector', vector.uri)

mlrun/feature_store/retrieval/local_merger.py CHANGED Viewed

@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         df = feature_set.to_dataframe(
             columns=column_names,
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
         )
         if df.index.names[0]:
             df.reset_index(inplace=True)

mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc22__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc14py3-none-any.whl → 1.7.0rc22py3-none-any.whl