PyPI - mlrun - Versions diffs - 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl - Mend

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +26 -112
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +144 -0
mlrun/api/schemas/__init__.py +5 -4
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +46 -257
mlrun/artifacts/dataset.py +11 -192
mlrun/artifacts/manager.py +47 -48
mlrun/artifacts/model.py +31 -159
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +69 -0
mlrun/common/db/sql_session.py +2 -3
mlrun/common/formatters/__init__.py +19 -0
mlrun/common/formatters/artifact.py +21 -0
mlrun/common/formatters/base.py +78 -0
mlrun/common/formatters/function.py +41 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/helpers.py +1 -2
mlrun/common/model_monitoring/helpers.py +9 -5
mlrun/{runtimes → common/runtimes}/constants.py +37 -9
mlrun/common/schemas/__init__.py +24 -4
mlrun/common/schemas/alert.py +203 -0
mlrun/common/schemas/api_gateway.py +148 -0
mlrun/common/schemas/artifact.py +18 -8
mlrun/common/schemas/auth.py +11 -5
mlrun/common/schemas/background_task.py +1 -1
mlrun/common/schemas/client_spec.py +4 -1
mlrun/common/schemas/feature_store.py +16 -16
mlrun/common/schemas/frontend_spec.py +8 -7
mlrun/common/schemas/function.py +5 -1
mlrun/common/schemas/hub.py +11 -18
mlrun/common/schemas/memory_reports.py +2 -2
mlrun/common/schemas/model_monitoring/__init__.py +18 -3
mlrun/common/schemas/model_monitoring/constants.py +83 -26
mlrun/common/schemas/model_monitoring/grafana.py +13 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
mlrun/common/schemas/notification.py +4 -4
mlrun/common/schemas/object.py +2 -2
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +1 -10
mlrun/common/schemas/project.py +24 -23
mlrun/common/schemas/runtime_resource.py +8 -12
mlrun/common/schemas/schedule.py +3 -3
mlrun/common/schemas/tag.py +1 -2
mlrun/common/schemas/workflow.py +2 -2
mlrun/common/types.py +7 -1
mlrun/config.py +54 -17
mlrun/data_types/to_pandas.py +10 -12
mlrun/datastore/__init__.py +5 -8
mlrun/datastore/alibaba_oss.py +130 -0
mlrun/datastore/azure_blob.py +17 -5
mlrun/datastore/base.py +62 -39
mlrun/datastore/datastore.py +28 -9
mlrun/datastore/datastore_profile.py +146 -20
mlrun/datastore/filestore.py +0 -1
mlrun/datastore/google_cloud_storage.py +6 -2
mlrun/datastore/hdfs.py +56 -0
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/redis.py +6 -2
mlrun/datastore/s3.py +9 -0
mlrun/datastore/snowflake_utils.py +43 -0
mlrun/datastore/sources.py +201 -96
mlrun/datastore/spark_utils.py +1 -2
mlrun/datastore/store_resources.py +7 -7
mlrun/datastore/targets.py +358 -104
mlrun/datastore/utils.py +72 -58
mlrun/datastore/v3io.py +5 -1
mlrun/db/base.py +185 -35
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +614 -179
mlrun/db/nopdb.py +210 -26
mlrun/errors.py +12 -1
mlrun/execution.py +41 -24
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +40 -72
mlrun/feature_store/common.py +1 -1
mlrun/feature_store/feature_set.py +76 -55
mlrun/feature_store/feature_vector.py +28 -30
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +16 -11
mlrun/feature_store/retrieval/conversion.py +11 -13
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +9 -3
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +34 -24
mlrun/feature_store/steps.py +37 -34
mlrun/features.py +9 -20
mlrun/frameworks/_common/artifacts_library.py +9 -9
mlrun/frameworks/_common/mlrun_interface.py +5 -5
mlrun/frameworks/_common/model_handler.py +48 -48
mlrun/frameworks/_common/plan.py +2 -3
mlrun/frameworks/_common/producer.py +3 -4
mlrun/frameworks/_common/utils.py +5 -5
mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
mlrun/frameworks/_ml_common/model_handler.py +24 -24
mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/_ml_common/utils.py +4 -4
mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
mlrun/frameworks/huggingface/model_server.py +4 -4
mlrun/frameworks/lgbm/__init__.py +33 -33
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
mlrun/frameworks/lgbm/model_handler.py +10 -10
mlrun/frameworks/lgbm/model_server.py +6 -6
mlrun/frameworks/lgbm/utils.py +5 -5
mlrun/frameworks/onnx/dataset.py +8 -8
mlrun/frameworks/onnx/mlrun_interface.py +3 -3
mlrun/frameworks/onnx/model_handler.py +6 -6
mlrun/frameworks/onnx/model_server.py +7 -7
mlrun/frameworks/parallel_coordinates.py +4 -3
mlrun/frameworks/pytorch/__init__.py +18 -18
mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
mlrun/frameworks/pytorch/model_handler.py +17 -17
mlrun/frameworks/pytorch/model_server.py +7 -7
mlrun/frameworks/sklearn/__init__.py +13 -13
mlrun/frameworks/sklearn/estimator.py +4 -4
mlrun/frameworks/sklearn/metrics_library.py +14 -14
mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
mlrun/frameworks/sklearn/model_handler.py +2 -2
mlrun/frameworks/tf_keras/__init__.py +10 -7
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
mlrun/frameworks/tf_keras/model_handler.py +14 -14
mlrun/frameworks/tf_keras/model_server.py +6 -6
mlrun/frameworks/xgboost/__init__.py +13 -13
mlrun/frameworks/xgboost/model_handler.py +6 -6
mlrun/k8s_utils.py +14 -16
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +16 -15
mlrun/launcher/client.py +8 -6
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +17 -11
mlrun/launcher/remote.py +16 -10
mlrun/lists.py +7 -6
mlrun/model.py +238 -73
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +138 -315
mlrun/model_monitoring/application.py +5 -296
mlrun/model_monitoring/applications/__init__.py +24 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +282 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +104 -84
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
mlrun/model_monitoring/db/tsdb/base.py +329 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/features_drift_table.py +134 -106
mlrun/model_monitoring/helpers.py +127 -28
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/prometheus.py +1 -4
mlrun/model_monitoring/stream_processing.py +62 -231
mlrun/model_monitoring/tracking_policy.py +9 -2
mlrun/model_monitoring/writer.py +152 -124
mlrun/package/__init__.py +6 -6
mlrun/package/context_handler.py +5 -5
mlrun/package/packager.py +7 -7
mlrun/package/packagers/default_packager.py +6 -6
mlrun/package/packagers/numpy_packagers.py +15 -15
mlrun/package/packagers/pandas_packagers.py +5 -5
mlrun/package/packagers/python_standard_library_packagers.py +10 -10
mlrun/package/packagers_manager.py +19 -23
mlrun/package/utils/_formatter.py +6 -6
mlrun/package/utils/_pickler.py +2 -2
mlrun/package/utils/_supported_format.py +4 -4
mlrun/package/utils/log_hint_utils.py +2 -2
mlrun/package/utils/type_hint_utils.py +4 -9
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +24 -203
mlrun/projects/operations.py +35 -21
mlrun/projects/pipelines.py +68 -99
mlrun/projects/project.py +830 -266
mlrun/render.py +3 -11
mlrun/run.py +162 -166
mlrun/runtimes/__init__.py +62 -7
mlrun/runtimes/base.py +39 -32
mlrun/runtimes/daskjob.py +8 -8
mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/function_reference.py +1 -1
mlrun/runtimes/kubejob.py +28 -122
mlrun/runtimes/local.py +6 -3
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +9 -10
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
mlrun/runtimes/nuclio/api_gateway.py +709 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +523 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
mlrun/runtimes/pod.py +286 -88
mlrun/runtimes/remotesparkjob.py +2 -2
mlrun/runtimes/sparkjob/spark3job.py +51 -34
mlrun/runtimes/utils.py +7 -75
mlrun/secrets.py +9 -5
mlrun/serving/remote.py +2 -7
mlrun/serving/routers.py +13 -10
mlrun/serving/server.py +22 -26
mlrun/serving/states.py +99 -25
mlrun/serving/utils.py +3 -3
mlrun/serving/v1_serving.py +6 -7
mlrun/serving/v2_serving.py +59 -20
mlrun/track/tracker.py +2 -1
mlrun/track/tracker_manager.py +3 -3
mlrun/track/trackers/mlflow_tracker.py +1 -2
mlrun/utils/async_http.py +5 -7
mlrun/utils/azure_vault.py +1 -1
mlrun/utils/clones.py +1 -2
mlrun/utils/condition_evaluator.py +3 -3
mlrun/utils/db.py +3 -3
mlrun/utils/helpers.py +183 -197
mlrun/utils/http.py +2 -5
mlrun/utils/logger.py +76 -14
mlrun/utils/notifications/notification/__init__.py +17 -12
mlrun/utils/notifications/notification/base.py +14 -2
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +3 -1
mlrun/utils/notifications/notification/ipython.py +3 -1
mlrun/utils/notifications/notification/slack.py +101 -21
mlrun/utils/notifications/notification/webhook.py +11 -1
mlrun/utils/notifications/notification_pusher.py +155 -30
mlrun/utils/retryer.py +208 -0
mlrun/utils/singleton.py +1 -1
mlrun/utils/v3io_clients.py +2 -4
mlrun/utils/version/version.json +2 -2
mlrun/utils/version/version.py +2 -6
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
mlrun-1.7.0rc20.dist-info/RECORD +353 -0
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/batch.py +0 -1095
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
mlrun/platforms/other.py +0 -306
mlrun-1.6.4rc2.dist-info/RECORD +0 -314
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0

mlrun/feature_store/feature_vector.py CHANGED Viewed

@@ -17,7 +17,7 @@ import typing
 from copy import copy
 from datetime import datetime
 from enum import Enum
-from typing import Dict, List, Union
+from typing import Union
 import numpy as np
 import pandas as pd
@@ -69,18 +69,16 @@ class FeatureVectorSpec(ModelObj):
         self._entity_fields: ObjectList = None
         self._entity_source: DataSource = None
         self._function: FunctionReference = None
-        self._relations: typing.Dict[str, ObjectDict] = None
+        self._relations: dict[str, ObjectDict] = None
         self._join_graph: JoinGraph = None
         self.description = description
-        self.features: List[str] = features or []
+        self.features: list[str] = features or []
         self.entity_source = entity_source
         self.entity_fields = entity_fields or []
         self.graph = graph
         self.join_graph = join_graph
-        self.relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]] = (
-            relations or {}
-        )
+        self.relations: dict[str, dict[str, Union[Entity, str]]] = relations or {}
         self.timestamp_field = timestamp_field
         self.label_feature = label_feature
         self.with_indexes = with_indexes
@@ -97,12 +95,12 @@ class FeatureVectorSpec(ModelObj):
         self._entity_source = self._verify_dict(source, "entity_source", DataSource)
     @property
-    def entity_fields(self) -> List[Feature]:
+    def entity_fields(self) -> list[Feature]:
         """the schema/metadata for the entity source fields"""
         return self._entity_fields
     @entity_fields.setter
-    def entity_fields(self, entity_fields: List[Feature]):
+    def entity_fields(self, entity_fields: list[Feature]):
         self._entity_fields = ObjectList.from_list(Feature, entity_fields)
     @property
@@ -125,14 +123,12 @@ class FeatureVectorSpec(ModelObj):
         self._function = self._verify_dict(function, "function", FunctionReference)
     @property
-    def relations(self) -> typing.Dict[str, ObjectDict]:
+    def relations(self) -> dict[str, ObjectDict]:
         """feature set relations dict"""
         return self._relations
     @relations.setter
-    def relations(
-        self, relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]]
-    ):
+    def relations(self, relations: dict[str, dict[str, Union[Entity, str]]]):
         temp_relations = {}
         for fs_name, relation in relations.items():
             for col, ent in relation.items():
@@ -179,29 +175,29 @@ class FeatureVectorStatus(ModelObj):
         self.stats = stats or {}
         self.index_keys = index_keys
         self.preview = preview or []
-        self.features: List[Feature] = features or []
+        self.features: list[Feature] = features or []
         self.run_uri = run_uri
         self.timestamp_key = timestamp_key
     @property
-    def targets(self) -> List[DataTarget]:
+    def targets(self) -> list[DataTarget]:
         """list of material storage targets + their status/path"""
         return self._targets
     @targets.setter
-    def targets(self, targets: List[DataTarget]):
+    def targets(self, targets: list[DataTarget]):
         self._targets = ObjectList.from_list(DataTarget, targets)
     def update_target(self, target: DataTarget):
         self._targets.update(target)
     @property
-    def features(self) -> List[Feature]:
+    def features(self) -> list[Feature]:
         """list of features (result of joining features from the source feature sets)"""
         return self._features
     @features.setter
-    def features(self, features: List[Feature]):
+    def features(self, features: list[Feature]):
         self._features = ObjectList.from_list(Feature, features)
@@ -378,7 +374,7 @@ class _JoinStep(ModelObj):
         name: str = None,
         left_step_name: str = None,
         right_step_name: str = None,
-        left_feature_set_names: Union[str, List[str]] = None,
+        left_feature_set_names: Union[str, list[str]] = None,
         right_feature_set_name: str = None,
         join_type: str = "inner",
         asof_join: bool = False,
@@ -388,7 +384,8 @@ class _JoinStep(ModelObj):
         self.right_step_name = right_step_name
         self.left_feature_set_names = (
             left_feature_set_names
-            if isinstance(left_feature_set_names, list)
+            if left_feature_set_names is None
+            or isinstance(left_feature_set_names, list)
             else [left_feature_set_names]
         )
         self.right_feature_set_name = right_feature_set_name
@@ -402,7 +399,7 @@ class _JoinStep(ModelObj):
         self,
         feature_set_objects: ObjectList,
         vector,
-        entity_rows_keys: List[str] = None,
+        entity_rows_keys: list[str] = None,
     ):
         if feature_set_objects[self.right_feature_set_name].is_connectable_to_df(
             entity_rows_keys
@@ -482,21 +479,22 @@ class FeatureVector(ModelObj):
         description=None,
         with_indexes=None,
         join_graph: JoinGraph = None,
-        relations: typing.Dict[str, typing.Dict[str, Union[Entity, str]]] = None,
+        relations: dict[str, dict[str, Union[Entity, str]]] = None,
     ):
         """Feature vector, specify selected features, their metadata and material views
         example::
             import mlrun.feature_store as fstore
             features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
             vector = fstore.FeatureVector("my-vec", features)
             # get the vector as a dataframe
-            df = fstore.get_offline_features(vector).to_dataframe()
+            df = vector.get_offline_features().to_dataframe()
             # return an online/real-time feature service
-            svc = fstore.get_online_feature_service(vector, impute_policy={"*": "$mean"})
+            svc = vector.get_online_feature_service(impute_policy={"*": "$mean"})
             resp = svc.get([{"stock": "GOOG"}])
         :param name:           List of names of targets to delete (default: delete all ingested targets)
@@ -732,7 +730,7 @@ class FeatureVector(ModelObj):
         entity_timestamp_column: str = None,
         target: DataTargetBase = None,
         run_config: RunConfig = None,
-        drop_columns: List[str] = None,
+        drop_columns: list[str] = None,
         start_time: Union[str, datetime] = None,
         end_time: Union[str, datetime] = None,
         with_indexes: bool = False,
@@ -740,9 +738,9 @@ class FeatureVector(ModelObj):
         engine: str = None,
         engine_args: dict = None,
         query: str = None,
-        order_by: Union[str, List[str]] = None,
+        order_by: Union[str, list[str]] = None,
         spark_service: str = None,
-        timestamp_for_filtering: Union[str, Dict[str, str]] = None,
+        timestamp_for_filtering: Union[str, dict[str, str]] = None,
     ):
         """retrieve offline feature vector results
@@ -827,7 +825,7 @@ class FeatureVector(ModelObj):
         fixed_window_type: FixedWindowType = FixedWindowType.LastClosedWindow,
         impute_policy: dict = None,
         update_stats: bool = False,
-        entity_keys: List[str] = None,
+        entity_keys: list[str] = None,
     ):
         """initialize and return online feature vector service api,
         returns :py:class:`~mlrun.feature_store.OnlineVectorService`
@@ -855,7 +853,7 @@ class FeatureVector(ModelObj):
                 Example::
-                    svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
+                    svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
                     try:
                         resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
                         print(resp)
@@ -910,7 +908,7 @@ class OnlineVectorService:
         graph,
         index_columns,
         impute_policy: dict = None,
-        requested_columns: List[str] = None,
+        requested_columns: list[str] = None,
     ):
         self.vector = vector
         self.impute_policy = impute_policy or {}
@@ -966,7 +964,7 @@ class OnlineVectorService:
         """vector merger function status (ready, running, error)"""
         return "ready"
-    def get(self, entity_rows: List[Union[dict, list]], as_list=False):
+    def get(self, entity_rows: list[Union[dict, list]], as_list=False):
         """get feature vector given the provided entity inputs
         take a list of input vectors/rows and return a list of enriched feature vectors

mlrun/feature_store/ingestion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import uuid
 import pandas as pd
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.datastore.sources import get_source_from_dict, get_source_step
 from mlrun.datastore.targets import (
     add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
         out_path=featureset.spec.output_path,
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-ingest").set_label(
-        "feature-set", featureset.uri
-    )
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
+    ).set_label("feature-set", featureset.uri)
     if run_config.owner:
-        task.set_label("owner", run_config.owner).set_label(
-            "v3io_user", run_config.owner
-        )
+        task.set_label(
+            mlrun_constants.MLRunInternalLabels.owner, run_config.owner
+        ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
     # set run UID and save in the feature set status (linking the features et to the job)
     task.metadata.uid = uuid.uuid4().hex

mlrun/feature_store/retrieval/base.py CHANGED Viewed

@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
         update_stats=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._target = target
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
             timestamp_for_filtering=timestamp_for_filtering,
             query=query,
             order_by=order_by,
+            additional_filters=additional_filters,
         )
     def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
         timestamp_for_filtering=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._create_engine_env()
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
             feature_sets.append(None)
             join_types.append(None)
-        filtered = False
+        timestamp_filtered = False
         for step in join_graph.steps:
             name = step.right_feature_set_name
             feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
             if self._drop_indexes:
                 self._append_drop_column(time_column)
             if (start_time or end_time) and time_column:
-                filtered = True
+                timestamp_filtered = True
             df = self._get_engine_df(
                 feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
                 start_time if time_column else None,
                 end_time if time_column else None,
                 time_column,
+                additional_filters,
             )
             fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
                     new_columns.append((column, alias))
             self._update_alias(dictionary={name: alias for name, alias in new_columns})
-        # None of the feature sets was filtered as required
-        if not filtered and (start_time or end_time):
+        # None of the feature sets was timestamp filtered as required
+        if not timestamp_filtered and (start_time or end_time):
             raise mlrun.errors.MLRunRuntimeError(
                 "start_time and end_time can only be provided in conjunction with "
                 "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -540,8 +544,8 @@ class BaseMerger(abc.ABC):
             self,
             name: str,
             order: int,
-            left_keys: typing.List[str] = None,
-            right_keys: typing.List[str] = None,
+            left_keys: list[str] = None,
+            right_keys: list[str] = None,
         ):
             self.name = name
             self.left_keys = left_keys if left_keys is not None else []
@@ -750,11 +754,12 @@ class BaseMerger(abc.ABC):
     def _get_engine_df(
         self,
         feature_set: FeatureSet,
-        feature_set_name: typing.List[str],
-        column_names: typing.List[str] = None,
+        feature_set_name: list[str],
+        column_names: list[str] = None,
         start_time: typing.Union[str, datetime] = None,
         end_time: typing.Union[str, datetime] = None,
         time_column: typing.Optional[str] = None,
+        additional_filters=None,
     ):
         """
         Return the feature_set data frame according to the args
@@ -773,8 +778,8 @@ class BaseMerger(abc.ABC):
     def _rename_columns_and_select(
         self,
         df,
-        rename_col_dict: typing.Dict[str, str],
-        columns: typing.List[str] = None,
+        rename_col_dict: dict[str, str],
+        columns: list[str] = None,
     ):
         """
         rename the columns of the df according to rename_col_dict, and select only `columns` if it is not none
@@ -801,7 +806,7 @@ class BaseMerger(abc.ABC):
         """
         raise NotImplementedError
-    def _order_by(self, order_by_active: typing.List[str]):
+    def _order_by(self, order_by_active: list[str]):
         """
         Order by `order_by_active` along all axis.

mlrun/feature_store/retrieval/conversion.py CHANGED Viewed

@@ -19,7 +19,7 @@ from collections import Counter
 # np.bool -> bool and np.object -> object fix backported from pyspark v3.3.3.
-class PandasConversionMixin(object):
+class PandasConversionMixin:
     """
     Min-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame`
     can use this class.
@@ -79,10 +79,10 @@ class PandasConversionMixin(object):
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
-                        "failed by the reason below:\n  %s\n"
+                        f"failed by the reason below:\n  {e}\n"
                         "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                        "true." % str(e)
+                        "true."
                     )
                     warnings.warn(msg)
                     use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin(object):
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                         "reached the error below and will not continue because automatic fallback "
                         "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                        "false.\n  %s" % str(e)
+                        f"false.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -108,9 +108,7 @@ class PandasConversionMixin(object):
                     )
                     # Rename columns to avoid duplicated column names.
-                    tmp_column_names = [
-                        "col_{}".format(i) for i in range(len(self.columns))
-                    ]
+                    tmp_column_names = [f"col_{i}" for i in range(len(self.columns))]
                     self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled()
                     batches = self.toDF(*tmp_column_names)._collect_as_arrow(
                         split_batches=self_destruct
@@ -160,7 +158,7 @@ class PandasConversionMixin(object):
                         "reached the error below and can not continue. Note that "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                         "effect on failures in the middle of "
-                        "computation.\n  %s" % str(e)
+                        f"computation.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -170,10 +168,10 @@ class PandasConversionMixin(object):
         column_counter = Counter(self.columns)
         dtype = [None] * len(self.schema)
-        for fieldIdx, field in enumerate(self.schema):
+        for field_idx, field in enumerate(self.schema):
             # For duplicate column name, we use `iloc` to access it.
             if column_counter[field.name] > 1:
-                pandas_col = pdf.iloc[:, fieldIdx]
+                pandas_col = pdf.iloc[:, field_idx]
             else:
                 pandas_col = pdf[field.name]
@@ -189,12 +187,12 @@ class PandasConversionMixin(object):
                 and field.nullable
                 and pandas_col.isnull().any()
             ):
-                dtype[fieldIdx] = pandas_type
+                dtype[field_idx] = pandas_type
             # Ensure we fall back to nullable numpy types, even when whole column is null:
             if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = np.float64
+                dtype[field_idx] = np.float64
             if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = object
+                dtype[field_idx] = object
         df = pd.DataFrame()
         for index, t in enumerate(dtype):

mlrun/feature_store/retrieval/dask_merger.py CHANGED Viewed

@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         import dask.dataframe as dd
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
             end_time=end_time,
             time_column=time_column,
             index=False,
+            additional_filters=additional_filters,
         )
         return self._reset_index(df).persist()

mlrun/feature_store/retrieval/job.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import uuid
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.config import config as mlconf
 from mlrun.model import DataTargetBase, new_task
 from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
     start_time=None,
     end_time=None,
     timestamp_for_filtering=None,
+    additional_filters=None,
 ):
     name = vector.metadata.name
     if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
             "end_time": end_time,
             "timestamp_for_filtering": timestamp_for_filtering,
             "engine_args": engine_args,
+            "additional_filters": additional_filters,
         },
         inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
+    ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
     task.metadata.uid = uuid.uuid4().hex
     vector.status.run_uri = task.metadata.uid
     vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
 from mlrun.datastore.targets import get_target_driver
 def merge_handler(context, vector_uri, target, entity_rows=None,
                   entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
-                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
+                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
+                  additional_filters=None):
     vector = context.get_store_resource(vector_uri)
     store_target = get_target_driver(target, vector)
     if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
     merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
     merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
                  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
-                 timestamp_for_filtering=timestamp_for_filtering)
+                 timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
     target = vector.status.targets[store_target.name].to_dict()
     context.log_result('feature_vector', vector.uri)

mlrun/feature_store/retrieval/local_merger.py CHANGED Viewed

@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         df = feature_set.to_dataframe(
             columns=column_names,
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
         )
         if df.index.names[0]:
             df.reset_index(inplace=True)

mlrun/feature_store/retrieval/spark_merger.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import pandas as pd
 import semver
@@ -24,6 +25,32 @@ from .base import BaseMerger
 from .conversion import PandasConversionMixin
+def spark_df_to_pandas(spark_df):
+    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
+    # when we upgrade pyspark, we should check whether this workaround is still necessary
+    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
+    if semver.parse(pd.__version__)["major"] >= 2:
+        import pyspark.sql.functions as pyspark_functions
+        type_conversion_dict = {}
+        for field in spark_df.schema.fields:
+            if str(field.dataType) == "TimestampType":
+                spark_df = spark_df.withColumn(
+                    field.name,
+                    pyspark_functions.date_format(
+                        pyspark_functions.to_timestamp(field.name),
+                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
+                    ),
+                )
+                type_conversion_dict[field.name] = "datetime64[ns]"
+        df = PandasConversionMixin.toPandas(spark_df)
+        if type_conversion_dict:
+            df = df.astype(type_conversion_dict)
+        return df
+    else:
+        return PandasConversionMixin.toPandas(spark_df)
 class SparkFeatureMerger(BaseMerger):
     engine = "spark"
     support_offline = True
@@ -166,29 +193,7 @@ class SparkFeatureMerger(BaseMerger):
     def get_df(self, to_pandas=True):
         if to_pandas:
             if self._pandas_df is None:
-                df = self._result_df
-                # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
-                # when we upgrade pyspark, we should check whether this workaround is still necessary
-                # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
-                if semver.parse(pd.__version__)["major"] >= 2:
-                    import pyspark.sql.functions as pyspark_functions
-                    type_conversion_dict = {}
-                    for field in df.schema.fields:
-                        if str(field.dataType) == "TimestampType":
-                            df = df.withColumn(
-                                field.name,
-                                pyspark_functions.date_format(
-                                    pyspark_functions.to_timestamp(field.name),
-                                    "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
-                                ),
-                            )
-                            type_conversion_dict[field.name] = "datetime64[ns]"
-                    df = PandasConversionMixin.toPandas(df)
-                    if type_conversion_dict:
-                        df = df.astype(type_conversion_dict)
-                else:
-                    df = PandasConversionMixin.toPandas(df)
+                df = spark_df_to_pandas(self._result_df)
                 self._pandas_df = df
                 self._set_indexes(self._pandas_df)
             return self._pandas_df
@@ -221,7 +226,12 @@ class SparkFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         source_kwargs = {}
         if feature_set.spec.passthrough:
             if not feature_set.spec.source:
@@ -243,13 +253,13 @@ class SparkFeatureMerger(BaseMerger):
         # handling case where there are multiple feature sets and user creates vector where
         # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
         source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
         source = source_driver(
             name=self.vector.metadata.name,
             path=source_path,
             time_field=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **source_kwargs,
         )

mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl