PyPI - mlrun - Versions diffs - 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl - Mend

mlrun 1.7.0rc4py3-none-any.whl → 1.7.0rc20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (200) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +25 -111
mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
mlrun/alerts/alert.py +144 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +38 -254
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +41 -47
mlrun/artifacts/model.py +30 -158
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +68 -0
mlrun/common/formatters/__init__.py +19 -0
mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
mlrun/common/formatters/base.py +78 -0
mlrun/common/formatters/function.py +41 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +25 -4
mlrun/common/schemas/alert.py +203 -0
mlrun/common/schemas/api_gateway.py +148 -0
mlrun/common/schemas/artifact.py +15 -5
mlrun/common/schemas/auth.py +8 -2
mlrun/common/schemas/client_spec.py +2 -0
mlrun/common/schemas/frontend_spec.py +1 -0
mlrun/common/schemas/function.py +4 -0
mlrun/common/schemas/hub.py +7 -9
mlrun/common/schemas/model_monitoring/__init__.py +19 -3
mlrun/common/schemas/model_monitoring/constants.py +96 -26
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +0 -9
mlrun/common/schemas/project.py +22 -21
mlrun/common/types.py +7 -1
mlrun/config.py +87 -19
mlrun/data_types/data_types.py +4 -0
mlrun/data_types/to_pandas.py +9 -9
mlrun/datastore/__init__.py +5 -8
mlrun/datastore/alibaba_oss.py +130 -0
mlrun/datastore/azure_blob.py +4 -5
mlrun/datastore/base.py +69 -30
mlrun/datastore/datastore.py +10 -2
mlrun/datastore/datastore_profile.py +90 -6
mlrun/datastore/google_cloud_storage.py +1 -1
mlrun/datastore/hdfs.py +5 -0
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/redis.py +2 -2
mlrun/datastore/s3.py +5 -0
mlrun/datastore/snowflake_utils.py +43 -0
mlrun/datastore/sources.py +172 -44
mlrun/datastore/store_resources.py +7 -7
mlrun/datastore/targets.py +285 -41
mlrun/datastore/utils.py +68 -5
mlrun/datastore/v3io.py +27 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/base.py +149 -14
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +608 -178
mlrun/db/nopdb.py +191 -7
mlrun/errors.py +11 -0
mlrun/execution.py +37 -20
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +21 -52
mlrun/feature_store/feature_set.py +48 -23
mlrun/feature_store/feature_vector.py +2 -1
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/conversion.py +9 -9
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +9 -3
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +34 -24
mlrun/feature_store/steps.py +30 -19
mlrun/features.py +4 -13
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
mlrun/frameworks/lgbm/__init__.py +1 -1
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/model_handler.py +1 -1
mlrun/frameworks/parallel_coordinates.py +2 -1
mlrun/frameworks/pytorch/__init__.py +2 -2
mlrun/frameworks/sklearn/__init__.py +1 -1
mlrun/frameworks/tf_keras/__init__.py +5 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
mlrun/frameworks/xgboost/__init__.py +1 -1
mlrun/k8s_utils.py +10 -11
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +6 -5
mlrun/launcher/client.py +8 -6
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +9 -3
mlrun/launcher/remote.py +9 -3
mlrun/lists.py +6 -2
mlrun/model.py +58 -19
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +127 -301
mlrun/model_monitoring/application.py +5 -296
mlrun/model_monitoring/applications/__init__.py +11 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +282 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +30 -36
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
mlrun/model_monitoring/db/tsdb/base.py +329 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +100 -7
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +93 -228
mlrun/model_monitoring/tracking_policy.py +7 -1
mlrun/model_monitoring/writer.py +152 -124
mlrun/package/packagers_manager.py +1 -0
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +21 -202
mlrun/projects/operations.py +30 -16
mlrun/projects/pipelines.py +92 -99
mlrun/projects/project.py +757 -268
mlrun/render.py +15 -14
mlrun/run.py +160 -162
mlrun/runtimes/__init__.py +55 -3
mlrun/runtimes/base.py +33 -19
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/kubejob.py +28 -122
mlrun/runtimes/local.py +5 -2
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +8 -8
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/api_gateway.py +709 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +523 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/nuclio/function.py +98 -58
mlrun/runtimes/nuclio/serving.py +36 -42
mlrun/runtimes/pod.py +196 -45
mlrun/runtimes/remotesparkjob.py +1 -1
mlrun/runtimes/sparkjob/spark3job.py +1 -1
mlrun/runtimes/utils.py +6 -73
mlrun/secrets.py +6 -2
mlrun/serving/remote.py +2 -3
mlrun/serving/routers.py +7 -4
mlrun/serving/server.py +7 -8
mlrun/serving/states.py +73 -43
mlrun/serving/v2_serving.py +8 -7
mlrun/track/tracker.py +2 -1
mlrun/utils/async_http.py +25 -5
mlrun/utils/helpers.py +141 -75
mlrun/utils/http.py +1 -1
mlrun/utils/logger.py +39 -7
mlrun/utils/notifications/notification/__init__.py +14 -9
mlrun/utils/notifications/notification/base.py +12 -0
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +3 -1
mlrun/utils/notifications/notification/ipython.py +2 -0
mlrun/utils/notifications/notification/slack.py +101 -21
mlrun/utils/notifications/notification/webhook.py +11 -1
mlrun/utils/notifications/notification_pusher.py +147 -16
mlrun/utils/retryer.py +3 -2
mlrun/utils/v3io_clients.py +0 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
mlrun-1.7.0rc20.dist-info/RECORD +353 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/batch.py +0 -974
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
mlrun/platforms/other.py +0 -305
mlrun-1.7.0rc4.dist-info/RECORD +0 -321
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0

mlrun/feature_store/feature_set.py CHANGED Viewed

@@ -337,7 +337,10 @@ class FeatureSet(ModelObj):
         example::
             import mlrun.feature_store as fstore
-            ticks = fstore.FeatureSet("ticks", entities=["stock"], timestamp_key="timestamp")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
             ticks.ingest(df)
         :param name:          name of the feature set
@@ -625,12 +628,12 @@ class FeatureSet(ModelObj):
             import mlrun.feature_store as fstore
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_entity("country",
-                            mlrun.data_types.ValueType.STRING,
-                            description="stock country")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_entity(
+                "country", mlrun.data_types.ValueType.STRING, description="stock country"
+            )
             ticks.add_entity("year", mlrun.data_types.ValueType.INT16)
             ticks.save()
@@ -650,13 +653,23 @@ class FeatureSet(ModelObj):
             import mlrun.feature_store as fstore
             from mlrun.features import Feature
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.STRING,
-                            description="client consistency"),"ABC01")
-            ticks.add_feature(Feature(value_type=mlrun.data_types.ValueType.FLOAT,
-                            description="client volatility"),"SAB")
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_feature(
+                Feature(
+                    value_type=mlrun.data_types.ValueType.STRING,
+                    description="client consistency",
+                ),
+                "ABC01",
+            )
+            ticks.add_feature(
+                Feature(
+                    value_type=mlrun.data_types.ValueType.FLOAT,
+                    description="client volatility",
+                ),
+                "SAB",
+            )
             ticks.save()
         :param feature:         setting of Feature
@@ -860,15 +873,18 @@ class FeatureSet(ModelObj):
         example::
             import mlrun.feature_store as fstore
             ...
-            ticks = fstore.FeatureSet("ticks",
-                            entities=["stock"],
-                            timestamp_key="timestamp")
-            ticks.add_aggregation(name='priceN',
-                                column='price',
-                                operations=['avg'],
-                                windows=['1d'],
-                                period='1h')
+            ticks = fstore.FeatureSet(
+                "ticks", entities=["stock"], timestamp_key="timestamp"
+            )
+            ticks.add_aggregation(
+                name="priceN",
+                column="price",
+                operations=["avg"],
+                windows=["1d"],
+                period="1h",
+            )
             ticks.plot(rankdir="LR", with_targets=True)
         :param filename:     target filepath for the graph image (None for the notebook)
@@ -901,6 +917,7 @@ class FeatureSet(ModelObj):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return featureset (offline) data as dataframe
@@ -912,6 +929,12 @@ class FeatureSet(ModelObj):
         :param end_time:     filter by end time
         :param time_column:  specify the time column name in the file
         :param kwargs:       additional reader (csv, parquet, ..) args
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         :return: DataFrame
         """
         entities = list(self.spec.entities.keys())
@@ -930,6 +953,7 @@ class FeatureSet(ModelObj):
                 start_time=start_time,
                 end_time=end_time,
                 time_field=time_column,
+                additional_filters=additional_filters,
                 **kwargs,
             )
             # to_dataframe() can sometimes return an iterator of dataframes instead of one dataframe
@@ -949,6 +973,7 @@ class FeatureSet(ModelObj):
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return result
@@ -1005,7 +1030,7 @@ class FeatureSet(ModelObj):
             df = stocks_set.ingest(stocks, infer_options=fstore.InferOptions.default())
             # for running as remote job
-            config = RunConfig(image='mlrun/mlrun')
+            config = RunConfig(image="mlrun/mlrun")
             df = ingest(stocks_set, stocks, run_config=config)
             # specify source and targets

mlrun/feature_store/feature_vector.py CHANGED Viewed

@@ -486,6 +486,7 @@ class FeatureVector(ModelObj):
         example::
             import mlrun.feature_store as fstore
             features = ["quotes.bid", "quotes.asks_sum_5h as asks_5h", "stocks.*"]
             vector = fstore.FeatureVector("my-vec", features)
@@ -852,7 +853,7 @@ class FeatureVector(ModelObj):
                 Example::
-                    svc = vector_uri.get_online_feature_service(entity_keys=['ticker'])
+                    svc = vector_uri.get_online_feature_service(entity_keys=["ticker"])
                     try:
                         resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
                         print(resp)

mlrun/feature_store/ingestion.py CHANGED Viewed

@@ -17,6 +17,7 @@ import uuid
 import pandas as pd
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.datastore.sources import get_source_from_dict, get_source_step
 from mlrun.datastore.targets import (
     add_target_steps,
@@ -263,13 +264,13 @@ def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service
         out_path=featureset.spec.output_path,
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-ingest").set_label(
-        "feature-set", featureset.uri
-    )
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-ingest"
+    ).set_label("feature-set", featureset.uri)
     if run_config.owner:
-        task.set_label("owner", run_config.owner).set_label(
-            "v3io_user", run_config.owner
-        )
+        task.set_label(
+            mlrun_constants.MLRunInternalLabels.owner, run_config.owner
+        ).set_label(mlrun_constants.MLRunInternalLabels.v3io_user, run_config.owner)
     # set run UID and save in the feature set status (linking the features et to the job)
     task.metadata.uid = uuid.uuid4().hex

mlrun/feature_store/retrieval/base.py CHANGED Viewed

@@ -88,6 +88,7 @@ class BaseMerger(abc.ABC):
         update_stats=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._target = target
@@ -134,6 +135,7 @@ class BaseMerger(abc.ABC):
             timestamp_for_filtering=timestamp_for_filtering,
             query=query,
             order_by=order_by,
+            additional_filters=additional_filters,
         )
     def _write_to_offline_target(self, timestamp_key=None):
@@ -186,6 +188,7 @@ class BaseMerger(abc.ABC):
         timestamp_for_filtering=None,
         query=None,
         order_by=None,
+        additional_filters=None,
     ):
         self._create_engine_env()
@@ -212,7 +215,7 @@ class BaseMerger(abc.ABC):
             feature_sets.append(None)
             join_types.append(None)
-        filtered = False
+        timestamp_filtered = False
         for step in join_graph.steps:
             name = step.right_feature_set_name
             feature_set = feature_set_objects[name]
@@ -250,7 +253,7 @@ class BaseMerger(abc.ABC):
             if self._drop_indexes:
                 self._append_drop_column(time_column)
             if (start_time or end_time) and time_column:
-                filtered = True
+                timestamp_filtered = True
             df = self._get_engine_df(
                 feature_set,
@@ -259,6 +262,7 @@ class BaseMerger(abc.ABC):
                 start_time if time_column else None,
                 end_time if time_column else None,
                 time_column,
+                additional_filters,
             )
             fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
@@ -302,8 +306,8 @@ class BaseMerger(abc.ABC):
                     new_columns.append((column, alias))
             self._update_alias(dictionary={name: alias for name, alias in new_columns})
-        # None of the feature sets was filtered as required
-        if not filtered and (start_time or end_time):
+        # None of the feature sets was timestamp filtered as required
+        if not timestamp_filtered and (start_time or end_time):
             raise mlrun.errors.MLRunRuntimeError(
                 "start_time and end_time can only be provided in conjunction with "
                 "a timestamp column, or when the at least one feature_set has a timestamp key"
@@ -755,6 +759,7 @@ class BaseMerger(abc.ABC):
         start_time: typing.Union[str, datetime] = None,
         end_time: typing.Union[str, datetime] = None,
         time_column: typing.Optional[str] = None,
+        additional_filters=None,
     ):
         """
         Return the feature_set data frame according to the args

mlrun/feature_store/retrieval/conversion.py CHANGED Viewed

@@ -79,10 +79,10 @@ class PandasConversionMixin:
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
-                        "failed by the reason below:\n  %s\n"
+                        f"failed by the reason below:\n  {e}\n"
                         "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                        "true." % str(e)
+                        "true."
                     )
                     warnings.warn(msg)
                     use_arrow = False
@@ -92,7 +92,7 @@ class PandasConversionMixin:
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                         "reached the error below and will not continue because automatic fallback "
                         "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                        "false.\n  %s" % str(e)
+                        f"false.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -158,7 +158,7 @@ class PandasConversionMixin:
                         "reached the error below and can not continue. Note that "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                         "effect on failures in the middle of "
-                        "computation.\n  %s" % str(e)
+                        f"computation.\n  {e}"
                     )
                     warnings.warn(msg)
                     raise
@@ -168,10 +168,10 @@ class PandasConversionMixin:
         column_counter = Counter(self.columns)
         dtype = [None] * len(self.schema)
-        for fieldIdx, field in enumerate(self.schema):
+        for field_idx, field in enumerate(self.schema):
             # For duplicate column name, we use `iloc` to access it.
             if column_counter[field.name] > 1:
-                pandas_col = pdf.iloc[:, fieldIdx]
+                pandas_col = pdf.iloc[:, field_idx]
             else:
                 pandas_col = pdf[field.name]
@@ -187,12 +187,12 @@ class PandasConversionMixin:
                 and field.nullable
                 and pandas_col.isnull().any()
             ):
-                dtype[fieldIdx] = pandas_type
+                dtype[field_idx] = pandas_type
             # Ensure we fall back to nullable numpy types, even when whole column is null:
             if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = np.float64
+                dtype[field_idx] = np.float64
             if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
-                dtype[fieldIdx] = object
+                dtype[field_idx] = object
         df = pd.DataFrame()
         for index, t in enumerate(dtype):

mlrun/feature_store/retrieval/dask_merger.py CHANGED Viewed

@@ -145,6 +145,7 @@ class DaskFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         import dask.dataframe as dd
@@ -155,6 +156,7 @@ class DaskFeatureMerger(BaseMerger):
             end_time=end_time,
             time_column=time_column,
             index=False,
+            additional_filters=additional_filters,
         )
         return self._reset_index(df).persist()

mlrun/feature_store/retrieval/job.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import uuid
 import mlrun
+import mlrun.common.constants as mlrun_constants
 from mlrun.config import config as mlconf
 from mlrun.model import DataTargetBase, new_task
 from mlrun.runtimes.function_reference import FunctionReference
@@ -42,6 +43,7 @@ def run_merge_job(
     start_time=None,
     end_time=None,
     timestamp_for_filtering=None,
+    additional_filters=None,
 ):
     name = vector.metadata.name
     if not target or not hasattr(target, "to_dict"):
@@ -116,11 +118,14 @@ def run_merge_job(
             "end_time": end_time,
             "timestamp_for_filtering": timestamp_for_filtering,
             "engine_args": engine_args,
+            "additional_filters": additional_filters,
         },
         inputs={"entity_rows": entity_rows} if entity_rows is not None else {},
     )
     task.spec.secret_sources = run_config.secret_sources
-    task.set_label("job-type", "feature-merge").set_label("feature-vector", vector.uri)
+    task.set_label(
+        mlrun_constants.MLRunInternalLabels.job_type, "feature-merge"
+    ).set_label(mlrun_constants.MLRunInternalLabels.feature_vector, vector.uri)
     task.metadata.uid = uuid.uuid4().hex
     vector.status.run_uri = task.metadata.uid
     vector.save()
@@ -196,7 +201,8 @@ import mlrun.feature_store.retrieval
 from mlrun.datastore.targets import get_target_driver
 def merge_handler(context, vector_uri, target, entity_rows=None,
                   entity_timestamp_column=None, drop_columns=None, with_indexes=None, query=None,
-                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None):
+                  engine_args=None, order_by=None, start_time=None, end_time=None, timestamp_for_filtering=None,
+                  additional_filters=None):
     vector = context.get_store_resource(vector_uri)
     store_target = get_target_driver(target, vector)
     if entity_rows:
@@ -206,7 +212,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
     merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
     merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
                  query=query, order_by=order_by, start_time=start_time, end_time=end_time,
-                 timestamp_for_filtering=timestamp_for_filtering)
+                 timestamp_for_filtering=timestamp_for_filtering, additional_filters=additional_filters)
     target = vector.status.targets[store_target.name].to_dict()
     context.log_result('feature_vector', vector.uri)

mlrun/feature_store/retrieval/local_merger.py CHANGED Viewed

@@ -114,12 +114,14 @@ class LocalFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
         df = feature_set.to_dataframe(
             columns=column_names,
             start_time=start_time,
             end_time=end_time,
             time_column=time_column,
+            additional_filters=additional_filters,
         )
         if df.index.names[0]:
             df.reset_index(inplace=True)

mlrun/feature_store/retrieval/spark_merger.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import pandas as pd
 import semver
@@ -24,6 +25,32 @@ from .base import BaseMerger
 from .conversion import PandasConversionMixin
+def spark_df_to_pandas(spark_df):
+    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
+    # when we upgrade pyspark, we should check whether this workaround is still necessary
+    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
+    if semver.parse(pd.__version__)["major"] >= 2:
+        import pyspark.sql.functions as pyspark_functions
+        type_conversion_dict = {}
+        for field in spark_df.schema.fields:
+            if str(field.dataType) == "TimestampType":
+                spark_df = spark_df.withColumn(
+                    field.name,
+                    pyspark_functions.date_format(
+                        pyspark_functions.to_timestamp(field.name),
+                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
+                    ),
+                )
+                type_conversion_dict[field.name] = "datetime64[ns]"
+        df = PandasConversionMixin.toPandas(spark_df)
+        if type_conversion_dict:
+            df = df.astype(type_conversion_dict)
+        return df
+    else:
+        return PandasConversionMixin.toPandas(spark_df)
 class SparkFeatureMerger(BaseMerger):
     engine = "spark"
     support_offline = True
@@ -166,29 +193,7 @@ class SparkFeatureMerger(BaseMerger):
     def get_df(self, to_pandas=True):
         if to_pandas:
             if self._pandas_df is None:
-                df = self._result_df
-                # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
-                # when we upgrade pyspark, we should check whether this workaround is still necessary
-                # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
-                if semver.parse(pd.__version__)["major"] >= 2:
-                    import pyspark.sql.functions as pyspark_functions
-                    type_conversion_dict = {}
-                    for field in df.schema.fields:
-                        if str(field.dataType) == "TimestampType":
-                            df = df.withColumn(
-                                field.name,
-                                pyspark_functions.date_format(
-                                    pyspark_functions.to_timestamp(field.name),
-                                    "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
-                                ),
-                            )
-                            type_conversion_dict[field.name] = "datetime64[ns]"
-                    df = PandasConversionMixin.toPandas(df)
-                    if type_conversion_dict:
-                        df = df.astype(type_conversion_dict)
-                else:
-                    df = PandasConversionMixin.toPandas(df)
+                df = spark_df_to_pandas(self._result_df)
                 self._pandas_df = df
                 self._set_indexes(self._pandas_df)
             return self._pandas_df
@@ -221,7 +226,12 @@ class SparkFeatureMerger(BaseMerger):
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
     ):
+        mlrun.utils.helpers.additional_filters_warning(
+            additional_filters, self.__class__
+        )
         source_kwargs = {}
         if feature_set.spec.passthrough:
             if not feature_set.spec.source:
@@ -243,13 +253,13 @@ class SparkFeatureMerger(BaseMerger):
         # handling case where there are multiple feature sets and user creates vector where
         # entity_timestamp_column is from a specific feature set (can't be entity timestamp)
         source_driver = mlrun.datastore.sources.source_kind_to_driver[source_kind]
         source = source_driver(
             name=self.vector.metadata.name,
             path=source_path,
             time_field=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **source_kwargs,
         )

mlrun/feature_store/steps.py CHANGED Viewed

@@ -162,13 +162,19 @@ class MapValues(StepToDict, MLRunStep):
         example::
             # replace the value "U" with '0' in the age column
-            graph.to(MapValues(mapping={'age': {'U': '0'}}, with_original_features=True))
+            graph.to(MapValues(mapping={"age": {"U": "0"}}, with_original_features=True))
             # replace integers, example
-            graph.to(MapValues(mapping={'not': {0: 1, 1: 0}}))
+            graph.to(MapValues(mapping={"not": {0: 1, 1: 0}}))
             # replace by range, use -inf and inf for extended range
-            graph.to(MapValues(mapping={'numbers': {'ranges': {'negative': [-inf, 0], 'positive': [0, inf]}}}))
+            graph.to(
+                MapValues(
+                    mapping={
+                        "numbers": {"ranges": {"negative": [-inf, 0], "positive": [0, inf]}}
+                    }
+                )
+            )
         :param mapping: a dict with entry per column and the associated old/new values map
         :param with_original_features: set to True to keep the original features
@@ -424,8 +430,10 @@ class OneHotEncoder(StepToDict, MLRunStep):
         example::
-            mapping = {'category': ['food', 'health', 'transportation'],
-                       'gender': ['male', 'female']}
+            mapping = {
+                "category": ["food", "health", "transportation"],
+                "gender": ["male", "female"],
+            }
             graph.to(OneHotEncoder(mapping=one_hot_encoder_mapping))
         :param mapping: a dict of per column categories (to map to binary fields)
@@ -542,10 +550,12 @@ class DateExtractor(StepToDict, MLRunStep):
             # (taken from the fraud-detection end-to-end feature store demo)
             # Define the Transactions FeatureSet
-            transaction_set = fstore.FeatureSet("transactions",
-                                            entities=[fstore.Entity("source")],
-                                            timestamp_key='timestamp',
-                                            description="transactions feature set")
+            transaction_set = fstore.FeatureSet(
+                "transactions",
+                entities=[fstore.Entity("source")],
+                timestamp_key="timestamp",
+                description="transactions feature set",
+            )
             # Get FeatureSet computation graph
             transaction_graph = transaction_set.graph
@@ -553,11 +563,11 @@ class DateExtractor(StepToDict, MLRunStep):
             # Add the custom `DateExtractor` step
             # to the computation graph
             transaction_graph.to(
-                    class_name='DateExtractor',
-                    name='Extract Dates',
-                    parts = ['hour', 'day_of_week'],
-                    timestamp_col = 'timestamp',
-                )
+                class_name="DateExtractor",
+                name="Extract Dates",
+                parts=["hour", "day_of_week"],
+                timestamp_col="timestamp",
+            )
         :param parts: list of pandas style date-time parts you want to extract.
         :param timestamp_col: The name of the column containing the timestamps to extract from,
@@ -694,11 +704,12 @@ class DropFeatures(StepToDict, MLRunStep):
         example::
-            feature_set = fstore.FeatureSet("fs-new",
-                                        entities=[fstore.Entity("id")],
-                                        description="feature set",
-                                        engine="pandas",
-                                        )
+            feature_set = fstore.FeatureSet(
+                "fs-new",
+                entities=[fstore.Entity("id")],
+                description="feature set",
+                engine="pandas",
+            )
             # Pre-processing graph steps
             feature_set.graph.to(DropFeatures(features=["age"]))
             df_pandas = feature_set.ingest(data)

mlrun/features.py CHANGED Viewed

@@ -238,10 +238,7 @@ class Validator(ModelObj):
             from mlrun.features import Validator
             # Add validator to the feature 'bid' with check type
-            quotes_set["bid"].validator = Validator(
-                check_type=True,
-                severity="info"
-            )
+            quotes_set["bid"].validator = Validator(check_type=True, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -280,10 +277,7 @@ class MinMaxValidator(Validator):
             # Add validator to the feature 'bid', where valid
             # minimal value is 52
-            quotes_set["bid"].validator = MinMaxValidator(
-                min=52,
-                severity="info"
-            )
+            quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")
         :param check_type:  check feature type e.g. True, False
         :param severity:    severity name e.g. info, warning, etc.
@@ -344,9 +338,7 @@ class MinMaxLenValidator(Validator):
             # Add length validator to the feature 'ticker', where valid
             # minimal length is 1 and maximal length is 10
             quotes_set["ticker"].validator = MinMaxLenValidator(
-                min=1,
-                max=10,
-                severity="info"
+                min=1, max=10, severity="info"
             )
         :param check_type:  check feature type e.g. True, False
@@ -408,8 +400,7 @@ class RegexValidator(Validator):
             # expression '(\b[A-Za-z]{1}[0-9]{7}\b)' where valid values are
             # e.g. A1234567, z9874563, etc.
             quotes_set["name"].validator = RegexValidator(
-                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)",
-                severity="info"
+                regex=r"(\b[A-Za-z]{1}[0-9]{7}\b)", severity="info"
             )
         :param check_type:  check feature type e.g. True, False

mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py CHANGED Viewed

@@ -547,9 +547,9 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
                     "inputs",
                     "parameters",
                 ]:
-                    text += "\n  * **{}**: {}".format(
-                        property_name.capitalize(),
-                        self._markdown_print(value=property_value, tabs=2),
+                    text += (
+                        f"\n  * **{property_name.capitalize()}**: "
+                        f"{self._markdown_print(value=property_value, tabs=2)}"
                     )
         else:
             for property_name, property_value in self._extract_epoch_results().items():
@@ -614,13 +614,8 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
         :return: The generated link.
         """
         return (
-            '<a href="{}/{}/{}/jobs/monitor/{}/overview" target="_blank">{}</a>'.format(
-                config.resolve_ui_url(),
-                config.ui.projects_prefix,
-                context.project,
-                context.uid,
-                link_text,
-            )
+            f'<a href="{config.resolve_ui_url()}/{config.ui.projects_prefix}/{context.project}'
+            f'/jobs/monitor/{context.uid}/overview" target="_blank">{link_text}</a>'
         )
     @staticmethod
@@ -653,13 +648,13 @@ class TensorboardLogger(Logger, Generic[DLTypes.WeightType]):
         if isinstance(value, list):
             if len(value) == 0:
                 return ""
-            text = "\n" + yaml.dump(value)
+            text = "\n" + yaml.safe_dump(value)
             text = "  \n".join(["  " * tabs + line for line in text.splitlines()])
             return text
         if isinstance(value, dict):
             if len(value) == 0:
                 return ""
-            text = yaml.dump(value)
+            text = yaml.safe_dump(value)
             text = "  \n".join(
                 ["  " * tabs + "- " + line for line in text.splitlines()]
             )

mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc4py3-none-any.whl → 1.7.0rc20py3-none-any.whl