PyPI - mlrun - Versions diffs - 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl - Mend

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +26 -112
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +144 -0
mlrun/api/schemas/__init__.py +5 -4
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +46 -257
mlrun/artifacts/dataset.py +11 -192
mlrun/artifacts/manager.py +47 -48
mlrun/artifacts/model.py +31 -159
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +69 -0
mlrun/common/db/sql_session.py +2 -3
mlrun/common/formatters/__init__.py +19 -0
mlrun/common/formatters/artifact.py +21 -0
mlrun/common/formatters/base.py +78 -0
mlrun/common/formatters/function.py +41 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/helpers.py +1 -2
mlrun/common/model_monitoring/helpers.py +9 -5
mlrun/{runtimes → common/runtimes}/constants.py +37 -9
mlrun/common/schemas/__init__.py +24 -4
mlrun/common/schemas/alert.py +203 -0
mlrun/common/schemas/api_gateway.py +148 -0
mlrun/common/schemas/artifact.py +18 -8
mlrun/common/schemas/auth.py +11 -5
mlrun/common/schemas/background_task.py +1 -1
mlrun/common/schemas/client_spec.py +4 -1
mlrun/common/schemas/feature_store.py +16 -16
mlrun/common/schemas/frontend_spec.py +8 -7
mlrun/common/schemas/function.py +5 -1
mlrun/common/schemas/hub.py +11 -18
mlrun/common/schemas/memory_reports.py +2 -2
mlrun/common/schemas/model_monitoring/__init__.py +18 -3
mlrun/common/schemas/model_monitoring/constants.py +83 -26
mlrun/common/schemas/model_monitoring/grafana.py +13 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
mlrun/common/schemas/notification.py +4 -4
mlrun/common/schemas/object.py +2 -2
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +1 -10
mlrun/common/schemas/project.py +24 -23
mlrun/common/schemas/runtime_resource.py +8 -12
mlrun/common/schemas/schedule.py +3 -3
mlrun/common/schemas/tag.py +1 -2
mlrun/common/schemas/workflow.py +2 -2
mlrun/common/types.py +7 -1
mlrun/config.py +54 -17
mlrun/data_types/to_pandas.py +10 -12
mlrun/datastore/__init__.py +5 -8
mlrun/datastore/alibaba_oss.py +130 -0
mlrun/datastore/azure_blob.py +17 -5
mlrun/datastore/base.py +62 -39
mlrun/datastore/datastore.py +28 -9
mlrun/datastore/datastore_profile.py +146 -20
mlrun/datastore/filestore.py +0 -1
mlrun/datastore/google_cloud_storage.py +6 -2
mlrun/datastore/hdfs.py +56 -0
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/redis.py +6 -2
mlrun/datastore/s3.py +9 -0
mlrun/datastore/snowflake_utils.py +43 -0
mlrun/datastore/sources.py +201 -96
mlrun/datastore/spark_utils.py +1 -2
mlrun/datastore/store_resources.py +7 -7
mlrun/datastore/targets.py +358 -104
mlrun/datastore/utils.py +72 -58
mlrun/datastore/v3io.py +5 -1
mlrun/db/base.py +185 -35
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +614 -179
mlrun/db/nopdb.py +210 -26
mlrun/errors.py +12 -1
mlrun/execution.py +41 -24
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +40 -72
mlrun/feature_store/common.py +1 -1
mlrun/feature_store/feature_set.py +76 -55
mlrun/feature_store/feature_vector.py +28 -30
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +16 -11
mlrun/feature_store/retrieval/conversion.py +11 -13
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +9 -3
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +34 -24
mlrun/feature_store/steps.py +37 -34
mlrun/features.py +9 -20
mlrun/frameworks/_common/artifacts_library.py +9 -9
mlrun/frameworks/_common/mlrun_interface.py +5 -5
mlrun/frameworks/_common/model_handler.py +48 -48
mlrun/frameworks/_common/plan.py +2 -3
mlrun/frameworks/_common/producer.py +3 -4
mlrun/frameworks/_common/utils.py +5 -5
mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
mlrun/frameworks/_ml_common/model_handler.py +24 -24
mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/_ml_common/utils.py +4 -4
mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
mlrun/frameworks/huggingface/model_server.py +4 -4
mlrun/frameworks/lgbm/__init__.py +33 -33
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
mlrun/frameworks/lgbm/model_handler.py +10 -10
mlrun/frameworks/lgbm/model_server.py +6 -6
mlrun/frameworks/lgbm/utils.py +5 -5
mlrun/frameworks/onnx/dataset.py +8 -8
mlrun/frameworks/onnx/mlrun_interface.py +3 -3
mlrun/frameworks/onnx/model_handler.py +6 -6
mlrun/frameworks/onnx/model_server.py +7 -7
mlrun/frameworks/parallel_coordinates.py +4 -3
mlrun/frameworks/pytorch/__init__.py +18 -18
mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
mlrun/frameworks/pytorch/model_handler.py +17 -17
mlrun/frameworks/pytorch/model_server.py +7 -7
mlrun/frameworks/sklearn/__init__.py +13 -13
mlrun/frameworks/sklearn/estimator.py +4 -4
mlrun/frameworks/sklearn/metrics_library.py +14 -14
mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
mlrun/frameworks/sklearn/model_handler.py +2 -2
mlrun/frameworks/tf_keras/__init__.py +10 -7
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
mlrun/frameworks/tf_keras/model_handler.py +14 -14
mlrun/frameworks/tf_keras/model_server.py +6 -6
mlrun/frameworks/xgboost/__init__.py +13 -13
mlrun/frameworks/xgboost/model_handler.py +6 -6
mlrun/k8s_utils.py +14 -16
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +16 -15
mlrun/launcher/client.py +8 -6
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +17 -11
mlrun/launcher/remote.py +16 -10
mlrun/lists.py +7 -6
mlrun/model.py +238 -73
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +138 -315
mlrun/model_monitoring/application.py +5 -296
mlrun/model_monitoring/applications/__init__.py +24 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +282 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +104 -84
mlrun/model_monitoring/controller_handler.py +13 -5
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
mlrun/model_monitoring/db/tsdb/base.py +329 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/features_drift_table.py +134 -106
mlrun/model_monitoring/helpers.py +127 -28
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/prometheus.py +1 -4
mlrun/model_monitoring/stream_processing.py +62 -231
mlrun/model_monitoring/tracking_policy.py +9 -2
mlrun/model_monitoring/writer.py +152 -124
mlrun/package/__init__.py +6 -6
mlrun/package/context_handler.py +5 -5
mlrun/package/packager.py +7 -7
mlrun/package/packagers/default_packager.py +6 -6
mlrun/package/packagers/numpy_packagers.py +15 -15
mlrun/package/packagers/pandas_packagers.py +5 -5
mlrun/package/packagers/python_standard_library_packagers.py +10 -10
mlrun/package/packagers_manager.py +19 -23
mlrun/package/utils/_formatter.py +6 -6
mlrun/package/utils/_pickler.py +2 -2
mlrun/package/utils/_supported_format.py +4 -4
mlrun/package/utils/log_hint_utils.py +2 -2
mlrun/package/utils/type_hint_utils.py +4 -9
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +24 -203
mlrun/projects/operations.py +35 -21
mlrun/projects/pipelines.py +68 -99
mlrun/projects/project.py +830 -266
mlrun/render.py +3 -11
mlrun/run.py +162 -166
mlrun/runtimes/__init__.py +62 -7
mlrun/runtimes/base.py +39 -32
mlrun/runtimes/daskjob.py +8 -8
mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/function_reference.py +1 -1
mlrun/runtimes/kubejob.py +28 -122
mlrun/runtimes/local.py +6 -3
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +9 -10
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
mlrun/runtimes/nuclio/api_gateway.py +709 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +523 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
mlrun/runtimes/pod.py +286 -88
mlrun/runtimes/remotesparkjob.py +2 -2
mlrun/runtimes/sparkjob/spark3job.py +51 -34
mlrun/runtimes/utils.py +7 -75
mlrun/secrets.py +9 -5
mlrun/serving/remote.py +2 -7
mlrun/serving/routers.py +13 -10
mlrun/serving/server.py +22 -26
mlrun/serving/states.py +99 -25
mlrun/serving/utils.py +3 -3
mlrun/serving/v1_serving.py +6 -7
mlrun/serving/v2_serving.py +59 -20
mlrun/track/tracker.py +2 -1
mlrun/track/tracker_manager.py +3 -3
mlrun/track/trackers/mlflow_tracker.py +1 -2
mlrun/utils/async_http.py +5 -7
mlrun/utils/azure_vault.py +1 -1
mlrun/utils/clones.py +1 -2
mlrun/utils/condition_evaluator.py +3 -3
mlrun/utils/db.py +3 -3
mlrun/utils/helpers.py +183 -197
mlrun/utils/http.py +2 -5
mlrun/utils/logger.py +76 -14
mlrun/utils/notifications/notification/__init__.py +17 -12
mlrun/utils/notifications/notification/base.py +14 -2
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +3 -1
mlrun/utils/notifications/notification/ipython.py +3 -1
mlrun/utils/notifications/notification/slack.py +101 -21
mlrun/utils/notifications/notification/webhook.py +11 -1
mlrun/utils/notifications/notification_pusher.py +155 -30
mlrun/utils/retryer.py +208 -0
mlrun/utils/singleton.py +1 -1
mlrun/utils/v3io_clients.py +2 -4
mlrun/utils/version/version.json +2 -2
mlrun/utils/version/version.py +2 -6
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
mlrun-1.7.0rc20.dist-info/RECORD +353 -0
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/batch.py +0 -1095
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
mlrun/platforms/other.py +0 -306
mlrun-1.6.4rc2.dist-info/RECORD +0 -314
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0

mlrun/datastore/base.py CHANGED Viewed

@@ -144,6 +144,10 @@ class DataStore:
     def url(self):
         return f"{self.kind}://{self.endpoint}"
+    @property
+    def spark_url(self):
+        return self.url
     def get(self, key, size=None, offset=0):
         pass
@@ -175,11 +179,23 @@ class DataStore:
         return {}
     @staticmethod
-    def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
+    def _parquet_reader(
+        df_module,
+        url,
+        file_system,
+        time_column,
+        start_time,
+        end_time,
+        additional_filters,
+    ):
         from storey.utils import find_filters, find_partitions
         def set_filters(
-            partitions_time_attributes, start_time_inner, end_time_inner, kwargs
+            partitions_time_attributes,
+            start_time_inner,
+            end_time_inner,
+            filters_inner,
+            kwargs,
         ):
             filters = []
             find_filters(
@@ -189,20 +205,23 @@ class DataStore:
                 filters,
                 time_column,
             )
+            if filters and filters_inner:
+                filters[0] += filters_inner
             kwargs["filters"] = filters
         def reader(*args, **kwargs):
-            if start_time or end_time:
-                if time_column is None:
-                    raise mlrun.errors.MLRunInvalidArgumentError(
-                        "When providing start_time or end_time, must provide time_column"
-                    )
+            if time_column is None and (start_time or end_time):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "When providing start_time or end_time, must provide time_column"
+                )
+            if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
                     partitions_time_attributes,
                     start_time,
                     end_time,
+                    additional_filters,
                     kwargs,
                 )
                 try:
@@ -213,6 +232,7 @@ class DataStore:
                     ):
                         raise ex
+                    # TODO: fix timezone issue (ML-6308)
                     if start_time.tzinfo:
                         start_time_inner = start_time.replace(tzinfo=None)
                         end_time_inner = end_time.replace(tzinfo=None)
@@ -224,6 +244,7 @@ class DataStore:
                         partitions_time_attributes,
                         start_time_inner,
                         end_time_inner,
+                        additional_filters,
                         kwargs,
                     )
                     return df_module.read_parquet(*args, **kwargs)
@@ -242,6 +263,7 @@ class DataStore:
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         df_module = df_module or pd
@@ -306,7 +328,13 @@ class DataStore:
                 kwargs["columns"] = columns
             reader = self._parquet_reader(
-                df_module, url, file_system, time_column, start_time, end_time
+                df_module,
+                url,
+                file_system,
+                time_column,
+                start_time,
+                end_time,
+                additional_filters,
             )
         elif file_url.endswith(".json") or format == "json":
@@ -317,31 +345,17 @@ class DataStore:
             raise Exception(f"File type unhandled {url}")
         if file_system:
-            if (
-                self.supports_isdir()
-                and file_system.isdir(file_url)
-                or self._is_dd(df_module)
-            ):
-                storage_options = self.get_storage_options()
-                if url.startswith("ds://"):
-                    parsed_url = urllib.parse.urlparse(url)
-                    url = parsed_url.path
-                    if self.using_bucket:
-                        url = url[1:]
-                    # Pass the underlying file system
-                    kwargs["filesystem"] = file_system
-                elif storage_options:
-                    kwargs["storage_options"] = storage_options
-                df = reader(url, **kwargs)
-            else:
-                file = url
-                # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
-                if file_system.protocol != "file":
-                    # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
-                    # support the storage_options parameter.
-                    file = file_system.open(url)
-                df = reader(file, **kwargs)
+            storage_options = self.get_storage_options()
+            if url.startswith("ds://"):
+                parsed_url = urllib.parse.urlparse(url)
+                url = parsed_url.path
+                if self.using_bucket:
+                    url = url[1:]
+                # Pass the underlying file system
+                kwargs["filesystem"] = file_system
+            elif storage_options:
+                kwargs["storage_options"] = storage_options
+            df = reader(url, **kwargs)
         else:
             temp_file = tempfile.NamedTemporaryFile(delete=False)
             self.download(self._join(subpath), temp_file.name)
@@ -399,14 +413,15 @@ class DataItem:
         # reading run results using DataItem (run.artifact())
-        train_run = train_iris_func.run(inputs={'dataset': dataset},
-                                        params={'label_column': 'label'})
+        train_run = train_iris_func.run(
+            inputs={"dataset": dataset}, params={"label_column": "label"}
+        )
-        train_run.artifact('confusion-matrix').show()
-        test_set = train_run.artifact('test_set').as_df()
+        train_run.artifact("confusion-matrix").show()
+        test_set = train_run.artifact("test_set").as_df()
         # create and use DataItem from uri
-        data = mlrun.get_dataitem('http://xyz/data.json').get()
+        data = mlrun.get_dataitem("http://xyz/data.json").get()
     """
     def __init__(
@@ -548,6 +563,7 @@ class DataItem:
         time_column=None,
         start_time=None,
         end_time=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return a dataframe object (generated from the dataitem).
@@ -559,6 +575,12 @@ class DataItem:
         :param end_time:    filters out data after this time
         :param time_column: Store timestamp_key will be used if None.
                             The results will be filtered by this column and start_time & end_time.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
         df = self._store.as_df(
             self._url,
@@ -569,6 +591,7 @@ class DataItem:
             time_column=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return df

mlrun/datastore/datastore.py CHANGED Viewed

@@ -94,6 +94,14 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
+    elif schema == "hdfs":
+        from .hdfs import HdfsStore
+        return HdfsStore
+    elif schema == "oss":
+        from .alibaba_oss import OSSStore
+        return OSSStore
     else:
         raise ValueError(f"unsupported store scheme ({schema})")
@@ -170,7 +178,7 @@ class StoreManager:
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"resource {url} does not have a valid/persistent offline target"
             )
-        return resource, target
+        return resource, target or ""
     def object(
         self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,14 +190,21 @@ class StoreManager:
                 url, project, allow_empty_resources, secrets
             )
-        store, subpath = self.get_or_create_store(
+        store, subpath, url = self.get_or_create_store(
             url, secrets=secrets, project_name=project
         )
-        return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
+        return DataItem(
+            key,
+            store,
+            subpath,
+            url,
+            meta=meta,
+            artifact_url=artifact_url,
+        )
     def get_or_create_store(
         self, url, secrets: dict = None, project_name=""
-    ) -> (DataStore, str):
+    ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
         store_key = f"{schema}://{endpoint}"
@@ -206,17 +221,22 @@ class StoreManager:
         if schema == "memory":
             subpath = url[len("memory://") :]
-            return in_memory_store, subpath
+            return in_memory_store, subpath, url
+        elif schema in get_local_file_schema():
+            # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
+            # As a workaround, we set subpath to the url.
+            subpath = url.replace("file://", "", 1)
         if not schema and endpoint:
             if endpoint in self._stores.keys():
-                return self._stores[endpoint], subpath
+                return self._stores[endpoint], subpath, url
             else:
                 raise ValueError(f"no such store ({endpoint})")
         if not secrets and not mlrun.config.is_running_as_api():
             if store_key in self._stores.keys():
-                return self._stores[store_key], subpath
+                return self._stores[store_key], subpath, url
         # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
         # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -226,8 +246,7 @@ class StoreManager:
         )
         if not secrets and not mlrun.config.is_running_as_api():
             self._stores[store_key] = store
-        # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
-        return store, url if store.kind == "file" else subpath
+        return store, subpath, url
     def reset_secrets(self):
         self._secrets = {}

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -16,6 +16,7 @@ import ast
 import base64
 import json
 import typing
+import warnings
 from urllib.parse import ParseResult, urlparse, urlunparse
 import pydantic
@@ -30,12 +31,13 @@ from ..secrets import get_secret_or_env
 class DatastoreProfile(pydantic.BaseModel):
     type: str
     name: str
-    _private_attributes: typing.List = ()
+    _private_attributes: list = ()
     class Config:
         extra = pydantic.Extra.forbid
     @pydantic.validator("name")
+    @classmethod
     def lower_case(cls, v):
         return v.lower()
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
     def get(self, key):
         return self._data.get(key, None)
+    def remove(self, key):
+        self._data.pop(key, None)
 class DatastoreProfileBasic(DatastoreProfile):
     type: str = pydantic.Field("basic")
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
 class DatastoreProfileKafkaTarget(DatastoreProfile):
     type: str = pydantic.Field("kafka_target")
     _private_attributes = "kwargs_private"
-    bootstrap_servers: str
+    bootstrap_servers: typing.Optional[str] = None
+    brokers: typing.Optional[str] = None
     topic: str
-    kwargs_public: typing.Optional[typing.Dict]
-    kwargs_private: typing.Optional[typing.Dict]
+    kwargs_public: typing.Optional[dict]
+    kwargs_private: typing.Optional[dict]
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if not self.brokers and not self.bootstrap_servers:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
+            )
+        if self.bootstrap_servers:
+            if self.brokers:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
+                )
+            else:
+                self.brokers = self.bootstrap_servers
+                self.bootstrap_servers = None
+            warnings.warn(
+                "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
+                "use 'brokers' instead.",
+                # TODO: Remove this in 1.9.0
+                FutureWarning,
+            )
     def attributes(self):
-        attributes = {"bootstrap_servers": self.bootstrap_servers}
+        attributes = {"brokers": self.brokers or self.bootstrap_servers}
         if self.kwargs_public:
             attributes = merge(attributes, self.kwargs_public)
         if self.kwargs_private:
@@ -96,15 +125,15 @@ class DatastoreProfileKafkaTarget(DatastoreProfile):
 class DatastoreProfileKafkaSource(DatastoreProfile):
     type: str = pydantic.Field("kafka_source")
     _private_attributes = ("kwargs_private", "sasl_user", "sasl_pass")
-    brokers: typing.Union[str, typing.List[str]]
-    topics: typing.Union[str, typing.List[str]]
+    brokers: typing.Union[str, list[str]]
+    topics: typing.Union[str, list[str]]
     group: typing.Optional[str] = "serving"
     initial_offset: typing.Optional[str] = "earliest"
-    partitions: typing.Optional[typing.Union[str, typing.List[str]]]
+    partitions: typing.Optional[typing.Union[str, list[str]]]
     sasl_user: typing.Optional[str]
     sasl_pass: typing.Optional[str]
-    kwargs_public: typing.Optional[typing.Dict]
-    kwargs_private: typing.Optional[typing.Dict]
+    kwargs_public: typing.Optional[dict]
+    kwargs_private: typing.Optional[dict]
     def attributes(self):
         attributes = {}
@@ -132,6 +161,22 @@ class DatastoreProfileKafkaSource(DatastoreProfile):
         return attributes
+class DatastoreProfileV3io(DatastoreProfile):
+    type: str = pydantic.Field("v3io")
+    v3io_access_key: typing.Optional[str] = None
+    _private_attributes = "v3io_access_key"
+    def url(self, subpath):
+        subpath = subpath.lstrip("/")
+        return f"v3io:///{subpath}"
+    def secrets(self) -> dict:
+        res = {}
+        if self.v3io_access_key:
+            res["V3IO_ACCESS_KEY"] = self.v3io_access_key
+        return res
 class DatastoreProfileS3(DatastoreProfile):
     type: str = pydantic.Field("s3")
     _private_attributes = ("access_key_id", "secret_key")
@@ -141,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
     assume_role_arn: typing.Optional[str] = None
     access_key_id: typing.Optional[str] = None
     secret_key: typing.Optional[str] = None
+    bucket: typing.Optional[str] = None
+    @pydantic.validator("bucket")
+    @classmethod
+    def check_bucket(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'bucket' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     def secrets(self) -> dict:
         res = {}
@@ -156,10 +213,16 @@ class DatastoreProfileS3(DatastoreProfile):
             res["AWS_PROFILE"] = self.profile_name
         if self.assume_role_arn:
             res["MLRUN_AWS_ROLE_ARN"] = self.assume_role_arn
-        return res if res else None
+        return res
     def url(self, subpath):
-        return f"s3:/{subpath}"
+        # TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
+        # we assume that the subpath can begin without a '/' character,
+        # while here we assume it always starts with one.
+        if self.bucket:
+            return f"s3://{self.bucket}{subpath}"
+        else:
+            return f"s3:/{subpath}"
 class DatastoreProfileRedis(DatastoreProfile):
@@ -199,7 +262,7 @@ class DatastoreProfileRedis(DatastoreProfile):
             res["REDIS_USER"] = self.username
         if self.password:
             res["REDIS_PASSWORD"] = self.password
-        return res if res else None
+        return res
     def url(self, subpath):
         return self.endpoint_url + subpath
@@ -220,26 +283,44 @@ class DatastoreProfileDBFS(DatastoreProfile):
             res["DATABRICKS_TOKEN"] = self.token
         if self.endpoint_url:
             res["DATABRICKS_HOST"] = self.endpoint_url
-        return res if res else None
+        return res
 class DatastoreProfileGCS(DatastoreProfile):
     type: str = pydantic.Field("gcs")
     _private_attributes = ("gcp_credentials",)
     credentials_path: typing.Optional[str] = None  # path to file.
-    gcp_credentials: typing.Optional[typing.Union[str, typing.Dict]] = None
+    gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
+    bucket: typing.Optional[str] = None
+    @pydantic.validator("bucket")
+    @classmethod
+    def check_bucket(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'bucket' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     @pydantic.validator("gcp_credentials", pre=True, always=True)
+    @classmethod
     def convert_dict_to_json(cls, v):
         if isinstance(v, dict):
             return json.dumps(v)
         return v
     def url(self, subpath) -> str:
+        # TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
+        # but the opposite assumption is made in S3.
         if subpath.startswith("/"):
             #  in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
             subpath = subpath[1:]
-        return f"gcs://{subpath}"
+        if self.bucket:
+            return f"gcs://{self.bucket}/{subpath}"
+        else:
+            return f"gcs://{subpath}"
     def secrets(self) -> dict:
         res = {}
@@ -247,7 +328,7 @@ class DatastoreProfileGCS(DatastoreProfile):
             res["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_path
         if self.gcp_credentials:
             res["GCP_CREDENTIALS"] = self.gcp_credentials
-        return res if res else None
+        return res
 class DatastoreProfileAzureBlob(DatastoreProfile):
@@ -267,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
     client_secret: typing.Optional[str] = None
     sas_token: typing.Optional[str] = None
     credential: typing.Optional[str] = None
+    container: typing.Optional[str] = None
+    @pydantic.validator("container")
+    @classmethod
+    def check_container(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'container' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     def url(self, subpath) -> str:
         if subpath.startswith("/"):
-            #  in azure the path after schema is starts with bucket, wherefore it should not start with "/".
+            #  in azure the path after schema is starts with container, wherefore it should not start with "/".
             subpath = subpath[1:]
-        return f"az://{subpath}"
+        if self.container:
+            return f"az://{self.container}/{subpath}"
+        else:
+            return f"az://{subpath}"
     def secrets(self) -> dict:
         res = {}
@@ -292,7 +388,31 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
             res["sas_token"] = self.sas_token
         if self.credential:
             res["credential"] = self.credential
-        return res if res else None
+        return res
+class DatastoreProfileHdfs(DatastoreProfile):
+    type: str = pydantic.Field("hdfs")
+    _private_attributes = "token"
+    host: typing.Optional[str] = None
+    port: typing.Optional[int] = None
+    http_port: typing.Optional[int] = None
+    user: typing.Optional[str] = None
+    def secrets(self) -> dict:
+        res = {}
+        if self.host:
+            res["HDFS_HOST"] = self.host
+        if self.port:
+            res["HDFS_PORT"] = self.port
+        if self.port:
+            res["HDFS_HTTP_PORT"] = self.http_port
+        if self.user:
+            res["HDFS_USER"] = self.user
+        return res or None
+    def url(self, subpath):
+        return f"hdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):
@@ -346,6 +466,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
         decoded_dict = {k: safe_literal_eval(v) for k, v in decoded_dict.items()}
         datastore_type = decoded_dict.get("type")
         ds_profile_factory = {
+            "v3io": DatastoreProfileV3io,
             "s3": DatastoreProfileS3,
             "redis": DatastoreProfileRedis,
             "basic": DatastoreProfileBasic,
@@ -354,6 +475,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
             "dbfs": DatastoreProfileDBFS,
             "gcs": DatastoreProfileGCS,
             "az": DatastoreProfileAzureBlob,
+            "hdfs": DatastoreProfileHdfs,
         }
         if datastore_type in ds_profile_factory:
             return ds_profile_factory[datastore_type].parse_obj(decoded_dict)
@@ -418,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
     It's beneficial for testing purposes.
     """
     TemporaryClientDatastoreProfiles().add(profile)
+def remove_temporary_client_datastore_profile(profile_name: str):
+    TemporaryClientDatastoreProfiles().remove(profile_name)

mlrun/datastore/filestore.py CHANGED Viewed

@@ -105,4 +105,3 @@ class FileStore(DataStore):
                 return
             except FileExistsError:
                 time.sleep(0.1)
-                pass

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -132,13 +132,13 @@ class GoogleCloudStorageStore(DataStore):
         self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
     def get_spark_options(self):
-        res = None
+        res = {}
         st = self.get_storage_options()
         if "token" in st:
             res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
             if isinstance(st["token"], str):
                 # Token is a filename, read json from it
-                with open(st["token"], "r") as file:
+                with open(st["token"]) as file:
                     credentials = json.load(file)
             else:
                 # Token is a dictionary, use it directly
@@ -161,3 +161,7 @@ class GoogleCloudStorageStore(DataStore):
             if "client_id" in credentials:
                 res["spark.hadoop.fs.gs.client.id"] = credentials["client_id"]
         return res
+    @property
+    def spark_url(self):
+        return f"gs://{self.endpoint}"

mlrun/datastore/hdfs.py ADDED Viewed

@@ -0,0 +1,56 @@
+# Copyright 2024 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from urllib.parse import urlparse
+import fsspec
+from mlrun.datastore.base import DataStore
+class HdfsStore(DataStore):
+    def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
+        super().__init__(parent, name, schema, endpoint, secrets)
+        self.host = self._get_secret_or_env("HDFS_HOST")
+        self.port = self._get_secret_or_env("HDFS_PORT")
+        self.http_port = self._get_secret_or_env("HDFS_HTTP_PORT")
+        self.user = self._get_secret_or_env("HDFS_USER")
+        if not self.user:
+            self.user = os.environ.get("HADOOP_USER_NAME", os.environ.get("USER"))
+        self._filesystem = None
+    @property
+    def filesystem(self):
+        if not self._filesystem:
+            self._filesystem = fsspec.filesystem(
+                "webhdfs",
+                host=self.host,
+                port=self.http_port,
+                user=self.user,
+            )
+        return self._filesystem
+    @property
+    def url(self):
+        return f"webhdfs://{self.host}:{self.http_port}"
+    @property
+    def spark_url(self):
+        return f"hdfs://{self.host}:{self.port}"
+    def rm(self, url, recursive=False, maxdepth=None):
+        path = urlparse(url).path
+        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)

mlrun/datastore/inmem.py CHANGED Viewed

@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
             reader = df_module.read_json
         else:
             raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
-        # InMemoryStore store do not filter on time
-        for field in ["time_column", "start_time", "end_time"]:
+        # InMemoryStore store – don't pass filters
+        for field in ["time_column", "start_time", "end_time", "additional_filters"]:
             kwargs.pop(field, None)
         return reader(item, **kwargs)

mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.4rc2py3-none-any.whl → 1.7.0rc20py3-none-any.whl