PyPI - mlrun - Versions diffs - 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

mlrun 1.6.4rc7py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +40 -122
mlrun/alerts/__init__.py +15 -0
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +5 -4
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +47 -257
mlrun/artifacts/dataset.py +11 -192
mlrun/artifacts/manager.py +79 -47
mlrun/artifacts/model.py +31 -159
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +74 -1
mlrun/common/db/sql_session.py +5 -5
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +45 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +33 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +12 -3
mlrun/common/model_monitoring/helpers.py +9 -5
mlrun/{runtimes → common/runtimes}/constants.py +37 -9
mlrun/common/schemas/__init__.py +31 -5
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +196 -0
mlrun/common/schemas/artifact.py +25 -4
mlrun/common/schemas/auth.py +16 -5
mlrun/common/schemas/background_task.py +1 -1
mlrun/common/schemas/client_spec.py +4 -2
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +74 -44
mlrun/common/schemas/frontend_spec.py +15 -7
mlrun/common/schemas/function.py +12 -1
mlrun/common/schemas/hub.py +11 -18
mlrun/common/schemas/memory_reports.py +2 -2
mlrun/common/schemas/model_monitoring/__init__.py +20 -4
mlrun/common/schemas/model_monitoring/constants.py +123 -42
mlrun/common/schemas/model_monitoring/grafana.py +13 -9
mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
mlrun/common/schemas/notification.py +71 -14
mlrun/common/schemas/object.py +2 -2
mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
mlrun/common/schemas/pipeline.py +8 -1
mlrun/common/schemas/project.py +69 -18
mlrun/common/schemas/runs.py +7 -1
mlrun/common/schemas/runtime_resource.py +8 -12
mlrun/common/schemas/schedule.py +4 -4
mlrun/common/schemas/tag.py +1 -2
mlrun/common/schemas/workflow.py +12 -4
mlrun/common/types.py +14 -1
mlrun/config.py +154 -69
mlrun/data_types/data_types.py +6 -1
mlrun/data_types/spark.py +2 -2
mlrun/data_types/to_pandas.py +67 -37
mlrun/datastore/__init__.py +6 -8
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +143 -42
mlrun/datastore/base.py +102 -58
mlrun/datastore/datastore.py +34 -13
mlrun/datastore/datastore_profile.py +146 -20
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -4
mlrun/datastore/google_cloud_storage.py +97 -33
mlrun/datastore/hdfs.py +56 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +7 -2
mlrun/datastore/s3.py +34 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +303 -111
mlrun/datastore/spark_utils.py +31 -2
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +453 -176
mlrun/datastore/utils.py +72 -58
mlrun/datastore/v3io.py +6 -1
mlrun/db/base.py +274 -41
mlrun/db/factory.py +1 -1
mlrun/db/httpdb.py +893 -225
mlrun/db/nopdb.py +291 -33
mlrun/errors.py +36 -6
mlrun/execution.py +115 -42
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +65 -73
mlrun/feature_store/common.py +7 -12
mlrun/feature_store/feature_set.py +76 -55
mlrun/feature_store/feature_vector.py +39 -31
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +16 -11
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +45 -34
mlrun/features.py +11 -21
mlrun/frameworks/_common/artifacts_library.py +9 -9
mlrun/frameworks/_common/mlrun_interface.py +5 -5
mlrun/frameworks/_common/model_handler.py +48 -48
mlrun/frameworks/_common/plan.py +5 -6
mlrun/frameworks/_common/producer.py +3 -4
mlrun/frameworks/_common/utils.py +5 -5
mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
mlrun/frameworks/_ml_common/model_handler.py +24 -24
mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
mlrun/frameworks/_ml_common/plan.py +2 -2
mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/_ml_common/utils.py +4 -4
mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
mlrun/frameworks/huggingface/model_server.py +4 -4
mlrun/frameworks/lgbm/__init__.py +33 -33
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
mlrun/frameworks/lgbm/model_handler.py +10 -10
mlrun/frameworks/lgbm/model_server.py +6 -6
mlrun/frameworks/lgbm/utils.py +5 -5
mlrun/frameworks/onnx/dataset.py +8 -8
mlrun/frameworks/onnx/mlrun_interface.py +3 -3
mlrun/frameworks/onnx/model_handler.py +6 -6
mlrun/frameworks/onnx/model_server.py +7 -7
mlrun/frameworks/parallel_coordinates.py +6 -6
mlrun/frameworks/pytorch/__init__.py +18 -18
mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
mlrun/frameworks/pytorch/model_handler.py +17 -17
mlrun/frameworks/pytorch/model_server.py +7 -7
mlrun/frameworks/sklearn/__init__.py +13 -13
mlrun/frameworks/sklearn/estimator.py +4 -4
mlrun/frameworks/sklearn/metrics_library.py +14 -14
mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
mlrun/frameworks/sklearn/model_handler.py +2 -2
mlrun/frameworks/tf_keras/__init__.py +10 -7
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
mlrun/frameworks/tf_keras/model_handler.py +14 -14
mlrun/frameworks/tf_keras/model_server.py +6 -6
mlrun/frameworks/xgboost/__init__.py +13 -13
mlrun/frameworks/xgboost/model_handler.py +6 -6
mlrun/k8s_utils.py +61 -17
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +16 -15
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +23 -13
mlrun/launcher/remote.py +17 -10
mlrun/lists.py +7 -6
mlrun/model.py +478 -103
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +163 -371
mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
mlrun/model_monitoring/applications/_application_steps.py +188 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +131 -278
mlrun/model_monitoring/db/__init__.py +18 -0
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +134 -106
mlrun/model_monitoring/helpers.py +199 -55
mlrun/model_monitoring/metrics/__init__.py +13 -0
mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +131 -398
mlrun/model_monitoring/tracking_policy.py +9 -2
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/__init__.py +6 -6
mlrun/package/context_handler.py +5 -5
mlrun/package/packager.py +7 -7
mlrun/package/packagers/default_packager.py +8 -8
mlrun/package/packagers/numpy_packagers.py +15 -15
mlrun/package/packagers/pandas_packagers.py +5 -5
mlrun/package/packagers/python_standard_library_packagers.py +10 -10
mlrun/package/packagers_manager.py +19 -23
mlrun/package/utils/_formatter.py +6 -6
mlrun/package/utils/_pickler.py +2 -2
mlrun/package/utils/_supported_format.py +4 -4
mlrun/package/utils/log_hint_utils.py +2 -2
mlrun/package/utils/type_hint_utils.py +4 -9
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +24 -203
mlrun/projects/operations.py +52 -25
mlrun/projects/pipelines.py +191 -197
mlrun/projects/project.py +1227 -400
mlrun/render.py +16 -19
mlrun/run.py +209 -184
mlrun/runtimes/__init__.py +83 -15
mlrun/runtimes/base.py +51 -35
mlrun/runtimes/daskjob.py +17 -10
mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/function_reference.py +1 -1
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +40 -11
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +9 -10
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
mlrun/runtimes/nuclio/api_gateway.py +769 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
mlrun/runtimes/pod.py +281 -101
mlrun/runtimes/remotesparkjob.py +12 -9
mlrun/runtimes/sparkjob/spark3job.py +67 -51
mlrun/runtimes/utils.py +41 -75
mlrun/secrets.py +9 -5
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -7
mlrun/serving/routers.py +85 -69
mlrun/serving/server.py +69 -44
mlrun/serving/states.py +209 -36
mlrun/serving/utils.py +22 -14
mlrun/serving/v1_serving.py +6 -7
mlrun/serving/v2_serving.py +129 -54
mlrun/track/tracker.py +2 -1
mlrun/track/tracker_manager.py +3 -3
mlrun/track/trackers/mlflow_tracker.py +6 -2
mlrun/utils/async_http.py +6 -8
mlrun/utils/azure_vault.py +1 -1
mlrun/utils/clones.py +1 -2
mlrun/utils/condition_evaluator.py +3 -3
mlrun/utils/db.py +21 -3
mlrun/utils/helpers.py +405 -225
mlrun/utils/http.py +3 -6
mlrun/utils/logger.py +112 -16
mlrun/utils/notifications/notification/__init__.py +17 -13
mlrun/utils/notifications/notification/base.py +50 -2
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +3 -1
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +59 -2
mlrun/utils/notifications/notification_pusher.py +149 -30
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +208 -0
mlrun/utils/singleton.py +1 -1
mlrun/utils/v3io_clients.py +4 -6
mlrun/utils/version/version.json +2 -2
mlrun/utils/version/version.py +2 -6
mlrun-1.7.0.dist-info/METADATA +378 -0
mlrun-1.7.0.dist-info/RECORD +351 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -273
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -1095
mlrun/model_monitoring/prometheus.py +0 -219
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
mlrun/platforms/other.py +0 -306
mlrun-1.6.4rc7.dist-info/METADATA +0 -272
mlrun-1.6.4rc7.dist-info/RECORD +0 -314
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
{mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0

mlrun/datastore/base.py CHANGED Viewed

@@ -24,13 +24,12 @@ import pandas as pd
 import pyarrow
 import pytz
 import requests
-import urllib3
 from deprecated import deprecated
 import mlrun.config
 import mlrun.errors
 from mlrun.errors import err_to_str
-from mlrun.utils import StorePrefix, is_ipython, logger
+from mlrun.utils import StorePrefix, is_jupyter, logger
 from .store_resources import is_store_uri, parse_store_uri
 from .utils import filter_df_start_end_time, select_columns_from_df
@@ -144,6 +143,10 @@ class DataStore:
     def url(self):
         return f"{self.kind}://{self.endpoint}"
+    @property
+    def spark_url(self):
+        return self.url
     def get(self, key, size=None, offset=0):
         pass
@@ -153,6 +156,18 @@ class DataStore:
     def put(self, key, data, append=False):
         pass
+    def _prepare_put_data(self, data, append=False):
+        mode = "a" if append else "w"
+        if isinstance(data, bytearray):
+            data = bytes(data)
+        if isinstance(data, bytes):
+            return data, f"{mode}b"
+        elif isinstance(data, str):
+            return data, mode
+        else:
+            raise TypeError(f"Unable to put a value of type {type(self).__name__}")
     def stat(self, key):
         pass
@@ -175,11 +190,23 @@ class DataStore:
         return {}
     @staticmethod
-    def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
+    def _parquet_reader(
+        df_module,
+        url,
+        file_system,
+        time_column,
+        start_time,
+        end_time,
+        additional_filters,
+    ):
         from storey.utils import find_filters, find_partitions
         def set_filters(
-            partitions_time_attributes, start_time_inner, end_time_inner, kwargs
+            partitions_time_attributes,
+            start_time_inner,
+            end_time_inner,
+            filters_inner,
+            kwargs,
         ):
             filters = []
             find_filters(
@@ -189,20 +216,32 @@ class DataStore:
                 filters,
                 time_column,
             )
+            if filters and filters_inner:
+                filters[0] += filters_inner
             kwargs["filters"] = filters
         def reader(*args, **kwargs):
-            if start_time or end_time:
-                if time_column is None:
-                    raise mlrun.errors.MLRunInvalidArgumentError(
-                        "When providing start_time or end_time, must provide time_column"
-                    )
+            if time_column is None and (start_time or end_time):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "When providing start_time or end_time, must provide time_column"
+                )
+            if (
+                start_time
+                and end_time
+                and start_time.utcoffset() != end_time.utcoffset()
+            ):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "start_time and end_time must have the same time zone"
+                )
+            if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
                     partitions_time_attributes,
                     start_time,
                     end_time,
+                    additional_filters,
                     kwargs,
                 )
                 try:
@@ -213,17 +252,23 @@ class DataStore:
                     ):
                         raise ex
-                    if start_time.tzinfo:
-                        start_time_inner = start_time.replace(tzinfo=None)
-                        end_time_inner = end_time.replace(tzinfo=None)
-                    else:
-                        start_time_inner = start_time.replace(tzinfo=pytz.utc)
-                        end_time_inner = end_time.replace(tzinfo=pytz.utc)
+                    start_time_inner = None
+                    if start_time:
+                        start_time_inner = start_time.replace(
+                            tzinfo=None if start_time.tzinfo else pytz.utc
+                        )
+                    end_time_inner = None
+                    if end_time:
+                        end_time_inner = end_time.replace(
+                            tzinfo=None if end_time.tzinfo else pytz.utc
+                        )
                     set_filters(
                         partitions_time_attributes,
                         start_time_inner,
                         end_time_inner,
+                        additional_filters,
                         kwargs,
                     )
                     return df_module.read_parquet(*args, **kwargs)
@@ -242,6 +287,7 @@ class DataStore:
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         df_module = df_module or pd
@@ -297,16 +343,18 @@ class DataStore:
                                 dfs.append(df_module.read_csv(*updated_args, **kwargs))
                         return df_module.concat(dfs)
-        elif (
-            file_url.endswith(".parquet")
-            or file_url.endswith(".pq")
-            or format == "parquet"
-        ):
+        elif mlrun.utils.helpers.is_parquet_file(file_url, format):
             if columns:
                 kwargs["columns"] = columns
             reader = self._parquet_reader(
-                df_module, url, file_system, time_column, start_time, end_time
+                df_module,
+                url,
+                file_system,
+                time_column,
+                start_time,
+                end_time,
+                additional_filters,
             )
         elif file_url.endswith(".json") or format == "json":
@@ -317,31 +365,17 @@ class DataStore:
             raise Exception(f"File type unhandled {url}")
         if file_system:
-            if (
-                self.supports_isdir()
-                and file_system.isdir(file_url)
-                or self._is_dd(df_module)
-            ):
-                storage_options = self.get_storage_options()
-                if url.startswith("ds://"):
-                    parsed_url = urllib.parse.urlparse(url)
-                    url = parsed_url.path
-                    if self.using_bucket:
-                        url = url[1:]
-                    # Pass the underlying file system
-                    kwargs["filesystem"] = file_system
-                elif storage_options:
-                    kwargs["storage_options"] = storage_options
-                df = reader(url, **kwargs)
-            else:
-                file = url
-                # Workaround for ARROW-12472 affecting pyarrow 3.x and 4.x.
-                if file_system.protocol != "file":
-                    # If not dir, use file_system.open() to avoid regression when pandas < 1.2 and does not
-                    # support the storage_options parameter.
-                    file = file_system.open(url)
-                df = reader(file, **kwargs)
+            storage_options = self.get_storage_options()
+            if url.startswith("ds://"):
+                parsed_url = urllib.parse.urlparse(url)
+                url = parsed_url.path
+                if self.using_bucket:
+                    url = url[1:]
+                # Pass the underlying file system
+                kwargs["filesystem"] = file_system
+            elif storage_options:
+                kwargs["storage_options"] = storage_options
+            df = reader(url, **kwargs)
         else:
             temp_file = tempfile.NamedTemporaryFile(delete=False)
             self.download(self._join(subpath), temp_file.name)
@@ -372,7 +406,10 @@ class DataStore:
         }
     def rm(self, path, recursive=False, maxdepth=None):
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        try:
+            self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        except FileNotFoundError:
+            pass
     @staticmethod
     def _is_dd(df_module):
@@ -399,14 +436,15 @@ class DataItem:
         # reading run results using DataItem (run.artifact())
-        train_run = train_iris_func.run(inputs={'dataset': dataset},
-                                        params={'label_column': 'label'})
+        train_run = train_iris_func.run(
+            inputs={"dataset": dataset}, params={"label_column": "label"}
+        )
-        train_run.artifact('confusion-matrix').show()
-        test_set = train_run.artifact('test_set').as_df()
+        train_run.artifact("confusion-matrix").show()
+        test_set = train_run.artifact("test_set").as_df()
         # create and use DataItem from uri
-        data = mlrun.get_dataitem('http://xyz/data.json').get()
+        data = mlrun.get_dataitem("http://xyz/data.json").get()
     """
     def __init__(
@@ -548,6 +586,7 @@ class DataItem:
         time_column=None,
         start_time=None,
         end_time=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return a dataframe object (generated from the dataitem).
@@ -559,6 +598,12 @@ class DataItem:
         :param end_time:    filters out data after this time
         :param time_column: Store timestamp_key will be used if None.
                             The results will be filtered by this column and start_time & end_time.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
         df = self._store.as_df(
             self._url,
@@ -569,18 +614,19 @@ class DataItem:
             time_column=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return df
-    def show(self, format=None):
+    def show(self, format: Optional[str] = None) -> None:
         """show the data object content in Jupyter
         :param format: format to use (when there is no/wrong suffix), e.g. 'png'
         """
-        if not is_ipython:
+        if not is_jupyter:
             logger.warning(
-                "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
+                "Jupyter was not detected. `.show()` displays only inside Jupyter."
             )
             return
@@ -698,8 +744,6 @@ class HttpStore(DataStore):
         verify_ssl = mlconf.httpdb.http.verify
         try:
-            if not verify_ssl:
-                urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
             response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
         except OSError as exc:
             raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
@@ -713,7 +757,7 @@ class HttpStore(DataStore):
 # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
 # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
 # method specifically to strip away the 'ds' schema as required.
-def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
+def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
     if not issubclass(cls, fsspec.AbstractFileSystem):
         raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")

mlrun/datastore/datastore.py CHANGED Viewed

@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
 from mlrun.errors import err_to_str
 from mlrun.utils.helpers import get_local_file_schema
-from ..utils import DB_SCHEMA, run_keys
+from ..utils import DB_SCHEMA, RunKeys
 from .base import DataItem, DataStore, HttpStore
 from .filestore import FileStore
 from .inmem import InMemoryStore
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
 def parse_url(url):
+    if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
+        url = url.replace("v3io://", "v3io:///", 1)
     parsed_url = urlparse(url)
     schema = parsed_url.scheme.lower()
     endpoint = parsed_url.hostname
@@ -94,6 +96,14 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
+    elif schema in ["hdfs", "webhdfs"]:
+        from .hdfs import HdfsStore
+        return HdfsStore
+    elif schema == "oss":
+        from .alibaba_oss import OSSStore
+        return OSSStore
     else:
         raise ValueError(f"unsupported store scheme ({schema})")
@@ -125,7 +135,7 @@ class StoreManager:
         return self._db
     def from_dict(self, struct: dict):
-        stor_list = struct.get(run_keys.data_stores)
+        stor_list = struct.get(RunKeys.data_stores)
         if stor_list and isinstance(stor_list, list):
             for stor in stor_list:
                 schema, endpoint, parsed_url = parse_url(stor.get("url"))
@@ -137,7 +147,7 @@ class StoreManager:
                 self._stores[stor["name"]] = new_stor
     def to_dict(self, struct):
-        struct[run_keys.data_stores] = [
+        struct[RunKeys.data_stores] = [
             stor.to_dict() for stor in self._stores.values() if stor.from_spec
         ]
@@ -170,7 +180,7 @@ class StoreManager:
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"resource {url} does not have a valid/persistent offline target"
             )
-        return resource, target
+        return resource, target or ""
     def object(
         self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
@@ -182,17 +192,24 @@ class StoreManager:
                 url, project, allow_empty_resources, secrets
             )
-        store, subpath = self.get_or_create_store(
+        store, subpath, url = self.get_or_create_store(
             url, secrets=secrets, project_name=project
         )
-        return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
+        return DataItem(
+            key,
+            store,
+            subpath,
+            url,
+            meta=meta,
+            artifact_url=artifact_url,
+        )
     def get_or_create_store(
         self, url, secrets: dict = None, project_name=""
-    ) -> (DataStore, str):
+    ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
-        store_key = f"{schema}://{endpoint}"
+        store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
         if schema == "ds":
             datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -206,17 +223,22 @@ class StoreManager:
         if schema == "memory":
             subpath = url[len("memory://") :]
-            return in_memory_store, subpath
+            return in_memory_store, subpath, url
+        elif schema in get_local_file_schema():
+            # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
+            # As a workaround, we set subpath to the url.
+            subpath = url.replace("file://", "", 1)
         if not schema and endpoint:
             if endpoint in self._stores.keys():
-                return self._stores[endpoint], subpath
+                return self._stores[endpoint], subpath, url
             else:
                 raise ValueError(f"no such store ({endpoint})")
         if not secrets and not mlrun.config.is_running_as_api():
             if store_key in self._stores.keys():
-                return self._stores[store_key], subpath
+                return self._stores[store_key], subpath, url
         # support u/p embedding in url (as done in redis) by setting netloc as the "endpoint" parameter
         # when running on server we don't cache the datastore, because there are multiple users and we don't want to
@@ -226,8 +248,7 @@ class StoreManager:
         )
         if not secrets and not mlrun.config.is_running_as_api():
             self._stores[store_key] = store
-        # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
-        return store, url if store.kind == "file" else subpath
+        return store, subpath, url
     def reset_secrets(self):
         self._secrets = {}

mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

mlrun 1.6.4rc7py3-none-any.whl → 1.7.0py3-none-any.whl