PyPI - mlrun - Versions diffs - 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl - Mend

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show

mlrun/__init__.py +11 -1
mlrun/__main__.py +39 -121
mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
mlrun/alerts/alert.py +248 -0
mlrun/api/schemas/__init__.py +4 -3
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +39 -254
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +73 -46
mlrun/artifacts/model.py +30 -158
mlrun/artifacts/plots.py +23 -380
mlrun/common/constants.py +73 -1
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +21 -0
mlrun/common/formatters/artifact.py +46 -0
mlrun/common/formatters/base.py +113 -0
mlrun/common/formatters/feature_set.py +44 -0
mlrun/common/formatters/function.py +46 -0
mlrun/common/formatters/pipeline.py +53 -0
mlrun/common/formatters/project.py +51 -0
mlrun/common/formatters/run.py +29 -0
mlrun/common/helpers.py +11 -1
mlrun/{runtimes → common/runtimes}/constants.py +32 -4
mlrun/common/schemas/__init__.py +31 -4
mlrun/common/schemas/alert.py +202 -0
mlrun/common/schemas/api_gateway.py +196 -0
mlrun/common/schemas/artifact.py +28 -1
mlrun/common/schemas/auth.py +13 -2
mlrun/common/schemas/client_spec.py +2 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/constants.py +3 -0
mlrun/common/schemas/feature_store.py +58 -28
mlrun/common/schemas/frontend_spec.py +8 -0
mlrun/common/schemas/function.py +11 -0
mlrun/common/schemas/hub.py +7 -9
mlrun/common/schemas/model_monitoring/__init__.py +21 -4
mlrun/common/schemas/model_monitoring/constants.py +136 -42
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
mlrun/common/schemas/notification.py +69 -12
mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
mlrun/common/schemas/pipeline.py +7 -0
mlrun/common/schemas/project.py +67 -16
mlrun/common/schemas/runs.py +17 -0
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +14 -1
mlrun/config.py +233 -58
mlrun/data_types/data_types.py +11 -1
mlrun/data_types/spark.py +5 -4
mlrun/data_types/to_pandas.py +75 -34
mlrun/datastore/__init__.py +8 -10
mlrun/datastore/alibaba_oss.py +131 -0
mlrun/datastore/azure_blob.py +131 -43
mlrun/datastore/base.py +107 -47
mlrun/datastore/datastore.py +17 -7
mlrun/datastore/datastore_profile.py +91 -7
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +92 -32
mlrun/datastore/hdfs.py +5 -0
mlrun/datastore/inmem.py +6 -3
mlrun/datastore/redis.py +3 -2
mlrun/datastore/s3.py +30 -12
mlrun/datastore/snowflake_utils.py +45 -0
mlrun/datastore/sources.py +274 -59
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/store_resources.py +9 -7
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +387 -119
mlrun/datastore/utils.py +68 -5
mlrun/datastore/v3io.py +28 -50
mlrun/db/auth_utils.py +152 -0
mlrun/db/base.py +245 -20
mlrun/db/factory.py +1 -4
mlrun/db/httpdb.py +909 -231
mlrun/db/nopdb.py +279 -14
mlrun/errors.py +35 -5
mlrun/execution.py +111 -38
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +46 -53
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_set.py +48 -23
mlrun/feature_store/feature_vector.py +13 -2
mlrun/feature_store/ingestion.py +7 -6
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +13 -4
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +24 -32
mlrun/feature_store/steps.py +38 -19
mlrun/features.py +6 -14
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
mlrun/frameworks/lgbm/__init__.py +1 -1
mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
mlrun/frameworks/lgbm/model_handler.py +1 -1
mlrun/frameworks/parallel_coordinates.py +4 -4
mlrun/frameworks/pytorch/__init__.py +2 -2
mlrun/frameworks/sklearn/__init__.py +1 -1
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/frameworks/tf_keras/__init__.py +5 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
mlrun/frameworks/xgboost/__init__.py +1 -1
mlrun/k8s_utils.py +57 -12
mlrun/launcher/__init__.py +1 -1
mlrun/launcher/base.py +6 -5
mlrun/launcher/client.py +13 -11
mlrun/launcher/factory.py +1 -1
mlrun/launcher/local.py +15 -5
mlrun/launcher/remote.py +10 -3
mlrun/lists.py +6 -2
mlrun/model.py +297 -48
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +152 -357
mlrun/model_monitoring/applications/__init__.py +10 -0
mlrun/model_monitoring/applications/_application_steps.py +190 -0
mlrun/model_monitoring/applications/base.py +108 -0
mlrun/model_monitoring/applications/context.py +341 -0
mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +130 -303
mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
mlrun/model_monitoring/db/stores/__init__.py +136 -0
mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
mlrun/model_monitoring/db/stores/base/store.py +213 -0
mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
mlrun/model_monitoring/db/tsdb/base.py +448 -0
mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
mlrun/model_monitoring/features_drift_table.py +34 -22
mlrun/model_monitoring/helpers.py +177 -39
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +165 -398
mlrun/model_monitoring/tracking_policy.py +7 -1
mlrun/model_monitoring/writer.py +161 -125
mlrun/package/packagers/default_packager.py +2 -2
mlrun/package/packagers_manager.py +1 -0
mlrun/package/utils/_formatter.py +2 -2
mlrun/platforms/__init__.py +11 -10
mlrun/platforms/iguazio.py +67 -228
mlrun/projects/__init__.py +6 -1
mlrun/projects/operations.py +47 -20
mlrun/projects/pipelines.py +396 -249
mlrun/projects/project.py +1176 -406
mlrun/render.py +28 -22
mlrun/run.py +208 -181
mlrun/runtimes/__init__.py +76 -11
mlrun/runtimes/base.py +54 -24
mlrun/runtimes/daskjob.py +9 -2
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/runtimes/funcdoc.py +1 -29
mlrun/runtimes/kubejob.py +34 -128
mlrun/runtimes/local.py +39 -10
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/abstract.py +8 -8
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/api_gateway.py +769 -0
mlrun/runtimes/nuclio/application/__init__.py +15 -0
mlrun/runtimes/nuclio/application/application.py +758 -0
mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
mlrun/runtimes/nuclio/function.py +188 -68
mlrun/runtimes/nuclio/serving.py +57 -60
mlrun/runtimes/pod.py +191 -58
mlrun/runtimes/remotesparkjob.py +11 -8
mlrun/runtimes/sparkjob/spark3job.py +17 -18
mlrun/runtimes/utils.py +40 -73
mlrun/secrets.py +6 -2
mlrun/serving/__init__.py +8 -1
mlrun/serving/remote.py +2 -3
mlrun/serving/routers.py +89 -64
mlrun/serving/server.py +54 -26
mlrun/serving/states.py +187 -56
mlrun/serving/utils.py +19 -11
mlrun/serving/v2_serving.py +136 -63
mlrun/track/tracker.py +2 -1
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +26 -6
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +375 -105
mlrun/utils/http.py +2 -2
mlrun/utils/logger.py +75 -9
mlrun/utils/notifications/notification/__init__.py +14 -10
mlrun/utils/notifications/notification/base.py +48 -0
mlrun/utils/notifications/notification/console.py +2 -0
mlrun/utils/notifications/notification/git.py +24 -1
mlrun/utils/notifications/notification/ipython.py +2 -0
mlrun/utils/notifications/notification/slack.py +96 -21
mlrun/utils/notifications/notification/webhook.py +63 -2
mlrun/utils/notifications/notification_pusher.py +146 -16
mlrun/utils/regex.py +9 -0
mlrun/utils/retryer.py +3 -2
mlrun/utils/v3io_clients.py +2 -3
mlrun/utils/version/version.json +2 -2
mlrun-1.7.2.dist-info/METADATA +390 -0
mlrun-1.7.2.dist-info/RECORD +351 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/kfpops.py +0 -868
mlrun/model_monitoring/application.py +0 -310
mlrun/model_monitoring/batch.py +0 -974
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/prometheus.py +0 -216
mlrun/model_monitoring/stores/__init__.py +0 -111
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
mlrun/model_monitoring/stores/models/__init__.py +0 -27
mlrun/model_monitoring/stores/models/base.py +0 -84
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
mlrun/platforms/other.py +0 -305
mlrun-1.7.0rc4.dist-info/METADATA +0 -269
mlrun-1.7.0rc4.dist-info/RECORD +0 -321
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0

mlrun/datastore/base.py CHANGED Viewed

@@ -24,20 +24,16 @@ import pandas as pd
 import pyarrow
 import pytz
 import requests
-import urllib3
 from deprecated import deprecated
+import mlrun.config
 import mlrun.errors
 from mlrun.errors import err_to_str
-from mlrun.utils import StorePrefix, is_ipython, logger
+from mlrun.utils import StorePrefix, is_jupyter, logger
 from .store_resources import is_store_uri, parse_store_uri
 from .utils import filter_df_start_end_time, select_columns_from_df
-verify_ssl = False
-if not verify_ssl:
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 class FileStats:
     def __init__(self, size, modified, content_type=None):
@@ -160,6 +156,18 @@ class DataStore:
     def put(self, key, data, append=False):
         pass
+    def _prepare_put_data(self, data, append=False):
+        mode = "a" if append else "w"
+        if isinstance(data, bytearray):
+            data = bytes(data)
+        if isinstance(data, bytes):
+            return data, f"{mode}b"
+        elif isinstance(data, str):
+            return data, mode
+        else:
+            raise TypeError(f"Unable to put a value of type {type(self).__name__}")
     def stat(self, key):
         pass
@@ -182,11 +190,23 @@ class DataStore:
         return {}
     @staticmethod
-    def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
+    def _parquet_reader(
+        df_module,
+        url,
+        file_system,
+        time_column,
+        start_time,
+        end_time,
+        additional_filters,
+    ):
         from storey.utils import find_filters, find_partitions
         def set_filters(
-            partitions_time_attributes, start_time_inner, end_time_inner, kwargs
+            partitions_time_attributes,
+            start_time_inner,
+            end_time_inner,
+            filters_inner,
+            kwargs,
         ):
             filters = []
             find_filters(
@@ -196,20 +216,32 @@ class DataStore:
                 filters,
                 time_column,
             )
+            if filters and filters_inner:
+                filters[0] += filters_inner
             kwargs["filters"] = filters
         def reader(*args, **kwargs):
-            if start_time or end_time:
-                if time_column is None:
-                    raise mlrun.errors.MLRunInvalidArgumentError(
-                        "When providing start_time or end_time, must provide time_column"
-                    )
+            if time_column is None and (start_time or end_time):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "When providing start_time or end_time, must provide time_column"
+                )
+            if (
+                start_time
+                and end_time
+                and start_time.utcoffset() != end_time.utcoffset()
+            ):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "start_time and end_time must have the same time zone"
+                )
+            if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
                     partitions_time_attributes,
                     start_time,
                     end_time,
+                    additional_filters,
                     kwargs,
                 )
                 try:
@@ -220,17 +252,23 @@ class DataStore:
                     ):
                         raise ex
-                    if start_time.tzinfo:
-                        start_time_inner = start_time.replace(tzinfo=None)
-                        end_time_inner = end_time.replace(tzinfo=None)
-                    else:
-                        start_time_inner = start_time.replace(tzinfo=pytz.utc)
-                        end_time_inner = end_time.replace(tzinfo=pytz.utc)
+                    start_time_inner = None
+                    if start_time:
+                        start_time_inner = start_time.replace(
+                            tzinfo=None if start_time.tzinfo else pytz.utc
+                        )
+                    end_time_inner = None
+                    if end_time:
+                        end_time_inner = end_time.replace(
+                            tzinfo=None if end_time.tzinfo else pytz.utc
+                        )
                     set_filters(
                         partitions_time_attributes,
                         start_time_inner,
                         end_time_inner,
+                        additional_filters,
                         kwargs,
                     )
                     return df_module.read_parquet(*args, **kwargs)
@@ -249,6 +287,7 @@ class DataStore:
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         df_module = df_module or pd
@@ -304,16 +343,18 @@ class DataStore:
                                 dfs.append(df_module.read_csv(*updated_args, **kwargs))
                         return df_module.concat(dfs)
-        elif (
-            file_url.endswith(".parquet")
-            or file_url.endswith(".pq")
-            or format == "parquet"
-        ):
+        elif mlrun.utils.helpers.is_parquet_file(file_url, format):
             if columns:
                 kwargs["columns"] = columns
             reader = self._parquet_reader(
-                df_module, url, file_system, time_column, start_time, end_time
+                df_module,
+                url,
+                file_system,
+                time_column,
+                start_time,
+                end_time,
+                additional_filters,
             )
         elif file_url.endswith(".json") or format == "json":
@@ -365,7 +406,10 @@ class DataStore:
         }
     def rm(self, path, recursive=False, maxdepth=None):
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        try:
+            self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        except FileNotFoundError:
+            pass
     @staticmethod
     def _is_dd(df_module):
@@ -392,14 +436,15 @@ class DataItem:
         # reading run results using DataItem (run.artifact())
-        train_run = train_iris_func.run(inputs={'dataset': dataset},
-                                        params={'label_column': 'label'})
+        train_run = train_iris_func.run(
+            inputs={"dataset": dataset}, params={"label_column": "label"}
+        )
-        train_run.artifact('confusion-matrix').show()
-        test_set = train_run.artifact('test_set').as_df()
+        train_run.artifact("confusion-matrix").show()
+        test_set = train_run.artifact("test_set").as_df()
         # create and use DataItem from uri
-        data = mlrun.get_dataitem('http://xyz/data.json').get()
+        data = mlrun.get_dataitem("http://xyz/data.json").get()
     """
     def __init__(
@@ -541,6 +586,7 @@ class DataItem:
         time_column=None,
         start_time=None,
         end_time=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return a dataframe object (generated from the dataitem).
@@ -552,6 +598,12 @@ class DataItem:
         :param end_time:    filters out data after this time
         :param time_column: Store timestamp_key will be used if None.
                             The results will be filtered by this column and start_time & end_time.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
         df = self._store.as_df(
             self._url,
@@ -562,18 +614,19 @@ class DataItem:
             time_column=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return df
-    def show(self, format=None):
+    def show(self, format: Optional[str] = None) -> None:
         """show the data object content in Jupyter
         :param format: format to use (when there is no/wrong suffix), e.g. 'png'
         """
-        if not is_ipython:
+        if not is_jupyter:
             logger.warning(
-                "Jupyter/IPython was not detected, .show() will only display inside Jupyter"
+                "Jupyter was not detected. `.show()` displays only inside Jupyter."
             )
             return
@@ -633,17 +686,6 @@ def basic_auth_header(user, password):
     return {"Authorization": authstr}
-def http_get(url, headers=None, auth=None):
-    try:
-        response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
-    except OSError as exc:
-        raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
-    mlrun.errors.raise_for_status(response)
-    return response.content
 class HttpStore(DataStore):
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
         super().__init__(parent, name, schema, endpoint, secrets)
@@ -671,7 +713,7 @@ class HttpStore(DataStore):
         raise ValueError("unimplemented")
     def get(self, key, size=None, offset=0):
-        data = http_get(self.url + self._join(key), self._headers, self.auth)
+        data = self._http_get(self.url + self._join(key), self._headers, self.auth)
         if offset:
             data = data[offset:]
         if size:
@@ -691,13 +733,31 @@ class HttpStore(DataStore):
                 f"schema as it is not secure and is not recommended."
             )
+    def _http_get(
+        self,
+        url,
+        headers=None,
+        auth=None,
+    ):
+        # import here to prevent import cycle
+        from mlrun.config import config as mlconf
+        verify_ssl = mlconf.httpdb.http.verify
+        try:
+            response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
+        except OSError as exc:
+            raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
+        mlrun.errors.raise_for_status(response)
+        return response.content
 # This wrapper class is designed to extract the 'ds' schema and profile name from URL-formatted paths.
 # Within fsspec, the AbstractFileSystem::_strip_protocol() internal method is used to handle complete URL paths.
 # As an example, it converts an S3 URL 's3://s3bucket/path' to just 's3bucket/path'.
 # Since 'ds' schemas are not inherently processed by fsspec, we have adapted the _strip_protocol()
 # method specifically to strip away the 'ds' schema as required.
-def makeDatastoreSchemaSanitizer(cls, using_bucket=False, *args, **kwargs):
+def make_datastore_schema_sanitizer(cls, using_bucket=False, *args, **kwargs):
     if not issubclass(cls, fsspec.AbstractFileSystem):
         raise ValueError("Class must be a subclass of fsspec.AbstractFileSystem")

mlrun/datastore/datastore.py CHANGED Viewed

@@ -21,7 +21,7 @@ from mlrun.datastore.datastore_profile import datastore_profile_read
 from mlrun.errors import err_to_str
 from mlrun.utils.helpers import get_local_file_schema
-from ..utils import DB_SCHEMA, run_keys
+from ..utils import DB_SCHEMA, RunKeys
 from .base import DataItem, DataStore, HttpStore
 from .filestore import FileStore
 from .inmem import InMemoryStore
@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
 def parse_url(url):
+    if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
+        url = url.replace("v3io://", "v3io:///", 1)
     parsed_url = urlparse(url)
     schema = parsed_url.scheme.lower()
     endpoint = parsed_url.hostname
@@ -94,10 +96,14 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
-    elif schema == "hdfs":
+    elif schema in ["hdfs", "webhdfs"]:
         from .hdfs import HdfsStore
         return HdfsStore
+    elif schema == "oss":
+        from .alibaba_oss import OSSStore
+        return OSSStore
     else:
         raise ValueError(f"unsupported store scheme ({schema})")
@@ -129,7 +135,7 @@ class StoreManager:
         return self._db
     def from_dict(self, struct: dict):
-        stor_list = struct.get(run_keys.data_stores)
+        stor_list = struct.get(RunKeys.data_stores)
         if stor_list and isinstance(stor_list, list):
             for stor in stor_list:
                 schema, endpoint, parsed_url = parse_url(stor.get("url"))
@@ -141,7 +147,7 @@ class StoreManager:
                 self._stores[stor["name"]] = new_stor
     def to_dict(self, struct):
-        struct[run_keys.data_stores] = [
+        struct[RunKeys.data_stores] = [
             stor.to_dict() for stor in self._stores.values() if stor.from_spec
         ]
@@ -203,7 +209,7 @@ class StoreManager:
     ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
-        store_key = f"{schema}://{endpoint}"
+        store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
         if schema == "ds":
             datastore_profile = datastore_profile_read(url, project_name, secrets)
@@ -219,6 +225,11 @@ class StoreManager:
             subpath = url[len("memory://") :]
             return in_memory_store, subpath, url
+        elif schema in get_local_file_schema():
+            # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
+            # As a workaround, we set subpath to the url.
+            subpath = url.replace("file://", "", 1)
         if not schema and endpoint:
             if endpoint in self._stores.keys():
                 return self._stores[endpoint], subpath, url
@@ -237,8 +248,7 @@ class StoreManager:
         )
         if not secrets and not mlrun.config.is_running_as_api():
             self._stores[store_key] = store
-        # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
-        return store, url if store.kind == "file" else subpath, url
+        return store, subpath, url
     def reset_secrets(self):
         self._secrets = {}

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -16,6 +16,7 @@ import ast
 import base64
 import json
 import typing
+import warnings
 from urllib.parse import ParseResult, urlparse, urlunparse
 import pydantic
@@ -36,6 +37,7 @@ class DatastoreProfile(pydantic.BaseModel):
         extra = pydantic.Extra.forbid
     @pydantic.validator("name")
+    @classmethod
     def lower_case(cls, v):
         return v.lower()
@@ -68,6 +70,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
     def get(self, key):
         return self._data.get(key, None)
+    def remove(self, key):
+        self._data.pop(key, None)
 class DatastoreProfileBasic(DatastoreProfile):
     type: str = pydantic.Field("basic")
@@ -79,13 +84,37 @@ class DatastoreProfileBasic(DatastoreProfile):
 class DatastoreProfileKafkaTarget(DatastoreProfile):
     type: str = pydantic.Field("kafka_target")
     _private_attributes = "kwargs_private"
-    bootstrap_servers: str
+    bootstrap_servers: typing.Optional[str] = None
+    brokers: typing.Optional[str] = None
     topic: str
     kwargs_public: typing.Optional[dict]
     kwargs_private: typing.Optional[dict]
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if not self.brokers and not self.bootstrap_servers:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "DatastoreProfileKafkaTarget requires the 'brokers' field to be set"
+            )
+        if self.bootstrap_servers:
+            if self.brokers:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "DatastoreProfileKafkaTarget cannot be created with both 'brokers' and 'bootstrap_servers'"
+                )
+            else:
+                self.brokers = self.bootstrap_servers
+                self.bootstrap_servers = None
+            warnings.warn(
+                "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
+                "use 'brokers' instead.",
+                # TODO: Remove this in 1.9.0
+                FutureWarning,
+            )
     def attributes(self):
-        attributes = {"bootstrap_servers": self.bootstrap_servers}
+        attributes = {"brokers": self.brokers or self.bootstrap_servers}
         if self.kwargs_public:
             attributes = merge(attributes, self.kwargs_public)
         if self.kwargs_private:
@@ -157,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
     assume_role_arn: typing.Optional[str] = None
     access_key_id: typing.Optional[str] = None
     secret_key: typing.Optional[str] = None
+    bucket: typing.Optional[str] = None
+    @pydantic.validator("bucket")
+    @classmethod
+    def check_bucket(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'bucket' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     def secrets(self) -> dict:
         res = {}
@@ -175,7 +216,13 @@ class DatastoreProfileS3(DatastoreProfile):
         return res
     def url(self, subpath):
-        return f"s3:/{subpath}"
+        # TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
+        # we assume that the subpath can begin without a '/' character,
+        # while here we assume it always starts with one.
+        if self.bucket:
+            return f"s3://{self.bucket}{subpath}"
+        else:
+            return f"s3:/{subpath}"
 class DatastoreProfileRedis(DatastoreProfile):
@@ -244,18 +291,36 @@ class DatastoreProfileGCS(DatastoreProfile):
     _private_attributes = ("gcp_credentials",)
     credentials_path: typing.Optional[str] = None  # path to file.
     gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
+    bucket: typing.Optional[str] = None
+    @pydantic.validator("bucket")
+    @classmethod
+    def check_bucket(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'bucket' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     @pydantic.validator("gcp_credentials", pre=True, always=True)
+    @classmethod
     def convert_dict_to_json(cls, v):
         if isinstance(v, dict):
             return json.dumps(v)
         return v
     def url(self, subpath) -> str:
+        # TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
+        # but the opposite assumption is made in S3.
         if subpath.startswith("/"):
             #  in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
             subpath = subpath[1:]
-        return f"gcs://{subpath}"
+        if self.bucket:
+            return f"gcs://{self.bucket}/{subpath}"
+        else:
+            return f"gcs://{subpath}"
     def secrets(self) -> dict:
         res = {}
@@ -283,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
     client_secret: typing.Optional[str] = None
     sas_token: typing.Optional[str] = None
     credential: typing.Optional[str] = None
+    container: typing.Optional[str] = None
+    @pydantic.validator("container")
+    @classmethod
+    def check_container(cls, v):
+        if not v:
+            warnings.warn(
+                "The 'container' attribute will be mandatory starting from version 1.9",
+                FutureWarning,
+                stacklevel=2,
+            )
+        return v
     def url(self, subpath) -> str:
         if subpath.startswith("/"):
-            #  in azure the path after schema is starts with bucket, wherefore it should not start with "/".
+            #  in azure the path after schema is starts with container, wherefore it should not start with "/".
             subpath = subpath[1:]
-        return f"az://{subpath}"
+        if self.container:
+            return f"az://{self.container}/{subpath}"
+        else:
+            return f"az://{subpath}"
     def secrets(self) -> dict:
         res = {}
@@ -332,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
         return res or None
     def url(self, subpath):
-        return f"hdfs://{self.host}:{self.http_port}{subpath}"
+        return f"webhdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):
@@ -460,3 +540,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
     It's beneficial for testing purposes.
     """
     TemporaryClientDatastoreProfiles().add(profile)
+def remove_temporary_client_datastore_profile(profile_name: str):
+    TemporaryClientDatastoreProfiles().remove(profile_name)

mlrun/datastore/dbfs_store.py CHANGED Viewed

@@ -19,7 +19,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 class DatabricksFileBugFixed(DatabricksFile):
@@ -89,7 +89,7 @@ class DBFSStore(DataStore):
         """return fsspec file system object, if supported"""
         filesystem_class = get_filesystem_class(protocol=self.kind)
         if not self._filesystem:
-            self._filesystem = makeDatastoreSchemaSanitizer(
+            self._filesystem = make_datastore_schema_sanitizer(
                 cls=filesystem_class,
                 using_bucket=False,
                 **self.get_storage_options(),
@@ -130,11 +130,7 @@ class DBFSStore(DataStore):
                 "Append mode not supported for Databricks file system"
             )
         #  can not use append mode because it overrides data.
-        mode = "w"
-        if isinstance(data, bytes):
-            mode += "b"
-        elif not isinstance(data, str):
-            raise TypeError(f"Unknown data type {type(data)}")
+        data, mode = self._prepare_put_data(data, append)
         with self.filesystem.open(key, mode) as f:
             f.write(data)

mlrun/datastore/filestore.py CHANGED Viewed

@@ -66,9 +66,7 @@ class FileStore(DataStore):
         dir_to_create = path.dirname(self._join(key))
         if dir_to_create:
             self._ensure_directory(dir_to_create)
-        mode = "a" if append else "w"
-        if isinstance(data, bytes):
-            mode = mode + "b"
+        data, mode = self._prepare_put_data(data, append)
         with open(self._join(key), mode) as fp:
             fp.write(data)
             fp.close()

mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc4py3-none-any.whl → 1.7.2py3-none-any.whl