PyPI - mlrun - Versions diffs - 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl - Mend

mlrun 1.7.0rc28py3-none-any.whl → 1.7.0rc55py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show

mlrun/__main__.py +4 -2
mlrun/alerts/alert.py +75 -8
mlrun/artifacts/base.py +1 -0
mlrun/artifacts/manager.py +9 -2
mlrun/common/constants.py +4 -1
mlrun/common/db/sql_session.py +3 -2
mlrun/common/formatters/__init__.py +1 -0
mlrun/common/formatters/artifact.py +1 -0
mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
mlrun/common/formatters/run.py +3 -0
mlrun/common/helpers.py +0 -1
mlrun/common/schemas/__init__.py +3 -1
mlrun/common/schemas/alert.py +15 -12
mlrun/common/schemas/api_gateway.py +6 -6
mlrun/common/schemas/auth.py +5 -0
mlrun/common/schemas/client_spec.py +0 -1
mlrun/common/schemas/common.py +7 -4
mlrun/common/schemas/frontend_spec.py +7 -0
mlrun/common/schemas/function.py +7 -0
mlrun/common/schemas/model_monitoring/__init__.py +4 -3
mlrun/common/schemas/model_monitoring/constants.py +41 -26
mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
mlrun/common/schemas/notification.py +69 -12
mlrun/common/schemas/project.py +45 -12
mlrun/common/schemas/workflow.py +10 -2
mlrun/common/types.py +1 -0
mlrun/config.py +91 -35
mlrun/data_types/data_types.py +6 -1
mlrun/data_types/spark.py +2 -2
mlrun/data_types/to_pandas.py +57 -25
mlrun/datastore/__init__.py +1 -0
mlrun/datastore/alibaba_oss.py +3 -2
mlrun/datastore/azure_blob.py +125 -37
mlrun/datastore/base.py +42 -21
mlrun/datastore/datastore.py +4 -2
mlrun/datastore/datastore_profile.py +1 -1
mlrun/datastore/dbfs_store.py +3 -7
mlrun/datastore/filestore.py +1 -3
mlrun/datastore/google_cloud_storage.py +85 -29
mlrun/datastore/inmem.py +4 -1
mlrun/datastore/redis.py +1 -0
mlrun/datastore/s3.py +25 -12
mlrun/datastore/sources.py +76 -4
mlrun/datastore/spark_utils.py +30 -0
mlrun/datastore/storeytargets.py +151 -0
mlrun/datastore/targets.py +102 -131
mlrun/datastore/v3io.py +1 -0
mlrun/db/base.py +15 -6
mlrun/db/httpdb.py +57 -28
mlrun/db/nopdb.py +29 -5
mlrun/errors.py +20 -3
mlrun/execution.py +46 -5
mlrun/feature_store/api.py +25 -1
mlrun/feature_store/common.py +6 -11
mlrun/feature_store/feature_vector.py +3 -1
mlrun/feature_store/retrieval/job.py +4 -1
mlrun/feature_store/retrieval/spark_merger.py +10 -39
mlrun/feature_store/steps.py +8 -0
mlrun/frameworks/_common/plan.py +3 -3
mlrun/frameworks/_ml_common/plan.py +1 -1
mlrun/frameworks/parallel_coordinates.py +2 -3
mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
mlrun/k8s_utils.py +48 -2
mlrun/launcher/client.py +6 -6
mlrun/launcher/local.py +2 -2
mlrun/model.py +215 -34
mlrun/model_monitoring/api.py +38 -24
mlrun/model_monitoring/applications/__init__.py +1 -2
mlrun/model_monitoring/applications/_application_steps.py +60 -29
mlrun/model_monitoring/applications/base.py +2 -174
mlrun/model_monitoring/applications/context.py +197 -70
mlrun/model_monitoring/applications/evidently_base.py +11 -85
mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
mlrun/model_monitoring/applications/results.py +4 -4
mlrun/model_monitoring/controller.py +110 -282
mlrun/model_monitoring/db/stores/__init__.py +8 -3
mlrun/model_monitoring/db/stores/base/store.py +3 -0
mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
mlrun/model_monitoring/db/tsdb/base.py +147 -15
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
mlrun/model_monitoring/helpers.py +70 -50
mlrun/model_monitoring/stream_processing.py +96 -195
mlrun/model_monitoring/writer.py +13 -5
mlrun/package/packagers/default_packager.py +2 -2
mlrun/projects/operations.py +16 -8
mlrun/projects/pipelines.py +126 -115
mlrun/projects/project.py +286 -129
mlrun/render.py +3 -3
mlrun/run.py +38 -19
mlrun/runtimes/__init__.py +19 -8
mlrun/runtimes/base.py +4 -1
mlrun/runtimes/daskjob.py +1 -1
mlrun/runtimes/funcdoc.py +1 -1
mlrun/runtimes/kubejob.py +6 -6
mlrun/runtimes/local.py +12 -5
mlrun/runtimes/nuclio/api_gateway.py +68 -8
mlrun/runtimes/nuclio/application/application.py +307 -70
mlrun/runtimes/nuclio/function.py +63 -14
mlrun/runtimes/nuclio/serving.py +10 -10
mlrun/runtimes/pod.py +25 -19
mlrun/runtimes/remotesparkjob.py +2 -5
mlrun/runtimes/sparkjob/spark3job.py +16 -17
mlrun/runtimes/utils.py +34 -0
mlrun/serving/routers.py +2 -5
mlrun/serving/server.py +37 -19
mlrun/serving/states.py +30 -3
mlrun/serving/v2_serving.py +44 -35
mlrun/track/trackers/mlflow_tracker.py +5 -0
mlrun/utils/async_http.py +1 -1
mlrun/utils/db.py +18 -0
mlrun/utils/helpers.py +150 -36
mlrun/utils/http.py +1 -1
mlrun/utils/notifications/notification/__init__.py +0 -1
mlrun/utils/notifications/notification/webhook.py +8 -1
mlrun/utils/notifications/notification_pusher.py +1 -1
mlrun/utils/v3io_clients.py +2 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/model_monitoring/controller_handler.py +0 -37
mlrun/model_monitoring/evidently_application.py +0 -20
mlrun/model_monitoring/prometheus.py +0 -216
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0

mlrun/config.py CHANGED Viewed

@@ -27,6 +27,7 @@ import copy
 import json
 import os
 import typing
+import warnings
 from collections.abc import Mapping
 from datetime import timedelta
 from distutils.util import strtobool
@@ -35,6 +36,7 @@ from threading import Lock
 import dotenv
 import semver
+import urllib3.exceptions
 import yaml
 import mlrun.common.constants
@@ -52,6 +54,11 @@ default_config = {
     "kubernetes": {
         "kubeconfig_path": "",  # local path to kubeconfig file (for development purposes),
         # empty by default as the API already running inside k8s cluster
+        "pagination": {
+            # pagination config for interacting with k8s API
+            "list_pods_limit": 200,
+            "list_crd_objects_limit": 200,
+        },
     },
     "dbpath": "",  # db/api url
     # url to nuclio dashboard api (can be with user & token, e.g. https://username:password@dashboard-url.com)
@@ -64,11 +71,15 @@ default_config = {
     "api_base_version": "v1",
     "version": "",  # will be set to current version
     "images_tag": "",  # tag to use with mlrun images e.g. mlrun/mlrun (defaults to version)
-    "images_registry": "",  # registry to use with mlrun images e.g. quay.io/ (defaults to empty, for dockerhub)
+    # registry to use with mlrun images that start with "mlrun/" e.g. quay.io/ (defaults to empty, for dockerhub)
+    "images_registry": "",
+    # registry to use with non-mlrun images (don't start with "mlrun/") specified in 'images_to_enrich_registry'
+    # defaults to empty, for dockerhub
+    "vendor_images_registry": "",
     # comma separated list of images that are in the specified images_registry, and therefore will be enriched with this
     # registry when used. default to mlrun/* which means any image which is of the mlrun repository (mlrun/mlrun,
     # mlrun/ml-base, etc...)
-    "images_to_enrich_registry": "^mlrun/*",
+    "images_to_enrich_registry": "^mlrun/*,python:3.9",
     "kfp_url": "",
     "kfp_ttl": "14400",  # KFP ttl in sec, after that completed PODs will be deleted
     "kfp_image": "mlrun/mlrun",  # image to use for KFP runner (defaults to mlrun/mlrun)
@@ -104,7 +115,12 @@ default_config = {
             # max number of parallel abort run jobs in runs monitoring
             "concurrent_abort_stale_runs_workers": 10,
             "list_runs_time_period_in_days": 7,  # days
-        }
+        },
+        "projects": {
+            "summaries": {
+                "cache_interval": "30",
+            },
+        },
     },
     "crud": {
         "runs": {
@@ -138,6 +154,11 @@ default_config = {
         "datasets": {
             "max_preview_columns": 100,
         },
+        "limits": {
+            "max_chunk_size": 1024 * 1024 * 1,  # 1MB
+            "max_preview_size": 1024 * 1024 * 10,  # 10MB
+            "max_download_size": 1024 * 1024 * 100,  # 100MB
+        },
     },
     # FIXME: Adding these defaults here so we won't need to patch the "installing component" (provazio-controller) to
     #  configure this values on field systems, for newer system this will be configured correctly
@@ -238,7 +259,7 @@ default_config = {
         },
         "application": {
             "default_sidecar_internal_port": 8050,
-            "default_authentication_mode": "accessKey",
+            "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
         },
     },
     # TODO: function defaults should be moved to the function spec config above
@@ -250,7 +271,7 @@ default_config = {
             "remote": "mlrun/mlrun",
             "dask": "mlrun/ml-base",
             "mpijob": "mlrun/mlrun",
-            "application": "python:3.9-slim",
+            "application": "python:3.9",
         },
         # see enrich_function_preemption_spec for more info,
         # and mlrun.common.schemas.function.PreemptionModes for available options
@@ -265,6 +286,16 @@ default_config = {
                 "url": "",
                 "service": "mlrun-api-chief",
                 "port": 8080,
+                "feature_gates": {
+                    "scheduler": "enabled",
+                    "project_sync": "enabled",
+                    "cleanup": "enabled",
+                    "runs_monitoring": "enabled",
+                    "pagination_cache": "enabled",
+                    "project_summaries": "enabled",
+                    "start_logs": "enabled",
+                    "stop_logs": "enabled",
+                },
             },
             "worker": {
                 "sync_with_chief": {
@@ -302,7 +333,7 @@ default_config = {
         "http": {
             # when True, the client will verify the server's TLS
             # set to False for backwards compatibility.
-            "verify": False,
+            "verify": True,
         },
         "db": {
             "commit_retry_timeout": 30,
@@ -433,7 +464,6 @@ default_config = {
             "followers": "",
             # This is used as the interval for the sync loop both when mlrun is leader and follower
             "periodic_sync_interval": "1 minute",
-            "counters_cache_ttl": "2 minutes",
             "project_owners_cache_ttl": "30 seconds",
             # access key to be used when the leader is iguazio and polling is done from it
             "iguazio_access_key": "",
@@ -462,10 +492,10 @@ default_config = {
             # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
             # git+https://github.com/mlrun/mlrun@development. by default uses the version
             "mlrun_version_specifier": "",
-            "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1",  # kaniko builder image
+            "kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2",  # kaniko builder image
             "kaniko_init_container_image": "alpine:3.18",
             # image for kaniko init container when docker registry is ECR
-            "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",
+            "kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
             # kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
             # a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
             "kaniko_image_fs_extraction_retries": "3",
@@ -509,7 +539,6 @@ default_config = {
         "store_prefixes": {
             "default": "v3io:///users/pipelines/{project}/model-endpoints/{kind}",
             "user_space": "v3io:///projects/{project}/model-endpoints/{kind}",
-            "stream": "",  # TODO: Delete in 1.9.0
             "monitoring_application": "v3io:///users/pipelines/{project}/monitoring-apps/",
         },
         # Offline storage path can be either relative or a full path. This path is used for general offline data
@@ -522,7 +551,6 @@ default_config = {
         "parquet_batching_max_events": 10_000,
         "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
         # See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
-        "store_type": "v3io-nosql",  # TODO: Delete in 1.9.0
         "endpoint_store_connection": "",
         # See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
         "tsdb_connection": "",
@@ -706,7 +734,7 @@ default_config = {
     "grafana_url": "",
     "alerts": {
         # supported modes: "enabled", "disabled".
-        "mode": "enabled",
+        "mode": "disabled",
         # maximum number of alerts we allow to be configured.
         # user will get an error when exceeding this
         "max_allowed": 10000,
@@ -768,7 +796,21 @@ class Config:
         for key, value in cfg.items():
             if hasattr(self, key):
                 if isinstance(value, dict):
-                    getattr(self, key).update(value)
+                    # ignore the `skip_errors` flag here
+                    # if the key does not align with what mlrun config expects it is a user
+                    # input error that can lead to unexpected behavior.
+                    # raise the exception to ensure configuration is loaded correctly and do not
+                    # ignore any errors.
+                    config_value = getattr(self, key)
+                    try:
+                        config_value.update(value)
+                    except AttributeError as exc:
+                        if not isinstance(config_value, (dict, Config)):
+                            raise ValueError(
+                                f"Can not update `{key}` config. "
+                                f"Expected a configuration but received {type(value)}"
+                            ) from exc
+                        raise exc
                 else:
                     try:
                         setattr(self, key, value)
@@ -840,7 +882,7 @@ class Config:
                     f"Unable to decode {attribute_path}"
                 )
             parsed_attribute_value = json.loads(decoded_attribute_value)
-            if type(parsed_attribute_value) != expected_type:
+            if not isinstance(parsed_attribute_value, expected_type):
                 raise mlrun.errors.MLRunInvalidArgumentTypeError(
                     f"Expected type {expected_type}, got {type(parsed_attribute_value)}"
                 )
@@ -1032,6 +1074,14 @@ class Config:
             resource_requirement.pop(gpu)
         return resource_requirement
+    def force_api_gateway_ssl_redirect(self):
+        """
+        Get the default value for the ssl_redirect configuration.
+        In Iguazio we always want to redirect to HTTPS, in other cases we don't.
+        :return: True if we should redirect to HTTPS, False otherwise.
+        """
+        return self.is_running_on_iguazio()
     def to_dict(self):
         return copy.deepcopy(self._cfg)
@@ -1064,6 +1114,9 @@ class Config:
             # importing here to avoid circular dependency
             import mlrun.db
+            # It ensures that SSL verification is set before establishing a connection
+            _configure_ssl_verification(self.httpdb.http.verify)
             # when dbpath is set we want to connect to it which will sync configuration from it to the client
             mlrun.db.get_run_db(value, force_reconnect=True)
@@ -1092,10 +1145,10 @@ class Config:
         project: str = "",
         kind: str = "",
         target: str = "online",
-        artifact_path: str = None,
-        function_name: str = None,
+        artifact_path: typing.Optional[str] = None,
+        function_name: typing.Optional[str] = None,
         **kwargs,
-    ) -> typing.Union[str, list[str]]:
+    ) -> str:
         """Get the full path from the configuration based on the provided project and kind.
         :param project:         Project name.
@@ -1111,8 +1164,7 @@ class Config:
                                 relative artifact path will be taken from the global MLRun artifact path.
         :param function_name:    Application name, None for model_monitoring_stream.
-        :return:                Full configured path for the provided kind. Can be either a single path
-                                or a list of paths in the case of the online model monitoring stream path.
+        :return:                Full configured path for the provided kind.
         """
         if target != "offline":
@@ -1133,17 +1185,11 @@ class Config:
                     if function_name is None
                     else f"{kind}-{function_name.lower()}",
                 )
-            elif kind == "stream":  # return list for mlrun<1.6.3 BC
-                return [
-                    mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
-                        project=project,
-                        kind=kind,
-                    ),  # old stream uri (pipelines) for BC ML-6043
-                    mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
-                        project=project,
-                        kind=kind,
-                    ),  # new stream uri (projects)
-                ]
+            elif kind == "stream":
+                return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
+                    project=project,
+                    kind=kind,
+                )
             else:
                 return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
                     project=project,
@@ -1206,12 +1252,11 @@ class Config:
         return storage_options
-    def is_explicit_ack(self, version=None) -> bool:
-        if not version:
-            version = self.nuclio_version
+    def is_explicit_ack_enabled(self) -> bool:
         return self.httpdb.nuclio.explicit_ack == "enabled" and (
-            not version
-            or semver.VersionInfo.parse(version) >= semver.VersionInfo.parse("1.12.10")
+            not self.nuclio_version
+            or semver.VersionInfo.parse(self.nuclio_version)
+            >= semver.VersionInfo.parse("1.12.10")
         )
@@ -1261,6 +1306,7 @@ def _do_populate(env=None, skip_errors=False):
     if data:
         config.update(data, skip_errors=skip_errors)
+    _configure_ssl_verification(config.httpdb.http.verify)
     _validate_config(config)
@@ -1320,6 +1366,16 @@ def _convert_str(value, typ):
     return typ(value)
+def _configure_ssl_verification(verify_ssl: bool) -> None:
+    """Configure SSL verification warnings based on the setting."""
+    if not verify_ssl:
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+    else:
+        # If the user changes the `verify` setting to `True` at runtime using `mlrun.set_env_from_file` after
+        # importing `mlrun`, we need to reload the `mlrun` configuration and enable this warning.
+        warnings.simplefilter("default", urllib3.exceptions.InsecureRequestWarning)
 def read_env(env=None, prefix=env_prefix):
     """Read configuration from environment"""
     env = os.environ if env is None else env

mlrun/data_types/data_types.py CHANGED Viewed

@@ -70,6 +70,11 @@ def pa_type_to_value_type(type_):
     if isinstance(type_, TimestampType):
         return ValueType.DATETIME
+    # pandas category type translates to pyarrow DictionaryType
+    # we need to unpack the value type (ML-7868)
+    if isinstance(type_, pyarrow.DictionaryType):
+        type_ = type_.value_type
     type_map = {
         pyarrow.bool_(): ValueType.BOOL,
         pyarrow.int64(): ValueType.INT64,
@@ -139,7 +144,7 @@ def gbq_to_pandas_dtype(gbq_type):
         "BOOL": "bool",
         "FLOAT": "float64",
         "INTEGER": pd.Int64Dtype(),
-        "TIMESTAMP": "datetime64[ns]",
+        "TIMESTAMP": "datetime64[ns, UTC]",
     }
     return type_map.get(gbq_type, "object")

mlrun/data_types/spark.py CHANGED Viewed

@@ -20,10 +20,10 @@ import pytz
 from pyspark.sql.functions import to_utc_timestamp
 from pyspark.sql.types import BooleanType, DoubleType, TimestampType
+from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
 from mlrun.utils import logger
 from .data_types import InferOptions, spark_to_value_type
-from .to_pandas import toPandas
 try:
     import pyspark.sql.functions as funcs
@@ -75,7 +75,7 @@ def get_df_preview_spark(df, preview_lines=20):
     """capture preview data from spark df"""
     df = df.limit(preview_lines)
-    result_dict = toPandas(df).to_dict(orient="split")
+    result_dict = spark_df_to_pandas(df).to_dict(orient="split")
     return [result_dict["columns"], *result_dict["data"]]

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -15,23 +15,13 @@
 import warnings
 from collections import Counter
-from pyspark.sql.types import (
-    BooleanType,
-    ByteType,
-    DoubleType,
-    FloatType,
-    IntegerType,
-    IntegralType,
-    LongType,
-    MapType,
-    ShortType,
-    TimestampType,
-)
-def toPandas(spark_df):
+import pandas as pd
+import semver
+def _to_pandas(spark_df):
     """
-    Modified version of spark DataFrame.toPandas() –
+    Modified version of spark DataFrame.toPandas() -
     https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
     The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -40,6 +30,12 @@ def toPandas(spark_df):
     This modification adds the missing unit to the dtype.
     """
     from pyspark.sql.dataframe import DataFrame
+    from pyspark.sql.types import (
+        BooleanType,
+        IntegralType,
+        MapType,
+        TimestampType,
+    )
     assert isinstance(spark_df, DataFrame)
@@ -48,7 +44,6 @@ def toPandas(spark_df):
     require_minimum_pandas_version()
     import numpy as np
-    import pandas as pd
     timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
@@ -217,22 +212,59 @@ def toPandas(spark_df):
 def _to_corrected_pandas_type(dt):
     import numpy as np
+    from pyspark.sql.types import (
+        BooleanType,
+        ByteType,
+        DoubleType,
+        FloatType,
+        IntegerType,
+        LongType,
+        ShortType,
+        TimestampType,
+    )
-    if type(dt) == ByteType:
+    if isinstance(dt, ByteType):
         return np.int8
-    elif type(dt) == ShortType:
+    elif isinstance(dt, ShortType):
         return np.int16
-    elif type(dt) == IntegerType:
+    elif isinstance(dt, IntegerType):
         return np.int32
-    elif type(dt) == LongType:
+    elif isinstance(dt, LongType):
         return np.int64
-    elif type(dt) == FloatType:
+    elif isinstance(dt, FloatType):
         return np.float32
-    elif type(dt) == DoubleType:
+    elif isinstance(dt, DoubleType):
         return np.float64
-    elif type(dt) == BooleanType:
+    elif isinstance(dt, BooleanType):
         return bool
-    elif type(dt) == TimestampType:
+    elif isinstance(dt, TimestampType):
         return "datetime64[ns]"
     else:
         return None
+def spark_df_to_pandas(spark_df):
+    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
+    # when we upgrade pyspark, we should check whether this workaround is still necessary
+    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
+    if semver.parse(pd.__version__)["major"] >= 2:
+        import pyspark.sql.functions as pyspark_functions
+        type_conversion_dict = {}
+        for field in spark_df.schema.fields:
+            if str(field.dataType) == "TimestampType":
+                spark_df = spark_df.withColumn(
+                    field.name,
+                    pyspark_functions.date_format(
+                        pyspark_functions.to_timestamp(field.name),
+                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
+                    ),
+                )
+                type_conversion_dict[field.name] = "datetime64[ns]"
+        df = _to_pandas(spark_df)
+        if type_conversion_dict:
+            df = df.astype(type_conversion_dict)
+        return df
+    else:
+        return _to_pandas(spark_df)

mlrun/datastore/__init__.py CHANGED Viewed

@@ -117,6 +117,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
         return OutputStream(stream_path, **kwargs)
     elif stream_path.startswith("v3io"):
         endpoint, stream_path = parse_path(stream_path)
+        endpoint = kwargs.pop("endpoint", None) or endpoint
         return OutputStream(stream_path, endpoint=endpoint, **kwargs)
     elif stream_path.startswith("dummy://"):
         return _DummyStream(**kwargs)

mlrun/datastore/alibaba_oss.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fsspec.registry import get_filesystem_class
 import mlrun.errors
-from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
+from .base import DataStore, FileStats, make_datastore_schema_sanitizer
 class OSSStore(DataStore):
@@ -53,7 +53,7 @@ class OSSStore(DataStore):
         except ImportError as exc:
             raise ImportError("ALIBABA ossfs not installed") from exc
         filesystem_class = get_filesystem_class(protocol=self.kind)
-        self._filesystem = makeDatastoreSchemaSanitizer(
+        self._filesystem = make_datastore_schema_sanitizer(
             filesystem_class,
             using_bucket=self.using_bucket,
             **self.get_storage_options(),
@@ -85,6 +85,7 @@ class OSSStore(DataStore):
         return oss.get_object(key).read()
     def put(self, key, data, append=False):
+        data, _ = self._prepare_put_data(data, append)
         bucket, key = self.get_bucket_and_key(key)
         oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
         oss.put_object(key, data)

mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc28py3-none-any.whl → 1.7.0rc55py3-none-any.whl