PyPI - mlrun - Versions diffs - 1.7.0rc29__py3-none-any.whl → 1.7.0rc31__py3-none-any.whl - Mend

mlrun 1.7.0rc29py3-none-any.whl → 1.7.0rc31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (39) hide show

mlrun/common/constants.py +1 -1
mlrun/common/formatters/artifact.py +1 -0
mlrun/common/schemas/model_monitoring/constants.py +5 -1
mlrun/common/schemas/project.py +10 -9
mlrun/config.py +21 -2
mlrun/data_types/spark.py +2 -2
mlrun/data_types/to_pandas.py +48 -16
mlrun/datastore/__init__.py +1 -0
mlrun/datastore/base.py +20 -8
mlrun/datastore/datastore.py +4 -2
mlrun/datastore/datastore_profile.py +1 -1
mlrun/datastore/google_cloud_storage.py +1 -0
mlrun/datastore/inmem.py +3 -0
mlrun/datastore/s3.py +2 -0
mlrun/datastore/sources.py +14 -0
mlrun/datastore/targets.py +11 -1
mlrun/db/base.py +1 -0
mlrun/db/httpdb.py +10 -2
mlrun/db/nopdb.py +1 -0
mlrun/feature_store/retrieval/spark_merger.py +3 -32
mlrun/model.py +1 -5
mlrun/model_monitoring/api.py +3 -3
mlrun/model_monitoring/controller.py +57 -73
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +8 -2
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +3 -0
mlrun/model_monitoring/helpers.py +6 -12
mlrun/model_monitoring/writer.py +1 -2
mlrun/projects/project.py +16 -0
mlrun/run.py +5 -5
mlrun/runtimes/base.py +1 -1
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/METADATA +6 -6
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/RECORD +37 -39
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/WHEEL +1 -1
mlrun/feature_store/retrieval/conversion.py +0 -271
mlrun/model_monitoring/controller_handler.py +0 -37
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc29.dist-info → mlrun-1.7.0rc31.dist-info}/top_level.txt +0 -0

mlrun/common/constants.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 IMAGE_NAME_ENRICH_REGISTRY_PREFIX = "."  # prefix for image name to enrich with registry
 MLRUN_SERVING_CONF = "serving-conf"
@@ -70,6 +69,7 @@ class MLRunInternalLabels:
     job_type = "job-type"
     kind = "kind"
     component = "component"
+    mlrun_type = "mlrun__type"
     owner = "owner"
     v3io_user = "v3io_user"

mlrun/common/formatters/artifact.py CHANGED Viewed

@@ -37,6 +37,7 @@ class ArtifactFormat(ObjectFormat, mlrun.common.types.StrEnum):
                     "spec.db_key",
                     "spec.size",
                     "spec.framework",
+                    "spec.algorithm",
                     "spec.metrics",
                     "spec.target_path",
                 ]

mlrun/common/schemas/model_monitoring/constants.py CHANGED Viewed

@@ -17,6 +17,7 @@ from dataclasses import dataclass
 from enum import Enum, IntEnum
 from typing import Optional
+import mlrun.common.constants
 import mlrun.common.helpers
 from mlrun.common.types import StrEnum
@@ -354,7 +355,7 @@ class ResultStatusApp(IntEnum):
 class ModelMonitoringAppLabel:
-    KEY = "mlrun__type"
+    KEY = mlrun.common.constants.MLRunInternalLabels.mlrun_type
     VAL = "mlrun__model-monitoring-application"
     def __str__(self) -> str:
@@ -377,3 +378,6 @@ class PredictionsQueryConstants:
 class SpecialApps:
     MLRUN_INFRA = "mlrun-infra"
+_RESERVED_FUNCTION_NAMES = MonitoringFunctionNames.list() + [SpecialApps.MLRUN_INFRA]

mlrun/common/schemas/project.py CHANGED Viewed

@@ -114,18 +114,19 @@ class ProjectOwner(pydantic.BaseModel):
 class ProjectSummary(pydantic.BaseModel):
     name: str
-    files_count: int
-    feature_sets_count: int
-    models_count: int
-    runs_completed_recent_count: int
-    runs_failed_recent_count: int
-    runs_running_count: int
-    distinct_schedules_count: int
-    distinct_scheduled_jobs_pending_count: int
-    distinct_scheduled_pipelines_pending_count: int
+    files_count: int = 0
+    feature_sets_count: int = 0
+    models_count: int = 0
+    runs_completed_recent_count: int = 0
+    runs_failed_recent_count: int = 0
+    runs_running_count: int = 0
+    distinct_schedules_count: int = 0
+    distinct_scheduled_jobs_pending_count: int = 0
+    distinct_scheduled_pipelines_pending_count: int = 0
     pipelines_completed_recent_count: typing.Optional[int] = None
     pipelines_failed_recent_count: typing.Optional[int] = None
     pipelines_running_count: typing.Optional[int] = None
+    updated: typing.Optional[datetime.datetime] = None
 class IguazioProject(pydantic.BaseModel):

mlrun/config.py CHANGED Viewed

@@ -52,6 +52,11 @@ default_config = {
     "kubernetes": {
         "kubeconfig_path": "",  # local path to kubeconfig file (for development purposes),
         # empty by default as the API already running inside k8s cluster
+        "pagination": {
+            # pagination config for interacting with k8s API
+            "list_pods_limit": 200,
+            "list_crd_objects_limit": 200,
+        },
     },
     "dbpath": "",  # db/api url
     # url to nuclio dashboard api (can be with user & token, e.g. https://username:password@dashboard-url.com)
@@ -108,7 +113,12 @@ default_config = {
             # max number of parallel abort run jobs in runs monitoring
             "concurrent_abort_stale_runs_workers": 10,
             "list_runs_time_period_in_days": 7,  # days
-        }
+        },
+        "projects": {
+            "summaries": {
+                "cache_interval": "30",
+            },
+        },
     },
     "crud": {
         "runs": {
@@ -269,6 +279,16 @@ default_config = {
                 "url": "",
                 "service": "mlrun-api-chief",
                 "port": 8080,
+                "feature_gates": {
+                    "scheduler": "enabled",
+                    "project_sync": "enabled",
+                    "cleanup": "enabled",
+                    "runs_monitoring": "enabled",
+                    "pagination_cache": "enabled",
+                    "project_summaries": "enabled",
+                    "start_logs": "enabled",
+                    "stop_logs": "enabled",
+                },
             },
             "worker": {
                 "sync_with_chief": {
@@ -437,7 +457,6 @@ default_config = {
             "followers": "",
             # This is used as the interval for the sync loop both when mlrun is leader and follower
             "periodic_sync_interval": "1 minute",
-            "counters_cache_ttl": "2 minutes",
             "project_owners_cache_ttl": "30 seconds",
             # access key to be used when the leader is iguazio and polling is done from it
             "iguazio_access_key": "",

mlrun/data_types/spark.py CHANGED Viewed

@@ -20,10 +20,10 @@ import pytz
 from pyspark.sql.functions import to_utc_timestamp
 from pyspark.sql.types import BooleanType, DoubleType, TimestampType
+from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
 from mlrun.utils import logger
 from .data_types import InferOptions, spark_to_value_type
-from .to_pandas import toPandas
 try:
     import pyspark.sql.functions as funcs
@@ -75,7 +75,7 @@ def get_df_preview_spark(df, preview_lines=20):
     """capture preview data from spark df"""
     df = df.limit(preview_lines)
-    result_dict = toPandas(df).to_dict(orient="split")
+    result_dict = spark_df_to_pandas(df).to_dict(orient="split")
     return [result_dict["columns"], *result_dict["data"]]

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -15,21 +15,11 @@
 import warnings
 from collections import Counter
-from pyspark.sql.types import (
-    BooleanType,
-    ByteType,
-    DoubleType,
-    FloatType,
-    IntegerType,
-    IntegralType,
-    LongType,
-    MapType,
-    ShortType,
-    TimestampType,
-)
-def toPandas(spark_df):
+import pandas as pd
+import semver
+def _toPandas(spark_df):
     """
     Modified version of spark DataFrame.toPandas() –
     https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
@@ -40,6 +30,12 @@ def toPandas(spark_df):
     This modification adds the missing unit to the dtype.
     """
     from pyspark.sql.dataframe import DataFrame
+    from pyspark.sql.types import (
+        BooleanType,
+        IntegralType,
+        MapType,
+        TimestampType,
+    )
     assert isinstance(spark_df, DataFrame)
@@ -48,7 +44,6 @@ def toPandas(spark_df):
     require_minimum_pandas_version()
     import numpy as np
-    import pandas as pd
     timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
@@ -217,6 +212,16 @@ def toPandas(spark_df):
 def _to_corrected_pandas_type(dt):
     import numpy as np
+    from pyspark.sql.types import (
+        BooleanType,
+        ByteType,
+        DoubleType,
+        FloatType,
+        IntegerType,
+        LongType,
+        ShortType,
+        TimestampType,
+    )
     if type(dt) == ByteType:
         return np.int8
@@ -236,3 +241,30 @@ def _to_corrected_pandas_type(dt):
         return "datetime64[ns]"
     else:
         return None
+def spark_df_to_pandas(spark_df):
+    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
+    # when we upgrade pyspark, we should check whether this workaround is still necessary
+    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
+    if semver.parse(pd.__version__)["major"] >= 2:
+        import pyspark.sql.functions as pyspark_functions
+        type_conversion_dict = {}
+        for field in spark_df.schema.fields:
+            if str(field.dataType) == "TimestampType":
+                spark_df = spark_df.withColumn(
+                    field.name,
+                    pyspark_functions.date_format(
+                        pyspark_functions.to_timestamp(field.name),
+                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
+                    ),
+                )
+                type_conversion_dict[field.name] = "datetime64[ns]"
+        df = _toPandas(spark_df)
+        if type_conversion_dict:
+            df = df.astype(type_conversion_dict)
+        return df
+    else:
+        return _toPandas(spark_df)

mlrun/datastore/__init__.py CHANGED Viewed

@@ -117,6 +117,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
         return OutputStream(stream_path, **kwargs)
     elif stream_path.startswith("v3io"):
         endpoint, stream_path = parse_path(stream_path)
+        endpoint = kwargs.pop("endpoint", None) or endpoint
         return OutputStream(stream_path, endpoint=endpoint, **kwargs)
     elif stream_path.startswith("dummy://"):
         return _DummyStream(**kwargs)

mlrun/datastore/base.py CHANGED Viewed

@@ -215,6 +215,11 @@ class DataStore:
                 raise mlrun.errors.MLRunInvalidArgumentError(
                     "When providing start_time or end_time, must provide time_column"
                 )
+            if start_time and end_time and start_time.tzinfo != end_time.tzinfo:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "start_time and end_time must have the same time zone"
+                )
             if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
@@ -232,13 +237,17 @@ class DataStore:
                     ):
                         raise ex
-                    # TODO: fix timezone issue (ML-6308)
-                    if start_time.tzinfo:
-                        start_time_inner = start_time.replace(tzinfo=None)
-                        end_time_inner = end_time.replace(tzinfo=None)
-                    else:
-                        start_time_inner = start_time.replace(tzinfo=pytz.utc)
-                        end_time_inner = end_time.replace(tzinfo=pytz.utc)
+                    start_time_inner = None
+                    if start_time:
+                        start_time_inner = start_time.replace(
+                            tzinfo=None if start_time.tzinfo else pytz.utc
+                        )
+                    end_time_inner = None
+                    if end_time:
+                        end_time_inner = end_time.replace(
+                            tzinfo=None if end_time.tzinfo else pytz.utc
+                        )
                     set_filters(
                         partitions_time_attributes,
@@ -382,7 +391,10 @@ class DataStore:
         }
     def rm(self, path, recursive=False, maxdepth=None):
-        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        try:
+            self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
+        except FileNotFoundError:
+            pass
     @staticmethod
     def _is_dd(df_module):

mlrun/datastore/datastore.py CHANGED Viewed

@@ -32,6 +32,8 @@ in_memory_store = InMemoryStore()
 def parse_url(url):
+    if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
+        url = url.replace("v3io://", "v3io:///", 1)
     parsed_url = urlparse(url)
     schema = parsed_url.scheme.lower()
     endpoint = parsed_url.hostname
@@ -94,7 +96,7 @@ def schema_to_store(schema):
         from .dbfs_store import DBFSStore
         return DBFSStore
-    elif schema == "hdfs":
+    elif schema in ["hdfs", "webhdfs"]:
         from .hdfs import HdfsStore
         return HdfsStore
@@ -207,7 +209,7 @@ class StoreManager:
     ) -> (DataStore, str, str):
         schema, endpoint, parsed_url = parse_url(url)
         subpath = parsed_url.path
-        store_key = f"{schema}://{endpoint}"
+        store_key = f"{schema}://{endpoint}" if endpoint else f"{schema}://"
         if schema == "ds":
             datastore_profile = datastore_profile_read(url, project_name, secrets)

mlrun/datastore/datastore_profile.py CHANGED Viewed

@@ -412,7 +412,7 @@ class DatastoreProfileHdfs(DatastoreProfile):
         return res or None
     def url(self, subpath):
-        return f"hdfs://{self.host}:{self.http_port}{subpath}"
+        return f"webhdfs://{self.host}:{self.http_port}{subpath}"
 class DatastoreProfile2Json(pydantic.BaseModel):

mlrun/datastore/google_cloud_storage.py CHANGED Viewed

@@ -133,6 +133,7 @@ class GoogleCloudStorageStore(DataStore):
     def rm(self, path, recursive=False, maxdepth=None):
         path = self._make_path(path)
+        self.filesystem.exists(path)
         self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
     def get_spark_options(self):

mlrun/datastore/inmem.py CHANGED Viewed

@@ -85,3 +85,6 @@ class InMemoryStore(DataStore):
             kwargs.pop(field, None)
         return reader(item, **kwargs)
+    def rm(self, path, recursive=False, maxdepth=None):
+        self._items.pop(path, None)

mlrun/datastore/s3.py CHANGED Viewed

@@ -201,6 +201,8 @@ class S3Store(DataStore):
     def rm(self, path, recursive=False, maxdepth=None):
         bucket, key = self.get_bucket_and_key(path)
         path = f"{bucket}/{key}"
+        #  In order to raise an error if there is connection error, ML-7056.
+        self.filesystem.exists(path=path)
         self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)

mlrun/datastore/sources.py CHANGED Viewed

@@ -826,6 +826,20 @@ class SnowflakeSource(BaseSourceDriver):
         spark_options["query"] = self.attributes.get("query")
         return spark_options
+    def to_dataframe(
+        self,
+        columns=None,
+        df_module=None,
+        entities=None,
+        start_time=None,
+        end_time=None,
+        time_field=None,
+        additional_filters=None,
+    ):
+        raise mlrun.errors.MLRunRuntimeError(
+            f"{type(self).__name__} supports only spark engine"
+        )
 class CustomSource(BaseSourceDriver):
     kind = "custom"

mlrun/datastore/targets.py CHANGED Viewed

@@ -726,6 +726,10 @@ class BaseStoreTarget(DataTargetBase):
         timestamp_key=None,
         featureset_status=None,
     ):
+        if not self.support_storey:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"{type(self).__name__} does not support storey engine"
+            )
         raise NotImplementedError()
     def purge(self):
@@ -768,6 +772,10 @@ class BaseStoreTarget(DataTargetBase):
     def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
         # options used in spark.read.load(**options)
+        if not self.support_spark:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"{type(self).__name__} does not support spark engine"
+            )
         raise NotImplementedError()
     def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
@@ -1283,7 +1291,9 @@ class SnowflakeTarget(BaseStoreTarget):
         additional_filters=None,
         **kwargs,
     ):
-        raise NotImplementedError()
+        raise mlrun.errors.MLRunRuntimeError(
+            f"{type(self).__name__} does not support storey engine"
+        )
     @property
     def source_spark_attributes(self) -> dict:

mlrun/db/base.py CHANGED Viewed

@@ -925,5 +925,6 @@ class RunDBInterface(ABC):
         self,
         project: str,
         credentials: dict[str, str],
+        replace_creds: bool,
     ) -> None:
         pass

mlrun/db/httpdb.py CHANGED Viewed

@@ -1253,13 +1253,17 @@ class HTTPRunDB(RunDBInterface):
                     function_name=name,
                 )
-    def list_functions(self, name=None, project=None, tag=None, labels=None):
+    def list_functions(
+        self, name=None, project=None, tag=None, labels=None, since=None, until=None
+    ):
         """Retrieve a list of functions, filtered by specific criteria.
         :param name: Return only functions with a specific name.
         :param project: Return functions belonging to this project. If not specified, the default project is used.
         :param tag: Return function versions with specific tags.
         :param labels: Return functions that have specific labels assigned to them.
+        :param since: Return functions updated after this date (as datetime object).
+        :param until: Return functions updated before this date (as datetime object).
         :returns: List of function objects (as dictionary).
         """
         project = project or config.default_project
@@ -1267,6 +1271,8 @@ class HTTPRunDB(RunDBInterface):
             "name": name,
             "tag": tag,
             "label": labels or [],
+            "since": datetime_to_iso(since),
+            "until": datetime_to_iso(until),
         }
         error = "list functions"
         path = f"projects/{project}/functions"
@@ -3546,17 +3552,19 @@ class HTTPRunDB(RunDBInterface):
         self,
         project: str,
         credentials: dict[str, str],
+        replace_creds: bool,
     ) -> None:
         """
         Set the credentials for the model monitoring application.
         :param project:     Project name.
         :param credentials: Credentials to set.
+        :param replace_creds:       If True, will override the existing credentials.
         """
         self.api_call(
             method=mlrun.common.types.HTTPMethod.POST,
             path=f"projects/{project}/model-monitoring/set-model-monitoring-credentials",
-            params={**credentials},
+            params={**credentials, "replace_creds": replace_creds},
         )
     def create_hub_source(

mlrun/db/nopdb.py CHANGED Viewed

@@ -738,6 +738,7 @@ class NopDB(RunDBInterface):
         self,
         project: str,
         credentials: dict[str, str],
+        replace_creds: bool,
     ) -> None:
         pass

mlrun/feature_store/retrieval/spark_merger.py CHANGED Viewed

@@ -13,45 +13,16 @@
 # limitations under the License.
 #
-import pandas as pd
-import semver
 import mlrun
+from mlrun.data_types.to_pandas import spark_df_to_pandas
 from mlrun.datastore.sources import ParquetSource
 from mlrun.datastore.targets import get_offline_target
+from mlrun.runtimes import RemoteSparkRuntime
+from mlrun.runtimes.sparkjob import Spark3Runtime
 from mlrun.utils.helpers import additional_filters_warning
-from ...runtimes import RemoteSparkRuntime
-from ...runtimes.sparkjob import Spark3Runtime
 from .base import BaseMerger
-from .conversion import PandasConversionMixin
-def spark_df_to_pandas(spark_df):
-    # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
-    # when we upgrade pyspark, we should check whether this workaround is still necessary
-    # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
-    if semver.parse(pd.__version__)["major"] >= 2:
-        import pyspark.sql.functions as pyspark_functions
-        type_conversion_dict = {}
-        for field in spark_df.schema.fields:
-            if str(field.dataType) == "TimestampType":
-                spark_df = spark_df.withColumn(
-                    field.name,
-                    pyspark_functions.date_format(
-                        pyspark_functions.to_timestamp(field.name),
-                        "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
-                    ),
-                )
-                type_conversion_dict[field.name] = "datetime64[ns]"
-        df = PandasConversionMixin.toPandas(spark_df)
-        if type_conversion_dict:
-            df = df.astype(type_conversion_dict)
-        return df
-    else:
-        return PandasConversionMixin.toPandas(spark_df)
 class SparkFeatureMerger(BaseMerger):

mlrun/model.py CHANGED Viewed

@@ -753,10 +753,6 @@ class Notification(ModelObj):
             raise mlrun.errors.MLRunInvalidArgumentError(
                 "Both 'secret_params' and 'params' are empty, at least one must be defined."
             )
-        if secret_params and params and secret_params != params:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "Both 'secret_params' and 'params' are defined but they contain different values"
-            )
         notification_class.validate_params(secret_params or params)
@@ -1315,7 +1311,7 @@ class RunTemplate(ModelObj):
             task.with_input("data", "/file-dir/path/to/file")
             task.with_input("data", "s3://<bucket>/path/to/file")
-            task.with_input("data", "v3io://[<remote-host>]/<data-container>/path/to/file")
+            task.with_input("data", "v3io://<data-container>/path/to/file")
         """
         if not self.spec.inputs:
             self.spec.inputs = {}

mlrun/model_monitoring/api.py CHANGED Viewed

@@ -569,10 +569,10 @@ def _create_model_monitoring_function_base(
             "please use `ModelMonitoringApplicationBaseV2`. It will be removed in 1.9.0.",
             FutureWarning,
         )
-    if name in mm_constants.MonitoringFunctionNames.list():
+    if name in mm_constants._RESERVED_FUNCTION_NAMES:
         raise mlrun.errors.MLRunInvalidArgumentError(
-            f"An application cannot have the following names: "
-            f"{mm_constants.MonitoringFunctionNames.list()}"
+            "An application cannot have the following names: "
+            f"{mm_constants._RESERVED_FUNCTION_NAMES}"
         )
     if func is None:
         func = ""

mlrun 1.7.0rc29__py3-none-any.whl → 1.7.0rc31__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc29py3-none-any.whl → 1.7.0rc31py3-none-any.whl