PyPI - mlrun - Versions diffs - 1.10.0rc42__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl - Mend

mlrun 1.10.0rc42py3-none-any.whl → 1.10.1rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (22) hide show

mlrun/config.py +3 -12
mlrun/datastore/base.py +265 -7
mlrun/datastore/datastore.py +1 -1
mlrun/datastore/model_provider/huggingface_provider.py +6 -2
mlrun/datastore/store_resources.py +4 -4
mlrun/model_monitoring/applications/base.py +16 -2
mlrun/projects/operations.py +10 -10
mlrun/projects/project.py +34 -29
mlrun/run.py +3 -3
mlrun/runtimes/nuclio/function.py +4 -2
mlrun/runtimes/nuclio/serving.py +17 -16
mlrun/serving/server.py +41 -22
mlrun/serving/states.py +70 -77
mlrun/utils/helpers.py +3 -1
mlrun/utils/notifications/notification/mail.py +38 -15
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +9 -7
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +22 -22
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0

mlrun/config.py CHANGED Viewed

@@ -66,7 +66,6 @@ default_config = {
     "nuclio_version": "",
     "default_nuclio_runtime": "python:3.11",
     "nest_asyncio_enabled": "",  # enable import of nest_asyncio for corner cases with old jupyter, set "1"
-    "ui_url": "",  # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
     "remote_host": "",
     "api_base_version": "v1",
     "version": "",  # will be set to current version
@@ -304,7 +303,7 @@ default_config = {
         "application": {
             "default_sidecar_internal_port": 8050,
             "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
-            "default_worker_number": 10000,
+            "default_worker_number": 100,
         },
     },
     # TODO: function defaults should be moved to the function spec config above
@@ -725,7 +724,7 @@ default_config = {
             # Set false to avoid creating a global source (for example in a dark site)
             "create": True,
             "name": "default",
-            "description": "MLRun global function hub",
+            "description": "MLRun hub",
             "url": "https://mlrun.github.io/marketplace",
             "channel": "master",
         },
@@ -1280,10 +1279,7 @@ class Config:
     @staticmethod
     def resolve_ui_url():
-        # ui_url is deprecated in favor of the ui.url (we created the ui block)
-        # since the config class is used in a "recursive" way, we can't use property like we used in other places
-        # since the property will need to be url, which exists in other structs as well
-        return config.ui.url or config.ui_url
+        return config.ui.url
     def is_api_running_on_k8s(self):
         # determine if the API service is attached to K8s cluster
@@ -1570,7 +1566,6 @@ def read_env(env=None, prefix=env_prefix):
             "https://mlrun-api.", "https://framesd."
         )
-    uisvc = env.get("MLRUN_UI_SERVICE_HOST")
     igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
     # workaround to try and detect IGZ domain
@@ -1596,10 +1591,6 @@ def read_env(env=None, prefix=env_prefix):
     if config.get("nuclio_dashboard_url") == "disabled":
         config["nuclio_dashboard_url"] = ""
-    if uisvc and not config.get("ui_url"):
-        if igz_domain:
-            config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
     if log_level := config.get("log_level"):
         import mlrun.utils.logger

mlrun/datastore/base.py CHANGED Viewed

@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import datetime
+import os
+import os.path
 import tempfile
 import urllib.parse
 from base64 import b64encode
 from copy import copy
-from os import path, remove
+from types import ModuleType
 from typing import Optional, Union
 from urllib.parse import urlparse
@@ -156,6 +159,195 @@ class DataStore(BaseRemoteClient):
     def get_spark_options(self, path=None):
         return {}
+    @staticmethod
+    def _is_directory_in_range(
+        start_time: Optional[datetime.datetime],
+        end_time: Optional[datetime.datetime],
+        year: int,
+        month: Optional[int] = None,
+        day: Optional[int] = None,
+        hour: Optional[int] = None,
+        **kwargs,
+    ):
+        """Check if a partition directory (year=.., month=.., etc.) is in the time range."""
+        from dateutil.relativedelta import relativedelta
+        partition_start = datetime.datetime(
+            year=year,
+            month=month or 1,
+            day=day or 1,
+            hour=hour or 0,
+            tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
+        )
+        partition_end = (
+            partition_start
+            + relativedelta(
+                years=1 if month is None else 0,
+                months=1 if day is None and month is not None else 0,
+                days=1 if hour is None and day is not None else 0,
+                hours=1 if hour is not None else 0,
+            )
+            - datetime.timedelta(microseconds=1)
+        )
+        if (end_time and end_time < partition_start) or (
+            start_time and start_time > partition_end
+        ):
+            return False
+        return True
+    @staticmethod
+    def _list_partition_paths_helper(
+        paths: list[str],
+        start_time: Optional[datetime.datetime],
+        end_time: Optional[datetime.datetime],
+        current_path: str,
+        partition_level: str,
+        filesystem,
+    ):
+        directory_split = current_path.rsplit("/", 1)
+        time_unit = None
+        directory_start, directory_end = "", ""
+        if len(directory_split) == 2:
+            directory_start, directory_end = directory_split
+            time_unit = directory_end.split("=")[0] if "=" in directory_end else None
+        if not time_unit and directory_end.endswith((".parquet", ".pq")):
+            paths.append(directory_start.rstrip("/"))
+            return
+        elif time_unit and time_unit == partition_level:
+            paths.append(current_path.rstrip("/"))
+            return
+        directories = filesystem.ls(current_path, detail=True)
+        if len(directories) == 0:
+            return
+        for directory in directories:
+            current_path = directory["name"]
+            parts = [p for p in current_path.split("/") if "=" in p]
+            kwargs = {}
+            for part in parts:
+                key, value = part.split("=", 1)
+                if value.isdigit():
+                    value = int(value)
+                kwargs[key] = value
+            if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
+                DataStore._list_partition_paths_helper(
+                    paths,
+                    start_time,
+                    end_time,
+                    current_path,
+                    partition_level,
+                    filesystem,
+                )
+    @staticmethod
+    def _list_partitioned_paths(
+        base_url: str,
+        start_time: Optional[datetime.datetime],
+        end_time: Optional[datetime.datetime],
+        partition_level: str,
+        filesystem,
+    ):
+        paths = []
+        parsed_base_url = urlparse(base_url)
+        base_path = parsed_base_url.path
+        if parsed_base_url.scheme not in ["v3io", "v3ios"]:
+            base_path = parsed_base_url.netloc + base_path
+        DataStore._list_partition_paths_helper(
+            paths, start_time, end_time, base_path, partition_level, filesystem
+        )
+        paths = [
+            DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
+            for path in paths
+        ]
+        return paths
+    @staticmethod
+    def _reconstruct_path_from_base_url(
+        parsed_base_url: urllib.parse.ParseResult, returned_path: str
+    ) -> str:
+        scheme = parsed_base_url.scheme
+        authority = parsed_base_url.netloc
+        returned_path = returned_path.lstrip("/")
+        if scheme == "v3io":
+            return f"{scheme}://{authority}/{returned_path}"
+        else:
+            return f"{scheme}://{returned_path}"
+    @staticmethod
+    def _clean_filters_for_partitions(
+        filters: list[list[tuple]],
+        partition_keys: list[str],
+    ):
+        """
+        Remove partition keys from filters.
+        :param filters: pandas-style filters
+                Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
+        :param partition_keys: partition columns handled via directory
+        :return list of list of tuples: cleaned filters without partition keys
+        """
+        cleaned_filters = []
+        for group in filters:
+            new_group = [f for f in group if f[0] not in partition_keys]
+            if new_group:
+                cleaned_filters.append(new_group)
+        return cleaned_filters
+    @staticmethod
+    def _read_partitioned_parquet(
+        base_url: str,
+        start_time: Optional[datetime.datetime],
+        end_time: Optional[datetime.datetime],
+        partition_keys: list[str],
+        df_module: ModuleType,
+        filesystem: fsspec.AbstractFileSystem,
+        **kwargs,
+    ):
+        """
+        Reads only the relevant partitions and concatenates the results.
+        Note that partition_keys cannot be empty.
+        """
+        logger.debug(f"Starting partition discovery process for {base_url}")
+        paths = DataStore._list_partitioned_paths(
+            base_url,
+            start_time,
+            end_time,
+            partition_keys[-1],
+            filesystem,
+        )
+        dfs = []
+        for current_path in paths:
+            try:
+                kwargs["filters"] = DataStore._clean_filters_for_partitions(
+                    kwargs["filters"], partition_keys
+                )
+                df = df_module.read_parquet(current_path, **kwargs)
+                logger.debug(
+                    "Finished reading DataFrame from subpath",
+                    url=current_path,
+                )
+                dfs.append(df)
+            except FileNotFoundError as e:
+                # Skip partitions that don't exist or have no data
+                logger.warning(
+                    "Failed to read DataFrame", url=current_path, exception=e
+                )
+        final_df = pd.concat(dfs) if dfs else pd.DataFrame()
+        logger.debug(
+            "Finished reading partitioned parquet files",
+            url=base_url,
+            columns=final_df.columns,
+        )
+        return final_df
     @staticmethod
     def _parquet_reader(
         df_module,
@@ -165,6 +357,7 @@ class DataStore(BaseRemoteClient):
         start_time,
         end_time,
         additional_filters,
+        optimize_discovery,
     ):
         from storey.utils import find_filters, find_partitions
@@ -203,7 +396,10 @@ class DataStore(BaseRemoteClient):
                 )
             if start_time or end_time or additional_filters:
-                partitions_time_attributes = find_partitions(url, file_system)
+                partitions_time_attributes, partitions = find_partitions(
+                    url, file_system, True
+                )
+                logger.debug("Partitioned parquet read", partitions=partitions)
                 set_filters(
                     partitions_time_attributes,
                     start_time,
@@ -211,8 +407,28 @@ class DataStore(BaseRemoteClient):
                     additional_filters,
                     kwargs,
                 )
                 try:
-                    return df_module.read_parquet(*args, **kwargs)
+                    if (
+                        optimize_discovery
+                        and partitions_time_attributes
+                        and DataStore._verify_path_partition_level(
+                            urlparse(url).path, partitions
+                        )
+                        and (start_time or end_time)
+                    ):
+                        return DataStore._read_partitioned_parquet(
+                            url,
+                            start_time,
+                            end_time,
+                            partitions_time_attributes,
+                            df_module,
+                            file_system,
+                            **kwargs,
+                        )
+                    else:
+                        return df_module.read_parquet(*args, **kwargs)
                 except pyarrow.lib.ArrowInvalid as ex:
                     if not str(ex).startswith(
                         "Cannot compare timestamp with timezone to timestamp without timezone"
@@ -238,7 +454,24 @@ class DataStore(BaseRemoteClient):
                         additional_filters,
                         kwargs,
                     )
-                    return df_module.read_parquet(*args, **kwargs)
+                    if (
+                        optimize_discovery
+                        and partitions_time_attributes
+                        and DataStore._verify_path_partition_level(
+                            urlparse(url).path, partitions
+                        )
+                    ):
+                        return DataStore._read_partitioned_parquet(
+                            url,
+                            start_time_inner,
+                            end_time_inner,
+                            partitions_time_attributes,
+                            df_module,
+                            file_system,
+                            **kwargs,
+                        )
+                    else:
+                        return df_module.read_parquet(*args, **kwargs)
             else:
                 return df_module.read_parquet(*args, **kwargs)
@@ -261,6 +494,10 @@ class DataStore(BaseRemoteClient):
         file_url = self._sanitize_url(url)
         is_csv, is_json, drop_time_column = False, False, False
         file_system = self.filesystem
+        # Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
+        optimize_discovery = kwargs.pop("optimize_discovery", True)
         if file_url.endswith(".csv") or format == "csv":
             is_csv = True
             drop_time_column = False
@@ -322,6 +559,7 @@ class DataStore(BaseRemoteClient):
                 start_time,
                 end_time,
                 additional_filters,
+                optimize_discovery,
             )
         elif file_url.endswith(".json") or format == "json":
@@ -347,7 +585,7 @@ class DataStore(BaseRemoteClient):
             temp_file = tempfile.NamedTemporaryFile(delete=False)
             self.download(self._join(subpath), temp_file.name)
             df = reader(temp_file.name, **kwargs)
-            remove(temp_file.name)
+            os.remove(temp_file.name)
         if is_json or is_csv:
             # for parquet file the time filtering is executed in `reader`
@@ -387,6 +625,26 @@ class DataStore(BaseRemoteClient):
         except ImportError:
             return False
+    @staticmethod
+    def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
+        if not partitions:
+            return False
+        path_parts = base_path.strip("/").split("/")
+        path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
+        if "hour" in partitions:
+            hour_index = partitions.index("hour")
+        else:
+            return False
+        for i, part in enumerate(partitions):
+            if not (
+                part in path_parts
+                or part in ["year", "month", "day", "hour"]
+                or i > hour_index
+            ):
+                return False
+        return True
 class DataItem:
     """Data input/output class abstracting access to various local/remote data sources
@@ -439,7 +697,7 @@ class DataItem:
     @property
     def suffix(self):
         """DataItem suffix (file extension) e.g. '.png'"""
-        _, file_ext = path.splitext(self._path)
+        _, file_ext = os.path.splitext(self._path)
         return file_ext
     @property
@@ -548,7 +806,7 @@ class DataItem:
             return
         if self._local_path:
-            remove(self._local_path)
+            os.remove(self._local_path)
             self._local_path = ""
     def as_df(

mlrun/datastore/datastore.py CHANGED Viewed

@@ -47,7 +47,7 @@ from .v3io import V3ioStore
 in_memory_store = InMemoryStore()
-def schema_to_store(schema) -> DataStore.__subclasses__():
+def schema_to_store(schema) -> type[DataStore]:
     # import store classes inside to enable making their dependencies optional (package extras)
     if not schema or schema in get_local_file_schema():

mlrun/datastore/model_provider/huggingface_provider.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import threading
 from typing import TYPE_CHECKING, Any, Optional, Union
 import mlrun
@@ -41,6 +41,9 @@ class HuggingFaceProvider(ModelProvider):
     into memory for inference. Ensure you have the required CPU/GPU and memory to use this operation.
     """
+    #  locks for threading use cases
+    _client_lock = threading.Lock()
     def __init__(
         self,
         parent,
@@ -224,7 +227,8 @@ class HuggingFaceProvider(ModelProvider):
             self.options["model_kwargs"] = self.options.get("model_kwargs", {})
             self.options["model_kwargs"]["local_files_only"] = True
-            self._client = pipeline(model=self.model, **self.options)
+            with self._client_lock:
+                self._client = pipeline(model=self.model, **self.options)
             self._expected_operation_type = Pipeline
         except ImportError as exc:
             raise ImportError("transformers package is not installed") from exc

mlrun/datastore/store_resources.py CHANGED Viewed

@@ -76,9 +76,9 @@ class ResourceCache:
             return self._tabels[uri]
         if uri.startswith("v3io://") or uri.startswith("v3ios://"):
-            endpoint, uri = parse_path(uri)
+            endpoint, path = parse_path(uri)
             self._tabels[uri] = Table(
-                uri,
+                path,
                 V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
                 flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
             )
@@ -87,10 +87,10 @@ class ResourceCache:
         if uri.startswith("redis://") or uri.startswith("rediss://"):
             from storey.redis_driver import RedisDriver
-            endpoint, uri = parse_path(uri)
+            endpoint, path = parse_path(uri)
             endpoint = endpoint or mlrun.mlconf.redis.url
             self._tabels[uri] = Table(
-                uri,
+                path,
                 RedisDriver(redis_url=endpoint, key_prefix="/"),
                 flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
             )

mlrun/model_monitoring/applications/base.py CHANGED Viewed

@@ -850,6 +850,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
         * ``base_period``, ``int``
         * ``write_output``, ``bool``
         * ``existing_data_handling``, ``str``
+        * ``_init_args``, ``dict`` - the arguments for the application class constructor
+          (equivalent to ``class_arguments``)
+        See :py:meth:`~ModelMonitoringApplicationBase.evaluate` for more details
+        about these inputs and params.
         For Git sources, add the source archive to the returned job and change the handler:
@@ -928,6 +933,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
         image: Optional[str] = None,
         with_repo: Optional[bool] = False,
         class_handler: Optional[str] = None,
+        class_arguments: Optional[dict[str, Any]] = None,
         requirements: Optional[Union[str, list[str]]] = None,
         requirements_file: str = "",
         endpoints: Union[list[tuple[str, str]], list[str], Literal["all"], None] = None,
@@ -963,7 +969,10 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
                                   You do not need to have a model endpoint to use this option.
         :param image:             Docker image to run the job on (when running remotely).
         :param with_repo:         Whether to clone the current repo to the build source.
-        :param class_handler:     The relative path to the class, useful when using Git sources or code from images.
+        :param class_handler:     The relative path to the application class, useful when using Git sources or code
+                                  from images.
+        :param class_arguments:   The arguments for the application class constructor. These are passed to the
+                                  class ``__init__``. The values must be JSON-serializable.
         :param requirements:      List of Python requirements to be installed in the image.
         :param requirements_file: Path to a Python requirements file to be installed in the image.
         :param endpoints:         The model endpoints to get the data from. The options are:
@@ -1041,7 +1050,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
             project=project,
         )
-        params: dict[str, Union[list, str, int, None, ds_profile.DatastoreProfile]] = {}
+        params: dict[
+            str, Union[list, dict, str, int, None, ds_profile.DatastoreProfile]
+        ] = {}
         if endpoints:
             params["endpoints"] = endpoints
             if sample_data is None:
@@ -1077,6 +1088,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
                 )
         params["stream_profile"] = stream_profile
+        if class_arguments:
+            params["_init_args"] = class_arguments
         inputs: dict[str, str] = {}
         for data, identifier in [
             (sample_data, "sample_data"),

mlrun/projects/operations.py CHANGED Viewed

@@ -85,17 +85,17 @@ def run_function(
 ) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
     """Run a local or remote task as part of a local/kubeflow pipeline
-    run_function() allow you to execute a function locally, on a remote cluster, or as part of an automated workflow
-    function can be specified as an object or by name (str), when the function is specified by name it is looked up
-    in the current project eliminating the need to redefine/edit functions.
+    run_function() allows you to execute a function locally, on a remote cluster, or as part of an automated workflow.
+    The function can be specified as an object or by name (str). When the function is specified by name it is looked up
+    in the current project, eliminating the need to redefine/edit functions.
-    when functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
+    When functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
     e.g. local=True will run all the functions locally, setting artifact_path will direct all outputs to the same path.
-    project runs provide additional notifications/reporting and exception handling.
-    inside a Kubeflow pipeline (KFP) run_function() generates KFP node (see PipelineNodeWrapper) which forms a DAG
-    some behavior may differ between regular runs and deferred KFP runs.
+    Project runs provide additional notifications/reporting and exception handling.
+    Inside a Kubeflow pipeline (KFP) run_function() generates KFP node (see PipelineNodeWrapper) which forms a DAG.
+    Some behavior may differ between regular runs and deferred KFP runs.
-    example (use with function object)::
+    Example (use with function object)::
         LABELS = "is_error"
         MODEL_CLASS = "sklearn.ensemble.RandomForestClassifier"
@@ -107,7 +107,7 @@ def run_function(
             inputs={"dataset": DATA_PATH},
         )
-    example (use with project)::
+    Example (use with project)::
         # create a project with two functions (local and from hub)
         project = mlrun.new_project(project_name, "./proj)
@@ -119,7 +119,7 @@ def run_function(
         run2 = run_function("train", params={"label_columns": LABELS, "model_class": MODEL_CLASS},
                                      inputs={"dataset": run1.outputs["data"]})
-    example (use in pipeline)::
+    Example (use in pipeline)::
         @dsl.pipeline(name="test pipeline", description="test")
         def my_pipe(url=""):

mlrun 1.10.0rc42__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc42py3-none-any.whl → 1.10.1rc4py3-none-any.whl