PyPI - zenml-nightly - Versions diffs - 0.58.1.dev20240610__py3-none-any.whl → 0.58.2.dev20240612__py3-none-any.whl - Mend

zenml-nightly 0.58.1.dev20240610py3-none-any.whl → 0.58.2.dev20240612py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

README.md CHANGED Viewed

@@ -289,7 +289,7 @@ the Apache License Version 2.0.
     <a href="https://github.com/zenml-io/zenml-projects">Projects Showcase</a>
     <br />
     <br />
-    🎉 Version 0.58.1 is out. Check out the release notes
+    🎉 Version 0.58.2 is out. Check out the release notes
     <a href="https://github.com/zenml-io/zenml/releases">here</a>.
     <br />
     🖥️ Download our VS Code Extension <a href="https://marketplace.visualstudio.com/items?itemName=ZenML.zenml-vscode">here</a>.

RELEASE_NOTES.md CHANGED Viewed

@@ -1,5 +1,29 @@
 <!-- markdown-link-check-disable -->
+# 0.58.2
+The 0.58.2 minor release is packed with a set of improvements to the ZenML logging and ZenML Server.
+With this release ZenML logging will:
+- Offer pagination of the logs during fetching via REST API
+- Store the full logs history on GCS Artifact Stores
+- Be performant running logging-heavy tasks, like TQDM logging or logging of training in any Deep Learning framework (also TQDM-backed)
+## What's Changed
+* update test-migrations.sh with latest versions by @safoinme in https://github.com/zenml-io/zenml/pull/2757
+* Fix overriding expiration date for api tokens by @schustmi in https://github.com/zenml-io/zenml/pull/2753
+* Step logs pagination by @schustmi in https://github.com/zenml-io/zenml/pull/2731
+* Fix broken links (round 2) by @strickvl in https://github.com/zenml-io/zenml/pull/2760
+* Remove default system flag in docker UV by @avishniakov in https://github.com/zenml-io/zenml/pull/2764
+* Another batch of small fixes and expansions by @AlexejPenner in https://github.com/zenml-io/zenml/pull/2762
+* Server scalability improvements by @stefannica in https://github.com/zenml-io/zenml/pull/2752
+* Add option to start parallel kubernetes steps with delay by @schustmi in https://github.com/zenml-io/zenml/pull/2758
+* Move `thread_limiter` to app startup event by @avishniakov in https://github.com/zenml-io/zenml/pull/2765
+* Logging performance improvements and GCP logging fix by @avishniakov in https://github.com/zenml-io/zenml/pull/2755
+**Full Changelog**: https://github.com/zenml-io/zenml/compare/0.58.1...0.58.2
 # 0.58.1
 The 0.58.1 release brings a set of minor enhancement and bugfix to the ZenML framework, such as the ability to delete all versions of a pipeline using the Client/CLI, providing greater flexibility and control over pipeline management. Users can now specify Python package installer arguments. Furthermore, a fix has been implemented for the Sentencepiece tokenizer materializer.

zenml/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.58.1.~~dev20240610~~
1	+ 0.58.2.dev20240612

zenml/artifact_stores/base_artifact_store.py CHANGED Viewed

@@ -171,6 +171,7 @@ class BaseArtifactStoreConfig(StackComponentConfig):
     path: str
     SUPPORTED_SCHEMES: ClassVar[Set[str]]
+    IS_IMMUTABLE_FILESYSTEM: ClassVar[bool] = False
     @root_validator(skip_on_failure=True)
     def _ensure_artifact_store(cls, values: Dict[str, Any]) -> Any:

zenml/config/server_config.py CHANGED Viewed

@@ -39,6 +39,7 @@ from zenml.constants import (
     DEFAULT_ZENML_SERVER_SECURE_HEADERS_REFERRER,
     DEFAULT_ZENML_SERVER_SECURE_HEADERS_XFO,
     DEFAULT_ZENML_SERVER_SECURE_HEADERS_XXP,
+    DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE,
     DEFAULT_ZENML_SERVER_USE_LEGACY_DASHBOARD,
     ENV_ZENML_SERVER_PREFIX,
 )
@@ -301,6 +302,8 @@ class ServerConfiguration(BaseModel):
     display_updates: bool = True
     auto_activate: bool = False
+    thread_pool_size: int = DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE
     _deployment_id: Optional[UUID] = None
     @root_validator(pre=True)

zenml/constants.py CHANGED Viewed

@@ -230,6 +230,7 @@ STEP_SOURCE_PARAMETER_NAME = "step_source"
 # Server settings
 DEFAULT_ZENML_SERVER_NAME = "default"
+DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE = 40
 DEFAULT_ZENML_JWT_TOKEN_LEEWAY = 10
 DEFAULT_ZENML_JWT_TOKEN_ALGORITHM = "HS256"
 DEFAULT_ZENML_AUTH_SCHEME = AuthScheme.OAUTH2_PASSWORD_BEARER

zenml/integrations/airflow/__init__.py CHANGED Viewed

@@ -17,7 +17,7 @@ The Airflow integration sub-module powers an alternative to the local
 orchestrator. You can enable it by registering the Airflow orchestrator with
 the CLI tool, then bootstrap using the ``zenml orchestrator up`` command.
 """
-from typing import List, Optional, Type
+from typing import List, Type
 from zenml.integrations.constants import AIRFLOW
 from zenml.integrations.integration import Integration

zenml/integrations/gcp/artifact_stores/gcp_artifact_store.py CHANGED Viewed

@@ -35,9 +35,11 @@ from zenml.integrations.gcp.flavors.gcp_artifact_store_flavor import (
     GCPArtifactStoreConfig,
 )
 from zenml.io.fileio import convert_to_str
+from zenml.logger import get_logger
 from zenml.secret.schemas import GCPSecretSchema
 from zenml.stack.authentication_mixin import AuthenticationMixin
+logger = get_logger(__name__)
 PathType = Union[bytes, str]
@@ -109,6 +111,10 @@ class GCPArtifactStore(BaseArtifactStore, AuthenticationMixin):
         Returns:
             A file-like object that can be used to read or write to the file.
         """
+        if mode in ("a", "ab"):
+            logger.warning(
+                "GCS Filesystem is immutable, so append mode will overwrite existing files."
+            )
         return self.filesystem.open(path=path, mode=mode)
     def copyfile(

zenml/integrations/gcp/flavors/gcp_artifact_store_flavor.py CHANGED Viewed

@@ -36,6 +36,7 @@ class GCPArtifactStoreConfig(
     """Configuration for GCP Artifact Store."""
     SUPPORTED_SCHEMES: ClassVar[Set[str]] = {GCP_PATH_PREFIX}
+    IS_IMMUTABLE_FILESYSTEM: ClassVar[bool] = True
 class GCPArtifactStoreFlavor(BaseArtifactStoreFlavor):

zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py CHANGED Viewed

@@ -82,6 +82,9 @@ class KubernetesOrchestratorConfig(  # type: ignore[misc] # https://github.com/p
             containers).
         skip_local_validations: If `True`, the local validations will be
             skipped.
+        parallel_step_startup_waiting_period: How long to wait in between
+            starting parallel steps. This can be used to distribute server
+            load when running pipelines with a huge amount of parallel steps.
     """
     incluster: bool = False
@@ -89,6 +92,7 @@ class KubernetesOrchestratorConfig(  # type: ignore[misc] # https://github.com/p
     kubernetes_namespace: str = "zenml"
     local: bool = False
     skip_local_validations: bool = False
+    parallel_step_startup_waiting_period: Optional[float] = None
     @property
     def is_remote(self) -> bool:

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py CHANGED Viewed

@@ -142,7 +142,14 @@ def main() -> None:
         )
         logger.info(f"Pod of step `{step_name}` completed.")
-    ThreadedDagRunner(dag=pipeline_dag, run_fn=run_step_on_kubernetes).run()
+    parallel_node_startup_waiting_period = (
+        orchestrator.config.parallel_step_startup_waiting_period or 0.0
+    )
+    ThreadedDagRunner(
+        dag=pipeline_dag,
+        run_fn=run_step_on_kubernetes,
+        parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
+    ).run()
     logger.info("Orchestration pod completed.")

zenml/logging/__init__.py CHANGED Viewed

@@ -12,8 +12,13 @@
 #  or implied. See the License for the specific language governing
 #  permissions and limitations under the License.
+"""Logging utilities."""
 # How many seconds to wait before uploading logs to the artifact store
 STEP_LOGS_STORAGE_INTERVAL_SECONDS: int = 15
 # How many messages to buffer before uploading logs to the artifact store
 STEP_LOGS_STORAGE_MAX_MESSAGES: int = 100
+# How often to merge logs into a single file
+STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS: int = 10 * 60

zenml/logging/step_logging.py CHANGED Viewed

@@ -19,22 +19,31 @@ import sys
 import time
 from contextvars import ContextVar
 from types import TracebackType
-from typing import Any, Callable, List, Optional, Type
-from uuid import uuid4
+from typing import Any, Callable, List, Optional, Type, Union
+from uuid import UUID, uuid4
 from zenml.artifact_stores import BaseArtifactStore
+from zenml.artifacts.utils import (
+    _load_artifact_store,
+    _load_file_from_artifact_store,
+)
 from zenml.client import Client
+from zenml.exceptions import DoesNotExistException
 from zenml.logger import get_logger
 from zenml.logging import (
     STEP_LOGS_STORAGE_INTERVAL_SECONDS,
     STEP_LOGS_STORAGE_MAX_MESSAGES,
+    STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
 )
+from zenml.zen_stores.base_zen_store import BaseZenStore
 # Get the logger
 logger = get_logger(__name__)
 redirected: ContextVar[bool] = ContextVar("redirected", default=False)
+LOGS_EXTENSION = ".log"
 def remove_ansi_escape_codes(text: str) -> str:
     """Auxiliary function to remove ANSI escape codes from a given string.
@@ -54,7 +63,7 @@ def prepare_logs_uri(
     step_name: str,
     log_key: Optional[str] = None,
 ) -> str:
-    """Generates and prepares a URI for the log file for a step.
+    """Generates and prepares a URI for the log file or folder for a step.
     Args:
         artifact_store: The artifact store on which the artifact will be stored.
@@ -62,7 +71,7 @@ def prepare_logs_uri(
         log_key: The unique identification key of the log file.
     Returns:
-        The URI of the logs file.
+        The URI of the log storage (file or folder).
     """
     if log_key is None:
         log_key = str(uuid4())
@@ -78,13 +87,119 @@ def prepare_logs_uri(
         artifact_store.makedirs(logs_base_uri)
     # Delete the file if it already exists
-    logs_uri = os.path.join(logs_base_uri, f"{log_key}.log")
-    if artifact_store.exists(logs_uri):
-        logger.warning(
-            f"Logs file {logs_uri} already exists! Removing old log file..."
+    if artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
+        logs_uri_folder = os.path.join(logs_base_uri, log_key)
+        if artifact_store.exists(logs_uri_folder):
+            logger.warning(
+                f"Logs directory {logs_uri_folder} already exists! Removing old log directory..."
+            )
+            artifact_store.rmtree(logs_uri_folder)
+        artifact_store.makedirs(logs_uri_folder)
+        return logs_uri_folder
+    else:
+        logs_uri = os.path.join(logs_base_uri, f"{log_key}.log")
+        if artifact_store.exists(logs_uri):
+            logger.warning(
+                f"Logs file {logs_uri} already exists! Removing old log file..."
+            )
+            artifact_store.remove(logs_uri)
+        return logs_uri
+def fetch_logs(
+    zen_store: "BaseZenStore",
+    artifact_store_id: Union[str, UUID],
+    logs_uri: str,
+    offset: int = 0,
+    length: int = 1024 * 1024 * 16,  # Default to 16MiB of data
+) -> str:
+    """Fetches the logs from the artifact store.
+    Args:
+        zen_store: The store in which the artifact is stored.
+        artifact_store_id: The ID of the artifact store.
+        logs_uri: The URI of the artifact.
+        offset: The offset from which to start reading.
+        length: The amount of bytes that should be read.
+    Returns:
+        The logs as a string.
+    Raises:
+        DoesNotExistException: If the artifact does not exist in the artifact
+            store.
+    """
+    def _read_file(
+        uri: str, offset: int = 0, length: Optional[int] = None
+    ) -> str:
+        return str(
+            _load_file_from_artifact_store(
+                uri,
+                artifact_store=artifact_store,
+                mode="rb",
+                offset=offset,
+                length=length,
+            ).decode()
         )
-        artifact_store.remove(logs_uri)
-    return logs_uri
+    artifact_store = _load_artifact_store(artifact_store_id, zen_store)
+    if not artifact_store.isdir(logs_uri):
+        return _read_file(logs_uri, offset, length)
+    else:
+        files = artifact_store.listdir(logs_uri)
+        if len(files) == 1:
+            return _read_file(
+                os.path.join(logs_uri, str(files[0])), offset, length
+            )
+        else:
+            is_negative_offset = offset < 0
+            files.sort(reverse=is_negative_offset)
+            # search for the first file we need to read
+            latest_file_id = 0
+            for i, file in enumerate(files):
+                file_size: int = artifact_store.size(
+                    os.path.join(logs_uri, str(file))
+                )  # type: ignore[assignment]
+                if is_negative_offset:
+                    if file_size >= -offset:
+                        latest_file_id = -(i + 1)
+                        break
+                    else:
+                        offset += file_size
+                else:
+                    if file_size > offset:
+                        latest_file_id = i
+                        break
+                    else:
+                        offset -= file_size
+            # read the files according to pre-filtering
+            files.sort()
+            ret = []
+            for file in files[latest_file_id:]:
+                ret.append(
+                    _read_file(
+                        os.path.join(logs_uri, str(file)),
+                        offset,
+                        length,
+                    )
+                )
+                offset = 0
+                length -= len(ret[-1])
+                if length <= 0:
+                    # stop further reading, if the whole length is already read
+                    break
+            if not ret:
+                raise DoesNotExistException(
+                    f"Folder '{logs_uri}' is empty in artifact store "
+                    f"'{artifact_store.name}'."
+                )
+            return "".join(ret)
 class StepLogsStorage:
@@ -95,25 +210,46 @@ class StepLogsStorage:
         logs_uri: str,
         max_messages: int = STEP_LOGS_STORAGE_MAX_MESSAGES,
         time_interval: int = STEP_LOGS_STORAGE_INTERVAL_SECONDS,
+        merge_files_interval: int = STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
     ) -> None:
         """Initialization.
         Args:
-            logs_uri: the target URI to store the logs.
+            logs_uri: the URI of the log file or folder.
             max_messages: the maximum number of messages to save in the buffer.
             time_interval: the amount of seconds before the buffer gets saved
                 automatically.
+            merge_files_interval: the amount of seconds before the created files
+                get merged into a single file.
         """
         # Parameters
         self.logs_uri = logs_uri
         self.max_messages = max_messages
         self.time_interval = time_interval
+        self.merge_files_interval = merge_files_interval
         # State
         self.buffer: List[str] = []
         self.disabled_buffer: List[str] = []
         self.last_save_time = time.time()
         self.disabled = False
+        self._artifact_store: Optional["BaseArtifactStore"] = None
+        # Immutable filesystems state
+        self.last_merge_time = time.time()
+        self.log_files_not_merged: List[str] = []
+        self.next_merged_file_name: str = f"{time.time()}{LOGS_EXTENSION}"
+    @property
+    def artifact_store(self) -> "BaseArtifactStore":
+        """Returns the active artifact store.
+        Returns:
+            The active artifact store.
+        """
+        if self._artifact_store is None:
+            self._artifact_store = Client().active_stack.artifact_store
+        return self._artifact_store
     def write(self, text: str) -> None:
         """Main write method.
@@ -126,29 +262,60 @@ class StepLogsStorage:
         if not self.disabled:
             self.buffer.append(text)
+            self.save_to_file()
-            if (
-                len(self.buffer) >= self.max_messages
-                or time.time() - self.last_save_time >= self.time_interval
-            ):
-                self.save_to_file()
+    @property
+    def _is_write_needed(self) -> bool:
+        """Checks whether the buffer needs to be written to disk.
-    def save_to_file(self) -> None:
-        """Method to save the buffer to the given URI."""
-        if not self.disabled:
+        Returns:
+            whether the buffer needs to be written to disk.
+        """
+        return (
+            len(self.buffer) >= self.max_messages
+            or time.time() - self.last_save_time >= self.time_interval
+        )
+    def save_to_file(self, force: bool = False) -> None:
+        """Method to save the buffer to the given URI.
+        Args:
+            force: whether to force a save even if the write conditions not met.
+        """
+        if not self.disabled and (self._is_write_needed or force):
             # IMPORTANT: keep this as the first code line in this method! The
             # code that follows might still emit logging messages, which will
             # end up triggering this method again, causing an infinite loop.
             self.disabled = True
-            artifact_store = Client().active_stack.artifact_store
             try:
                 if self.buffer:
-                    with artifact_store.open(self.logs_uri, "a") as file:
-                        for message in self.buffer:
-                            file.write(
-                                remove_ansi_escape_codes(message) + "\n"
+                    if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
+                        if not self.log_files_not_merged:
+                            self.next_merged_file_name = (
+                                f"{time.time()}{LOGS_EXTENSION}"
                             )
+                        log_file_ = f"{time.time()}{LOGS_EXTENSION}"
+                        self.log_files_not_merged.append(log_file_)
+                        with self.artifact_store.open(
+                            os.path.join(
+                                self.logs_uri,
+                                log_file_,
+                            ),
+                            "w",
+                        ) as file:
+                            for message in self.buffer:
+                                file.write(
+                                    remove_ansi_escape_codes(message) + "\n"
+                                )
+                    else:
+                        with self.artifact_store.open(
+                            self.logs_uri, "a"
+                        ) as file:
+                            for message in self.buffer:
+                                file.write(
+                                    remove_ansi_escape_codes(message) + "\n"
+                                )
             except (OSError, IOError) as e:
                 # This exception can be raised if there are issues with the
@@ -161,6 +328,66 @@ class StepLogsStorage:
                 self.last_save_time = time.time()
                 self.disabled = False
+        # merge created files on a given interval (defaults to 10 minutes)
+        # only runs on Immutable Filesystems
+        if (
+            self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM
+            and time.time() - self.last_merge_time > self.merge_files_interval
+        ):
+            try:
+                self.merge_log_files(
+                    self.next_merged_file_name, self.log_files_not_merged
+                )
+            except (OSError, IOError) as e:
+                logger.error(f"Error while trying to roll up logs: {e}")
+            else:
+                self.log_files_not_merged = []
+            finally:
+                self.last_merge_time = time.time()
+    def merge_log_files(
+        self,
+        file_name: Optional[str] = None,
+        files: Optional[List[str]] = None,
+    ) -> None:
+        """Merges all log files into one in the given URI.
+        Called on the logging context exit.
+        Args:
+            file_name: The name of the merged log file.
+            files: The list of log files to merge.
+        """
+        if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
+            files_ = files or self.artifact_store.listdir(self.logs_uri)
+            file_name_ = file_name or f"full_log{LOGS_EXTENSION}"
+            if len(files_) > 1:
+                files_.sort()
+                logger.debug("Log files count: %s", len(files_))
+                try:
+                    # dump all logs to a local file first
+                    with self.artifact_store.open(
+                        os.path.join(self.logs_uri, file_name_), "w"
+                    ) as merged_file:
+                        for file in files_:
+                            merged_file.write(
+                                str(
+                                    _load_file_from_artifact_store(
+                                        os.path.join(self.logs_uri, str(file)),
+                                        artifact_store=self.artifact_store,
+                                        mode="r",
+                                    )
+                                )
+                            )
+                except Exception as e:
+                    logger.warning(f"Failed to merge log files. {e}")
+                else:
+                    # clean up left over files
+                    for file in files_:
+                        self.artifact_store.remove(
+                            os.path.join(self.logs_uri, str(file))
+                        )
 class StepLogsStorageContext:
@@ -213,7 +440,8 @@ class StepLogsStorageContext:
         Restores the `write` method of both stderr and stdout.
         """
-        self.storage.save_to_file()
+        self.storage.save_to_file(force=True)
+        self.storage.merge_log_files()
         setattr(sys.stdout, "write", self.stdout_write)
         setattr(sys.stdout, "flush", self.stdout_flush)

zenml/orchestrators/dag_runner.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """DAG (Directed Acyclic Graph) Runners."""
 import threading
+import time
 from collections import defaultdict
 from enum import Enum
 from typing import Any, Callable, Dict, List
@@ -66,7 +67,10 @@ class ThreadedDagRunner:
     """
     def __init__(
-        self, dag: Dict[str, List[str]], run_fn: Callable[[str], Any]
+        self,
+        dag: Dict[str, List[str]],
+        run_fn: Callable[[str], Any],
+        parallel_node_startup_waiting_period: float = 0.0,
     ) -> None:
         """Define attributes and initialize all nodes in waiting state.
@@ -75,7 +79,12 @@ class ThreadedDagRunner:
                 E.g.: [(1->2), (1->3), (2->4), (3->4)] should be represented as
                 `dag={2: [1], 3: [1], 4: [2, 3]}`
             run_fn: A function `run_fn(node)` that runs a single node
+            parallel_node_startup_waiting_period: Delay in seconds to wait in
+                between starting parallel nodes.
         """
+        self.parallel_node_startup_waiting_period = (
+            parallel_node_startup_waiting_period
+        )
         self.dag = dag
         self.reversed_dag = reverse_dag(dag)
         self.run_fn = run_fn
@@ -154,9 +163,12 @@ class ThreadedDagRunner:
             self.node_states[node] = NodeStatus.COMPLETED
         # Run downstream nodes.
-        threads = []
+        threads: List[threading.Thread] = []
         for downstram_node in self.reversed_dag[node]:
             if self._can_run(downstram_node):
+                if threads and self.parallel_node_startup_waiting_period > 0:
+                    time.sleep(self.parallel_node_startup_waiting_period)
                 thread = self._run_node_in_thread(downstram_node)
                 threads.append(thread)
@@ -173,9 +185,12 @@ class ThreadedDagRunner:
         # Run all nodes that can be started immediately.
         # These will, in turn, start other nodes once all of their respective
         # upstream nodes have completed.
-        threads = []
+        threads: List[threading.Thread] = []
         for node in self.nodes:
             if self._can_run(node):
+                if threads and self.parallel_node_startup_waiting_period > 0:
+                    time.sleep(self.parallel_node_startup_waiting_period)
                 thread = self._run_node_in_thread(node)
                 threads.append(thread)

zenml/utils/pipeline_docker_image_builder.py CHANGED Viewed

@@ -70,10 +70,7 @@ PIP_DEFAULT_ARGS = {
     "no-cache-dir": None,
     "default-timeout": 60,
 }
-UV_DEFAULT_ARGS = {
-    "no-cache-dir": None,
-    "system": None,
-}
+UV_DEFAULT_ARGS = {"no-cache-dir": None}
 class PipelineDockerImageBuilder:

zenml/zen_server/cloud_utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pydantic import BaseModel, validator
 from requests.adapters import HTTPAdapter, Retry
 from zenml.exceptions import SubscriptionUpgradeRequiredError
+from zenml.zen_server.utils import server_config
 ZENML_CLOUD_RBAC_ENV_PREFIX = "ZENML_CLOUD_"
@@ -99,7 +100,7 @@ class ZenMLCloudSession:
                 raise SubscriptionUpgradeRequiredError(response.json())
             else:
                 raise RuntimeError(
-                    f"Failed with the following error {response.json()}"
+                    f"Failed with the following error {response} {response.text}"
                 )
         return response
@@ -154,12 +155,29 @@ class ZenMLCloudSession:
             A requests session with the authentication token.
         """
         if self._session is None:
+            # Set up the session's connection pool size to match the server's
+            # thread pool size. This allows the server to cache one connection
+            # per thread, which means we can keep connections open for longer
+            # and avoid the overhead of setting up a new connection for each
+            # request.
+            conn_pool_size = server_config().thread_pool_size
             self._session = requests.Session()
             token = self._fetch_auth_token()
             self._session.headers.update({"Authorization": "Bearer " + token})
             retries = Retry(total=5, backoff_factor=0.1)
-            self._session.mount("https://", HTTPAdapter(max_retries=retries))
+            self._session.mount(
+                "https://",
+                HTTPAdapter(
+                    max_retries=retries,
+                    # We only use one connection pool to be cached because we
+                    # only communicate with one remote server (the control
+                    # plane)
+                    pool_connections=1,
+                    pool_maxsize=conn_pool_size,
+                ),
+            )
         return self._session

zenml-nightly 0.58.1.dev20240610__py3-none-any.whl → 0.58.2.dev20240612__py3-none-any.whl

zenml-nightly 0.58.1.dev20240610py3-none-any.whl → 0.58.2.dev20240612py3-none-any.whl