zenml-nightly 0.58.1.dev20240610__py3-none-any.whl → 0.58.2.dev20240612__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. README.md +1 -1
  2. RELEASE_NOTES.md +24 -0
  3. zenml/VERSION +1 -1
  4. zenml/artifact_stores/base_artifact_store.py +1 -0
  5. zenml/config/server_config.py +3 -0
  6. zenml/constants.py +1 -0
  7. zenml/integrations/airflow/__init__.py +1 -1
  8. zenml/integrations/gcp/artifact_stores/gcp_artifact_store.py +6 -0
  9. zenml/integrations/gcp/flavors/gcp_artifact_store_flavor.py +1 -0
  10. zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +4 -0
  11. zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +8 -1
  12. zenml/logging/__init__.py +5 -0
  13. zenml/logging/step_logging.py +253 -25
  14. zenml/orchestrators/dag_runner.py +18 -3
  15. zenml/utils/pipeline_docker_image_builder.py +1 -4
  16. zenml/zen_server/cloud_utils.py +20 -2
  17. zenml/zen_server/dashboard/assets/{404-D5p6PIdn.js → 404-CDPQCl4D.js} +1 -1
  18. zenml/zen_server/dashboard/assets/{@reactflow-CegZ5GV3.js → @reactflow-CHBapDaj.js} +1 -1
  19. zenml/zen_server/dashboard/assets/{AwarenessChannel-DDpU6zHx.js → AwarenessChannel-nXGpmj_f.js} +1 -1
  20. zenml/zen_server/dashboard/assets/{Cards-wfOUm_Ae.js → Cards-nwsvQLVS.js} +1 -1
  21. zenml/zen_server/dashboard/assets/{CodeSnippet-iinvcx17.js → CodeSnippet-BidtnWOi.js} +2 -2
  22. zenml/zen_server/dashboard/assets/{Commands-DGnWeAWF.js → Commands-DuIWKg_Q.js} +1 -1
  23. zenml/zen_server/dashboard/assets/{CopyButton-eUfW9370.js → CopyButton-B_YSm-Ds.js} +2 -2
  24. zenml/zen_server/dashboard/assets/{CsvVizualization-CoIkmrjr.js → CsvVizualization-BOuez-fG.js} +1 -1
  25. zenml/zen_server/dashboard/assets/{Error-CDMUBgpN.js → Error-B6M0dPph.js} +1 -1
  26. zenml/zen_server/dashboard/assets/{Helpbox-BV73V0J6.js → Helpbox-BQoqCm04.js} +1 -1
  27. zenml/zen_server/dashboard/assets/{Infobox-DG7zmtut.js → Infobox-Ce9mefqU.js} +1 -1
  28. zenml/zen_server/dashboard/assets/{InlineAvatar-BiKh3XC5.js → InlineAvatar-DGf3dVhV.js} +1 -1
  29. zenml/zen_server/dashboard/assets/{PageHeader-D-u0obgg.js → PageHeader-DGaemzjc.js} +1 -1
  30. zenml/zen_server/dashboard/assets/{Pagination-ZYqHJ5gE.js → Pagination-DVYfBCCc.js} +1 -1
  31. zenml/zen_server/dashboard/assets/{SetPassword-Dq6iUfpW.js → SetPassword-B5s7DJug.js} +1 -1
  32. zenml/zen_server/dashboard/assets/SuccessStep-ZzczaM7g.js +1 -0
  33. zenml/zen_server/dashboard/assets/{UpdatePasswordSchemas-4FyPPBY9.js → UpdatePasswordSchemas-DnM-c11H.js} +1 -1
  34. zenml/zen_server/dashboard/assets/adam-e-y0WnB_.webp +0 -0
  35. zenml/zen_server/dashboard/assets/alex-DcCuDHPg.webp +0 -0
  36. zenml/zen_server/dashboard/assets/baris-C0ZrZ10g.webp +0 -0
  37. zenml/zen_server/dashboard/assets/{cloud-only-DsILLhXk.js → cloud-only-Ba_ShBR5.js} +1 -1
  38. zenml/zen_server/dashboard/assets/hamza-NKKOZz1I.webp +0 -0
  39. zenml/zen_server/dashboard/assets/index-CWJ3xbIf.css +1 -0
  40. zenml/zen_server/dashboard/assets/{index-BhYPVFKa.js → index-QORVVTMN.js} +3 -3
  41. zenml/zen_server/dashboard/assets/{login-mutation-CDARn8rx.js → login-mutation-wzzl23C6.js} +1 -1
  42. zenml/zen_server/dashboard/assets/{not-found-D1_I0ubu.js → not-found-Dh2la7kh.js} +1 -1
  43. zenml/zen_server/dashboard/assets/{page-CaDkuI7b.js → page-AQKopn_4.js} +1 -1
  44. zenml/zen_server/dashboard/assets/{page-7IP7WH5_.js → page-B-5jAKoO.js} +1 -1
  45. zenml/zen_server/dashboard/assets/{page-ByiUk9rA.js → page-B-vWk8a6.js} +1 -1
  46. zenml/zen_server/dashboard/assets/{page-B5Y_HW80.js → page-B0BrqfS8.js} +1 -1
  47. zenml/zen_server/dashboard/assets/{page-Ca8Zw2SO.js → page-BQxVFlUl.js} +1 -1
  48. zenml/zen_server/dashboard/assets/{page-bpP11sGS.js → page-BW6Ket3a.js} +1 -1
  49. zenml/zen_server/dashboard/assets/{page-BQT1Zxsp.js → page-Bi5AI0S7.js} +1 -1
  50. zenml/zen_server/dashboard/assets/{page-CK0iF8U_.js → page-BmkSiYeQ.js} +1 -1
  51. zenml/zen_server/dashboard/assets/{page-BR7WTzLa.js → page-ByrHy6Ss.js} +1 -1
  52. zenml/zen_server/dashboard/assets/{page-C9lMl0g8.js → page-BzVZGExK.js} +1 -1
  53. zenml/zen_server/dashboard/assets/{page-Bwrw_wb_.js → page-CPtY4Kv_.js} +1 -1
  54. zenml/zen_server/dashboard/assets/{page-D_Vj_UH1.js → page-CmmukLsl.js} +1 -1
  55. zenml/zen_server/dashboard/assets/page-CuT1SUik.js +1 -0
  56. zenml/zen_server/dashboard/assets/{page-DPrgvGj6.js → page-D2D-7qyr.js} +3 -3
  57. zenml/zen_server/dashboard/assets/{page-DDY5j-6S.js → page-DAQQyLxT.js} +1 -1
  58. zenml/zen_server/dashboard/assets/{page-QPP3iIQH.js → page-DHkUMl_E.js} +1 -1
  59. zenml/zen_server/dashboard/assets/{page-DIz9_5Du.js → page-DZCbwOEs.js} +1 -1
  60. zenml/zen_server/dashboard/assets/page-DdaIt20-.js +1 -0
  61. zenml/zen_server/dashboard/assets/{page-zYQJvPVh.js → page-LqLs24Ot.js} +1 -1
  62. zenml/zen_server/dashboard/assets/{page-DB_mi8or.js → page-lebv0c7C.js} +1 -1
  63. zenml/zen_server/dashboard/assets/{page-CQTaUp7q.js → page-yN4rZ-ZS.js} +1 -1
  64. zenml/zen_server/dashboard/assets/stefan-B08Ftbba.webp +0 -0
  65. zenml/zen_server/dashboard/assets/{update-server-settings-mutation-SaWcyAnk.js → update-server-settings-mutation-0Wgz8pUE.js} +1 -1
  66. zenml/zen_server/dashboard/assets/{url-ZKNs861m.js → url-6_xv0WJS.js} +1 -1
  67. zenml/zen_server/dashboard/index.html +4 -4
  68. zenml/zen_server/dashboard_legacy/asset-manifest.json +4 -4
  69. zenml/zen_server/dashboard_legacy/index.html +1 -1
  70. zenml/zen_server/dashboard_legacy/{precache-manifest.8e59f98d08e9c4c7cb3ef9f0bab7093f.js → precache-manifest.f4abc5b7cfa7d90c1caf5521918e29a8.js} +4 -4
  71. zenml/zen_server/dashboard_legacy/service-worker.js +1 -1
  72. zenml/zen_server/dashboard_legacy/static/js/main.ac2f17d0.chunk.js +2 -0
  73. zenml/zen_server/dashboard_legacy/static/js/{main.a238a4d2.chunk.js.map → main.ac2f17d0.chunk.js.map} +1 -1
  74. zenml/zen_server/deploy/helm/Chart.yaml +1 -1
  75. zenml/zen_server/deploy/helm/README.md +2 -2
  76. zenml/zen_server/deploy/helm/templates/_environment.tpl +3 -0
  77. zenml/zen_server/deploy/helm/templates/server-secret.yaml +6 -0
  78. zenml/zen_server/deploy/helm/values.yaml +21 -0
  79. zenml/zen_server/routers/steps_endpoints.py +9 -15
  80. zenml/zen_server/zen_server_api.py +17 -8
  81. zenml/zen_stores/migrations/versions/0.58.2_release.py +23 -0
  82. {zenml_nightly-0.58.1.dev20240610.dist-info → zenml_nightly-0.58.2.dev20240612.dist-info}/METADATA +2 -2
  83. {zenml_nightly-0.58.1.dev20240610.dist-info → zenml_nightly-0.58.2.dev20240612.dist-info}/RECORD +86 -80
  84. zenml/zen_server/dashboard/assets/SuccessStep-BHhPYxz9.js +0 -1
  85. zenml/zen_server/dashboard/assets/index-CRZ5qzG3.css +0 -1
  86. zenml/zen_server/dashboard/assets/page-BmuIfr11.js +0 -1
  87. zenml/zen_server/dashboard/assets/page-xA0WcjLa.js +0 -1
  88. zenml/zen_server/dashboard_legacy/static/js/main.a238a4d2.chunk.js +0 -2
  89. {zenml_nightly-0.58.1.dev20240610.dist-info → zenml_nightly-0.58.2.dev20240612.dist-info}/LICENSE +0 -0
  90. {zenml_nightly-0.58.1.dev20240610.dist-info → zenml_nightly-0.58.2.dev20240612.dist-info}/WHEEL +0 -0
  91. {zenml_nightly-0.58.1.dev20240610.dist-info → zenml_nightly-0.58.2.dev20240612.dist-info}/entry_points.txt +0 -0
README.md CHANGED
@@ -289,7 +289,7 @@ the Apache License Version 2.0.
289
289
  <a href="https://github.com/zenml-io/zenml-projects">Projects Showcase</a>
290
290
  <br />
291
291
  <br />
292
- 🎉 Version 0.58.1 is out. Check out the release notes
292
+ 🎉 Version 0.58.2 is out. Check out the release notes
293
293
  <a href="https://github.com/zenml-io/zenml/releases">here</a>.
294
294
  <br />
295
295
  🖥️ Download our VS Code Extension <a href="https://marketplace.visualstudio.com/items?itemName=ZenML.zenml-vscode">here</a>.
RELEASE_NOTES.md CHANGED
@@ -1,5 +1,29 @@
1
1
  <!-- markdown-link-check-disable -->
2
2
 
3
+ # 0.58.2
4
+
5
+ The 0.58.2 minor release is packed with a set of improvements to the ZenML logging and ZenML Server.
6
+
7
+ With this release ZenML logging will:
8
+ - Offer pagination of the logs during fetching via REST API
9
+ - Store the full logs history on GCS Artifact Stores
10
+ - Be performant running logging-heavy tasks, like TQDM logging or logging of training in any Deep Learning framework (also TQDM-backed)
11
+
12
+ ## What's Changed
13
+ * update test-migrations.sh with latest versions by @safoinme in https://github.com/zenml-io/zenml/pull/2757
14
+ * Fix overriding expiration date for api tokens by @schustmi in https://github.com/zenml-io/zenml/pull/2753
15
+ * Step logs pagination by @schustmi in https://github.com/zenml-io/zenml/pull/2731
16
+ * Fix broken links (round 2) by @strickvl in https://github.com/zenml-io/zenml/pull/2760
17
+ * Remove default system flag in docker UV by @avishniakov in https://github.com/zenml-io/zenml/pull/2764
18
+ * Another batch of small fixes and expansions by @AlexejPenner in https://github.com/zenml-io/zenml/pull/2762
19
+ * Server scalability improvements by @stefannica in https://github.com/zenml-io/zenml/pull/2752
20
+ * Add option to start parallel kubernetes steps with delay by @schustmi in https://github.com/zenml-io/zenml/pull/2758
21
+ * Move `thread_limiter` to app startup event by @avishniakov in https://github.com/zenml-io/zenml/pull/2765
22
+ * Logging performance improvements and GCP logging fix by @avishniakov in https://github.com/zenml-io/zenml/pull/2755
23
+
24
+
25
+ **Full Changelog**: https://github.com/zenml-io/zenml/compare/0.58.1...0.58.2
26
+
3
27
  # 0.58.1
4
28
 
5
29
  The 0.58.1 release brings a set of minor enhancement and bugfix to the ZenML framework, such as the ability to delete all versions of a pipeline using the Client/CLI, providing greater flexibility and control over pipeline management. Users can now specify Python package installer arguments. Furthermore, a fix has been implemented for the Sentencepiece tokenizer materializer.
zenml/VERSION CHANGED
@@ -1 +1 @@
1
- 0.58.1.dev20240610
1
+ 0.58.2.dev20240612
@@ -171,6 +171,7 @@ class BaseArtifactStoreConfig(StackComponentConfig):
171
171
  path: str
172
172
 
173
173
  SUPPORTED_SCHEMES: ClassVar[Set[str]]
174
+ IS_IMMUTABLE_FILESYSTEM: ClassVar[bool] = False
174
175
 
175
176
  @root_validator(skip_on_failure=True)
176
177
  def _ensure_artifact_store(cls, values: Dict[str, Any]) -> Any:
@@ -39,6 +39,7 @@ from zenml.constants import (
39
39
  DEFAULT_ZENML_SERVER_SECURE_HEADERS_REFERRER,
40
40
  DEFAULT_ZENML_SERVER_SECURE_HEADERS_XFO,
41
41
  DEFAULT_ZENML_SERVER_SECURE_HEADERS_XXP,
42
+ DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE,
42
43
  DEFAULT_ZENML_SERVER_USE_LEGACY_DASHBOARD,
43
44
  ENV_ZENML_SERVER_PREFIX,
44
45
  )
@@ -301,6 +302,8 @@ class ServerConfiguration(BaseModel):
301
302
  display_updates: bool = True
302
303
  auto_activate: bool = False
303
304
 
305
+ thread_pool_size: int = DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE
306
+
304
307
  _deployment_id: Optional[UUID] = None
305
308
 
306
309
  @root_validator(pre=True)
zenml/constants.py CHANGED
@@ -230,6 +230,7 @@ STEP_SOURCE_PARAMETER_NAME = "step_source"
230
230
 
231
231
  # Server settings
232
232
  DEFAULT_ZENML_SERVER_NAME = "default"
233
+ DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE = 40
233
234
  DEFAULT_ZENML_JWT_TOKEN_LEEWAY = 10
234
235
  DEFAULT_ZENML_JWT_TOKEN_ALGORITHM = "HS256"
235
236
  DEFAULT_ZENML_AUTH_SCHEME = AuthScheme.OAUTH2_PASSWORD_BEARER
@@ -17,7 +17,7 @@ The Airflow integration sub-module powers an alternative to the local
17
17
  orchestrator. You can enable it by registering the Airflow orchestrator with
18
18
  the CLI tool, then bootstrap using the ``zenml orchestrator up`` command.
19
19
  """
20
- from typing import List, Optional, Type
20
+ from typing import List, Type
21
21
 
22
22
  from zenml.integrations.constants import AIRFLOW
23
23
  from zenml.integrations.integration import Integration
@@ -35,9 +35,11 @@ from zenml.integrations.gcp.flavors.gcp_artifact_store_flavor import (
35
35
  GCPArtifactStoreConfig,
36
36
  )
37
37
  from zenml.io.fileio import convert_to_str
38
+ from zenml.logger import get_logger
38
39
  from zenml.secret.schemas import GCPSecretSchema
39
40
  from zenml.stack.authentication_mixin import AuthenticationMixin
40
41
 
42
+ logger = get_logger(__name__)
41
43
  PathType = Union[bytes, str]
42
44
 
43
45
 
@@ -109,6 +111,10 @@ class GCPArtifactStore(BaseArtifactStore, AuthenticationMixin):
109
111
  Returns:
110
112
  A file-like object that can be used to read or write to the file.
111
113
  """
114
+ if mode in ("a", "ab"):
115
+ logger.warning(
116
+ "GCS Filesystem is immutable, so append mode will overwrite existing files."
117
+ )
112
118
  return self.filesystem.open(path=path, mode=mode)
113
119
 
114
120
  def copyfile(
@@ -36,6 +36,7 @@ class GCPArtifactStoreConfig(
36
36
  """Configuration for GCP Artifact Store."""
37
37
 
38
38
  SUPPORTED_SCHEMES: ClassVar[Set[str]] = {GCP_PATH_PREFIX}
39
+ IS_IMMUTABLE_FILESYSTEM: ClassVar[bool] = True
39
40
 
40
41
 
41
42
  class GCPArtifactStoreFlavor(BaseArtifactStoreFlavor):
@@ -82,6 +82,9 @@ class KubernetesOrchestratorConfig( # type: ignore[misc] # https://github.com/p
82
82
  containers).
83
83
  skip_local_validations: If `True`, the local validations will be
84
84
  skipped.
85
+ parallel_step_startup_waiting_period: How long to wait in between
86
+ starting parallel steps. This can be used to distribute server
87
+ load when running pipelines with a huge amount of parallel steps.
85
88
  """
86
89
 
87
90
  incluster: bool = False
@@ -89,6 +92,7 @@ class KubernetesOrchestratorConfig( # type: ignore[misc] # https://github.com/p
89
92
  kubernetes_namespace: str = "zenml"
90
93
  local: bool = False
91
94
  skip_local_validations: bool = False
95
+ parallel_step_startup_waiting_period: Optional[float] = None
92
96
 
93
97
  @property
94
98
  def is_remote(self) -> bool:
@@ -142,7 +142,14 @@ def main() -> None:
142
142
  )
143
143
  logger.info(f"Pod of step `{step_name}` completed.")
144
144
 
145
- ThreadedDagRunner(dag=pipeline_dag, run_fn=run_step_on_kubernetes).run()
145
+ parallel_node_startup_waiting_period = (
146
+ orchestrator.config.parallel_step_startup_waiting_period or 0.0
147
+ )
148
+ ThreadedDagRunner(
149
+ dag=pipeline_dag,
150
+ run_fn=run_step_on_kubernetes,
151
+ parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
152
+ ).run()
146
153
 
147
154
  logger.info("Orchestration pod completed.")
148
155
 
zenml/logging/__init__.py CHANGED
@@ -12,8 +12,13 @@
12
12
  # or implied. See the License for the specific language governing
13
13
  # permissions and limitations under the License.
14
14
 
15
+ """Logging utilities."""
16
+
15
17
  # How many seconds to wait before uploading logs to the artifact store
16
18
  STEP_LOGS_STORAGE_INTERVAL_SECONDS: int = 15
17
19
 
18
20
  # How many messages to buffer before uploading logs to the artifact store
19
21
  STEP_LOGS_STORAGE_MAX_MESSAGES: int = 100
22
+
23
+ # How often to merge logs into a single file
24
+ STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS: int = 10 * 60
@@ -19,22 +19,31 @@ import sys
19
19
  import time
20
20
  from contextvars import ContextVar
21
21
  from types import TracebackType
22
- from typing import Any, Callable, List, Optional, Type
23
- from uuid import uuid4
22
+ from typing import Any, Callable, List, Optional, Type, Union
23
+ from uuid import UUID, uuid4
24
24
 
25
25
  from zenml.artifact_stores import BaseArtifactStore
26
+ from zenml.artifacts.utils import (
27
+ _load_artifact_store,
28
+ _load_file_from_artifact_store,
29
+ )
26
30
  from zenml.client import Client
31
+ from zenml.exceptions import DoesNotExistException
27
32
  from zenml.logger import get_logger
28
33
  from zenml.logging import (
29
34
  STEP_LOGS_STORAGE_INTERVAL_SECONDS,
30
35
  STEP_LOGS_STORAGE_MAX_MESSAGES,
36
+ STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
31
37
  )
38
+ from zenml.zen_stores.base_zen_store import BaseZenStore
32
39
 
33
40
  # Get the logger
34
41
  logger = get_logger(__name__)
35
42
 
36
43
  redirected: ContextVar[bool] = ContextVar("redirected", default=False)
37
44
 
45
+ LOGS_EXTENSION = ".log"
46
+
38
47
 
39
48
  def remove_ansi_escape_codes(text: str) -> str:
40
49
  """Auxiliary function to remove ANSI escape codes from a given string.
@@ -54,7 +63,7 @@ def prepare_logs_uri(
54
63
  step_name: str,
55
64
  log_key: Optional[str] = None,
56
65
  ) -> str:
57
- """Generates and prepares a URI for the log file for a step.
66
+ """Generates and prepares a URI for the log file or folder for a step.
58
67
 
59
68
  Args:
60
69
  artifact_store: The artifact store on which the artifact will be stored.
@@ -62,7 +71,7 @@ def prepare_logs_uri(
62
71
  log_key: The unique identification key of the log file.
63
72
 
64
73
  Returns:
65
- The URI of the logs file.
74
+ The URI of the log storage (file or folder).
66
75
  """
67
76
  if log_key is None:
68
77
  log_key = str(uuid4())
@@ -78,13 +87,119 @@ def prepare_logs_uri(
78
87
  artifact_store.makedirs(logs_base_uri)
79
88
 
80
89
  # Delete the file if it already exists
81
- logs_uri = os.path.join(logs_base_uri, f"{log_key}.log")
82
- if artifact_store.exists(logs_uri):
83
- logger.warning(
84
- f"Logs file {logs_uri} already exists! Removing old log file..."
90
+ if artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
91
+ logs_uri_folder = os.path.join(logs_base_uri, log_key)
92
+ if artifact_store.exists(logs_uri_folder):
93
+ logger.warning(
94
+ f"Logs directory {logs_uri_folder} already exists! Removing old log directory..."
95
+ )
96
+ artifact_store.rmtree(logs_uri_folder)
97
+
98
+ artifact_store.makedirs(logs_uri_folder)
99
+ return logs_uri_folder
100
+ else:
101
+ logs_uri = os.path.join(logs_base_uri, f"{log_key}.log")
102
+ if artifact_store.exists(logs_uri):
103
+ logger.warning(
104
+ f"Logs file {logs_uri} already exists! Removing old log file..."
105
+ )
106
+ artifact_store.remove(logs_uri)
107
+ return logs_uri
108
+
109
+
110
+ def fetch_logs(
111
+ zen_store: "BaseZenStore",
112
+ artifact_store_id: Union[str, UUID],
113
+ logs_uri: str,
114
+ offset: int = 0,
115
+ length: int = 1024 * 1024 * 16, # Default to 16MiB of data
116
+ ) -> str:
117
+ """Fetches the logs from the artifact store.
118
+
119
+ Args:
120
+ zen_store: The store in which the artifact is stored.
121
+ artifact_store_id: The ID of the artifact store.
122
+ logs_uri: The URI of the artifact.
123
+ offset: The offset from which to start reading.
124
+ length: The amount of bytes that should be read.
125
+
126
+ Returns:
127
+ The logs as a string.
128
+
129
+ Raises:
130
+ DoesNotExistException: If the artifact does not exist in the artifact
131
+ store.
132
+ """
133
+
134
+ def _read_file(
135
+ uri: str, offset: int = 0, length: Optional[int] = None
136
+ ) -> str:
137
+ return str(
138
+ _load_file_from_artifact_store(
139
+ uri,
140
+ artifact_store=artifact_store,
141
+ mode="rb",
142
+ offset=offset,
143
+ length=length,
144
+ ).decode()
85
145
  )
86
- artifact_store.remove(logs_uri)
87
- return logs_uri
146
+
147
+ artifact_store = _load_artifact_store(artifact_store_id, zen_store)
148
+ if not artifact_store.isdir(logs_uri):
149
+ return _read_file(logs_uri, offset, length)
150
+ else:
151
+ files = artifact_store.listdir(logs_uri)
152
+ if len(files) == 1:
153
+ return _read_file(
154
+ os.path.join(logs_uri, str(files[0])), offset, length
155
+ )
156
+ else:
157
+ is_negative_offset = offset < 0
158
+ files.sort(reverse=is_negative_offset)
159
+
160
+ # search for the first file we need to read
161
+ latest_file_id = 0
162
+ for i, file in enumerate(files):
163
+ file_size: int = artifact_store.size(
164
+ os.path.join(logs_uri, str(file))
165
+ ) # type: ignore[assignment]
166
+
167
+ if is_negative_offset:
168
+ if file_size >= -offset:
169
+ latest_file_id = -(i + 1)
170
+ break
171
+ else:
172
+ offset += file_size
173
+ else:
174
+ if file_size > offset:
175
+ latest_file_id = i
176
+ break
177
+ else:
178
+ offset -= file_size
179
+
180
+ # read the files according to pre-filtering
181
+ files.sort()
182
+ ret = []
183
+ for file in files[latest_file_id:]:
184
+ ret.append(
185
+ _read_file(
186
+ os.path.join(logs_uri, str(file)),
187
+ offset,
188
+ length,
189
+ )
190
+ )
191
+ offset = 0
192
+ length -= len(ret[-1])
193
+ if length <= 0:
194
+ # stop further reading, if the whole length is already read
195
+ break
196
+
197
+ if not ret:
198
+ raise DoesNotExistException(
199
+ f"Folder '{logs_uri}' is empty in artifact store "
200
+ f"'{artifact_store.name}'."
201
+ )
202
+ return "".join(ret)
88
203
 
89
204
 
90
205
  class StepLogsStorage:
@@ -95,25 +210,46 @@ class StepLogsStorage:
95
210
  logs_uri: str,
96
211
  max_messages: int = STEP_LOGS_STORAGE_MAX_MESSAGES,
97
212
  time_interval: int = STEP_LOGS_STORAGE_INTERVAL_SECONDS,
213
+ merge_files_interval: int = STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
98
214
  ) -> None:
99
215
  """Initialization.
100
216
 
101
217
  Args:
102
- logs_uri: the target URI to store the logs.
218
+ logs_uri: the URI of the log file or folder.
103
219
  max_messages: the maximum number of messages to save in the buffer.
104
220
  time_interval: the amount of seconds before the buffer gets saved
105
221
  automatically.
222
+ merge_files_interval: the amount of seconds before the created files
223
+ get merged into a single file.
106
224
  """
107
225
  # Parameters
108
226
  self.logs_uri = logs_uri
109
227
  self.max_messages = max_messages
110
228
  self.time_interval = time_interval
229
+ self.merge_files_interval = merge_files_interval
111
230
 
112
231
  # State
113
232
  self.buffer: List[str] = []
114
233
  self.disabled_buffer: List[str] = []
115
234
  self.last_save_time = time.time()
116
235
  self.disabled = False
236
+ self._artifact_store: Optional["BaseArtifactStore"] = None
237
+
238
+ # Immutable filesystems state
239
+ self.last_merge_time = time.time()
240
+ self.log_files_not_merged: List[str] = []
241
+ self.next_merged_file_name: str = f"{time.time()}{LOGS_EXTENSION}"
242
+
243
+ @property
244
+ def artifact_store(self) -> "BaseArtifactStore":
245
+ """Returns the active artifact store.
246
+
247
+ Returns:
248
+ The active artifact store.
249
+ """
250
+ if self._artifact_store is None:
251
+ self._artifact_store = Client().active_stack.artifact_store
252
+ return self._artifact_store
117
253
 
118
254
  def write(self, text: str) -> None:
119
255
  """Main write method.
@@ -126,29 +262,60 @@ class StepLogsStorage:
126
262
 
127
263
  if not self.disabled:
128
264
  self.buffer.append(text)
265
+ self.save_to_file()
129
266
 
130
- if (
131
- len(self.buffer) >= self.max_messages
132
- or time.time() - self.last_save_time >= self.time_interval
133
- ):
134
- self.save_to_file()
267
+ @property
268
+ def _is_write_needed(self) -> bool:
269
+ """Checks whether the buffer needs to be written to disk.
135
270
 
136
- def save_to_file(self) -> None:
137
- """Method to save the buffer to the given URI."""
138
- if not self.disabled:
271
+ Returns:
272
+ whether the buffer needs to be written to disk.
273
+ """
274
+ return (
275
+ len(self.buffer) >= self.max_messages
276
+ or time.time() - self.last_save_time >= self.time_interval
277
+ )
278
+
279
+ def save_to_file(self, force: bool = False) -> None:
280
+ """Method to save the buffer to the given URI.
281
+
282
+ Args:
283
+ force: whether to force a save even if the write conditions not met.
284
+ """
285
+ if not self.disabled and (self._is_write_needed or force):
139
286
  # IMPORTANT: keep this as the first code line in this method! The
140
287
  # code that follows might still emit logging messages, which will
141
288
  # end up triggering this method again, causing an infinite loop.
142
289
  self.disabled = True
143
290
 
144
- artifact_store = Client().active_stack.artifact_store
145
291
  try:
146
292
  if self.buffer:
147
- with artifact_store.open(self.logs_uri, "a") as file:
148
- for message in self.buffer:
149
- file.write(
150
- remove_ansi_escape_codes(message) + "\n"
293
+ if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
294
+ if not self.log_files_not_merged:
295
+ self.next_merged_file_name = (
296
+ f"{time.time()}{LOGS_EXTENSION}"
151
297
  )
298
+ log_file_ = f"{time.time()}{LOGS_EXTENSION}"
299
+ self.log_files_not_merged.append(log_file_)
300
+ with self.artifact_store.open(
301
+ os.path.join(
302
+ self.logs_uri,
303
+ log_file_,
304
+ ),
305
+ "w",
306
+ ) as file:
307
+ for message in self.buffer:
308
+ file.write(
309
+ remove_ansi_escape_codes(message) + "\n"
310
+ )
311
+ else:
312
+ with self.artifact_store.open(
313
+ self.logs_uri, "a"
314
+ ) as file:
315
+ for message in self.buffer:
316
+ file.write(
317
+ remove_ansi_escape_codes(message) + "\n"
318
+ )
152
319
 
153
320
  except (OSError, IOError) as e:
154
321
  # This exception can be raised if there are issues with the
@@ -161,6 +328,66 @@ class StepLogsStorage:
161
328
  self.last_save_time = time.time()
162
329
 
163
330
  self.disabled = False
331
+ # merge created files on a given interval (defaults to 10 minutes)
332
+ # only runs on Immutable Filesystems
333
+ if (
334
+ self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM
335
+ and time.time() - self.last_merge_time > self.merge_files_interval
336
+ ):
337
+ try:
338
+ self.merge_log_files(
339
+ self.next_merged_file_name, self.log_files_not_merged
340
+ )
341
+ except (OSError, IOError) as e:
342
+ logger.error(f"Error while trying to roll up logs: {e}")
343
+ else:
344
+ self.log_files_not_merged = []
345
+ finally:
346
+ self.last_merge_time = time.time()
347
+
348
+ def merge_log_files(
349
+ self,
350
+ file_name: Optional[str] = None,
351
+ files: Optional[List[str]] = None,
352
+ ) -> None:
353
+ """Merges all log files into one in the given URI.
354
+
355
+ Called on the logging context exit.
356
+
357
+ Args:
358
+ file_name: The name of the merged log file.
359
+ files: The list of log files to merge.
360
+ """
361
+ if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
362
+ files_ = files or self.artifact_store.listdir(self.logs_uri)
363
+ file_name_ = file_name or f"full_log{LOGS_EXTENSION}"
364
+ if len(files_) > 1:
365
+ files_.sort()
366
+ logger.debug("Log files count: %s", len(files_))
367
+
368
+ try:
369
+ # dump all logs to a local file first
370
+ with self.artifact_store.open(
371
+ os.path.join(self.logs_uri, file_name_), "w"
372
+ ) as merged_file:
373
+ for file in files_:
374
+ merged_file.write(
375
+ str(
376
+ _load_file_from_artifact_store(
377
+ os.path.join(self.logs_uri, str(file)),
378
+ artifact_store=self.artifact_store,
379
+ mode="r",
380
+ )
381
+ )
382
+ )
383
+ except Exception as e:
384
+ logger.warning(f"Failed to merge log files. {e}")
385
+ else:
386
+ # clean up left over files
387
+ for file in files_:
388
+ self.artifact_store.remove(
389
+ os.path.join(self.logs_uri, str(file))
390
+ )
164
391
 
165
392
 
166
393
  class StepLogsStorageContext:
@@ -213,7 +440,8 @@ class StepLogsStorageContext:
213
440
 
214
441
  Restores the `write` method of both stderr and stdout.
215
442
  """
216
- self.storage.save_to_file()
443
+ self.storage.save_to_file(force=True)
444
+ self.storage.merge_log_files()
217
445
 
218
446
  setattr(sys.stdout, "write", self.stdout_write)
219
447
  setattr(sys.stdout, "flush", self.stdout_flush)
@@ -14,6 +14,7 @@
14
14
  """DAG (Directed Acyclic Graph) Runners."""
15
15
 
16
16
  import threading
17
+ import time
17
18
  from collections import defaultdict
18
19
  from enum import Enum
19
20
  from typing import Any, Callable, Dict, List
@@ -66,7 +67,10 @@ class ThreadedDagRunner:
66
67
  """
67
68
 
68
69
  def __init__(
69
- self, dag: Dict[str, List[str]], run_fn: Callable[[str], Any]
70
+ self,
71
+ dag: Dict[str, List[str]],
72
+ run_fn: Callable[[str], Any],
73
+ parallel_node_startup_waiting_period: float = 0.0,
70
74
  ) -> None:
71
75
  """Define attributes and initialize all nodes in waiting state.
72
76
 
@@ -75,7 +79,12 @@ class ThreadedDagRunner:
75
79
  E.g.: [(1->2), (1->3), (2->4), (3->4)] should be represented as
76
80
  `dag={2: [1], 3: [1], 4: [2, 3]}`
77
81
  run_fn: A function `run_fn(node)` that runs a single node
82
+ parallel_node_startup_waiting_period: Delay in seconds to wait in
83
+ between starting parallel nodes.
78
84
  """
85
+ self.parallel_node_startup_waiting_period = (
86
+ parallel_node_startup_waiting_period
87
+ )
79
88
  self.dag = dag
80
89
  self.reversed_dag = reverse_dag(dag)
81
90
  self.run_fn = run_fn
@@ -154,9 +163,12 @@ class ThreadedDagRunner:
154
163
  self.node_states[node] = NodeStatus.COMPLETED
155
164
 
156
165
  # Run downstream nodes.
157
- threads = []
166
+ threads: List[threading.Thread] = []
158
167
  for downstram_node in self.reversed_dag[node]:
159
168
  if self._can_run(downstram_node):
169
+ if threads and self.parallel_node_startup_waiting_period > 0:
170
+ time.sleep(self.parallel_node_startup_waiting_period)
171
+
160
172
  thread = self._run_node_in_thread(downstram_node)
161
173
  threads.append(thread)
162
174
 
@@ -173,9 +185,12 @@ class ThreadedDagRunner:
173
185
  # Run all nodes that can be started immediately.
174
186
  # These will, in turn, start other nodes once all of their respective
175
187
  # upstream nodes have completed.
176
- threads = []
188
+ threads: List[threading.Thread] = []
177
189
  for node in self.nodes:
178
190
  if self._can_run(node):
191
+ if threads and self.parallel_node_startup_waiting_period > 0:
192
+ time.sleep(self.parallel_node_startup_waiting_period)
193
+
179
194
  thread = self._run_node_in_thread(node)
180
195
  threads.append(thread)
181
196
 
@@ -70,10 +70,7 @@ PIP_DEFAULT_ARGS = {
70
70
  "no-cache-dir": None,
71
71
  "default-timeout": 60,
72
72
  }
73
- UV_DEFAULT_ARGS = {
74
- "no-cache-dir": None,
75
- "system": None,
76
- }
73
+ UV_DEFAULT_ARGS = {"no-cache-dir": None}
77
74
 
78
75
 
79
76
  class PipelineDockerImageBuilder:
@@ -8,6 +8,7 @@ from pydantic import BaseModel, validator
8
8
  from requests.adapters import HTTPAdapter, Retry
9
9
 
10
10
  from zenml.exceptions import SubscriptionUpgradeRequiredError
11
+ from zenml.zen_server.utils import server_config
11
12
 
12
13
  ZENML_CLOUD_RBAC_ENV_PREFIX = "ZENML_CLOUD_"
13
14
 
@@ -99,7 +100,7 @@ class ZenMLCloudSession:
99
100
  raise SubscriptionUpgradeRequiredError(response.json())
100
101
  else:
101
102
  raise RuntimeError(
102
- f"Failed with the following error {response.json()}"
103
+ f"Failed with the following error {response} {response.text}"
103
104
  )
104
105
 
105
106
  return response
@@ -154,12 +155,29 @@ class ZenMLCloudSession:
154
155
  A requests session with the authentication token.
155
156
  """
156
157
  if self._session is None:
158
+ # Set up the session's connection pool size to match the server's
159
+ # thread pool size. This allows the server to cache one connection
160
+ # per thread, which means we can keep connections open for longer
161
+ # and avoid the overhead of setting up a new connection for each
162
+ # request.
163
+ conn_pool_size = server_config().thread_pool_size
164
+
157
165
  self._session = requests.Session()
158
166
  token = self._fetch_auth_token()
159
167
  self._session.headers.update({"Authorization": "Bearer " + token})
160
168
 
161
169
  retries = Retry(total=5, backoff_factor=0.1)
162
- self._session.mount("https://", HTTPAdapter(max_retries=retries))
170
+ self._session.mount(
171
+ "https://",
172
+ HTTPAdapter(
173
+ max_retries=retries,
174
+ # We only use one connection pool to be cached because we
175
+ # only communicate with one remote server (the control
176
+ # plane)
177
+ pool_connections=1,
178
+ pool_maxsize=conn_pool_size,
179
+ ),
180
+ )
163
181
 
164
182
  return self._session
165
183