zenml-nightly 0.58.1.dev20240608__py3-none-any.whl → 0.58.2.dev20240611__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +1 -1
- RELEASE_NOTES.md +24 -0
- zenml/VERSION +1 -1
- zenml/artifact_stores/base_artifact_store.py +1 -0
- zenml/config/server_config.py +3 -0
- zenml/constants.py +1 -0
- zenml/integrations/airflow/__init__.py +1 -1
- zenml/integrations/gcp/artifact_stores/gcp_artifact_store.py +6 -0
- zenml/integrations/gcp/flavors/gcp_artifact_store_flavor.py +1 -0
- zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +4 -0
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +8 -1
- zenml/logging/__init__.py +5 -0
- zenml/logging/step_logging.py +253 -25
- zenml/orchestrators/dag_runner.py +18 -3
- zenml/utils/pipeline_docker_image_builder.py +1 -4
- zenml/zen_server/cloud_utils.py +20 -2
- zenml/zen_server/dashboard/assets/{404-D5p6PIdn.js → 404-CDPQCl4D.js} +1 -1
- zenml/zen_server/dashboard/assets/{@reactflow-CegZ5GV3.js → @reactflow-CHBapDaj.js} +1 -1
- zenml/zen_server/dashboard/assets/{AwarenessChannel-DDpU6zHx.js → AwarenessChannel-nXGpmj_f.js} +1 -1
- zenml/zen_server/dashboard/assets/{Cards-wfOUm_Ae.js → Cards-nwsvQLVS.js} +1 -1
- zenml/zen_server/dashboard/assets/{CodeSnippet-iinvcx17.js → CodeSnippet-BidtnWOi.js} +2 -2
- zenml/zen_server/dashboard/assets/{Commands-DGnWeAWF.js → Commands-DuIWKg_Q.js} +1 -1
- zenml/zen_server/dashboard/assets/{CopyButton-eUfW9370.js → CopyButton-B_YSm-Ds.js} +2 -2
- zenml/zen_server/dashboard/assets/{CsvVizualization-CoIkmrjr.js → CsvVizualization-BOuez-fG.js} +1 -1
- zenml/zen_server/dashboard/assets/{Error-CDMUBgpN.js → Error-B6M0dPph.js} +1 -1
- zenml/zen_server/dashboard/assets/{Helpbox-BV73V0J6.js → Helpbox-BQoqCm04.js} +1 -1
- zenml/zen_server/dashboard/assets/{Infobox-DG7zmtut.js → Infobox-Ce9mefqU.js} +1 -1
- zenml/zen_server/dashboard/assets/{InlineAvatar-BiKh3XC5.js → InlineAvatar-DGf3dVhV.js} +1 -1
- zenml/zen_server/dashboard/assets/{PageHeader-D-u0obgg.js → PageHeader-DGaemzjc.js} +1 -1
- zenml/zen_server/dashboard/assets/{Pagination-ZYqHJ5gE.js → Pagination-DVYfBCCc.js} +1 -1
- zenml/zen_server/dashboard/assets/{SetPassword-Dq6iUfpW.js → SetPassword-B5s7DJug.js} +1 -1
- zenml/zen_server/dashboard/assets/SuccessStep-ZzczaM7g.js +1 -0
- zenml/zen_server/dashboard/assets/{UpdatePasswordSchemas-4FyPPBY9.js → UpdatePasswordSchemas-DnM-c11H.js} +1 -1
- zenml/zen_server/dashboard/assets/adam-e-y0WnB_.webp +0 -0
- zenml/zen_server/dashboard/assets/alex-DcCuDHPg.webp +0 -0
- zenml/zen_server/dashboard/assets/baris-C0ZrZ10g.webp +0 -0
- zenml/zen_server/dashboard/assets/{cloud-only-DsILLhXk.js → cloud-only-Ba_ShBR5.js} +1 -1
- zenml/zen_server/dashboard/assets/hamza-NKKOZz1I.webp +0 -0
- zenml/zen_server/dashboard/assets/index-CWJ3xbIf.css +1 -0
- zenml/zen_server/dashboard/assets/{index-BhYPVFKa.js → index-QORVVTMN.js} +3 -3
- zenml/zen_server/dashboard/assets/{login-mutation-CDARn8rx.js → login-mutation-wzzl23C6.js} +1 -1
- zenml/zen_server/dashboard/assets/{not-found-D1_I0ubu.js → not-found-Dh2la7kh.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-CaDkuI7b.js → page-AQKopn_4.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-7IP7WH5_.js → page-B-5jAKoO.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-ByiUk9rA.js → page-B-vWk8a6.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-B5Y_HW80.js → page-B0BrqfS8.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-Ca8Zw2SO.js → page-BQxVFlUl.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-bpP11sGS.js → page-BW6Ket3a.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-BQT1Zxsp.js → page-Bi5AI0S7.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-CK0iF8U_.js → page-BmkSiYeQ.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-BR7WTzLa.js → page-ByrHy6Ss.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-C9lMl0g8.js → page-BzVZGExK.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-Bwrw_wb_.js → page-CPtY4Kv_.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-D_Vj_UH1.js → page-CmmukLsl.js} +1 -1
- zenml/zen_server/dashboard/assets/page-CuT1SUik.js +1 -0
- zenml/zen_server/dashboard/assets/{page-DPrgvGj6.js → page-D2D-7qyr.js} +3 -3
- zenml/zen_server/dashboard/assets/{page-DDY5j-6S.js → page-DAQQyLxT.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-QPP3iIQH.js → page-DHkUMl_E.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-DIz9_5Du.js → page-DZCbwOEs.js} +1 -1
- zenml/zen_server/dashboard/assets/page-DdaIt20-.js +1 -0
- zenml/zen_server/dashboard/assets/{page-zYQJvPVh.js → page-LqLs24Ot.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-DB_mi8or.js → page-lebv0c7C.js} +1 -1
- zenml/zen_server/dashboard/assets/{page-CQTaUp7q.js → page-yN4rZ-ZS.js} +1 -1
- zenml/zen_server/dashboard/assets/stefan-B08Ftbba.webp +0 -0
- zenml/zen_server/dashboard/assets/{update-server-settings-mutation-SaWcyAnk.js → update-server-settings-mutation-0Wgz8pUE.js} +1 -1
- zenml/zen_server/dashboard/assets/{url-ZKNs861m.js → url-6_xv0WJS.js} +1 -1
- zenml/zen_server/dashboard/index.html +4 -4
- zenml/zen_server/dashboard_legacy/asset-manifest.json +4 -4
- zenml/zen_server/dashboard_legacy/index.html +1 -1
- zenml/zen_server/dashboard_legacy/{precache-manifest.8e59f98d08e9c4c7cb3ef9f0bab7093f.js → precache-manifest.f4abc5b7cfa7d90c1caf5521918e29a8.js} +4 -4
- zenml/zen_server/dashboard_legacy/service-worker.js +1 -1
- zenml/zen_server/dashboard_legacy/static/js/main.ac2f17d0.chunk.js +2 -0
- zenml/zen_server/dashboard_legacy/static/js/{main.a238a4d2.chunk.js.map → main.ac2f17d0.chunk.js.map} +1 -1
- zenml/zen_server/deploy/helm/Chart.yaml +1 -1
- zenml/zen_server/deploy/helm/README.md +2 -2
- zenml/zen_server/deploy/helm/templates/_environment.tpl +3 -0
- zenml/zen_server/deploy/helm/templates/server-secret.yaml +6 -0
- zenml/zen_server/deploy/helm/values.yaml +21 -0
- zenml/zen_server/routers/steps_endpoints.py +9 -15
- zenml/zen_server/zen_server_api.py +17 -8
- zenml/zen_stores/migrations/versions/0.58.2_release.py +23 -0
- {zenml_nightly-0.58.1.dev20240608.dist-info → zenml_nightly-0.58.2.dev20240611.dist-info}/METADATA +2 -2
- {zenml_nightly-0.58.1.dev20240608.dist-info → zenml_nightly-0.58.2.dev20240611.dist-info}/RECORD +86 -80
- zenml/zen_server/dashboard/assets/SuccessStep-BHhPYxz9.js +0 -1
- zenml/zen_server/dashboard/assets/index-CRZ5qzG3.css +0 -1
- zenml/zen_server/dashboard/assets/page-BmuIfr11.js +0 -1
- zenml/zen_server/dashboard/assets/page-xA0WcjLa.js +0 -1
- zenml/zen_server/dashboard_legacy/static/js/main.a238a4d2.chunk.js +0 -2
- {zenml_nightly-0.58.1.dev20240608.dist-info → zenml_nightly-0.58.2.dev20240611.dist-info}/LICENSE +0 -0
- {zenml_nightly-0.58.1.dev20240608.dist-info → zenml_nightly-0.58.2.dev20240611.dist-info}/WHEEL +0 -0
- {zenml_nightly-0.58.1.dev20240608.dist-info → zenml_nightly-0.58.2.dev20240611.dist-info}/entry_points.txt +0 -0
README.md
CHANGED
@@ -289,7 +289,7 @@ the Apache License Version 2.0.
|
|
289
289
|
<a href="https://github.com/zenml-io/zenml-projects">Projects Showcase</a>
|
290
290
|
<br />
|
291
291
|
<br />
|
292
|
-
🎉 Version 0.58.
|
292
|
+
🎉 Version 0.58.2 is out. Check out the release notes
|
293
293
|
<a href="https://github.com/zenml-io/zenml/releases">here</a>.
|
294
294
|
<br />
|
295
295
|
🖥️ Download our VS Code Extension <a href="https://marketplace.visualstudio.com/items?itemName=ZenML.zenml-vscode">here</a>.
|
RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,29 @@
|
|
1
1
|
<!-- markdown-link-check-disable -->
|
2
2
|
|
3
|
+
# 0.58.2
|
4
|
+
|
5
|
+
The 0.58.2 minor release is packed with a set of improvements to the ZenML logging and ZenML Server.
|
6
|
+
|
7
|
+
With this release ZenML logging will:
|
8
|
+
- Offer pagination of the logs during fetching via REST API
|
9
|
+
- Store the full logs history on GCS Artifact Stores
|
10
|
+
- Be performant running logging-heavy tasks, like TQDM logging or logging of training in any Deep Learning framework (also TQDM-backed)
|
11
|
+
|
12
|
+
## What's Changed
|
13
|
+
* update test-migrations.sh with latest versions by @safoinme in https://github.com/zenml-io/zenml/pull/2757
|
14
|
+
* Fix overriding expiration date for api tokens by @schustmi in https://github.com/zenml-io/zenml/pull/2753
|
15
|
+
* Step logs pagination by @schustmi in https://github.com/zenml-io/zenml/pull/2731
|
16
|
+
* Fix broken links (round 2) by @strickvl in https://github.com/zenml-io/zenml/pull/2760
|
17
|
+
* Remove default system flag in docker UV by @avishniakov in https://github.com/zenml-io/zenml/pull/2764
|
18
|
+
* Another batch of small fixes and expansions by @AlexejPenner in https://github.com/zenml-io/zenml/pull/2762
|
19
|
+
* Server scalability improvements by @stefannica in https://github.com/zenml-io/zenml/pull/2752
|
20
|
+
* Add option to start parallel kubernetes steps with delay by @schustmi in https://github.com/zenml-io/zenml/pull/2758
|
21
|
+
* Move `thread_limiter` to app startup event by @avishniakov in https://github.com/zenml-io/zenml/pull/2765
|
22
|
+
* Logging performance improvements and GCP logging fix by @avishniakov in https://github.com/zenml-io/zenml/pull/2755
|
23
|
+
|
24
|
+
|
25
|
+
**Full Changelog**: https://github.com/zenml-io/zenml/compare/0.58.1...0.58.2
|
26
|
+
|
3
27
|
# 0.58.1
|
4
28
|
|
5
29
|
The 0.58.1 release brings a set of minor enhancement and bugfix to the ZenML framework, such as the ability to delete all versions of a pipeline using the Client/CLI, providing greater flexibility and control over pipeline management. Users can now specify Python package installer arguments. Furthermore, a fix has been implemented for the Sentencepiece tokenizer materializer.
|
zenml/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.58.
|
1
|
+
0.58.2.dev20240611
|
@@ -171,6 +171,7 @@ class BaseArtifactStoreConfig(StackComponentConfig):
|
|
171
171
|
path: str
|
172
172
|
|
173
173
|
SUPPORTED_SCHEMES: ClassVar[Set[str]]
|
174
|
+
IS_IMMUTABLE_FILESYSTEM: ClassVar[bool] = False
|
174
175
|
|
175
176
|
@root_validator(skip_on_failure=True)
|
176
177
|
def _ensure_artifact_store(cls, values: Dict[str, Any]) -> Any:
|
zenml/config/server_config.py
CHANGED
@@ -39,6 +39,7 @@ from zenml.constants import (
|
|
39
39
|
DEFAULT_ZENML_SERVER_SECURE_HEADERS_REFERRER,
|
40
40
|
DEFAULT_ZENML_SERVER_SECURE_HEADERS_XFO,
|
41
41
|
DEFAULT_ZENML_SERVER_SECURE_HEADERS_XXP,
|
42
|
+
DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE,
|
42
43
|
DEFAULT_ZENML_SERVER_USE_LEGACY_DASHBOARD,
|
43
44
|
ENV_ZENML_SERVER_PREFIX,
|
44
45
|
)
|
@@ -301,6 +302,8 @@ class ServerConfiguration(BaseModel):
|
|
301
302
|
display_updates: bool = True
|
302
303
|
auto_activate: bool = False
|
303
304
|
|
305
|
+
thread_pool_size: int = DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE
|
306
|
+
|
304
307
|
_deployment_id: Optional[UUID] = None
|
305
308
|
|
306
309
|
@root_validator(pre=True)
|
zenml/constants.py
CHANGED
@@ -230,6 +230,7 @@ STEP_SOURCE_PARAMETER_NAME = "step_source"
|
|
230
230
|
|
231
231
|
# Server settings
|
232
232
|
DEFAULT_ZENML_SERVER_NAME = "default"
|
233
|
+
DEFAULT_ZENML_SERVER_THREAD_POOL_SIZE = 40
|
233
234
|
DEFAULT_ZENML_JWT_TOKEN_LEEWAY = 10
|
234
235
|
DEFAULT_ZENML_JWT_TOKEN_ALGORITHM = "HS256"
|
235
236
|
DEFAULT_ZENML_AUTH_SCHEME = AuthScheme.OAUTH2_PASSWORD_BEARER
|
@@ -17,7 +17,7 @@ The Airflow integration sub-module powers an alternative to the local
|
|
17
17
|
orchestrator. You can enable it by registering the Airflow orchestrator with
|
18
18
|
the CLI tool, then bootstrap using the ``zenml orchestrator up`` command.
|
19
19
|
"""
|
20
|
-
from typing import List,
|
20
|
+
from typing import List, Type
|
21
21
|
|
22
22
|
from zenml.integrations.constants import AIRFLOW
|
23
23
|
from zenml.integrations.integration import Integration
|
@@ -35,9 +35,11 @@ from zenml.integrations.gcp.flavors.gcp_artifact_store_flavor import (
|
|
35
35
|
GCPArtifactStoreConfig,
|
36
36
|
)
|
37
37
|
from zenml.io.fileio import convert_to_str
|
38
|
+
from zenml.logger import get_logger
|
38
39
|
from zenml.secret.schemas import GCPSecretSchema
|
39
40
|
from zenml.stack.authentication_mixin import AuthenticationMixin
|
40
41
|
|
42
|
+
logger = get_logger(__name__)
|
41
43
|
PathType = Union[bytes, str]
|
42
44
|
|
43
45
|
|
@@ -109,6 +111,10 @@ class GCPArtifactStore(BaseArtifactStore, AuthenticationMixin):
|
|
109
111
|
Returns:
|
110
112
|
A file-like object that can be used to read or write to the file.
|
111
113
|
"""
|
114
|
+
if mode in ("a", "ab"):
|
115
|
+
logger.warning(
|
116
|
+
"GCS Filesystem is immutable, so append mode will overwrite existing files."
|
117
|
+
)
|
112
118
|
return self.filesystem.open(path=path, mode=mode)
|
113
119
|
|
114
120
|
def copyfile(
|
@@ -82,6 +82,9 @@ class KubernetesOrchestratorConfig( # type: ignore[misc] # https://github.com/p
|
|
82
82
|
containers).
|
83
83
|
skip_local_validations: If `True`, the local validations will be
|
84
84
|
skipped.
|
85
|
+
parallel_step_startup_waiting_period: How long to wait in between
|
86
|
+
starting parallel steps. This can be used to distribute server
|
87
|
+
load when running pipelines with a huge amount of parallel steps.
|
85
88
|
"""
|
86
89
|
|
87
90
|
incluster: bool = False
|
@@ -89,6 +92,7 @@ class KubernetesOrchestratorConfig( # type: ignore[misc] # https://github.com/p
|
|
89
92
|
kubernetes_namespace: str = "zenml"
|
90
93
|
local: bool = False
|
91
94
|
skip_local_validations: bool = False
|
95
|
+
parallel_step_startup_waiting_period: Optional[float] = None
|
92
96
|
|
93
97
|
@property
|
94
98
|
def is_remote(self) -> bool:
|
@@ -142,7 +142,14 @@ def main() -> None:
|
|
142
142
|
)
|
143
143
|
logger.info(f"Pod of step `{step_name}` completed.")
|
144
144
|
|
145
|
-
|
145
|
+
parallel_node_startup_waiting_period = (
|
146
|
+
orchestrator.config.parallel_step_startup_waiting_period or 0.0
|
147
|
+
)
|
148
|
+
ThreadedDagRunner(
|
149
|
+
dag=pipeline_dag,
|
150
|
+
run_fn=run_step_on_kubernetes,
|
151
|
+
parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
|
152
|
+
).run()
|
146
153
|
|
147
154
|
logger.info("Orchestration pod completed.")
|
148
155
|
|
zenml/logging/__init__.py
CHANGED
@@ -12,8 +12,13 @@
|
|
12
12
|
# or implied. See the License for the specific language governing
|
13
13
|
# permissions and limitations under the License.
|
14
14
|
|
15
|
+
"""Logging utilities."""
|
16
|
+
|
15
17
|
# How many seconds to wait before uploading logs to the artifact store
|
16
18
|
STEP_LOGS_STORAGE_INTERVAL_SECONDS: int = 15
|
17
19
|
|
18
20
|
# How many messages to buffer before uploading logs to the artifact store
|
19
21
|
STEP_LOGS_STORAGE_MAX_MESSAGES: int = 100
|
22
|
+
|
23
|
+
# How often to merge logs into a single file
|
24
|
+
STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS: int = 10 * 60
|
zenml/logging/step_logging.py
CHANGED
@@ -19,22 +19,31 @@ import sys
|
|
19
19
|
import time
|
20
20
|
from contextvars import ContextVar
|
21
21
|
from types import TracebackType
|
22
|
-
from typing import Any, Callable, List, Optional, Type
|
23
|
-
from uuid import uuid4
|
22
|
+
from typing import Any, Callable, List, Optional, Type, Union
|
23
|
+
from uuid import UUID, uuid4
|
24
24
|
|
25
25
|
from zenml.artifact_stores import BaseArtifactStore
|
26
|
+
from zenml.artifacts.utils import (
|
27
|
+
_load_artifact_store,
|
28
|
+
_load_file_from_artifact_store,
|
29
|
+
)
|
26
30
|
from zenml.client import Client
|
31
|
+
from zenml.exceptions import DoesNotExistException
|
27
32
|
from zenml.logger import get_logger
|
28
33
|
from zenml.logging import (
|
29
34
|
STEP_LOGS_STORAGE_INTERVAL_SECONDS,
|
30
35
|
STEP_LOGS_STORAGE_MAX_MESSAGES,
|
36
|
+
STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
|
31
37
|
)
|
38
|
+
from zenml.zen_stores.base_zen_store import BaseZenStore
|
32
39
|
|
33
40
|
# Get the logger
|
34
41
|
logger = get_logger(__name__)
|
35
42
|
|
36
43
|
redirected: ContextVar[bool] = ContextVar("redirected", default=False)
|
37
44
|
|
45
|
+
LOGS_EXTENSION = ".log"
|
46
|
+
|
38
47
|
|
39
48
|
def remove_ansi_escape_codes(text: str) -> str:
|
40
49
|
"""Auxiliary function to remove ANSI escape codes from a given string.
|
@@ -54,7 +63,7 @@ def prepare_logs_uri(
|
|
54
63
|
step_name: str,
|
55
64
|
log_key: Optional[str] = None,
|
56
65
|
) -> str:
|
57
|
-
"""Generates and prepares a URI for the log file for a step.
|
66
|
+
"""Generates and prepares a URI for the log file or folder for a step.
|
58
67
|
|
59
68
|
Args:
|
60
69
|
artifact_store: The artifact store on which the artifact will be stored.
|
@@ -62,7 +71,7 @@ def prepare_logs_uri(
|
|
62
71
|
log_key: The unique identification key of the log file.
|
63
72
|
|
64
73
|
Returns:
|
65
|
-
The URI of the
|
74
|
+
The URI of the log storage (file or folder).
|
66
75
|
"""
|
67
76
|
if log_key is None:
|
68
77
|
log_key = str(uuid4())
|
@@ -78,13 +87,119 @@ def prepare_logs_uri(
|
|
78
87
|
artifact_store.makedirs(logs_base_uri)
|
79
88
|
|
80
89
|
# Delete the file if it already exists
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
90
|
+
if artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
|
91
|
+
logs_uri_folder = os.path.join(logs_base_uri, log_key)
|
92
|
+
if artifact_store.exists(logs_uri_folder):
|
93
|
+
logger.warning(
|
94
|
+
f"Logs directory {logs_uri_folder} already exists! Removing old log directory..."
|
95
|
+
)
|
96
|
+
artifact_store.rmtree(logs_uri_folder)
|
97
|
+
|
98
|
+
artifact_store.makedirs(logs_uri_folder)
|
99
|
+
return logs_uri_folder
|
100
|
+
else:
|
101
|
+
logs_uri = os.path.join(logs_base_uri, f"{log_key}.log")
|
102
|
+
if artifact_store.exists(logs_uri):
|
103
|
+
logger.warning(
|
104
|
+
f"Logs file {logs_uri} already exists! Removing old log file..."
|
105
|
+
)
|
106
|
+
artifact_store.remove(logs_uri)
|
107
|
+
return logs_uri
|
108
|
+
|
109
|
+
|
110
|
+
def fetch_logs(
|
111
|
+
zen_store: "BaseZenStore",
|
112
|
+
artifact_store_id: Union[str, UUID],
|
113
|
+
logs_uri: str,
|
114
|
+
offset: int = 0,
|
115
|
+
length: int = 1024 * 1024 * 16, # Default to 16MiB of data
|
116
|
+
) -> str:
|
117
|
+
"""Fetches the logs from the artifact store.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
zen_store: The store in which the artifact is stored.
|
121
|
+
artifact_store_id: The ID of the artifact store.
|
122
|
+
logs_uri: The URI of the artifact.
|
123
|
+
offset: The offset from which to start reading.
|
124
|
+
length: The amount of bytes that should be read.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
The logs as a string.
|
128
|
+
|
129
|
+
Raises:
|
130
|
+
DoesNotExistException: If the artifact does not exist in the artifact
|
131
|
+
store.
|
132
|
+
"""
|
133
|
+
|
134
|
+
def _read_file(
|
135
|
+
uri: str, offset: int = 0, length: Optional[int] = None
|
136
|
+
) -> str:
|
137
|
+
return str(
|
138
|
+
_load_file_from_artifact_store(
|
139
|
+
uri,
|
140
|
+
artifact_store=artifact_store,
|
141
|
+
mode="rb",
|
142
|
+
offset=offset,
|
143
|
+
length=length,
|
144
|
+
).decode()
|
85
145
|
)
|
86
|
-
|
87
|
-
|
146
|
+
|
147
|
+
artifact_store = _load_artifact_store(artifact_store_id, zen_store)
|
148
|
+
if not artifact_store.isdir(logs_uri):
|
149
|
+
return _read_file(logs_uri, offset, length)
|
150
|
+
else:
|
151
|
+
files = artifact_store.listdir(logs_uri)
|
152
|
+
if len(files) == 1:
|
153
|
+
return _read_file(
|
154
|
+
os.path.join(logs_uri, str(files[0])), offset, length
|
155
|
+
)
|
156
|
+
else:
|
157
|
+
is_negative_offset = offset < 0
|
158
|
+
files.sort(reverse=is_negative_offset)
|
159
|
+
|
160
|
+
# search for the first file we need to read
|
161
|
+
latest_file_id = 0
|
162
|
+
for i, file in enumerate(files):
|
163
|
+
file_size: int = artifact_store.size(
|
164
|
+
os.path.join(logs_uri, str(file))
|
165
|
+
) # type: ignore[assignment]
|
166
|
+
|
167
|
+
if is_negative_offset:
|
168
|
+
if file_size >= -offset:
|
169
|
+
latest_file_id = -(i + 1)
|
170
|
+
break
|
171
|
+
else:
|
172
|
+
offset += file_size
|
173
|
+
else:
|
174
|
+
if file_size > offset:
|
175
|
+
latest_file_id = i
|
176
|
+
break
|
177
|
+
else:
|
178
|
+
offset -= file_size
|
179
|
+
|
180
|
+
# read the files according to pre-filtering
|
181
|
+
files.sort()
|
182
|
+
ret = []
|
183
|
+
for file in files[latest_file_id:]:
|
184
|
+
ret.append(
|
185
|
+
_read_file(
|
186
|
+
os.path.join(logs_uri, str(file)),
|
187
|
+
offset,
|
188
|
+
length,
|
189
|
+
)
|
190
|
+
)
|
191
|
+
offset = 0
|
192
|
+
length -= len(ret[-1])
|
193
|
+
if length <= 0:
|
194
|
+
# stop further reading, if the whole length is already read
|
195
|
+
break
|
196
|
+
|
197
|
+
if not ret:
|
198
|
+
raise DoesNotExistException(
|
199
|
+
f"Folder '{logs_uri}' is empty in artifact store "
|
200
|
+
f"'{artifact_store.name}'."
|
201
|
+
)
|
202
|
+
return "".join(ret)
|
88
203
|
|
89
204
|
|
90
205
|
class StepLogsStorage:
|
@@ -95,25 +210,46 @@ class StepLogsStorage:
|
|
95
210
|
logs_uri: str,
|
96
211
|
max_messages: int = STEP_LOGS_STORAGE_MAX_MESSAGES,
|
97
212
|
time_interval: int = STEP_LOGS_STORAGE_INTERVAL_SECONDS,
|
213
|
+
merge_files_interval: int = STEP_LOGS_STORAGE_MERGE_INTERVAL_SECONDS,
|
98
214
|
) -> None:
|
99
215
|
"""Initialization.
|
100
216
|
|
101
217
|
Args:
|
102
|
-
logs_uri: the
|
218
|
+
logs_uri: the URI of the log file or folder.
|
103
219
|
max_messages: the maximum number of messages to save in the buffer.
|
104
220
|
time_interval: the amount of seconds before the buffer gets saved
|
105
221
|
automatically.
|
222
|
+
merge_files_interval: the amount of seconds before the created files
|
223
|
+
get merged into a single file.
|
106
224
|
"""
|
107
225
|
# Parameters
|
108
226
|
self.logs_uri = logs_uri
|
109
227
|
self.max_messages = max_messages
|
110
228
|
self.time_interval = time_interval
|
229
|
+
self.merge_files_interval = merge_files_interval
|
111
230
|
|
112
231
|
# State
|
113
232
|
self.buffer: List[str] = []
|
114
233
|
self.disabled_buffer: List[str] = []
|
115
234
|
self.last_save_time = time.time()
|
116
235
|
self.disabled = False
|
236
|
+
self._artifact_store: Optional["BaseArtifactStore"] = None
|
237
|
+
|
238
|
+
# Immutable filesystems state
|
239
|
+
self.last_merge_time = time.time()
|
240
|
+
self.log_files_not_merged: List[str] = []
|
241
|
+
self.next_merged_file_name: str = f"{time.time()}{LOGS_EXTENSION}"
|
242
|
+
|
243
|
+
@property
|
244
|
+
def artifact_store(self) -> "BaseArtifactStore":
|
245
|
+
"""Returns the active artifact store.
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
The active artifact store.
|
249
|
+
"""
|
250
|
+
if self._artifact_store is None:
|
251
|
+
self._artifact_store = Client().active_stack.artifact_store
|
252
|
+
return self._artifact_store
|
117
253
|
|
118
254
|
def write(self, text: str) -> None:
|
119
255
|
"""Main write method.
|
@@ -126,29 +262,60 @@ class StepLogsStorage:
|
|
126
262
|
|
127
263
|
if not self.disabled:
|
128
264
|
self.buffer.append(text)
|
265
|
+
self.save_to_file()
|
129
266
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
):
|
134
|
-
self.save_to_file()
|
267
|
+
@property
|
268
|
+
def _is_write_needed(self) -> bool:
|
269
|
+
"""Checks whether the buffer needs to be written to disk.
|
135
270
|
|
136
|
-
|
137
|
-
|
138
|
-
|
271
|
+
Returns:
|
272
|
+
whether the buffer needs to be written to disk.
|
273
|
+
"""
|
274
|
+
return (
|
275
|
+
len(self.buffer) >= self.max_messages
|
276
|
+
or time.time() - self.last_save_time >= self.time_interval
|
277
|
+
)
|
278
|
+
|
279
|
+
def save_to_file(self, force: bool = False) -> None:
|
280
|
+
"""Method to save the buffer to the given URI.
|
281
|
+
|
282
|
+
Args:
|
283
|
+
force: whether to force a save even if the write conditions not met.
|
284
|
+
"""
|
285
|
+
if not self.disabled and (self._is_write_needed or force):
|
139
286
|
# IMPORTANT: keep this as the first code line in this method! The
|
140
287
|
# code that follows might still emit logging messages, which will
|
141
288
|
# end up triggering this method again, causing an infinite loop.
|
142
289
|
self.disabled = True
|
143
290
|
|
144
|
-
artifact_store = Client().active_stack.artifact_store
|
145
291
|
try:
|
146
292
|
if self.buffer:
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
293
|
+
if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
|
294
|
+
if not self.log_files_not_merged:
|
295
|
+
self.next_merged_file_name = (
|
296
|
+
f"{time.time()}{LOGS_EXTENSION}"
|
151
297
|
)
|
298
|
+
log_file_ = f"{time.time()}{LOGS_EXTENSION}"
|
299
|
+
self.log_files_not_merged.append(log_file_)
|
300
|
+
with self.artifact_store.open(
|
301
|
+
os.path.join(
|
302
|
+
self.logs_uri,
|
303
|
+
log_file_,
|
304
|
+
),
|
305
|
+
"w",
|
306
|
+
) as file:
|
307
|
+
for message in self.buffer:
|
308
|
+
file.write(
|
309
|
+
remove_ansi_escape_codes(message) + "\n"
|
310
|
+
)
|
311
|
+
else:
|
312
|
+
with self.artifact_store.open(
|
313
|
+
self.logs_uri, "a"
|
314
|
+
) as file:
|
315
|
+
for message in self.buffer:
|
316
|
+
file.write(
|
317
|
+
remove_ansi_escape_codes(message) + "\n"
|
318
|
+
)
|
152
319
|
|
153
320
|
except (OSError, IOError) as e:
|
154
321
|
# This exception can be raised if there are issues with the
|
@@ -161,6 +328,66 @@ class StepLogsStorage:
|
|
161
328
|
self.last_save_time = time.time()
|
162
329
|
|
163
330
|
self.disabled = False
|
331
|
+
# merge created files on a given interval (defaults to 10 minutes)
|
332
|
+
# only runs on Immutable Filesystems
|
333
|
+
if (
|
334
|
+
self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM
|
335
|
+
and time.time() - self.last_merge_time > self.merge_files_interval
|
336
|
+
):
|
337
|
+
try:
|
338
|
+
self.merge_log_files(
|
339
|
+
self.next_merged_file_name, self.log_files_not_merged
|
340
|
+
)
|
341
|
+
except (OSError, IOError) as e:
|
342
|
+
logger.error(f"Error while trying to roll up logs: {e}")
|
343
|
+
else:
|
344
|
+
self.log_files_not_merged = []
|
345
|
+
finally:
|
346
|
+
self.last_merge_time = time.time()
|
347
|
+
|
348
|
+
def merge_log_files(
|
349
|
+
self,
|
350
|
+
file_name: Optional[str] = None,
|
351
|
+
files: Optional[List[str]] = None,
|
352
|
+
) -> None:
|
353
|
+
"""Merges all log files into one in the given URI.
|
354
|
+
|
355
|
+
Called on the logging context exit.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
file_name: The name of the merged log file.
|
359
|
+
files: The list of log files to merge.
|
360
|
+
"""
|
361
|
+
if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM:
|
362
|
+
files_ = files or self.artifact_store.listdir(self.logs_uri)
|
363
|
+
file_name_ = file_name or f"full_log{LOGS_EXTENSION}"
|
364
|
+
if len(files_) > 1:
|
365
|
+
files_.sort()
|
366
|
+
logger.debug("Log files count: %s", len(files_))
|
367
|
+
|
368
|
+
try:
|
369
|
+
# dump all logs to a local file first
|
370
|
+
with self.artifact_store.open(
|
371
|
+
os.path.join(self.logs_uri, file_name_), "w"
|
372
|
+
) as merged_file:
|
373
|
+
for file in files_:
|
374
|
+
merged_file.write(
|
375
|
+
str(
|
376
|
+
_load_file_from_artifact_store(
|
377
|
+
os.path.join(self.logs_uri, str(file)),
|
378
|
+
artifact_store=self.artifact_store,
|
379
|
+
mode="r",
|
380
|
+
)
|
381
|
+
)
|
382
|
+
)
|
383
|
+
except Exception as e:
|
384
|
+
logger.warning(f"Failed to merge log files. {e}")
|
385
|
+
else:
|
386
|
+
# clean up left over files
|
387
|
+
for file in files_:
|
388
|
+
self.artifact_store.remove(
|
389
|
+
os.path.join(self.logs_uri, str(file))
|
390
|
+
)
|
164
391
|
|
165
392
|
|
166
393
|
class StepLogsStorageContext:
|
@@ -213,7 +440,8 @@ class StepLogsStorageContext:
|
|
213
440
|
|
214
441
|
Restores the `write` method of both stderr and stdout.
|
215
442
|
"""
|
216
|
-
self.storage.save_to_file()
|
443
|
+
self.storage.save_to_file(force=True)
|
444
|
+
self.storage.merge_log_files()
|
217
445
|
|
218
446
|
setattr(sys.stdout, "write", self.stdout_write)
|
219
447
|
setattr(sys.stdout, "flush", self.stdout_flush)
|
@@ -14,6 +14,7 @@
|
|
14
14
|
"""DAG (Directed Acyclic Graph) Runners."""
|
15
15
|
|
16
16
|
import threading
|
17
|
+
import time
|
17
18
|
from collections import defaultdict
|
18
19
|
from enum import Enum
|
19
20
|
from typing import Any, Callable, Dict, List
|
@@ -66,7 +67,10 @@ class ThreadedDagRunner:
|
|
66
67
|
"""
|
67
68
|
|
68
69
|
def __init__(
|
69
|
-
self,
|
70
|
+
self,
|
71
|
+
dag: Dict[str, List[str]],
|
72
|
+
run_fn: Callable[[str], Any],
|
73
|
+
parallel_node_startup_waiting_period: float = 0.0,
|
70
74
|
) -> None:
|
71
75
|
"""Define attributes and initialize all nodes in waiting state.
|
72
76
|
|
@@ -75,7 +79,12 @@ class ThreadedDagRunner:
|
|
75
79
|
E.g.: [(1->2), (1->3), (2->4), (3->4)] should be represented as
|
76
80
|
`dag={2: [1], 3: [1], 4: [2, 3]}`
|
77
81
|
run_fn: A function `run_fn(node)` that runs a single node
|
82
|
+
parallel_node_startup_waiting_period: Delay in seconds to wait in
|
83
|
+
between starting parallel nodes.
|
78
84
|
"""
|
85
|
+
self.parallel_node_startup_waiting_period = (
|
86
|
+
parallel_node_startup_waiting_period
|
87
|
+
)
|
79
88
|
self.dag = dag
|
80
89
|
self.reversed_dag = reverse_dag(dag)
|
81
90
|
self.run_fn = run_fn
|
@@ -154,9 +163,12 @@ class ThreadedDagRunner:
|
|
154
163
|
self.node_states[node] = NodeStatus.COMPLETED
|
155
164
|
|
156
165
|
# Run downstream nodes.
|
157
|
-
threads = []
|
166
|
+
threads: List[threading.Thread] = []
|
158
167
|
for downstram_node in self.reversed_dag[node]:
|
159
168
|
if self._can_run(downstram_node):
|
169
|
+
if threads and self.parallel_node_startup_waiting_period > 0:
|
170
|
+
time.sleep(self.parallel_node_startup_waiting_period)
|
171
|
+
|
160
172
|
thread = self._run_node_in_thread(downstram_node)
|
161
173
|
threads.append(thread)
|
162
174
|
|
@@ -173,9 +185,12 @@ class ThreadedDagRunner:
|
|
173
185
|
# Run all nodes that can be started immediately.
|
174
186
|
# These will, in turn, start other nodes once all of their respective
|
175
187
|
# upstream nodes have completed.
|
176
|
-
threads = []
|
188
|
+
threads: List[threading.Thread] = []
|
177
189
|
for node in self.nodes:
|
178
190
|
if self._can_run(node):
|
191
|
+
if threads and self.parallel_node_startup_waiting_period > 0:
|
192
|
+
time.sleep(self.parallel_node_startup_waiting_period)
|
193
|
+
|
179
194
|
thread = self._run_node_in_thread(node)
|
180
195
|
threads.append(thread)
|
181
196
|
|
zenml/zen_server/cloud_utils.py
CHANGED
@@ -8,6 +8,7 @@ from pydantic import BaseModel, validator
|
|
8
8
|
from requests.adapters import HTTPAdapter, Retry
|
9
9
|
|
10
10
|
from zenml.exceptions import SubscriptionUpgradeRequiredError
|
11
|
+
from zenml.zen_server.utils import server_config
|
11
12
|
|
12
13
|
ZENML_CLOUD_RBAC_ENV_PREFIX = "ZENML_CLOUD_"
|
13
14
|
|
@@ -99,7 +100,7 @@ class ZenMLCloudSession:
|
|
99
100
|
raise SubscriptionUpgradeRequiredError(response.json())
|
100
101
|
else:
|
101
102
|
raise RuntimeError(
|
102
|
-
f"Failed with the following error {response.
|
103
|
+
f"Failed with the following error {response} {response.text}"
|
103
104
|
)
|
104
105
|
|
105
106
|
return response
|
@@ -154,12 +155,29 @@ class ZenMLCloudSession:
|
|
154
155
|
A requests session with the authentication token.
|
155
156
|
"""
|
156
157
|
if self._session is None:
|
158
|
+
# Set up the session's connection pool size to match the server's
|
159
|
+
# thread pool size. This allows the server to cache one connection
|
160
|
+
# per thread, which means we can keep connections open for longer
|
161
|
+
# and avoid the overhead of setting up a new connection for each
|
162
|
+
# request.
|
163
|
+
conn_pool_size = server_config().thread_pool_size
|
164
|
+
|
157
165
|
self._session = requests.Session()
|
158
166
|
token = self._fetch_auth_token()
|
159
167
|
self._session.headers.update({"Authorization": "Bearer " + token})
|
160
168
|
|
161
169
|
retries = Retry(total=5, backoff_factor=0.1)
|
162
|
-
self._session.mount(
|
170
|
+
self._session.mount(
|
171
|
+
"https://",
|
172
|
+
HTTPAdapter(
|
173
|
+
max_retries=retries,
|
174
|
+
# We only use one connection pool to be cached because we
|
175
|
+
# only communicate with one remote server (the control
|
176
|
+
# plane)
|
177
|
+
pool_connections=1,
|
178
|
+
pool_maxsize=conn_pool_size,
|
179
|
+
),
|
180
|
+
)
|
163
181
|
|
164
182
|
return self._session
|
165
183
|
|