amzn-sagemaker-checkpointing 1.0.9__tar.gz → 1.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of amzn-sagemaker-checkpointing might be problematic. Click here for more details.
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/PKG-INFO +6 -1
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/README.md +5 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/pyproject.toml +1 -1
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/filesystem.py +117 -59
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/.crux_dry_run_build +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/.gitignore +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/DEVELOPING.md +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/LICENSE.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/brazil.ion +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-build-tools.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-build.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-static-analysis.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-test.py3.11.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-test.py3.12.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/requirements.txt +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/exceptions.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/in_memory_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/sagemaker_checkpoint_config.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/py.typed +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/checksum.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/exceptions.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/inmemory_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/models.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/local/disk_fs.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/__init__.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/s3_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/s3_client_manager.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/utils/logging_utils.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/src/scripts/test_inmemory_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/checkpointing/filesystem/test_filesystem.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/inmemory/checksum_test.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/inmemory/test_inmemory_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/s3/test_s3_client.py +0 -0
- {amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/test_dummy.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: amzn-sagemaker-checkpointing
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.11
|
|
4
4
|
Summary: Amazon SageMaker Checkpointing Library
|
|
5
5
|
License: Apache 2.0
|
|
6
6
|
License-File: LICENSE.txt
|
|
@@ -95,7 +95,12 @@ following to your S3 bucket policy
|
|
|
95
95
|
```
|
|
96
96
|
|
|
97
97
|
## Installation
|
|
98
|
+
### Prerequisites
|
|
99
|
+
```bash
|
|
100
|
+
pip install s3torchconnector tenacity torch boto3 botocore
|
|
101
|
+
```
|
|
98
102
|
|
|
103
|
+
### SageMaker Checkpointing Library
|
|
99
104
|
```bash
|
|
100
105
|
pip install amzn-sagemaker-checkpointing
|
|
101
106
|
```
|
|
@@ -82,7 +82,12 @@ following to your S3 bucket policy
|
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
## Installation
|
|
85
|
+
### Prerequisites
|
|
86
|
+
```bash
|
|
87
|
+
pip install s3torchconnector tenacity torch boto3 botocore
|
|
88
|
+
```
|
|
85
89
|
|
|
90
|
+
### SageMaker Checkpointing Library
|
|
86
91
|
```bash
|
|
87
92
|
pip install amzn-sagemaker-checkpointing
|
|
88
93
|
```
|
|
@@ -19,6 +19,7 @@ import pickle
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
from dataclasses import dataclass
|
|
22
|
+
from enum import Enum
|
|
22
23
|
from logging import FileHandler
|
|
23
24
|
from typing import Any, Union
|
|
24
25
|
|
|
@@ -46,9 +47,6 @@ from torch.futures import Future
|
|
|
46
47
|
from amzn_sagemaker_checkpointing.config.sagemaker_checkpoint_config import (
|
|
47
48
|
SageMakerCheckpointConfig,
|
|
48
49
|
)
|
|
49
|
-
from amzn_sagemaker_checkpointing.storage.clients.inmemory.exceptions import (
|
|
50
|
-
InMemoryServerError,
|
|
51
|
-
)
|
|
52
50
|
from amzn_sagemaker_checkpointing.storage.clients.inmemory.inmemory_client import (
|
|
53
51
|
InMemoryCheckpointClient,
|
|
54
52
|
)
|
|
@@ -80,6 +78,15 @@ class _SageMakerStorageInfo:
|
|
|
80
78
|
offset: int
|
|
81
79
|
length: int
|
|
82
80
|
|
|
81
|
+
class StorageTier(Enum):
|
|
82
|
+
IN_MEMORY = 0
|
|
83
|
+
S3 = 1
|
|
84
|
+
|
|
85
|
+
def __str__(self):
|
|
86
|
+
return {
|
|
87
|
+
0: "IN_MEMORY",
|
|
88
|
+
1: "S3"
|
|
89
|
+
}[self.value]
|
|
83
90
|
|
|
84
91
|
def _get_step_val(step: int, path: str | os.PathLike) -> int:
|
|
85
92
|
"""
|
|
@@ -791,51 +798,42 @@ class SageMakerTieredStorageReader(StorageReader):
|
|
|
791
798
|
|
|
792
799
|
def read_metadata(self) -> Metadata:
|
|
793
800
|
"""
|
|
794
|
-
Retrieve and deserialize checkpoint metadata
|
|
801
|
+
Retrieve and deserialize checkpoint metadata.
|
|
795
802
|
|
|
796
803
|
Returns
|
|
797
804
|
-------
|
|
798
805
|
Metadata
|
|
799
806
|
Metadata object containing checkpoint information.
|
|
800
|
-
|
|
801
|
-
Raises
|
|
802
|
-
------
|
|
803
|
-
RuntimeError
|
|
804
|
-
If metadata retrieval fails.
|
|
807
|
+
(or) empty Metadata if not available
|
|
805
808
|
"""
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
self.step
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
self.logger.info(
|
|
835
|
-
f"[Rank {self.rank}] Step {self.step}: Successfully read metadata from S3, size={len(metadata_buffer)} bytes"
|
|
836
|
-
)
|
|
837
|
-
return pickle.loads(metadata_buffer)
|
|
838
|
-
return Metadata({})
|
|
809
|
+
metadata = Metadata({})
|
|
810
|
+
try:
|
|
811
|
+
if self.step is not None:
|
|
812
|
+
self.logger.info(f"[Rank {self.rank}] Step {self.step}: "
|
|
813
|
+
"reading metadata for configured step")
|
|
814
|
+
metadata = self._read_metadata_for_step(self.step)
|
|
815
|
+
else:
|
|
816
|
+
latest_step_all_tiers = self._get_latest_step_all_tiers()
|
|
817
|
+
for latest_step, tier in latest_step_all_tiers:
|
|
818
|
+
if tier == StorageTier.IN_MEMORY:
|
|
819
|
+
self.logger.info(f"[Rank {self.rank}] Attempting to read "
|
|
820
|
+
f"metadata from memory for {latest_step}")
|
|
821
|
+
step_metadata = self._read_metadata_from_memory(latest_step)
|
|
822
|
+
elif tier == StorageTier.S3:
|
|
823
|
+
self.logger.info(f"[Rank {self.rank}] Attempting to read "
|
|
824
|
+
f"metadata from S3 for {latest_step}")
|
|
825
|
+
step_metadata = self._read_metadata_from_s3(latest_step)
|
|
826
|
+
if step_metadata is not None:
|
|
827
|
+
metadata = step_metadata
|
|
828
|
+
self.step = latest_step
|
|
829
|
+
self.logger.info(f"[Rank {self.rank}] Metadata "
|
|
830
|
+
f"read from step {latest_step} of {tier} tier")
|
|
831
|
+
break
|
|
832
|
+
if self.step is None:
|
|
833
|
+
self.logger.error(f"[Rank {self.rank}] No checkpoints to read metadata")
|
|
834
|
+
except Exception as e:
|
|
835
|
+
self.logger.error(f"[Rank {self.rank}] Step {self.step}: read_metadata failed: {e}")
|
|
836
|
+
return metadata
|
|
839
837
|
|
|
840
838
|
def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
|
|
841
839
|
"""
|
|
@@ -1083,23 +1081,6 @@ class SageMakerTieredStorageReader(StorageReader):
|
|
|
1083
1081
|
"""
|
|
1084
1082
|
return True
|
|
1085
1083
|
|
|
1086
|
-
def _find_latest_complete_step_across_tiers(self) -> int | None:
|
|
1087
|
-
"""Find latest step from both storage tiers."""
|
|
1088
|
-
memory_step = self.client.get_latest_checkpoints(limit=1)
|
|
1089
|
-
s3_step = self._find_latest_complete_step()
|
|
1090
|
-
latest_step = None
|
|
1091
|
-
if not memory_step:
|
|
1092
|
-
latest_step = s3_step
|
|
1093
|
-
elif not s3_step:
|
|
1094
|
-
latest_step = memory_step[0]
|
|
1095
|
-
else:
|
|
1096
|
-
latest_step = max(memory_step[0], s3_step)
|
|
1097
|
-
self.logger.info(
|
|
1098
|
-
f"[Rank {self.rank}] Step {self.step}: Latest steps: "
|
|
1099
|
-
f"memory:{memory_step}, s3:{s3_step}, across_tiers:{latest_step}"
|
|
1100
|
-
)
|
|
1101
|
-
return latest_step
|
|
1102
|
-
|
|
1103
1084
|
def _try_read_md_from_memory(self, step: int) -> bytes | None:
|
|
1104
1085
|
"""Try reading metadata from in-memory storage."""
|
|
1105
1086
|
try:
|
|
@@ -1252,3 +1233,80 @@ class SageMakerTieredStorageReader(StorageReader):
|
|
|
1252
1233
|
f"[Rank {self.rank}] Failed to read item {item_index} from step {step}: {e}"
|
|
1253
1234
|
)
|
|
1254
1235
|
return None
|
|
1236
|
+
|
|
1237
|
+
def _read_metadata_from_memory(self, step) -> Metadata | None:
|
|
1238
|
+
metadata = None
|
|
1239
|
+
try:
|
|
1240
|
+
metadata_buffer = self._try_read_md_from_memory(step)
|
|
1241
|
+
if metadata_buffer:
|
|
1242
|
+
self.logger.info(
|
|
1243
|
+
f"[Rank {self.rank}] Step {step}: Successfully read metadata from memory, "
|
|
1244
|
+
f"size={len(metadata_buffer)} bytes"
|
|
1245
|
+
)
|
|
1246
|
+
metadata = pickle.loads(metadata_buffer)
|
|
1247
|
+
else:
|
|
1248
|
+
self.logger.info(
|
|
1249
|
+
f"[Rank {self.rank}] Step {step}: "
|
|
1250
|
+
f"In-memory metadata not found"
|
|
1251
|
+
)
|
|
1252
|
+
except Exception as e:
|
|
1253
|
+
self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_from_memory failed: {e}")
|
|
1254
|
+
return metadata
|
|
1255
|
+
|
|
1256
|
+
def _read_metadata_from_s3(self, step) -> Metadata | None:
|
|
1257
|
+
metadata = None
|
|
1258
|
+
try:
|
|
1259
|
+
if self.s3_base_path:
|
|
1260
|
+
self.logger.info(
|
|
1261
|
+
f"[Rank {self.rank}] Step {step}: Attempting metadata read from S3"
|
|
1262
|
+
)
|
|
1263
|
+
metadata_buffer = self._try_read_md_from_s3(step)
|
|
1264
|
+
if metadata_buffer:
|
|
1265
|
+
self.logger.info(f"[Rank {self.rank}] Step {step}: "
|
|
1266
|
+
f"Successfully read metadata from size={len(metadata_buffer)} bytes")
|
|
1267
|
+
metadata = pickle.loads(metadata_buffer)
|
|
1268
|
+
else:
|
|
1269
|
+
self.logger.info(
|
|
1270
|
+
f"[Rank {self.rank}] Step {step}: "
|
|
1271
|
+
"S3 metadata not found")
|
|
1272
|
+
else:
|
|
1273
|
+
self.logger.info(
|
|
1274
|
+
f"[Rank {self.rank}] Step {step}: Unable to read metadata "
|
|
1275
|
+
"as S3 path is not provided"
|
|
1276
|
+
)
|
|
1277
|
+
except Exception as e:
|
|
1278
|
+
self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_from_s3 failed: {e}")
|
|
1279
|
+
return metadata
|
|
1280
|
+
|
|
1281
|
+
def _read_metadata_for_step(self, step) -> Metadata:
|
|
1282
|
+
metadata = Metadata({})
|
|
1283
|
+
try:
|
|
1284
|
+
in_memory_metadata = self._read_metadata_from_memory(step)
|
|
1285
|
+
if in_memory_metadata is not None:
|
|
1286
|
+
metadata = in_memory_metadata
|
|
1287
|
+
else:
|
|
1288
|
+
s3_metadata = self._read_metadata_from_s3(step)
|
|
1289
|
+
if s3_metadata is not None:
|
|
1290
|
+
metadata = s3_metadata
|
|
1291
|
+
except Exception as e:
|
|
1292
|
+
self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_for_step failed: {e}")
|
|
1293
|
+
return metadata
|
|
1294
|
+
|
|
1295
|
+
def _get_latest_step_all_tiers(self) -> list[tuple[int, StorageTier]]:
|
|
1296
|
+
latest_step_all_tiers = []
|
|
1297
|
+
try:
|
|
1298
|
+
memory_steps = self.client.get_latest_checkpoints(limit=3)
|
|
1299
|
+
if memory_steps:
|
|
1300
|
+
latest_step_all_tiers = [(step, StorageTier.IN_MEMORY) for step in memory_steps]
|
|
1301
|
+
except Exception as e:
|
|
1302
|
+
self.logger.error(f"[Rank {self.rank}]: Failed to get memory steps: {e}")
|
|
1303
|
+
try:
|
|
1304
|
+
s3_step = self._find_latest_complete_step()
|
|
1305
|
+
if s3_step:
|
|
1306
|
+
latest_step_all_tiers.append((s3_step, StorageTier.S3))
|
|
1307
|
+
except Exception as e:
|
|
1308
|
+
self.logger.error(f"[Rank {self.rank}]: Failed to get S3 step: {e}")
|
|
1309
|
+
|
|
1310
|
+
latest_step_all_tiers.sort(key=lambda tier_step: (-tier_step[0], tier_step[1].value))
|
|
1311
|
+
self.logger.info(f"[Rank {self.rank}] Latest steps across tiers: {latest_step_all_tiers}")
|
|
1312
|
+
return latest_step_all_tiers
|
{amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/.crux_dry_run_build
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{amzn_sagemaker_checkpointing-1.0.9 → amzn_sagemaker_checkpointing-1.0.11}/tests/test_dummy.py
RENAMED
|
File without changes
|