amzn-sagemaker-checkpointing 1.0.10__tar.gz → 1.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of amzn-sagemaker-checkpointing might be problematic. Click here for more details.

Files changed (40) hide show
  1. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/PKG-INFO +3 -3
  2. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/README.md +2 -2
  3. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/pyproject.toml +1 -1
  4. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/filesystem.py +117 -59
  5. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/.crux_dry_run_build +0 -0
  6. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/.gitignore +0 -0
  7. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/DEVELOPING.md +0 -0
  8. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/LICENSE.txt +0 -0
  9. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/brazil.ion +0 -0
  10. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-build-tools.txt +0 -0
  11. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-build.txt +0 -0
  12. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-static-analysis.txt +0 -0
  13. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-test.py3.11.txt +0 -0
  14. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements/requirements-hatch-test.py3.12.txt +0 -0
  15. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/requirements.txt +0 -0
  16. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/__init__.py +0 -0
  17. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/__init__.py +0 -0
  18. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/checkpointing/filesystem/exceptions.py +0 -0
  19. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/__init__.py +0 -0
  20. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/in_memory_client.py +0 -0
  21. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/config/sagemaker_checkpoint_config.py +0 -0
  22. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/py.typed +0 -0
  23. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/__init__.py +0 -0
  24. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/__init__.py +0 -0
  25. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/__init__.py +0 -0
  26. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/checksum.py +0 -0
  27. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/exceptions.py +0 -0
  28. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/inmemory_client.py +0 -0
  29. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/inmemory/models.py +0 -0
  30. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/local/disk_fs.py +0 -0
  31. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/__init__.py +0 -0
  32. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/s3_client.py +0 -0
  33. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/storage/clients/s3/s3_client_manager.py +0 -0
  34. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/amzn_sagemaker_checkpointing/utils/logging_utils.py +0 -0
  35. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/src/scripts/test_inmemory_client.py +0 -0
  36. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/checkpointing/filesystem/test_filesystem.py +0 -0
  37. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/inmemory/checksum_test.py +0 -0
  38. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/inmemory/test_inmemory_client.py +0 -0
  39. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/tests/amzn_sagemaker_checkpointing/storage/clients/s3/test_s3_client.py +0 -0
  40. {amzn_sagemaker_checkpointing-1.0.10 → amzn_sagemaker_checkpointing-1.0.11}/tests/test_dummy.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: amzn-sagemaker-checkpointing
3
- Version: 1.0.10
3
+ Version: 1.0.11
4
4
  Summary: Amazon SageMaker Checkpointing Library
5
5
  License: Apache 2.0
6
6
  License-File: LICENSE.txt
@@ -95,12 +95,12 @@ following to your S3 bucket policy
95
95
  ```
96
96
 
97
97
  ## Installation
98
- ### PreRequisites
98
+ ### Prerequisites
99
99
  ```bash
100
100
  pip install s3torchconnector tenacity torch boto3 botocore
101
101
  ```
102
102
 
103
- ### Install amzn-sagemaker-checkpointing library
103
+ ### SageMaker Checkpointing Library
104
104
  ```bash
105
105
  pip install amzn-sagemaker-checkpointing
106
106
  ```
@@ -82,12 +82,12 @@ following to your S3 bucket policy
82
82
  ```
83
83
 
84
84
  ## Installation
85
- ### PreRequisites
85
+ ### Prerequisites
86
86
  ```bash
87
87
  pip install s3torchconnector tenacity torch boto3 botocore
88
88
  ```
89
89
 
90
- ### Install amzn-sagemaker-checkpointing library
90
+ ### SageMaker Checkpointing Library
91
91
  ```bash
92
92
  pip install amzn-sagemaker-checkpointing
93
93
  ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "amzn-sagemaker-checkpointing"
7
- version = "1.0.10"
7
+ version = "1.0.11"
8
8
  description = "Amazon SageMaker Checkpointing Library"
9
9
  readme = "README.md"
10
10
  license = { "text" = "Apache 2.0" }
@@ -19,6 +19,7 @@ import pickle
19
19
  import threading
20
20
  import time
21
21
  from dataclasses import dataclass
22
+ from enum import Enum
22
23
  from logging import FileHandler
23
24
  from typing import Any, Union
24
25
 
@@ -46,9 +47,6 @@ from torch.futures import Future
46
47
  from amzn_sagemaker_checkpointing.config.sagemaker_checkpoint_config import (
47
48
  SageMakerCheckpointConfig,
48
49
  )
49
- from amzn_sagemaker_checkpointing.storage.clients.inmemory.exceptions import (
50
- InMemoryServerError,
51
- )
52
50
  from amzn_sagemaker_checkpointing.storage.clients.inmemory.inmemory_client import (
53
51
  InMemoryCheckpointClient,
54
52
  )
@@ -80,6 +78,15 @@ class _SageMakerStorageInfo:
80
78
  offset: int
81
79
  length: int
82
80
 
81
+ class StorageTier(Enum):
82
+ IN_MEMORY = 0
83
+ S3 = 1
84
+
85
+ def __str__(self):
86
+ return {
87
+ 0: "IN_MEMORY",
88
+ 1: "S3"
89
+ }[self.value]
83
90
 
84
91
  def _get_step_val(step: int, path: str | os.PathLike) -> int:
85
92
  """
@@ -791,51 +798,42 @@ class SageMakerTieredStorageReader(StorageReader):
791
798
 
792
799
  def read_metadata(self) -> Metadata:
793
800
  """
794
- Retrieve and deserialize checkpoint metadata from the in-memory storage.
801
+ Retrieve and deserialize checkpoint metadata.
795
802
 
796
803
  Returns
797
804
  -------
798
805
  Metadata
799
806
  Metadata object containing checkpoint information.
800
-
801
- Raises
802
- ------
803
- RuntimeError
804
- If metadata retrieval fails.
807
+ (or) empty Metadata if not available
805
808
  """
806
- # Use provided step or find latest available
807
- if self.step is None:
808
- self.step = self._find_latest_complete_step_across_tiers()
809
-
810
- if not self.step:
811
- self.logger.info(
812
- f"[Rank {self.rank}] Step {self.step}: No checkpoints found"
813
- )
814
- return Metadata({})
815
-
816
- # Try in-memory first (faster)
817
- metadata_buffer = self._try_read_md_from_memory(self.step)
818
- if metadata_buffer:
819
- self.logger.info(
820
- f"[Rank {self.rank}] Step {self.step}: Successfully read metadata from memory, size={len(metadata_buffer)} bytes"
821
- )
822
- return pickle.loads(metadata_buffer)
823
-
824
- self.logger.info(
825
- f"[Rank {self.rank}] Step {self.step}: In-memory metadata not found"
826
- )
827
- # Fallback to S3
828
- if self.s3_base_path:
829
- self.logger.info(
830
- f"[Rank {self.rank}] Step {self.step}: Attempting metadata read from S3"
831
- )
832
- metadata_buffer = self._try_read_md_from_s3(self.step)
833
- if metadata_buffer:
834
- self.logger.info(
835
- f"[Rank {self.rank}] Step {self.step}: Successfully read metadata from S3, size={len(metadata_buffer)} bytes"
836
- )
837
- return pickle.loads(metadata_buffer)
838
- return Metadata({})
809
+ metadata = Metadata({})
810
+ try:
811
+ if self.step is not None:
812
+ self.logger.info(f"[Rank {self.rank}] Step {self.step}: "
813
+ "reading metadata for configured step")
814
+ metadata = self._read_metadata_for_step(self.step)
815
+ else:
816
+ latest_step_all_tiers = self._get_latest_step_all_tiers()
817
+ for latest_step, tier in latest_step_all_tiers:
818
+ if tier == StorageTier.IN_MEMORY:
819
+ self.logger.info(f"[Rank {self.rank}] Attempting to read "
820
+ f"metadata from memory for {latest_step}")
821
+ step_metadata = self._read_metadata_from_memory(latest_step)
822
+ elif tier == StorageTier.S3:
823
+ self.logger.info(f"[Rank {self.rank}] Attempting to read "
824
+ f"metadata from S3 for {latest_step}")
825
+ step_metadata = self._read_metadata_from_s3(latest_step)
826
+ if step_metadata is not None:
827
+ metadata = step_metadata
828
+ self.step = latest_step
829
+ self.logger.info(f"[Rank {self.rank}] Metadata "
830
+ f"read from step {latest_step} of {tier} tier")
831
+ break
832
+ if self.step is None:
833
+ self.logger.error(f"[Rank {self.rank}] No checkpoints to read metadata")
834
+ except Exception as e:
835
+ self.logger.error(f"[Rank {self.rank}] Step {self.step}: read_metadata failed: {e}")
836
+ return metadata
839
837
 
840
838
  def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
841
839
  """
@@ -1083,23 +1081,6 @@ class SageMakerTieredStorageReader(StorageReader):
1083
1081
  """
1084
1082
  return True
1085
1083
 
1086
- def _find_latest_complete_step_across_tiers(self) -> int | None:
1087
- """Find latest step from both storage tiers."""
1088
- memory_step = self.client.get_latest_checkpoints(limit=1)
1089
- s3_step = self._find_latest_complete_step()
1090
- latest_step = None
1091
- if not memory_step:
1092
- latest_step = s3_step
1093
- elif not s3_step:
1094
- latest_step = memory_step[0]
1095
- else:
1096
- latest_step = max(memory_step[0], s3_step)
1097
- self.logger.info(
1098
- f"[Rank {self.rank}] Step {self.step}: Latest steps: "
1099
- f"memory:{memory_step}, s3:{s3_step}, across_tiers:{latest_step}"
1100
- )
1101
- return latest_step
1102
-
1103
1084
  def _try_read_md_from_memory(self, step: int) -> bytes | None:
1104
1085
  """Try reading metadata from in-memory storage."""
1105
1086
  try:
@@ -1252,3 +1233,80 @@ class SageMakerTieredStorageReader(StorageReader):
1252
1233
  f"[Rank {self.rank}] Failed to read item {item_index} from step {step}: {e}"
1253
1234
  )
1254
1235
  return None
1236
+
1237
+ def _read_metadata_from_memory(self, step) -> Metadata | None:
1238
+ metadata = None
1239
+ try:
1240
+ metadata_buffer = self._try_read_md_from_memory(step)
1241
+ if metadata_buffer:
1242
+ self.logger.info(
1243
+ f"[Rank {self.rank}] Step {step}: Successfully read metadata from memory, "
1244
+ f"size={len(metadata_buffer)} bytes"
1245
+ )
1246
+ metadata = pickle.loads(metadata_buffer)
1247
+ else:
1248
+ self.logger.info(
1249
+ f"[Rank {self.rank}] Step {step}: "
1250
+ f"In-memory metadata not found"
1251
+ )
1252
+ except Exception as e:
1253
+ self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_from_memory failed: {e}")
1254
+ return metadata
1255
+
1256
+ def _read_metadata_from_s3(self, step) -> Metadata | None:
1257
+ metadata = None
1258
+ try:
1259
+ if self.s3_base_path:
1260
+ self.logger.info(
1261
+ f"[Rank {self.rank}] Step {step}: Attempting metadata read from S3"
1262
+ )
1263
+ metadata_buffer = self._try_read_md_from_s3(step)
1264
+ if metadata_buffer:
1265
+ self.logger.info(f"[Rank {self.rank}] Step {step}: "
1266
+ f"Successfully read metadata from size={len(metadata_buffer)} bytes")
1267
+ metadata = pickle.loads(metadata_buffer)
1268
+ else:
1269
+ self.logger.info(
1270
+ f"[Rank {self.rank}] Step {step}: "
1271
+ "S3 metadata not found")
1272
+ else:
1273
+ self.logger.info(
1274
+ f"[Rank {self.rank}] Step {step}: Unable to read metadata "
1275
+ "as S3 path is not provided"
1276
+ )
1277
+ except Exception as e:
1278
+ self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_from_s3 failed: {e}")
1279
+ return metadata
1280
+
1281
+ def _read_metadata_for_step(self, step) -> Metadata:
1282
+ metadata = Metadata({})
1283
+ try:
1284
+ in_memory_metadata = self._read_metadata_from_memory(step)
1285
+ if in_memory_metadata is not None:
1286
+ metadata = in_memory_metadata
1287
+ else:
1288
+ s3_metadata = self._read_metadata_from_s3(step)
1289
+ if s3_metadata is not None:
1290
+ metadata = s3_metadata
1291
+ except Exception as e:
1292
+ self.logger.error(f"[Rank {self.rank}] Step {step}: _read_metadata_for_step failed: {e}")
1293
+ return metadata
1294
+
1295
+ def _get_latest_step_all_tiers(self) -> list[tuple[int, StorageTier]]:
1296
+ latest_step_all_tiers = []
1297
+ try:
1298
+ memory_steps = self.client.get_latest_checkpoints(limit=3)
1299
+ if memory_steps:
1300
+ latest_step_all_tiers = [(step, StorageTier.IN_MEMORY) for step in memory_steps]
1301
+ except Exception as e:
1302
+ self.logger.error(f"[Rank {self.rank}]: Failed to get memory steps: {e}")
1303
+ try:
1304
+ s3_step = self._find_latest_complete_step()
1305
+ if s3_step:
1306
+ latest_step_all_tiers.append((s3_step, StorageTier.S3))
1307
+ except Exception as e:
1308
+ self.logger.error(f"[Rank {self.rank}]: Failed to get S3 step: {e}")
1309
+
1310
+ latest_step_all_tiers.sort(key=lambda tier_step: (-tier_step[0], tier_step[1].value))
1311
+ self.logger.info(f"[Rank {self.rank}] Latest steps across tiers: {latest_step_all_tiers}")
1312
+ return latest_step_all_tiers