nshtrainer 0.28.0__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nshtrainer/_checkpoint/metadata.py +2 -2
- nshtrainer/_checkpoint/saver.py +2 -2
- nshtrainer/_hf_hub.py +16 -4
- nshtrainer/callbacks/checkpoint/_base.py +4 -14
- nshtrainer/util/_environment_info.py +2 -2
- nshtrainer/util/path.py +4 -1
- {nshtrainer-0.28.0.dist-info → nshtrainer-0.29.1.dist-info}/METADATA +1 -1
- {nshtrainer-0.28.0.dist-info → nshtrainer-0.29.1.dist-info}/RECORD +9 -9
- {nshtrainer-0.28.0.dist-info → nshtrainer-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -117,7 +117,7 @@ def _write_checkpoint_metadata(
|
|
|
117
117
|
try:
|
|
118
118
|
metadata_path.write_text(metadata.model_dump_json(indent=4), encoding="utf-8")
|
|
119
119
|
except Exception:
|
|
120
|
-
log.
|
|
120
|
+
log.warning(f"Failed to write metadata to {metadata_path}", exc_info=True)
|
|
121
121
|
return None
|
|
122
122
|
|
|
123
123
|
log.debug(f"Checkpoint metadata written to {metadata_path}")
|
|
@@ -129,7 +129,7 @@ def _remove_checkpoint_metadata(checkpoint_path: Path):
|
|
|
129
129
|
try:
|
|
130
130
|
path.unlink(missing_ok=True)
|
|
131
131
|
except Exception:
|
|
132
|
-
log.
|
|
132
|
+
log.warning(f"Failed to remove {path}", exc_info=True)
|
|
133
133
|
else:
|
|
134
134
|
log.debug(f"Removed {path}")
|
|
135
135
|
|
nshtrainer/_checkpoint/saver.py
CHANGED
|
@@ -25,11 +25,11 @@ def _link_checkpoint(
|
|
|
25
25
|
try:
|
|
26
26
|
if linkpath.exists():
|
|
27
27
|
if linkpath.is_dir():
|
|
28
|
-
shutil.rmtree(linkpath
|
|
28
|
+
shutil.rmtree(linkpath)
|
|
29
29
|
else:
|
|
30
30
|
linkpath.unlink(missing_ok=True)
|
|
31
31
|
except Exception:
|
|
32
|
-
log.
|
|
32
|
+
log.warning(f"Failed to remove {linkpath}", exc_info=True)
|
|
33
33
|
|
|
34
34
|
if metadata:
|
|
35
35
|
_remove_checkpoint_metadata(linkpath)
|
nshtrainer/_hf_hub.py
CHANGED
|
@@ -179,7 +179,9 @@ class HFHubCallback(NTCallbackBase):
|
|
|
179
179
|
try:
|
|
180
180
|
yield
|
|
181
181
|
except Exception:
|
|
182
|
-
log.
|
|
182
|
+
log.warning(
|
|
183
|
+
f"Failed to {opeartion}, repo_id={self._repo_id}", exc_info=True
|
|
184
|
+
)
|
|
183
185
|
else:
|
|
184
186
|
log.debug(f"Successfully {opeartion}, repo_id={self._repo_id}")
|
|
185
187
|
|
|
@@ -261,9 +263,13 @@ class HFHubCallback(NTCallbackBase):
|
|
|
261
263
|
)
|
|
262
264
|
log.info(f"Created new repository '{self.repo_id}'.")
|
|
263
265
|
except Exception:
|
|
264
|
-
log.
|
|
266
|
+
log.warning(
|
|
267
|
+
f"Failed to create repository '{self.repo_id}'", exc_info=True
|
|
268
|
+
)
|
|
265
269
|
except Exception:
|
|
266
|
-
log.
|
|
270
|
+
log.warning(
|
|
271
|
+
f"Error checking repository '{self.repo_id}'", exc_info=True
|
|
272
|
+
)
|
|
267
273
|
|
|
268
274
|
def _save_config(self, root_config: "BaseConfig"):
|
|
269
275
|
with self._with_error_handling("upload config"):
|
|
@@ -300,9 +306,15 @@ class HFHubCallback(NTCallbackBase):
|
|
|
300
306
|
|
|
301
307
|
def _save_file(self, p: _Upload):
|
|
302
308
|
with self._with_error_handling("save file"):
|
|
309
|
+
# First, read the file into memory.
|
|
310
|
+
# We do this to avoid issues with
|
|
311
|
+
# the file being moved or deleted.
|
|
312
|
+
with p.local_path.open("rb") as f:
|
|
313
|
+
data = f.read()
|
|
314
|
+
|
|
303
315
|
# Upload the checkpoint files to the repository
|
|
304
316
|
self.api.upload_file(
|
|
305
|
-
path_or_fileobj=
|
|
317
|
+
path_or_fileobj=data,
|
|
306
318
|
path_in_repo=str(p.path_in_repo),
|
|
307
319
|
repo_id=self.repo_id,
|
|
308
320
|
repo_type="model",
|
|
@@ -9,7 +9,7 @@ from lightning.pytorch import Trainer
|
|
|
9
9
|
from lightning.pytorch.callbacks import Checkpoint
|
|
10
10
|
from typing_extensions import TypeVar, override
|
|
11
11
|
|
|
12
|
-
from ..._checkpoint.metadata import CheckpointMetadata
|
|
12
|
+
from ..._checkpoint.metadata import CheckpointMetadata
|
|
13
13
|
from ..._checkpoint.saver import _link_checkpoint, _remove_checkpoint
|
|
14
14
|
from ..base import CallbackConfigBase
|
|
15
15
|
|
|
@@ -155,29 +155,19 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
|
|
|
155
155
|
trainer.save_checkpoint(filepath, self.config.save_weights_only)
|
|
156
156
|
|
|
157
157
|
if trainer.is_global_zero:
|
|
158
|
+
# Remove old checkpoints
|
|
159
|
+
self.remove_old_checkpoints(trainer)
|
|
160
|
+
|
|
158
161
|
# Create the latest symlink
|
|
159
162
|
if (symlink_filename := self.symlink_path()) is not None:
|
|
160
163
|
symlink_path = self.dirpath / symlink_filename
|
|
161
164
|
_link_checkpoint(filepath, symlink_path, metadata=True)
|
|
162
165
|
log.debug(f"Created latest symlink: {symlink_path}")
|
|
163
166
|
|
|
164
|
-
# Remove old checkpoints
|
|
165
|
-
self.remove_old_checkpoints(trainer)
|
|
166
|
-
|
|
167
167
|
# Barrier to ensure all processes have saved the checkpoint,
|
|
168
168
|
# deleted the old checkpoints, and created the symlink before continuing
|
|
169
169
|
trainer.strategy.barrier()
|
|
170
170
|
|
|
171
|
-
# Call the on save checkpoint callback for the symlink (if it exists)
|
|
172
|
-
if (symlink_filename := self.symlink_path()) is not None:
|
|
173
|
-
from ... import _callback
|
|
174
|
-
|
|
175
|
-
symlink_path = self.dirpath / symlink_filename
|
|
176
|
-
symlink_metadata_path = _metadata_path(symlink_path)
|
|
177
|
-
_callback._call_on_checkpoint_saved(
|
|
178
|
-
trainer, symlink_path, symlink_metadata_path
|
|
179
|
-
)
|
|
180
|
-
|
|
181
171
|
def _should_skip_saving_checkpoint(self, trainer: Trainer) -> bool:
|
|
182
172
|
from lightning.pytorch.trainer.states import TrainerFn
|
|
183
173
|
|
|
@@ -434,7 +434,7 @@ class EnvironmentPackageConfig(C.Config):
|
|
|
434
434
|
requires=requires,
|
|
435
435
|
)
|
|
436
436
|
except Exception:
|
|
437
|
-
log.
|
|
437
|
+
log.warning(f"Error processing package {dist.name}", exc_info=True)
|
|
438
438
|
|
|
439
439
|
except ImportError:
|
|
440
440
|
log.warning(
|
|
@@ -673,7 +673,7 @@ class GitRepositoryConfig(C.Config):
|
|
|
673
673
|
except git.InvalidGitRepositoryError:
|
|
674
674
|
draft.is_git_repo = False
|
|
675
675
|
except Exception:
|
|
676
|
-
log.
|
|
676
|
+
log.warning("Failed to get Git repository information", exc_info=True)
|
|
677
677
|
draft.is_git_repo = None
|
|
678
678
|
|
|
679
679
|
return draft.finalize()
|
nshtrainer/util/path.py
CHANGED
|
@@ -97,7 +97,10 @@ def try_symlink_or_copy(
|
|
|
97
97
|
symlink_target, target_is_directory=target_is_directory
|
|
98
98
|
)
|
|
99
99
|
except Exception:
|
|
100
|
-
log.
|
|
100
|
+
log.warning(
|
|
101
|
+
f"Failed to create symlink or copy {file_path} to {link_path}",
|
|
102
|
+
exc_info=True,
|
|
103
|
+
)
|
|
101
104
|
return False
|
|
102
105
|
else:
|
|
103
106
|
log.debug(f"Created symlink or copied {file_path} to {link_path}")
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
|
|
2
2
|
nshtrainer/_callback.py,sha256=A1zLsTy4b_wOYnInLLXGSRdHzT2yNa6mPEql-ozm0u0,1013
|
|
3
3
|
nshtrainer/_checkpoint/loader.py,sha256=5vjg-OFChXJjgiOVv8vnV8nwTscfdDtEdxQRz6uPfDE,14158
|
|
4
|
-
nshtrainer/_checkpoint/metadata.py,sha256=
|
|
5
|
-
nshtrainer/_checkpoint/saver.py,sha256=
|
|
4
|
+
nshtrainer/_checkpoint/metadata.py,sha256=5D4PgKodzhLsmQvuF3xxkH49epKaegxi4wh_ImDTtns,4737
|
|
5
|
+
nshtrainer/_checkpoint/saver.py,sha256=MbX_WjkDtHHAf9Ms-KXDlknkjiPXVoGIe2ciO28AdZ0,1264
|
|
6
6
|
nshtrainer/_experimental/__init__.py,sha256=pEXPyI184UuDHvfh4p9Kg9nQZQZI41e4_HvNd4BK-yg,81
|
|
7
|
-
nshtrainer/_hf_hub.py,sha256=
|
|
7
|
+
nshtrainer/_hf_hub.py,sha256=0bkXkqhve5D1onMW-fCfuvVKlTn0i6jv_6uMNgZ7OHQ,12974
|
|
8
8
|
nshtrainer/callbacks/__init__.py,sha256=4qocBDzQbLLhhbIEfvbA3SQB_Dy9ZJH7keMwPay-ZS8,2359
|
|
9
9
|
nshtrainer/callbacks/_throughput_monitor_callback.py,sha256=aJo_11rc4lo0IYOd-kHmPDtzdC4ctgXyRudkRJqH4m4,23184
|
|
10
10
|
nshtrainer/callbacks/actsave.py,sha256=qbnaKts4_dvjPeAaPtv7Ds12_vEWzaHUfg_--49NB9I,4041
|
|
11
11
|
nshtrainer/callbacks/base.py,sha256=NpjeKmonJ1Kaz5_39XSn3LlDwvbGjk6WV8BpHSNCvI4,3508
|
|
12
12
|
nshtrainer/callbacks/checkpoint/__init__.py,sha256=g-3zIthupERKqWZQw-A_busQPaPRkto6iHBV-M7nK1Y,527
|
|
13
|
-
nshtrainer/callbacks/checkpoint/_base.py,sha256=
|
|
13
|
+
nshtrainer/callbacks/checkpoint/_base.py,sha256=vvlwuD-20NozYVIolGGShmUdkkNYeuwN6xCoFnK4GiU,6157
|
|
14
14
|
nshtrainer/callbacks/checkpoint/best_checkpoint.py,sha256=8BHgLAd3Tuzf5sup0guEAKF1jJiAwYsjdKBFYZw98ac,2171
|
|
15
15
|
nshtrainer/callbacks/checkpoint/last_checkpoint.py,sha256=CWWv0cSwQ1VAX26N7hAyMxbNCk26Keh39oQguBEK5To,1102
|
|
16
16
|
nshtrainer/callbacks/checkpoint/on_exception_checkpoint.py,sha256=ctT88EGT22_t_6tr5r7Sfo43cuve6XeroBnBYRMPOus,3372
|
|
@@ -79,14 +79,14 @@ nshtrainer/trainer/_runtime_callback.py,sha256=sd2cUdRJG-UCdQr9ruZvEYpNGNF1t2W2f
|
|
|
79
79
|
nshtrainer/trainer/checkpoint_connector.py,sha256=r0ir4xYSdf_jebM0x09qaO6nJsvsiRQDyM0fs80ppOQ,2347
|
|
80
80
|
nshtrainer/trainer/signal_connector.py,sha256=2EzkVktlasl8PgWAKNLDZRUMY__gRlDy1HdinAU-tfU,10740
|
|
81
81
|
nshtrainer/trainer/trainer.py,sha256=L4nYXq6Gts2sS9CQGenwEcvMET4L5vO5c60KM5Hm8Do,17544
|
|
82
|
-
nshtrainer/util/_environment_info.py,sha256=
|
|
82
|
+
nshtrainer/util/_environment_info.py,sha256=CFUUZYjXhBLWGc0jtPNOaZgYMueUDEHpEaWFA1f3GoY,24213
|
|
83
83
|
nshtrainer/util/_useful_types.py,sha256=dwZokFkIe7M5i2GR3nQ9A1lhGw06DMAFfH5atyquqSA,8000
|
|
84
84
|
nshtrainer/util/environment.py,sha256=AeW_kLl-N70wmb6L_JLz1wRj0kA70xs6RCmc9iUqczE,4159
|
|
85
|
-
nshtrainer/util/path.py,sha256=
|
|
85
|
+
nshtrainer/util/path.py,sha256=VkpuhR4GaZtSFBVqbGAvfjcrU-PR8xwiGzzwFNOWP9c,2995
|
|
86
86
|
nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
|
|
87
87
|
nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
|
|
88
88
|
nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
|
|
89
89
|
nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
|
|
90
|
-
nshtrainer-0.
|
|
91
|
-
nshtrainer-0.
|
|
92
|
-
nshtrainer-0.
|
|
90
|
+
nshtrainer-0.29.1.dist-info/METADATA,sha256=Qck1QY1pNnjQH9zLMyAMKVVvYMovEeIyP5zV7VlZios,916
|
|
91
|
+
nshtrainer-0.29.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
92
|
+
nshtrainer-0.29.1.dist-info/RECORD,,
|
|
File without changes
|