nshtrainer 0.18.0__py3-none-any.whl → 0.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nshtrainer/_checkpoint/metadata.py +3 -3
- nshtrainer/_hf_hub.py +17 -1
- nshtrainer/trainer/trainer.py +1 -1
- {nshtrainer-0.18.0.dist-info → nshtrainer-0.18.2.dist-info}/METADATA +1 -1
- {nshtrainer-0.18.0.dist-info → nshtrainer-0.18.2.dist-info}/RECORD +6 -6
- {nshtrainer-0.18.0.dist-info → nshtrainer-0.18.2.dist-info}/WHEEL +0 -0
|
@@ -111,11 +111,11 @@ def _write_checkpoint_metadata(
|
|
|
111
111
|
try:
|
|
112
112
|
metadata_path.write_text(metadata.model_dump_json(indent=4), encoding="utf-8")
|
|
113
113
|
except Exception:
|
|
114
|
-
log.exception(f"Failed to write metadata to {
|
|
114
|
+
log.exception(f"Failed to write metadata to {metadata_path}")
|
|
115
115
|
return None
|
|
116
116
|
|
|
117
|
-
log.debug(f"Checkpoint metadata written to {
|
|
118
|
-
return
|
|
117
|
+
log.debug(f"Checkpoint metadata written to {metadata_path}")
|
|
118
|
+
return metadata_path
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
def _remove_checkpoint_metadata(checkpoint_path: Path):
|
nshtrainer/_hf_hub.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import io
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
3
4
|
from pathlib import Path
|
|
@@ -299,7 +300,22 @@ def _save_checkpoint_files(
|
|
|
299
300
|
# Resolve the repository name
|
|
300
301
|
repo_name = _repo_name(api, root_config)
|
|
301
302
|
|
|
303
|
+
# Let's read all the files to memory right now,
|
|
304
|
+
# in case they get used/removed by other processes.
|
|
305
|
+
# Read all the files to memory
|
|
306
|
+
file_contents: list[bytes | None] = []
|
|
302
307
|
for p in paths:
|
|
308
|
+
try:
|
|
309
|
+
with open(p, "rb") as f:
|
|
310
|
+
file_contents.append(f.read())
|
|
311
|
+
except IOError as e:
|
|
312
|
+
log.warning(f"Failed to read checkpoint file {p}: {str(e)}")
|
|
313
|
+
file_contents.append(None)
|
|
314
|
+
|
|
315
|
+
for p, contents in zip(paths, file_contents):
|
|
316
|
+
if contents is None:
|
|
317
|
+
continue
|
|
318
|
+
|
|
303
319
|
try:
|
|
304
320
|
relative_path = p.relative_to(checkpoint_dir)
|
|
305
321
|
except ValueError:
|
|
@@ -314,7 +330,7 @@ def _save_checkpoint_files(
|
|
|
314
330
|
# Upload the checkpoint file to the repository
|
|
315
331
|
try:
|
|
316
332
|
api.upload_file(
|
|
317
|
-
path_or_fileobj=
|
|
333
|
+
path_or_fileobj=io.BytesIO(contents),
|
|
318
334
|
path_in_repo=str(path_in_repo),
|
|
319
335
|
repo_id=repo_name,
|
|
320
336
|
repo_type="model",
|
nshtrainer/trainer/trainer.py
CHANGED
|
@@ -426,7 +426,7 @@ class Trainer(LightningTrainer):
|
|
|
426
426
|
metadata_path = _write_checkpoint_metadata(self, lm, filepath)
|
|
427
427
|
|
|
428
428
|
# If HF Hub is enabled, then we upload
|
|
429
|
-
if hparams.trainer.hf_hub:
|
|
429
|
+
if hparams.trainer.hf_hub and self.is_global_zero:
|
|
430
430
|
from .._hf_hub import _save_checkpoint_files
|
|
431
431
|
|
|
432
432
|
files = [f for f in (filepath, metadata_path) if f is not None]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
|
|
2
2
|
nshtrainer/_checkpoint/loader.py,sha256=myFObRsPdb8jBncMK73vjr5FDJIfKhF86Ec_kSjXtwg,13837
|
|
3
|
-
nshtrainer/_checkpoint/metadata.py,sha256=
|
|
3
|
+
nshtrainer/_checkpoint/metadata.py,sha256=p5e7dhVPpOGrXeuesq_7Y_RHi5lguzDAR_UXtMJXzWU,5175
|
|
4
4
|
nshtrainer/_checkpoint/saver.py,sha256=DkbCH0YeOJ71m32vAARiQdGBf0hvwwdoAV8LOFGy-0Y,1428
|
|
5
5
|
nshtrainer/_experimental/__init__.py,sha256=pEXPyI184UuDHvfh4p9Kg9nQZQZI41e4_HvNd4BK-yg,81
|
|
6
|
-
nshtrainer/_hf_hub.py,sha256=
|
|
6
|
+
nshtrainer/_hf_hub.py,sha256=Py9_8ADvMCFPaJzeE7bxm8Mgs3mEMkyWJ4pDEccTGt8,11230
|
|
7
7
|
nshtrainer/callbacks/__init__.py,sha256=4qocBDzQbLLhhbIEfvbA3SQB_Dy9ZJH7keMwPay-ZS8,2359
|
|
8
8
|
nshtrainer/callbacks/_throughput_monitor_callback.py,sha256=aJo_11rc4lo0IYOd-kHmPDtzdC4ctgXyRudkRJqH4m4,23184
|
|
9
9
|
nshtrainer/callbacks/actsave.py,sha256=qbnaKts4_dvjPeAaPtv7Ds12_vEWzaHUfg_--49NB9I,4041
|
|
@@ -77,7 +77,7 @@ nshtrainer/trainer/__init__.py,sha256=P2rmr8oBVTHk-HJHYPcUwWqDEArMbPR4_rPpATbWK3
|
|
|
77
77
|
nshtrainer/trainer/_runtime_callback.py,sha256=sd2cUdRJG-UCdQr9ruZvEYpNGNF1t2W2fuxwwVlQD9E,4164
|
|
78
78
|
nshtrainer/trainer/checkpoint_connector.py,sha256=F2tkHogbMAa5U7335sm77sZBkjEDa5v46XbJCH9Mg6c,2167
|
|
79
79
|
nshtrainer/trainer/signal_connector.py,sha256=2EzkVktlasl8PgWAKNLDZRUMY__gRlDy1HdinAU-tfU,10740
|
|
80
|
-
nshtrainer/trainer/trainer.py,sha256=
|
|
80
|
+
nshtrainer/trainer/trainer.py,sha256=TTtVkgSB_ekgDlHg24d58Vzddtkpp6ZHOTVprXdXMH0,17503
|
|
81
81
|
nshtrainer/util/_environment_info.py,sha256=gIdq9TJgzGCdcVzZxjHcwYasJ_HmEGVHbvE-KJVVtWs,24187
|
|
82
82
|
nshtrainer/util/_useful_types.py,sha256=dwZokFkIe7M5i2GR3nQ9A1lhGw06DMAFfH5atyquqSA,8000
|
|
83
83
|
nshtrainer/util/environment.py,sha256=AeW_kLl-N70wmb6L_JLz1wRj0kA70xs6RCmc9iUqczE,4159
|
|
@@ -85,6 +85,6 @@ nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
|
|
|
85
85
|
nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
|
|
86
86
|
nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
|
|
87
87
|
nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
|
|
88
|
-
nshtrainer-0.18.
|
|
89
|
-
nshtrainer-0.18.
|
|
90
|
-
nshtrainer-0.18.
|
|
88
|
+
nshtrainer-0.18.2.dist-info/METADATA,sha256=vev96DaxCnqJOAvvGrGOJ37OpWNFLrCdtGPN-kpnvO4,935
|
|
89
|
+
nshtrainer-0.18.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
90
|
+
nshtrainer-0.18.2.dist-info/RECORD,,
|
|
File without changes
|