nshtrainer 0.18.0__py3-none-any.whl → 0.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -111,11 +111,11 @@ def _write_checkpoint_metadata(
111
111
  try:
112
112
  metadata_path.write_text(metadata.model_dump_json(indent=4), encoding="utf-8")
113
113
  except Exception:
114
- log.exception(f"Failed to write metadata to {checkpoint_path}")
114
+ log.exception(f"Failed to write metadata to {metadata_path}")
115
115
  return None
116
116
 
117
- log.debug(f"Checkpoint metadata written to {checkpoint_path}")
118
- return checkpoint_path
117
+ log.debug(f"Checkpoint metadata written to {metadata_path}")
118
+ return metadata_path
119
119
 
120
120
 
121
121
  def _remove_checkpoint_metadata(checkpoint_path: Path):
nshtrainer/_hf_hub.py CHANGED
@@ -1,3 +1,4 @@
1
+ import io
1
2
  import logging
2
3
  import os
3
4
  from pathlib import Path
@@ -299,7 +300,22 @@ def _save_checkpoint_files(
299
300
  # Resolve the repository name
300
301
  repo_name = _repo_name(api, root_config)
301
302
 
303
+ # Let's read all the files to memory right now,
304
+ # in case they get used/removed by other processes.
305
+ # Read all the files to memory
306
+ file_contents: list[bytes | None] = []
302
307
  for p in paths:
308
+ try:
309
+ with open(p, "rb") as f:
310
+ file_contents.append(f.read())
311
+ except IOError as e:
312
+ log.warning(f"Failed to read checkpoint file {p}: {str(e)}")
313
+ file_contents.append(None)
314
+
315
+ for p, contents in zip(paths, file_contents):
316
+ if contents is None:
317
+ continue
318
+
303
319
  try:
304
320
  relative_path = p.relative_to(checkpoint_dir)
305
321
  except ValueError:
@@ -314,7 +330,7 @@ def _save_checkpoint_files(
314
330
  # Upload the checkpoint file to the repository
315
331
  try:
316
332
  api.upload_file(
317
- path_or_fileobj=str(p.resolve().absolute()),
333
+ path_or_fileobj=io.BytesIO(contents),
318
334
  path_in_repo=str(path_in_repo),
319
335
  repo_id=repo_name,
320
336
  repo_type="model",
@@ -426,7 +426,7 @@ class Trainer(LightningTrainer):
426
426
  metadata_path = _write_checkpoint_metadata(self, lm, filepath)
427
427
 
428
428
  # If HF Hub is enabled, then we upload
429
- if hparams.trainer.hf_hub:
429
+ if hparams.trainer.hf_hub and self.is_global_zero:
430
430
  from .._hf_hub import _save_checkpoint_files
431
431
 
432
432
  files = [f for f in (filepath, metadata_path) if f is not None]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nshtrainer
3
- Version: 0.18.0
3
+ Version: 0.18.2
4
4
  Summary:
5
5
  Author: Nima Shoghi
6
6
  Author-email: nimashoghi@gmail.com
@@ -1,9 +1,9 @@
1
1
  nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
2
2
  nshtrainer/_checkpoint/loader.py,sha256=myFObRsPdb8jBncMK73vjr5FDJIfKhF86Ec_kSjXtwg,13837
3
- nshtrainer/_checkpoint/metadata.py,sha256=_9dBLJSCgi3H98-HJLgwVr8U7yHxbQA5VB9ZYMYjFj0,5181
3
+ nshtrainer/_checkpoint/metadata.py,sha256=p5e7dhVPpOGrXeuesq_7Y_RHi5lguzDAR_UXtMJXzWU,5175
4
4
  nshtrainer/_checkpoint/saver.py,sha256=DkbCH0YeOJ71m32vAARiQdGBf0hvwwdoAV8LOFGy-0Y,1428
5
5
  nshtrainer/_experimental/__init__.py,sha256=pEXPyI184UuDHvfh4p9Kg9nQZQZI41e4_HvNd4BK-yg,81
6
- nshtrainer/_hf_hub.py,sha256=b1Na0-SyOM5xlJCH8cqjk0ggEVCPMI_z770c32JIQRY,10701
6
+ nshtrainer/_hf_hub.py,sha256=Py9_8ADvMCFPaJzeE7bxm8Mgs3mEMkyWJ4pDEccTGt8,11230
7
7
  nshtrainer/callbacks/__init__.py,sha256=4qocBDzQbLLhhbIEfvbA3SQB_Dy9ZJH7keMwPay-ZS8,2359
8
8
  nshtrainer/callbacks/_throughput_monitor_callback.py,sha256=aJo_11rc4lo0IYOd-kHmPDtzdC4ctgXyRudkRJqH4m4,23184
9
9
  nshtrainer/callbacks/actsave.py,sha256=qbnaKts4_dvjPeAaPtv7Ds12_vEWzaHUfg_--49NB9I,4041
@@ -77,7 +77,7 @@ nshtrainer/trainer/__init__.py,sha256=P2rmr8oBVTHk-HJHYPcUwWqDEArMbPR4_rPpATbWK3
77
77
  nshtrainer/trainer/_runtime_callback.py,sha256=sd2cUdRJG-UCdQr9ruZvEYpNGNF1t2W2fuxwwVlQD9E,4164
78
78
  nshtrainer/trainer/checkpoint_connector.py,sha256=F2tkHogbMAa5U7335sm77sZBkjEDa5v46XbJCH9Mg6c,2167
79
79
  nshtrainer/trainer/signal_connector.py,sha256=2EzkVktlasl8PgWAKNLDZRUMY__gRlDy1HdinAU-tfU,10740
80
- nshtrainer/trainer/trainer.py,sha256=xJBl8C-9SVT1ppmxTVwT1PIN8vZmE1erpKtKlsX2-8Y,17479
80
+ nshtrainer/trainer/trainer.py,sha256=TTtVkgSB_ekgDlHg24d58Vzddtkpp6ZHOTVprXdXMH0,17503
81
81
  nshtrainer/util/_environment_info.py,sha256=gIdq9TJgzGCdcVzZxjHcwYasJ_HmEGVHbvE-KJVVtWs,24187
82
82
  nshtrainer/util/_useful_types.py,sha256=dwZokFkIe7M5i2GR3nQ9A1lhGw06DMAFfH5atyquqSA,8000
83
83
  nshtrainer/util/environment.py,sha256=AeW_kLl-N70wmb6L_JLz1wRj0kA70xs6RCmc9iUqczE,4159
@@ -85,6 +85,6 @@ nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
85
85
  nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
86
86
  nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
87
87
  nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
88
- nshtrainer-0.18.0.dist-info/METADATA,sha256=uKcju9SCdP6M3h-GjX0OOcpd52_cNThmUPmMYUpBIk4,935
89
- nshtrainer-0.18.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
90
- nshtrainer-0.18.0.dist-info/RECORD,,
88
+ nshtrainer-0.18.2.dist-info/METADATA,sha256=vev96DaxCnqJOAvvGrGOJ37OpWNFLrCdtGPN-kpnvO4,935
89
+ nshtrainer-0.18.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
90
+ nshtrainer-0.18.2.dist-info/RECORD,,