PyPI - wavedl - Versions diffs - 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl - Mend

wavedl 1.4.5py3-none-any.whl → 1.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

wavedl/__init__.py +1 -1
wavedl/hpc.py +11 -2
wavedl/hpo.py +51 -2
wavedl/test.py +13 -7
wavedl/train.py +27 -3
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/METADATA +13 -11
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/RECORD +11 -11
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/LICENSE +0 -0
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/WHEEL +0 -0
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/entry_points.txt +0 -0
{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/top_level.txt +0 -0

wavedl/__init__.py CHANGED Viewed

@@ -18,7 +18,7 @@ For inference:
     # or: python -m wavedl.test --checkpoint best_checkpoint --data_path test.npz
 """
-__version__ = "1.4.5"
+__version__ = "1.4.6"
 __author__ = "Ductho Le"
 __email__ = "ductho.le@outlook.com"

wavedl/hpc.py CHANGED Viewed

@@ -174,7 +174,9 @@ Environment Variables:
     return args, remaining
-def print_summary(exit_code: int, wandb_mode: str, wandb_dir: str) -> None:
+def print_summary(
+    exit_code: int, wandb_enabled: bool, wandb_mode: str, wandb_dir: str
+) -> None:
     """Print post-training summary and instructions."""
     print()
     print("=" * 40)
@@ -183,7 +185,8 @@ def print_summary(exit_code: int, wandb_mode: str, wandb_dir: str) -> None:
         print("✅ Training completed successfully!")
         print("=" * 40)
-        if wandb_mode == "offline":
+        # Only show WandB sync instructions if user enabled wandb
+        if wandb_enabled and wandb_mode == "offline":
             print()
             print("📊 WandB Sync Instructions:")
             print("   From the login node, run:")
@@ -237,6 +240,10 @@ def main() -> int:
         f"--dynamo_backend={args.dynamo_backend}",
     ]
+    # Explicitly set multi_gpu to suppress accelerate auto-detection warning
+    if num_gpus > 1:
+        cmd.append("--multi_gpu")
     # Add multi-node networking args if specified (required for some clusters)
     if args.main_process_ip:
         cmd.append(f"--main_process_ip={args.main_process_ip}")
@@ -263,8 +270,10 @@ def main() -> int:
         exit_code = 130
     # Print summary
+    wandb_enabled = "--wandb" in train_args
     print_summary(
         exit_code,
+        wandb_enabled,
         os.environ.get("WANDB_MODE", "offline"),
         os.environ.get("WANDB_DIR", "/tmp/wandb"),
     )

wavedl/hpo.py CHANGED Viewed

@@ -31,7 +31,7 @@ try:
     import optuna
     from optuna.trial import TrialState
 except ImportError:
-    print("Error: Optuna not installed. Run: pip install -e '.[hpo]'")
+    print("Error: Optuna not installed. Run: pip install wavedl")
     sys.exit(1)
@@ -147,6 +147,32 @@ def create_objective(args):
             cmd.extend(["--output_dir", tmpdir])
             history_file = Path(tmpdir) / "training_history.csv"
+            # GPU isolation for parallel trials: assign each trial to a specific GPU
+            # This prevents multiple trials from competing for all GPUs
+            env = None
+            if args.n_jobs > 1:
+                import os
+                # Detect available GPUs
+                n_gpus = 1
+                try:
+                    import subprocess as sp
+                    result_gpu = sp.run(
+                        ["nvidia-smi", "--list-gpus"],
+                        capture_output=True,
+                        text=True,
+                    )
+                    if result_gpu.returncode == 0:
+                        n_gpus = len(result_gpu.stdout.strip().split("\n"))
+                except Exception:
+                    pass
+                # Assign trial to a specific GPU (round-robin)
+                gpu_id = trial.number % n_gpus
+                env = os.environ.copy()
+                env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
             # Run training
             try:
                 result = subprocess.run(
@@ -155,6 +181,7 @@ def create_objective(args):
                     text=True,
                     timeout=args.timeout,
                     cwd=Path(__file__).parent,
+                    env=env,
                 )
                 # Read best val_loss from training_history.csv (reliable machine-readable)
@@ -248,7 +275,10 @@ Examples:
         "--n_trials", type=int, default=50, help="Number of HPO trials (default: 50)"
     )
     parser.add_argument(
-        "--n_jobs", type=int, default=1, help="Parallel trials (default: 1)"
+        "--n_jobs",
+        type=int,
+        default=-1,
+        help="Parallel trials (-1 = auto-detect GPUs, default: -1)",
     )
     parser.add_argument(
         "--quick",
@@ -315,11 +345,30 @@ Examples:
     args = parser.parse_args()
+    # Convert to absolute path (child processes may run in different cwd)
+    args.data_path = str(Path(args.data_path).resolve())
     # Validate data path
     if not Path(args.data_path).exists():
         print(f"Error: Data file not found: {args.data_path}")
         sys.exit(1)
+    # Auto-detect GPUs for n_jobs if not specified
+    if args.n_jobs == -1:
+        try:
+            result_gpu = subprocess.run(
+                ["nvidia-smi", "--list-gpus"],
+                capture_output=True,
+                text=True,
+            )
+            if result_gpu.returncode == 0:
+                args.n_jobs = max(1, len(result_gpu.stdout.strip().split("\n")))
+            else:
+                args.n_jobs = 1
+        except Exception:
+            args.n_jobs = 1
+        print(f"Auto-detected {args.n_jobs} GPU(s) for parallel trials")
     # Create study
     print("=" * 60)
     print("WaveDL Hyperparameter Optimization")

wavedl/test.py CHANGED Viewed

@@ -366,13 +366,19 @@ def load_checkpoint(
     logging.info(f"   Building model: {model_name}")
     model = build_model(model_name, in_shape=in_shape, out_size=out_size)
-    # Load weights (prefer safetensors)
-    weight_path = checkpoint_dir / "model.safetensors"
-    if not weight_path.exists():
-        weight_path = checkpoint_dir / "pytorch_model.bin"
-    if not weight_path.exists():
-        raise FileNotFoundError(f"No model weights found in {checkpoint_dir}")
+    # Load weights (check multiple formats in order of preference)
+    weight_path = None
+    for fname in ["model.safetensors", "model.bin", "pytorch_model.bin"]:
+        candidate = checkpoint_dir / fname
+        if candidate.exists():
+            weight_path = candidate
+            break
+    if weight_path is None:
+        raise FileNotFoundError(
+            f"No model weights found in {checkpoint_dir}. "
+            f"Expected one of: model.safetensors, model.bin, pytorch_model.bin"
+        )
     if HAS_SAFETENSORS and weight_path.suffix == ".safetensors":
         state_dict = load_safetensors(str(weight_path))

wavedl/train.py CHANGED Viewed

@@ -148,6 +148,24 @@ torch.set_float32_matmul_precision("high")  # Use TF32 for float32 ops
 torch.backends.cudnn.benchmark = True
+# ==============================================================================
+# LOGGING UTILITIES
+# ==============================================================================
+from contextlib import contextmanager
+@contextmanager
+def suppress_accelerate_logging():
+    """Temporarily suppress accelerate's verbose checkpoint save messages."""
+    accelerate_logger = logging.getLogger("accelerate.checkpointing")
+    original_level = accelerate_logger.level
+    accelerate_logger.setLevel(logging.WARNING)
+    try:
+        yield
+    finally:
+        accelerate_logger.setLevel(original_level)
 # ==============================================================================
 # ARGUMENT PARSING
 # ==============================================================================
@@ -1033,7 +1051,8 @@ def main():
             # Step 3: Save checkpoint with all ranks participating
             if is_best_epoch:
                 ckpt_dir = os.path.join(args.output_dir, "best_checkpoint")
-                accelerator.save_state(ckpt_dir)  # All ranks must call this
+                with suppress_accelerate_logging():
+                    accelerator.save_state(ckpt_dir, safe_serialization=False)
                 # Step 4: Rank 0 handles metadata and updates tracking variables
                 if accelerator.is_main_process:
@@ -1096,7 +1115,8 @@ def main():
             if periodic_checkpoint_needed:
                 ckpt_name = f"epoch_{epoch + 1}_checkpoint"
                 ckpt_dir = os.path.join(args.output_dir, ckpt_name)
-                accelerator.save_state(ckpt_dir)  # All ranks participate
+                with suppress_accelerate_logging():
+                    accelerator.save_state(ckpt_dir, safe_serialization=False)
                 if accelerator.is_main_process:
                     with open(os.path.join(ckpt_dir, "training_meta.pkl"), "wb") as f:
@@ -1147,7 +1167,11 @@ def main():
     except KeyboardInterrupt:
         logger.warning("Training interrupted. Saving emergency checkpoint...")
-        accelerator.save_state(os.path.join(args.output_dir, "interrupted_checkpoint"))
+        with suppress_accelerate_logging():
+            accelerator.save_state(
+                os.path.join(args.output_dir, "interrupted_checkpoint"),
+                safe_serialization=False,
+            )
     except Exception as e:
         logger.error(f"Critical error: {e}", exc_info=True)

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: wavedl
-Version: 1.4.5
+Version: 1.4.6
 Summary: A Scalable Deep Learning Framework for Wave-Based Inverse Problems
 Author: Ductho Le
 License: MIT
@@ -49,7 +49,7 @@ Requires-Dist: triton>=2.0.0; sys_platform == "linux"
 ### A Scalable Deep Learning Framework for Wave-Based Inverse Problems
-[![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg?style=plastic&logo=python&logoColor=white)](https://www.python.org/downloads/)
+[![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue.svg?style=plastic&logo=python&logoColor=white)](https://www.python.org/downloads/)
 [![PyTorch 2.x](https://img.shields.io/badge/PyTorch-2.x-ee4c2c.svg?style=plastic&logo=pytorch&logoColor=white)](https://pytorch.org/)
 [![Accelerate](https://img.shields.io/badge/Accelerate-Enabled-yellow.svg?style=plastic&logo=huggingface&logoColor=white)](https://huggingface.co/docs/accelerate/)
 <br>
@@ -57,7 +57,7 @@ Requires-Dist: triton>=2.0.0; sys_platform == "linux"
 [![Lint](https://img.shields.io/github/actions/workflow/status/ductho-le/WaveDL/lint.yml?branch=main&style=plastic&logo=ruff&logoColor=white&label=Lint)](https://github.com/ductho-le/WaveDL/actions/workflows/lint.yml)
 [![Try it on Colab](https://img.shields.io/badge/Try_it_on_Colab-8E44AD?style=plastic&logo=googlecolab&logoColor=white)](https://colab.research.google.com/github/ductho-le/WaveDL/blob/main/notebooks/demo.ipynb)
 <br>
-[![Downloads](https://img.shields.io/pepy/dt/wavedl?style=plastic&logo=pypi&logoColor=white&color=9ACD32)](https://pepy.tech/project/wavedl)
+[![Downloads](https://img.shields.io/badge/dynamic/json?url=https://pypistats.org/api/packages/wavedl/recent?period=month%26mirrors=false&query=data.last_month&style=plastic&logo=pypi&logoColor=white&color=9ACD32&label=Downloads&suffix=/month)](https://pypistats.org/packages/wavedl)
 [![License: MIT](https://img.shields.io/badge/License-MIT-orange.svg?style=plastic)](LICENSE)
 [![DOI](https://img.shields.io/badge/DOI-10.5281/zenodo.18012338-008080.svg?style=plastic)](https://doi.org/10.5281/zenodo.18012338)
@@ -734,18 +734,20 @@ Automatically find the best training configuration using [Optuna](https://optuna
 **Run HPO:**
-You specify which models to search and how many trials to run:
 ```bash
-# Search 3 models with 100 trials
-python -m wavedl.hpo --data_path train.npz --models cnn resnet18 efficientnet_b0 --n_trials 100
+# Basic HPO (auto-detects GPUs for parallel trials)
+wavedl-hpo --data_path train.npz --models cnn --n_trials 100
-# Search 1 model (faster)
-python -m wavedl.hpo --data_path train.npz --models cnn --n_trials 50
+# Search multiple models
+wavedl-hpo --data_path train.npz --models cnn resnet18 efficientnet_b0 --n_trials 200
-# Search all your candidate models
-python -m wavedl.hpo --data_path train.npz --models cnn resnet18 resnet50 vit_small densenet121 --n_trials 200
+# Quick mode (fewer parameters, faster)
+wavedl-hpo --data_path train.npz --models cnn --n_trials 50 --quick
 ```
+> [!TIP]
+> **Auto GPU Detection**: HPO automatically detects available GPUs and runs one trial per GPU in parallel. On a 4-GPU system, 4 trials run simultaneously. Use `--n_jobs 1` to force serial execution.
 **Train with best parameters**
 After HPO completes, it prints the optimal command:
@@ -784,7 +786,7 @@ accelerate launch -m wavedl.train --data_path train.npz --model cnn --lr 3.2e-4
 | `--optimizers` | all 6 | Optimizers to search |
 | `--schedulers` | all 8 | Schedulers to search |
 | `--losses` | all 6 | Losses to search |
-| `--n_jobs` | `1` | Parallel trials (multi-GPU) |
+| `--n_jobs` | `-1` | Parallel trials (-1 = auto-detect GPUs) |
 | `--max_epochs` | `50` | Max epochs per trial |
 | `--output` | `hpo_results.json` | Output file |

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-wavedl/__init__.py,sha256=2ro7SYQ3wCmq-ejiAm5sd6BeXf6sZgixC9U2vS7Ckbs,1177
-wavedl/hpc.py,sha256=0h8IZzOT0EzmEv3fU9cKyRVE9V1ivtBzbjuBCaxYadc,8445
-wavedl/hpo.py,sha256=YJXsnSGEBSVUqp_2ah7zu3_VClAUqZrdkuzDaSqQUjU,12952
-wavedl/test.py,sha256=81al6vQBDAJ3CpSEtxZn6xzR1c4-jo28R7tX_84KROc,37642
-wavedl/train.py,sha256=_pW7prvlNqfUGrGweHO2QelS87UiAYKvyJwqMAIj6yI,49292
+wavedl/__init__.py,sha256=ItdZLt3f7sbtAMgiwUtGwwG5Cko4tPLugC_OVhfHMno,1177
+wavedl/hpc.py,sha256=-iOjjKkXPcV_quj4vAsMBJN_zWKtD1lMRfIZZBhyGms,8756
+wavedl/hpo.py,sha256=JQvwPgiVHj3sB9Wombn1QO4ammpuo0QAMpRee0LjkuI,14731
+wavedl/test.py,sha256=oWGSSC7178loqOxwti-oDXUVogOqbwHL__GfoXSE5Ss,37846
+wavedl/train.py,sha256=9l4aVW1Jd1Sq6yBr8BOoVIKUYmxASDO8XK6BqEkLLWs,50151
 wavedl/models/__init__.py,sha256=lfSohEnAUztO14nuwayMJhPjpgySzRN3jGiyAUuBmAU,3206
 wavedl/models/_template.py,sha256=J_D8taSPmV8lBaucN_vU-WiG98iFr7CJrZVNNX_Tdts,4600
 wavedl/models/base.py,sha256=T9iDF9IQM2MYucG_ggQd31rieUkB2fob-nkHyNIl2ak,7337
@@ -29,9 +29,9 @@ wavedl/utils/losses.py,sha256=5762M-TBC_hz6uyj1NPbU1vZeFOJQq7fR3-j7OygJRo,7254
 wavedl/utils/metrics.py,sha256=mkCpqZwl_XUpNvA5Ekjf7y-HqApafR7eR6EuA8cBdM8,37287
 wavedl/utils/optimizers.py,sha256=PyIkJ_hRhFi_Fio81Gy5YQNhcME0JUUEl8OTSyu-0RA,6323
 wavedl/utils/schedulers.py,sha256=e6Sf0yj8VOqkdwkUHLMyUfGfHKTX4NMr-zfgxWqCTYI,7659
-wavedl-1.4.5.dist-info/LICENSE,sha256=cEUCvcvH-9BT9Y-CNGY__PwWONCKu9zsoIqWA-NeHJ4,1066
-wavedl-1.4.5.dist-info/METADATA,sha256=4ltxFDaqPqh4XUAW_K8nkFmvqBzPcL2cxmghH11GMWg,42191
-wavedl-1.4.5.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
-wavedl-1.4.5.dist-info/entry_points.txt,sha256=f1RNDkXFZwBzrBzTMFocJ6xhfTvTmaEDTi5YyDEUaF8,140
-wavedl-1.4.5.dist-info/top_level.txt,sha256=ccneUt3D5Qzbh3bsBSSrq9bqrhGiogcWKY24ZC4Q6Xw,7
-wavedl-1.4.5.dist-info/RECORD,,
+wavedl-1.4.6.dist-info/LICENSE,sha256=cEUCvcvH-9BT9Y-CNGY__PwWONCKu9zsoIqWA-NeHJ4,1066
+wavedl-1.4.6.dist-info/METADATA,sha256=Hnot8ui2oksCz2UXhj3FHd_Z9MtoP8MJyiMzC6eWq5s,42453
+wavedl-1.4.6.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
+wavedl-1.4.6.dist-info/entry_points.txt,sha256=f1RNDkXFZwBzrBzTMFocJ6xhfTvTmaEDTi5YyDEUaF8,140
+wavedl-1.4.6.dist-info/top_level.txt,sha256=ccneUt3D5Qzbh3bsBSSrq9bqrhGiogcWKY24ZC4Q6Xw,7
+wavedl-1.4.6.dist-info/RECORD,,

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{wavedl-1.4.5.dist-info → wavedl-1.4.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

wavedl 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl

wavedl 1.4.5py3-none-any.whl → 1.4.6py3-none-any.whl