PyPI - wavedl - Versions diffs - 1.4.1__tar.gz → 1.4.2__tar.gz - Mend

wavedl 1.4.1tar.gz → 1.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{wavedl-1.4.1/src/wavedl.egg-info → wavedl-1.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: wavedl
-Version: 1.4.1
+Version: 1.4.2
 Summary: A Scalable Deep Learning Framework for Wave-Based Inverse Problems
 Author: Ductho Le
 License: MIT

{wavedl-1.4.1 → wavedl-1.4.2}/src/wavedl/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ For inference:
     # or: python -m wavedl.test --checkpoint best_checkpoint --data_path test.npz
 """
-__version__ = "1.4.1"
+__version__ = "1.4.2"
 __author__ = "Ductho Le"
 __email__ = "ductho.le@outlook.com"

{wavedl-1.4.1 → wavedl-1.4.2}/src/wavedl/hpo.py RENAMED Viewed

@@ -145,6 +145,7 @@ def create_objective(args):
         # Use temporary directory for trial output
         with tempfile.TemporaryDirectory() as tmpdir:
             cmd.extend(["--output_dir", tmpdir])
+            history_file = Path(tmpdir) / "training_history.csv"
             # Run training
             try:
@@ -156,29 +157,55 @@ def create_objective(args):
                     cwd=Path(__file__).parent,
                 )
-                # Parse validation loss from output
-                # Look for "Best val_loss: X.XXXX" in stdout
+                # Read best val_loss from training_history.csv (reliable machine-readable)
                 val_loss = None
-                for line in result.stdout.split("\n"):
-                    if "Best val_loss:" in line:
-                        try:
-                            val_loss = float(line.split(":")[-1].strip())
-                        except ValueError:
-                            pass
-                    # Also check for final validation loss
-                    if "val_loss=" in line.lower():
-                        try:
-                            # Extract number after val_loss=
-                            parts = line.lower().split("val_loss=")
-                            if len(parts) > 1:
-                                val_str = parts[1].split()[0].strip(",")
-                                val_loss = float(val_str)
-                        except (ValueError, IndexError):
-                            pass
+                if history_file.exists():
+                    try:
+                        import csv
+                        with open(history_file) as f:
+                            reader = csv.DictReader(f)
+                            val_losses = []
+                            for row in reader:
+                                if "val_loss" in row:
+                                    try:
+                                        val_losses.append(float(row["val_loss"]))
+                                    except (ValueError, TypeError):
+                                        pass
+                            if val_losses:
+                                val_loss = min(val_losses)  # Best (minimum) val_loss
+                    except Exception as e:
+                        print(f"Trial {trial.number}: Error reading history: {e}")
+                if val_loss is None:
+                    # Fallback: parse stdout for training log format
+                    # Pattern: "epoch | train_loss | val_loss | ..."
+                    # Use regex to avoid false positives from unrelated lines
+                    import re
+                    # Match lines like: "  42  | 0.0123   | 0.0156   | ..."
+                    log_pattern = re.compile(
+                        r"^\s*\d+\s*\|\s*[\d.]+\s*\|\s*([\d.]+)\s*\|"
+                    )
+                    val_losses_stdout = []
+                    for line in result.stdout.split("\n"):
+                        match = log_pattern.match(line)
+                        if match:
+                            try:
+                                val_losses_stdout.append(float(match.group(1)))
+                            except ValueError:
+                                continue
+                    if val_losses_stdout:
+                        val_loss = min(val_losses_stdout)
                 if val_loss is None:
                     # Training failed or no loss found
-                    print(f"Trial {trial.number}: Training failed")
+                    print(f"Trial {trial.number}: Training failed (no val_loss found)")
+                    if result.returncode != 0:
+                        # Show last few lines of stderr for debugging
+                        stderr_lines = result.stderr.strip().split("\n")[-3:]
+                        for line in stderr_lines:
+                            print(f"  stderr: {line}")
                     return float("inf")
                 print(f"Trial {trial.number}: val_loss={val_loss:.6f}")

{wavedl-1.4.1 → wavedl-1.4.2}/src/wavedl/train.py RENAMED Viewed

@@ -851,7 +851,7 @@ def main():
             val_mae_sum = torch.zeros(out_dim, device=accelerator.device)
             val_samples = 0
-            # Accumulate predictions locally, gather ONCE at end (reduces sync overhead)
+            # Accumulate predictions locally ON CPU to prevent GPU OOM
             local_preds = []
             local_targets = []
@@ -867,17 +867,19 @@ def main():
                     mae_batch = torch.abs((pred - y) * phys_scale).sum(dim=0)
                     val_mae_sum += mae_batch
-                    # Store locally (no GPU sync per batch)
-                    local_preds.append(pred)
-                    local_targets.append(y)
+                    # Store on CPU (critical for large val sets)
+                    local_preds.append(pred.detach().cpu())
+                    local_targets.append(y.detach().cpu())
-            # Single gather at end of validation (2 syncs instead of 2×num_batches)
-            all_local_preds = torch.cat(local_preds)
-            all_local_targets = torch.cat(local_targets)
-            all_preds = accelerator.gather_for_metrics(all_local_preds)
-            all_targets = accelerator.gather_for_metrics(all_local_targets)
+            # Concatenate locally on CPU (no GPU memory spike)
+            cpu_preds = torch.cat(local_preds)
+            cpu_targets = torch.cat(local_targets)
-            # Synchronize validation metrics
+            # Gather to rank 0 only via gather_object (avoids all-gather to every rank)
+            # gather_object returns list of objects from each rank: [(preds0, targs0), (preds1, targs1), ...]
+            gathered = accelerator.gather_object((cpu_preds, cpu_targets))
+            # Synchronize validation metrics (scalars only - efficient)
             val_loss_scalar = val_loss_sum.item()
             val_metrics = torch.cat(
                 [
@@ -900,9 +902,14 @@ def main():
             # ==================== LOGGING & CHECKPOINTING ====================
             if accelerator.is_main_process:
-                # Scientific metrics - cast to float32 before numpy (bf16 can't convert)
-                y_pred = all_preds.float().cpu().numpy()
-                y_true = all_targets.float().cpu().numpy()
+                # Concatenate gathered tensors from all ranks (only on rank 0)
+                # gathered is list of tuples: [(preds_rank0, targs_rank0), (preds_rank1, targs_rank1), ...]
+                all_preds = torch.cat([item[0] for item in gathered])
+                all_targets = torch.cat([item[1] for item in gathered])
+                # Scientific metrics - cast to float32 before numpy
+                y_pred = all_preds.float().numpy()
+                y_true = all_targets.float().numpy()
                 # Trim DDP padding
                 real_len = len(val_dl.dataset)

{wavedl-1.4.1 → wavedl-1.4.2}/src/wavedl/utils/data.py RENAMED Viewed

@@ -735,7 +735,7 @@ def load_test_data(
     try:
         inp, outp = source.load(path)
     except KeyError:
-        # Try with just inputs if outputs not found
+        # Try with just inputs if outputs not found (inference-only mode)
         if format == "npz":
             data = np.load(path, allow_pickle=True)
             keys = list(data.keys())
@@ -751,6 +751,54 @@ def load_test_data(
                 )
             out_key = DataSource._find_key(keys, custom_output_keys)
             outp = data[out_key] if out_key else None
+        elif format == "hdf5":
+            # HDF5: input-only loading for inference
+            with h5py.File(path, "r") as f:
+                keys = list(f.keys())
+                inp_key = DataSource._find_key(keys, custom_input_keys)
+                if inp_key is None:
+                    raise KeyError(
+                        f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
+                    )
+                # Check size - load_test_data is eager, large files should use DataLoader
+                n_samples = f[inp_key].shape[0]
+                if n_samples > 100000:
+                    raise ValueError(
+                        f"Dataset has {n_samples:,} samples. load_test_data() loads "
+                        f"everything into RAM which may cause OOM. For large inference "
+                        f"sets, use a DataLoader with HDF5Source.load_mmap() instead."
+                    )
+                inp = f[inp_key][:]
+                out_key = DataSource._find_key(keys, custom_output_keys)
+                outp = f[out_key][:] if out_key else None
+        elif format == "mat":
+            # MAT v7.3: input-only loading with proper sparse handling
+            mat_source = MATSource()
+            with h5py.File(path, "r") as f:
+                keys = list(f.keys())
+                inp_key = DataSource._find_key(keys, custom_input_keys)
+                if inp_key is None:
+                    raise KeyError(
+                        f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
+                    )
+                # Check size - load_test_data is eager, large files should use DataLoader
+                n_samples = f[inp_key].shape[-1]  # MAT is transposed
+                if n_samples > 100000:
+                    raise ValueError(
+                        f"Dataset has {n_samples:,} samples. load_test_data() loads "
+                        f"everything into RAM which may cause OOM. For large inference "
+                        f"sets, use a DataLoader with MATSource.load_mmap() instead."
+                    )
+                # Use _load_dataset for sparse support and proper transpose
+                inp = mat_source._load_dataset(f, inp_key)
+                out_key = DataSource._find_key(keys, custom_output_keys)
+                if out_key:
+                    outp = mat_source._load_dataset(f, out_key)
+                    # Handle 1D outputs that become (1, N) after transpose
+                    if outp.ndim == 2 and outp.shape[0] == 1:
+                        outp = outp.T
+                else:
+                    outp = None
         else:
             raise
@@ -949,6 +997,15 @@ def prepare_data(
             with open(META_FILE, "rb") as f:
                 meta = pickle.load(f)
             cached_data_path = meta.get("data_path", None)
+            cached_file_size = meta.get("file_size", None)
+            cached_file_mtime = meta.get("file_mtime", None)
+            # Get current file stats
+            current_stats = os.stat(args.data_path)
+            current_size = current_stats.st_size
+            current_mtime = current_stats.st_mtime
+            # Check if data path changed
             if cached_data_path != os.path.abspath(args.data_path):
                 if accelerator.is_main_process:
                     logger.warning(
@@ -958,6 +1015,23 @@ def prepare_data(
                         f"   Invalidating cache and regenerating..."
                     )
                 cache_exists = False
+            # Check if file was modified (size or mtime changed)
+            elif cached_file_size is not None and cached_file_size != current_size:
+                if accelerator.is_main_process:
+                    logger.warning(
+                        f"⚠️  Data file size changed!\n"
+                        f"   Cached size: {cached_file_size:,} bytes\n"
+                        f"   Current size: {current_size:,} bytes\n"
+                        f"   Invalidating cache and regenerating..."
+                    )
+                cache_exists = False
+            elif cached_file_mtime is not None and cached_file_mtime != current_mtime:
+                if accelerator.is_main_process:
+                    logger.warning(
+                        "⚠️  Data file was modified!\n"
+                        "   Cache may be stale, regenerating..."
+                    )
+                cache_exists = False
         except Exception:
             cache_exists = False
@@ -1053,13 +1127,16 @@ def prepare_data(
                 f"   Shape Detected: {full_shape} [{dim_type}] | Output Dim: {out_dim}"
             )
-            # Save metadata (including data path for cache validation)
+            # Save metadata (including data path, size, mtime for cache validation)
+            file_stats = os.stat(args.data_path)
             with open(META_FILE, "wb") as f:
                 pickle.dump(
                     {
                         "shape": full_shape,
                         "out_dim": out_dim,
                         "data_path": os.path.abspath(args.data_path),
+                        "file_size": file_stats.st_size,
+                        "file_mtime": file_stats.st_mtime,
                     },
                     f,
                 )

{wavedl-1.4.1 → wavedl-1.4.2/src/wavedl.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: wavedl
-Version: 1.4.1
+Version: 1.4.2
 Summary: A Scalable Deep Learning Framework for Wave-Based Inverse Problems
 Author: Ductho Le
 License: MIT