PyPI - discontinuum - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl - Mend

discontinuum 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

discontinuum/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.0.3'
-__version_tuple__ = version_tuple = (1, 0, 3)
+__version__ = version = '1.0.5'
+__version_tuple__ = version_tuple = (1, 0, 5)

discontinuum/engines/gpytorch.py CHANGED Viewed

@@ -49,7 +49,11 @@ class MarginalGPyTorch(BaseModel):
             target: Dataset,
             target_unc: Dataset = None,
             iterations: int = 100,
-            optimizer: str = "adam",
+            optimizer: str = "adamw",
+            learning_rate: float = None,
+            early_stopping: bool = False,
+            patience: int = 60,
+            gradient_noise: bool = False,
             ):
         """Fit the model to data.
@@ -64,7 +68,15 @@ class MarginalGPyTorch(BaseModel):
         iterations : int, optional
             Number of iterations for optimization. The default is 100.
         optimizer : str, optional
-            Optimization method. The default is "adam".
+            Optimization method. Supported: "adam", "adamw". The default is "adamw".
+        learning_rate : float, optional
+            Learning rate for optimization. If None, uses adaptive defaults.
+        early_stopping : bool, optional
+            Whether to use early stopping. The default is False.
+        patience : int, optional
+            Number of iterations to wait without improvement before stopping. The default is 60.
+        gradient_noise : bool, optional
+            Whether to inject Gaussian noise into gradients each step (std = 0.1 × current learning rate). The default is False.
         """
         self.is_fitted = True
         # setup data manager (self.dm)
@@ -86,26 +98,143 @@ class MarginalGPyTorch(BaseModel):
         self.model.train()
         self.likelihood.train()
-        # Use the adam optimizer
-        if optimizer == "adam":
-            optimizer = torch.optim.Adam(self.model.parameters(), lr=0.05) # default previously lr=0.1
+        if learning_rate is None:
+            if optimizer == "adam":
+                learning_rate = 0.1  # Aggressive default for faster convergence
+            elif optimizer == "adamw":
+                learning_rate = 0.1
+        if optimizer == "adamw":
+            optimizer_obj = torch.optim.AdamW(
+                self.model.parameters(),
+                lr=learning_rate,
+                betas=(0.9, 0.999),
+                eps=1e-8,
+                weight_decay=1e-2      # Stronger regularization for AdamW
+            )
+        elif optimizer == "adam":
+            optimizer_obj = torch.optim.Adam(
+                self.model.parameters(),
+                lr=learning_rate,
+                betas=(0.9, 0.999),
+                eps=1e-8,
+                weight_decay=1e-4      # Lighter regularization for Adam
+            )
         else:
-            raise NotImplementedError("Only Adam optimizer is implemented")
+            raise NotImplementedError(f"Only 'adam' and 'adamw' optimizers are supported. Got '{optimizer}'.")
+        # Use ReduceLROnPlateau for more stable learning rate adaptation
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer_obj,
+            mode='min',
+            factor=0.5,                      # Reduce LR by half
+            patience=max(2, patience),
+            threshold=1e-4,
+            min_lr=1e-5
+        )
         # "Loss" for GPs - the marginal log likelihood
         mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
-        pbar = tqdm.tqdm(range(iterations), ncols=70)
+        # Training loop with stability features
+        pbar = tqdm.tqdm(range(iterations), ncols=100)  # Wider progress bar
+        jitter = 1e-6  # Dynamic jitter for numerical stability
+        best_loss = float('inf')
+        patience_counter = 0
+        min_lr_for_early_stop = 2e-5  # Stop if patience is exceeded and LR is below this
         for i in pbar:
-            # Zero gradients from previous iteration
-            optimizer.zero_grad()
-            # Output from model
+            # Adam/AdamW optimizer with stability features
+            optimizer_obj.zero_grad()
             output = self.model(train_x)
-            # Calc loss and backprop gradients
-            loss = -mll(output, train_y)
+            # Attempt loss calculation with dynamic jitter
+            try:
+                with gpytorch.settings.cholesky_jitter(jitter):
+                    loss = -mll(output, train_y)
+            except Exception as e:
+                # Increase jitter if numerical issues occur
+                jitter = min(jitter * 10, 1e-2)
+                current_lr = optimizer_obj.param_groups[0]['lr']
+                pbar.set_postfix_str(
+                    f'lr={current_lr:.1e} jitter={jitter:.1e} | Numerical issue - increasing jitter'
+                )
+                continue
+            # Check for NaN loss
+            if torch.isnan(loss) or torch.isinf(loss):
+                current_lr = optimizer_obj.param_groups[0]['lr']
+                pbar.set_postfix_str(
+                    f'lr={current_lr:.1e} jitter={jitter:.1e} | NaN/Inf loss detected - skipping step'
+                )
+                continue
             loss.backward()
-            pbar.set_postfix(loss=loss.item())
-            optimizer.step()
+            # Get current learning rate before gradient noise injection
+            current_lr = optimizer_obj.param_groups[0]['lr']
+            # Gradient noise injection (if enabled)
+            if gradient_noise:
+                gradient_noise_scale = 0.1
+                adaptive_noise = gradient_noise_scale * current_lr
+                for param in self.model.parameters():
+                    if param.grad is not None:
+                        noise = torch.normal(mean=0.0, std=adaptive_noise, size=param.grad.shape, device=param.grad.device)
+                        param.grad.add_(noise)
+            # Gradient clipping for stability
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
+            # Check for NaN gradients
+            has_nan_grad = False
+            for param in self.model.parameters():
+                if param.grad is not None and torch.isnan(param.grad).any():
+                    has_nan_grad = True
+                    break
+            if has_nan_grad:
+                # Don't update scheduler on NaN gradients - this prevents rapid LR decay
+                # The scheduler should only respond to actual optimization progress
+                current_lr = optimizer_obj.param_groups[0]['lr']
+                # Update best loss tracking (loss is still valid, just gradients are NaN)
+                if loss.item() < best_loss:
+                    best_loss = loss.item()
+                    patience_counter = 0
+                else:
+                    patience_counter += 1
+                # Display comprehensive info even with NaN gradients, skip normal progress update
+                pbar.set_postfix_str(
+                    f'loss={loss.item():.4f} lr={current_lr:.1e} jitter={jitter:.1e} best={best_loss:.4f} | NaN gradients - skipping step'
+                )
+                continue
+            optimizer_obj.step()
+            # Update learning rate scheduler for Adam/AdamW
+            scheduler.step(loss.item())
+            current_lr = optimizer_obj.param_groups[0]['lr']
+            # Early stopping check (more aggressive)
+            if loss.item() < best_loss:
+                best_loss = loss.item()
+                patience_counter = 0
+            else:
+                patience_counter += 1
+            # Only update progress bar if not skipped above
+            if not has_nan_grad:
+                progress_info = f'loss={loss.item():.4f} lr={current_lr:.1e} jitter={jitter:.1e} best={best_loss:.4f}'
+                if early_stopping:
+                    progress_info += f' patience={patience_counter}/{patience}'
+                pbar.set_postfix_str(progress_info)
+            if early_stopping and patience_counter >= patience and current_lr <= min_lr_for_early_stop:
+                print(f"\nEarly stopping triggered after {i+1} iterations")
+                print(f"Best loss: {best_loss:.6f}")
+                break
     @is_fitted
     def predict(self,

{discontinuum-1.0.3.dist-info → discontinuum-1.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: discontinuum
-Version: 1.0.3
+Version: 1.0.5
 Summary: Estimate discontinuous timeseries from continuous covariates.
 Maintainer-email: Timothy Hodson <thodson@usgs.gov>
 License: License
@@ -124,11 +124,12 @@ However, LOADEST has several serious limitations
 the more flexible Weighted Regression on Time Discharge and Season (WRTDS),
 which allows the relation between target and covariate to vary through time.
 `loadest-gp` takes the WRTDS idea and reimplements it as a GP.
-Try it out in the [loadest-gp demo](https://code.usgs.gov/wma/uncertainty/discontinuum/-/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb).
+github/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb
+Try it out in the [loadest-gp demo](https://github.com/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb).
 ### rating-gp
 `rating-gp` is a Gaussian-process model for estimating river flow from stage time series.
-Try it out in the [rating-gp demo](https://code.usgs.gov/wma/uncertainty/discontinuum/-/blob/main/docs/source/notebooks/rating-gp-demo.ipynb).
+Try it out in the [rating-gp demo](https://github.com/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/rating-gp-demo.ipynb).
 ## Engines
 Currently, the only supported engines are the marginal likelihood implementation in `pymc` and `gpytorch`.

{discontinuum-1.0.3.dist-info → discontinuum-1.0.5.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 discontinuum/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-discontinuum/_version.py,sha256=jWOT7x0QtPxvOokiMtAg9lJ_WkkL0NXT4lwG0yqXne0,511
+discontinuum/_version.py,sha256=cafGA7j4exK_daS29O0wW2JM2p5b8FwYDLqAYJR0jWo,511
 discontinuum/data_manager.py,sha256=LiZoPR0nnu7YAUfh5L1ZDRfaS3dgfVIELXIHkzUKyBg,4416
 discontinuum/pipeline.py,sha256=1avuZnFai-b3HmihcpZ8M3WFNQ8lXAFSNTrnfl2NrY0,10074
 discontinuum/plot.py,sha256=eZQS6-Ydq8FFcEukPtNuDVB-weV6lHyWMyJ1hqTkVrU,2969
 discontinuum/utils.py,sha256=07hIHQk_oDlkjz7tasgBjqqPOC6D0iNcy0eu-88aNbM,1540
 discontinuum/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 discontinuum/engines/base.py,sha256=OlHd4ssIQoWvYHKoVqk5fKAVBcKsIIkR4ul9iNBvaYg,2396
-discontinuum/engines/gpytorch.py,sha256=oJMNvNAwKwxQyt3j-QyRE-pjkYDv4i-qqhQfimNQ2HQ,8654
+discontinuum/engines/gpytorch.py,sha256=kRyAgCfxjKZbAJhJGViaDU_y8NO8sW4rSWRyEQlomHo,14383
 discontinuum/engines/pymc.py,sha256=phbtE-3UCSVcP1MhbXwAHIWDZWDr56wK9U7aRt-w-2o,5961
 discontinuum/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 discontinuum/providers/base.py,sha256=Yn2EHS1b4fYl09-m2MYuf2P9VRUXAP-WDpSoZrCbRvY,720
 discontinuum/tests/test_pipeline.py,sha256=_FhkGxbFIxNb35lGaIdZk7Zjgs6CkxEF3gFUX3PE8EU,918
-discontinuum-1.0.3.dist-info/licenses/LICENSE.md,sha256=XElVHHnS2uQ15M_Z2giPH1vmeWMzdpGQ48ItkuZurVA,1650
+discontinuum-1.0.5.dist-info/licenses/LICENSE.md,sha256=XElVHHnS2uQ15M_Z2giPH1vmeWMzdpGQ48ItkuZurVA,1650
 loadest_gp/__init__.py,sha256=YISfvbc7Zy2y0BOxS1A2KzqxyoNJTz0EnLMnRW6iVT8,740
 loadest_gp/plot.py,sha256=x2PK7vBCc44dX9lu5YV-rvw1u4pvXSLdcrTSvYLiHMA,2595
 loadest_gp/utils.py,sha256=m5QaqR_0JiuRXPfryH8nI5lODp8PqvQla5C05WDN3LY,2772
@@ -25,11 +25,11 @@ rating_gp/pipeline.py,sha256=1HgxN6DD3ZL5lhUb3DK2in2IXiml7W4Ja272GBMTc08,1884
 rating_gp/plot.py,sha256=CJphwqWWAfIY22j5Oz5DRwj7TcQCRyIQvM79_3KEdlc,9635
 rating_gp/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rating_gp/models/base.py,sha256=e2Kq644I88YLHWPNA0qyRgitF5wimdLW4618vKX-o_s,1474
-rating_gp/models/gpytorch.py,sha256=OUMKIdBgPjAxInYttKzGN2ou2zlc9V7BTUf4SuTVvwY,6043
+rating_gp/models/gpytorch.py,sha256=eFHwtnW44GZ1zz0fLx5REbzIWwnb_x_uq-cGjDcHyWs,6907
 rating_gp/models/kernels.py,sha256=3xg2mhY3aEgjI3r5vyAll9MA4c3M5UKqRi3FApNhJJQ,11579
 rating_gp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rating_gp/providers/usgs.py,sha256=KmKYN3c8Mi-ly2l6X80WT3taEhqCPXeEcRNi9HvbJmY,8134
-discontinuum-1.0.3.dist-info/METADATA,sha256=RukIH_49qEGFmjgaqaN2wTUwR4SPpD9zUWeHNhPosZI,6231
-discontinuum-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-discontinuum-1.0.3.dist-info/top_level.txt,sha256=mwU_PSFrZYSJrBgqIuTJTo7Pp9ODDv6XdDed7kAagXM,34
-discontinuum-1.0.3.dist-info/RECORD,,
+discontinuum-1.0.5.dist-info/METADATA,sha256=GDh-fscmNYYMmXXoUE5Xd1QafuKEOPjgZjlssmjqGVg,6302
+discontinuum-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+discontinuum-1.0.5.dist-info/top_level.txt,sha256=mwU_PSFrZYSJrBgqIuTJTo7Pp9ODDv6XdDed7kAagXM,34
+discontinuum-1.0.5.dist-info/RECORD,,

rating_gp/models/gpytorch.py CHANGED Viewed

@@ -71,8 +71,9 @@ class RatingGPMarginalGPyTorch(
         # noise, *and* you did not specify noise. This is treated as a no-op."
         self.likelihood = gpytorch.likelihoods.FixedNoiseGaussianLikelihood(
             noise=noise,
+            #learn_additional_noise=False,
             learn_additional_noise=True,
-            noise_prior=gpytorch.priors.HalfNormalPrior(scale=0.01),
+            noise_prior=gpytorch.priors.HalfNormalPrior(scale=0.005),
         )
         model = ExactGPModel(X, y, self.likelihood)
@@ -109,17 +110,20 @@ class ExactGPModel(gpytorch.models.ExactGP):
         # + stage * time kernel only at low stage with smaller time length.
         # Note that stage gets transformed to q, so the kernel is actually
         # q * time
+        b_min = np.quantile(train_y, 0.10)
+        b_max = np.quantile(train_y, 0.90)
         self.covar_module = (
-            (self.cov_stage()
+            (self.cov_stage(ls_prior=GammaPrior(concentration=1,  rate=1))
              * self.cov_time(ls_prior=GammaPrior(concentration=1,  rate=1)))
-             + (self.cov_stage(ls_prior=GammaPrior(concentration=1, rate=2))
+             + (self.cov_stage(ls_prior=GammaPrior(concentration=3, rate=1))
                * self.cov_time(ls_prior=GammaPrior(concentration=2, rate=5))
                * SigmoidKernel(
                    active_dims=self.stage_dim,
                    # a_prior=NormalPrior(loc=20, scale=1),
+                   # b_prior=NormalPrior(loc=0.5, scale=0.2),
                    b_constraint=gpytorch.constraints.Interval(
-                       train_y.min(),
-                       train_y.max(),
+                       b_min,
+                       b_max,
                    ),
                )
               )
@@ -141,11 +145,12 @@ class ExactGPModel(gpytorch.models.ExactGP):
     def cov_stage(self, ls_prior=None):
         eta = HalfNormalPrior(scale=1)
         return ScaleKernel(
             MaternKernel(
                 active_dims=self.stage_dim,
                 lengthscale_prior=ls_prior,
+                nu=2.5,  # Smoother kernel (was nu=1.5)
             ),
             outputscale_prior=eta,
         )
@@ -153,14 +158,31 @@ class ExactGPModel(gpytorch.models.ExactGP):
     def cov_time(self, ls_prior=None):
         eta = HalfNormalPrior(scale=1)
-        return ScaleKernel(
+        # Base Matern kernel for long-term trends
+        base_kernel = ScaleKernel(
             MaternKernel(
                 active_dims=self.time_dim,
                 lengthscale_prior=ls_prior,
+                nu=1.5, # was 2.5
             ),
             outputscale_prior=eta,
         )
+        # Periodic performs beter than a locally periodic kernel
+        periodic_kernel = ScaleKernel(
+            gpytorch.kernels.PeriodicKernel(
+                active_dims=self.time_dim,
+                period_length_prior=NormalPrior(loc=1.0, scale=0.1),  # ~1 year
+                lengthscale_prior=GammaPrior(concentration=2, rate=4),
+            ),
+            outputscale_prior=HalfNormalPrior(scale=0.5),
+        )
+        return base_kernel + periodic_kernel
     def cov_stagetime(self):
         eta = HalfNormalPrior(scale=1)
         ls = GammaPrior(concentration=2, rate=1)

{discontinuum-1.0.3.dist-info → discontinuum-1.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{discontinuum-1.0.3.dist-info → discontinuum-1.0.5.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

{discontinuum-1.0.3.dist-info → discontinuum-1.0.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

discontinuum 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

discontinuum 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl