discontinuum 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
discontinuum/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '1.0.3'
21
- __version_tuple__ = version_tuple = (1, 0, 3)
20
+ __version__ = version = '1.0.5'
21
+ __version_tuple__ = version_tuple = (1, 0, 5)
@@ -49,7 +49,11 @@ class MarginalGPyTorch(BaseModel):
49
49
  target: Dataset,
50
50
  target_unc: Dataset = None,
51
51
  iterations: int = 100,
52
- optimizer: str = "adam",
52
+ optimizer: str = "adamw",
53
+ learning_rate: float = None,
54
+ early_stopping: bool = False,
55
+ patience: int = 60,
56
+ gradient_noise: bool = False,
53
57
  ):
54
58
  """Fit the model to data.
55
59
 
@@ -64,7 +68,15 @@ class MarginalGPyTorch(BaseModel):
64
68
  iterations : int, optional
65
69
  Number of iterations for optimization. The default is 100.
66
70
  optimizer : str, optional
67
- Optimization method. The default is "adam".
71
+ Optimization method. Supported: "adam", "adamw". The default is "adamw".
72
+ learning_rate : float, optional
73
+ Learning rate for optimization. If None, uses adaptive defaults.
74
+ early_stopping : bool, optional
75
+ Whether to use early stopping. The default is False.
76
+ patience : int, optional
77
+ Number of iterations to wait without improvement before stopping. The default is 60.
78
+ gradient_noise : bool, optional
79
+ Whether to inject Gaussian noise into gradients each step (std = 0.1 × current learning rate). The default is False.
68
80
  """
69
81
  self.is_fitted = True
70
82
  # setup data manager (self.dm)
@@ -86,26 +98,143 @@ class MarginalGPyTorch(BaseModel):
86
98
  self.model.train()
87
99
  self.likelihood.train()
88
100
 
89
- # Use the adam optimizer
90
- if optimizer == "adam":
91
- optimizer = torch.optim.Adam(self.model.parameters(), lr=0.05) # default previously lr=0.1
101
+ if learning_rate is None:
102
+ if optimizer == "adam":
103
+ learning_rate = 0.1 # Aggressive default for faster convergence
104
+ elif optimizer == "adamw":
105
+ learning_rate = 0.1
106
+
107
+ if optimizer == "adamw":
108
+ optimizer_obj = torch.optim.AdamW(
109
+ self.model.parameters(),
110
+ lr=learning_rate,
111
+ betas=(0.9, 0.999),
112
+ eps=1e-8,
113
+ weight_decay=1e-2 # Stronger regularization for AdamW
114
+ )
115
+ elif optimizer == "adam":
116
+ optimizer_obj = torch.optim.Adam(
117
+ self.model.parameters(),
118
+ lr=learning_rate,
119
+ betas=(0.9, 0.999),
120
+ eps=1e-8,
121
+ weight_decay=1e-4 # Lighter regularization for Adam
122
+ )
92
123
  else:
93
- raise NotImplementedError("Only Adam optimizer is implemented")
124
+ raise NotImplementedError(f"Only 'adam' and 'adamw' optimizers are supported. Got '{optimizer}'.")
125
+
126
+ # Use ReduceLROnPlateau for more stable learning rate adaptation
127
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
128
+ optimizer_obj,
129
+ mode='min',
130
+ factor=0.5, # Reduce LR by half
131
+ patience=max(2, patience),
132
+ threshold=1e-4,
133
+ min_lr=1e-5
134
+ )
94
135
 
95
136
  # "Loss" for GPs - the marginal log likelihood
96
137
  mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
97
138
 
98
- pbar = tqdm.tqdm(range(iterations), ncols=70)
139
+ # Training loop with stability features
140
+ pbar = tqdm.tqdm(range(iterations), ncols=100) # Wider progress bar
141
+ jitter = 1e-6 # Dynamic jitter for numerical stability
142
+ best_loss = float('inf')
143
+ patience_counter = 0
144
+ min_lr_for_early_stop = 2e-5 # Stop if patience is exceeded and LR is below this
145
+
99
146
  for i in pbar:
100
- # Zero gradients from previous iteration
101
- optimizer.zero_grad()
102
- # Output from model
147
+ # Adam/AdamW optimizer with stability features
148
+ optimizer_obj.zero_grad()
103
149
  output = self.model(train_x)
104
- # Calc loss and backprop gradients
105
- loss = -mll(output, train_y)
150
+
151
+ # Attempt loss calculation with dynamic jitter
152
+ try:
153
+ with gpytorch.settings.cholesky_jitter(jitter):
154
+ loss = -mll(output, train_y)
155
+ except Exception as e:
156
+ # Increase jitter if numerical issues occur
157
+ jitter = min(jitter * 10, 1e-2)
158
+ current_lr = optimizer_obj.param_groups[0]['lr']
159
+ pbar.set_postfix_str(
160
+ f'lr={current_lr:.1e} jitter={jitter:.1e} | Numerical issue - increasing jitter'
161
+ )
162
+ continue
163
+
164
+ # Check for NaN loss
165
+ if torch.isnan(loss) or torch.isinf(loss):
166
+ current_lr = optimizer_obj.param_groups[0]['lr']
167
+ pbar.set_postfix_str(
168
+ f'lr={current_lr:.1e} jitter={jitter:.1e} | NaN/Inf loss detected - skipping step'
169
+ )
170
+ continue
171
+
106
172
  loss.backward()
107
- pbar.set_postfix(loss=loss.item())
108
- optimizer.step()
173
+
174
+ # Get current learning rate before gradient noise injection
175
+ current_lr = optimizer_obj.param_groups[0]['lr']
176
+
177
+ # Gradient noise injection (if enabled)
178
+ if gradient_noise:
179
+ gradient_noise_scale = 0.1
180
+ adaptive_noise = gradient_noise_scale * current_lr
181
+ for param in self.model.parameters():
182
+ if param.grad is not None:
183
+ noise = torch.normal(mean=0.0, std=adaptive_noise, size=param.grad.shape, device=param.grad.device)
184
+ param.grad.add_(noise)
185
+
186
+ # Gradient clipping for stability
187
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
188
+
189
+ # Check for NaN gradients
190
+ has_nan_grad = False
191
+ for param in self.model.parameters():
192
+ if param.grad is not None and torch.isnan(param.grad).any():
193
+ has_nan_grad = True
194
+ break
195
+
196
+ if has_nan_grad:
197
+ # Don't update scheduler on NaN gradients - this prevents rapid LR decay
198
+ # The scheduler should only respond to actual optimization progress
199
+ current_lr = optimizer_obj.param_groups[0]['lr']
200
+
201
+ # Update best loss tracking (loss is still valid, just gradients are NaN)
202
+ if loss.item() < best_loss:
203
+ best_loss = loss.item()
204
+ patience_counter = 0
205
+ else:
206
+ patience_counter += 1
207
+
208
+ # Display comprehensive info even with NaN gradients, skip normal progress update
209
+ pbar.set_postfix_str(
210
+ f'loss={loss.item():.4f} lr={current_lr:.1e} jitter={jitter:.1e} best={best_loss:.4f} | NaN gradients - skipping step'
211
+ )
212
+ continue
213
+
214
+ optimizer_obj.step()
215
+
216
+ # Update learning rate scheduler for Adam/AdamW
217
+ scheduler.step(loss.item())
218
+ current_lr = optimizer_obj.param_groups[0]['lr']
219
+
220
+ # Early stopping check (more aggressive)
221
+ if loss.item() < best_loss:
222
+ best_loss = loss.item()
223
+ patience_counter = 0
224
+ else:
225
+ patience_counter += 1
226
+
227
+ # Only update progress bar if not skipped above
228
+ if not has_nan_grad:
229
+ progress_info = f'loss={loss.item():.4f} lr={current_lr:.1e} jitter={jitter:.1e} best={best_loss:.4f}'
230
+ if early_stopping:
231
+ progress_info += f' patience={patience_counter}/{patience}'
232
+ pbar.set_postfix_str(progress_info)
233
+
234
+ if early_stopping and patience_counter >= patience and current_lr <= min_lr_for_early_stop:
235
+ print(f"\nEarly stopping triggered after {i+1} iterations")
236
+ print(f"Best loss: {best_loss:.6f}")
237
+ break
109
238
 
110
239
  @is_fitted
111
240
  def predict(self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: discontinuum
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Estimate discontinuous timeseries from continuous covariates.
5
5
  Maintainer-email: Timothy Hodson <thodson@usgs.gov>
6
6
  License: License
@@ -124,11 +124,12 @@ However, LOADEST has several serious limitations
124
124
  the more flexible Weighted Regression on Time Discharge and Season (WRTDS),
125
125
  which allows the relation between target and covariate to vary through time.
126
126
  `loadest-gp` takes the WRTDS idea and reimplements it as a GP.
127
- Try it out in the [loadest-gp demo](https://code.usgs.gov/wma/uncertainty/discontinuum/-/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb).
127
+ github/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb
128
+ Try it out in the [loadest-gp demo](https://github.com/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/loadest-gp-demo.ipynb).
128
129
 
129
130
  ### rating-gp
130
131
  `rating-gp` is a Gaussian-process model for estimating river flow from stage time series.
131
- Try it out in the [rating-gp demo](https://code.usgs.gov/wma/uncertainty/discontinuum/-/blob/main/docs/source/notebooks/rating-gp-demo.ipynb).
132
+ Try it out in the [rating-gp demo](https://github.com/thodson-usgs/discontinuum/blob/main/docs/source/notebooks/rating-gp-demo.ipynb).
132
133
 
133
134
  ## Engines
134
135
  Currently, the only supported engines are the marginal likelihood implementation in `pymc` and `gpytorch`.
@@ -1,17 +1,17 @@
1
1
  discontinuum/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- discontinuum/_version.py,sha256=jWOT7x0QtPxvOokiMtAg9lJ_WkkL0NXT4lwG0yqXne0,511
2
+ discontinuum/_version.py,sha256=cafGA7j4exK_daS29O0wW2JM2p5b8FwYDLqAYJR0jWo,511
3
3
  discontinuum/data_manager.py,sha256=LiZoPR0nnu7YAUfh5L1ZDRfaS3dgfVIELXIHkzUKyBg,4416
4
4
  discontinuum/pipeline.py,sha256=1avuZnFai-b3HmihcpZ8M3WFNQ8lXAFSNTrnfl2NrY0,10074
5
5
  discontinuum/plot.py,sha256=eZQS6-Ydq8FFcEukPtNuDVB-weV6lHyWMyJ1hqTkVrU,2969
6
6
  discontinuum/utils.py,sha256=07hIHQk_oDlkjz7tasgBjqqPOC6D0iNcy0eu-88aNbM,1540
7
7
  discontinuum/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  discontinuum/engines/base.py,sha256=OlHd4ssIQoWvYHKoVqk5fKAVBcKsIIkR4ul9iNBvaYg,2396
9
- discontinuum/engines/gpytorch.py,sha256=oJMNvNAwKwxQyt3j-QyRE-pjkYDv4i-qqhQfimNQ2HQ,8654
9
+ discontinuum/engines/gpytorch.py,sha256=kRyAgCfxjKZbAJhJGViaDU_y8NO8sW4rSWRyEQlomHo,14383
10
10
  discontinuum/engines/pymc.py,sha256=phbtE-3UCSVcP1MhbXwAHIWDZWDr56wK9U7aRt-w-2o,5961
11
11
  discontinuum/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  discontinuum/providers/base.py,sha256=Yn2EHS1b4fYl09-m2MYuf2P9VRUXAP-WDpSoZrCbRvY,720
13
13
  discontinuum/tests/test_pipeline.py,sha256=_FhkGxbFIxNb35lGaIdZk7Zjgs6CkxEF3gFUX3PE8EU,918
14
- discontinuum-1.0.3.dist-info/licenses/LICENSE.md,sha256=XElVHHnS2uQ15M_Z2giPH1vmeWMzdpGQ48ItkuZurVA,1650
14
+ discontinuum-1.0.5.dist-info/licenses/LICENSE.md,sha256=XElVHHnS2uQ15M_Z2giPH1vmeWMzdpGQ48ItkuZurVA,1650
15
15
  loadest_gp/__init__.py,sha256=YISfvbc7Zy2y0BOxS1A2KzqxyoNJTz0EnLMnRW6iVT8,740
16
16
  loadest_gp/plot.py,sha256=x2PK7vBCc44dX9lu5YV-rvw1u4pvXSLdcrTSvYLiHMA,2595
17
17
  loadest_gp/utils.py,sha256=m5QaqR_0JiuRXPfryH8nI5lODp8PqvQla5C05WDN3LY,2772
@@ -25,11 +25,11 @@ rating_gp/pipeline.py,sha256=1HgxN6DD3ZL5lhUb3DK2in2IXiml7W4Ja272GBMTc08,1884
25
25
  rating_gp/plot.py,sha256=CJphwqWWAfIY22j5Oz5DRwj7TcQCRyIQvM79_3KEdlc,9635
26
26
  rating_gp/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  rating_gp/models/base.py,sha256=e2Kq644I88YLHWPNA0qyRgitF5wimdLW4618vKX-o_s,1474
28
- rating_gp/models/gpytorch.py,sha256=OUMKIdBgPjAxInYttKzGN2ou2zlc9V7BTUf4SuTVvwY,6043
28
+ rating_gp/models/gpytorch.py,sha256=eFHwtnW44GZ1zz0fLx5REbzIWwnb_x_uq-cGjDcHyWs,6907
29
29
  rating_gp/models/kernels.py,sha256=3xg2mhY3aEgjI3r5vyAll9MA4c3M5UKqRi3FApNhJJQ,11579
30
30
  rating_gp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  rating_gp/providers/usgs.py,sha256=KmKYN3c8Mi-ly2l6X80WT3taEhqCPXeEcRNi9HvbJmY,8134
32
- discontinuum-1.0.3.dist-info/METADATA,sha256=RukIH_49qEGFmjgaqaN2wTUwR4SPpD9zUWeHNhPosZI,6231
33
- discontinuum-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
- discontinuum-1.0.3.dist-info/top_level.txt,sha256=mwU_PSFrZYSJrBgqIuTJTo7Pp9ODDv6XdDed7kAagXM,34
35
- discontinuum-1.0.3.dist-info/RECORD,,
32
+ discontinuum-1.0.5.dist-info/METADATA,sha256=GDh-fscmNYYMmXXoUE5Xd1QafuKEOPjgZjlssmjqGVg,6302
33
+ discontinuum-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
+ discontinuum-1.0.5.dist-info/top_level.txt,sha256=mwU_PSFrZYSJrBgqIuTJTo7Pp9ODDv6XdDed7kAagXM,34
35
+ discontinuum-1.0.5.dist-info/RECORD,,
@@ -71,8 +71,9 @@ class RatingGPMarginalGPyTorch(
71
71
  # noise, *and* you did not specify noise. This is treated as a no-op."
72
72
  self.likelihood = gpytorch.likelihoods.FixedNoiseGaussianLikelihood(
73
73
  noise=noise,
74
+ #learn_additional_noise=False,
74
75
  learn_additional_noise=True,
75
- noise_prior=gpytorch.priors.HalfNormalPrior(scale=0.01),
76
+ noise_prior=gpytorch.priors.HalfNormalPrior(scale=0.005),
76
77
  )
77
78
 
78
79
  model = ExactGPModel(X, y, self.likelihood)
@@ -109,17 +110,20 @@ class ExactGPModel(gpytorch.models.ExactGP):
109
110
  # + stage * time kernel only at low stage with smaller time length.
110
111
  # Note that stage gets transformed to q, so the kernel is actually
111
112
  # q * time
113
+ b_min = np.quantile(train_y, 0.10)
114
+ b_max = np.quantile(train_y, 0.90)
112
115
  self.covar_module = (
113
- (self.cov_stage()
116
+ (self.cov_stage(ls_prior=GammaPrior(concentration=1, rate=1))
114
117
  * self.cov_time(ls_prior=GammaPrior(concentration=1, rate=1)))
115
- + (self.cov_stage(ls_prior=GammaPrior(concentration=1, rate=2))
118
+ + (self.cov_stage(ls_prior=GammaPrior(concentration=3, rate=1))
116
119
  * self.cov_time(ls_prior=GammaPrior(concentration=2, rate=5))
117
120
  * SigmoidKernel(
118
121
  active_dims=self.stage_dim,
119
122
  # a_prior=NormalPrior(loc=20, scale=1),
123
+ # b_prior=NormalPrior(loc=0.5, scale=0.2),
120
124
  b_constraint=gpytorch.constraints.Interval(
121
- train_y.min(),
122
- train_y.max(),
125
+ b_min,
126
+ b_max,
123
127
  ),
124
128
  )
125
129
  )
@@ -141,11 +145,12 @@ class ExactGPModel(gpytorch.models.ExactGP):
141
145
 
142
146
  def cov_stage(self, ls_prior=None):
143
147
  eta = HalfNormalPrior(scale=1)
144
-
148
+
145
149
  return ScaleKernel(
146
150
  MaternKernel(
147
151
  active_dims=self.stage_dim,
148
152
  lengthscale_prior=ls_prior,
153
+ nu=2.5, # Smoother kernel (was nu=1.5)
149
154
  ),
150
155
  outputscale_prior=eta,
151
156
  )
@@ -153,14 +158,31 @@ class ExactGPModel(gpytorch.models.ExactGP):
153
158
  def cov_time(self, ls_prior=None):
154
159
  eta = HalfNormalPrior(scale=1)
155
160
 
156
- return ScaleKernel(
161
+ # Base Matern kernel for long-term trends
162
+ base_kernel = ScaleKernel(
157
163
  MaternKernel(
158
164
  active_dims=self.time_dim,
159
165
  lengthscale_prior=ls_prior,
166
+ nu=1.5, # was 2.5
160
167
  ),
161
168
  outputscale_prior=eta,
162
169
  )
163
170
 
171
+ # Periodic performs beter than a locally periodic kernel
172
+ periodic_kernel = ScaleKernel(
173
+ gpytorch.kernels.PeriodicKernel(
174
+ active_dims=self.time_dim,
175
+ period_length_prior=NormalPrior(loc=1.0, scale=0.1), # ~1 year
176
+ lengthscale_prior=GammaPrior(concentration=2, rate=4),
177
+ ),
178
+ outputscale_prior=HalfNormalPrior(scale=0.5),
179
+ )
180
+
181
+ return base_kernel + periodic_kernel
182
+
183
+
184
+
185
+
164
186
  def cov_stagetime(self):
165
187
  eta = HalfNormalPrior(scale=1)
166
188
  ls = GammaPrior(concentration=2, rate=1)