python-gls 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
python_gls/model.py ADDED
@@ -0,0 +1,511 @@
1
+ """GLS model class -- main entry point for the library.
2
+
3
+ Implements Generalized Least Squares with learned correlation and variance
4
+ structures, equivalent to R's nlme::gls().
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import warnings
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from numpy.typing import NDArray
14
+ from scipy.optimize import minimize
15
+
16
+ from python_gls.correlation.base import CorStruct
17
+ from python_gls.variance.base import VarFunc
18
+ from python_gls.likelihood import (
19
+ profile_loglik_ml,
20
+ profile_loglik_reml,
21
+ compute_gls_estimates,
22
+ )
23
+ from python_gls.results import GLSResults
24
+
25
+
26
+ def _validate_array(arr: NDArray, name: str) -> None:
27
+ """Check an array for NaN and Inf values."""
28
+ if np.any(np.isnan(arr)):
29
+ n_nan = int(np.sum(np.isnan(arr)))
30
+ raise ValueError(
31
+ f"{name} contains {n_nan} NaN value(s). "
32
+ f"Remove or impute missing values before fitting."
33
+ )
34
+ if np.any(np.isinf(arr)):
35
+ n_inf = int(np.sum(np.isinf(arr)))
36
+ raise ValueError(
37
+ f"{name} contains {n_inf} infinite value(s). "
38
+ f"Check for overflow or division by zero in your data."
39
+ )
40
+
41
+
42
+ class GLS:
43
+ """Generalized Least Squares with learned correlation and variance structures.
44
+
45
+ Equivalent to R's ``nlme::gls()``. Estimates fixed effects along with
46
+ correlation and variance parameters via maximum likelihood (ML) or
47
+ restricted maximum likelihood (REML).
48
+
49
+ Parameters
50
+ ----------
51
+ formula : str or None
52
+ R-style formula (e.g., ``"y ~ x1 + x2"``). Use ``from_formula()``
53
+ for formula-based construction.
54
+ data : DataFrame or None
55
+ Data for formula-based construction.
56
+ endog : array-like or None
57
+ Response variable (if not using formula).
58
+ exog : array-like or None
59
+ Design matrix (if not using formula). Should include intercept column.
60
+ correlation : CorStruct or None
61
+ Correlation structure. If None, assumes independence.
62
+ variance : VarFunc or None
63
+ Variance function. If None, assumes homoscedasticity.
64
+ groups : str or array-like or None
65
+ Grouping variable name (str) or array of group labels.
66
+ Required if correlation is specified.
67
+ method : str
68
+ Estimation method: ``'REML'`` (default) or ``'ML'``.
69
+
70
+ Examples
71
+ --------
72
+ >>> from python_gls import GLS
73
+ >>> from python_gls.correlation import CorSymm
74
+ >>> from python_gls.variance import VarIdent
75
+ >>>
76
+ >>> result = GLS.from_formula(
77
+ ... "y ~ x1 + x2",
78
+ ... data=df,
79
+ ... correlation=CorSymm(),
80
+ ... variance=VarIdent("group"),
81
+ ... groups="subject",
82
+ ... ).fit()
83
+ >>> print(result.summary())
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ endog: NDArray | None = None,
89
+ exog: NDArray | None = None,
90
+ correlation: CorStruct | None = None,
91
+ variance: VarFunc | None = None,
92
+ groups: NDArray | str | None = None,
93
+ data: pd.DataFrame | None = None,
94
+ method: str = "REML",
95
+ ) -> None:
96
+ if correlation is not None and not isinstance(correlation, CorStruct):
97
+ raise TypeError(
98
+ f"correlation must be a CorStruct instance, got {type(correlation).__name__}. "
99
+ f"Use one of: CorAR1(), CorCompSymm(), CorSymm(), etc."
100
+ )
101
+ if variance is not None and not isinstance(variance, VarFunc):
102
+ raise TypeError(
103
+ f"variance must be a VarFunc instance, got {type(variance).__name__}. "
104
+ f"Use one of: VarIdent(), VarPower(), VarExp(), etc."
105
+ )
106
+ if not isinstance(method, str):
107
+ raise TypeError(f"method must be a string, got {type(method).__name__}")
108
+
109
+ self.correlation = correlation
110
+ self.variance = variance
111
+ self.method = method.upper()
112
+ if self.method not in ("ML", "REML"):
113
+ raise ValueError(
114
+ f"method must be 'ML' or 'REML', got '{method}'"
115
+ )
116
+
117
+ self._X: NDArray | None = None
118
+ self._y: NDArray | None = None
119
+ self._groups: NDArray | None = None
120
+ self._data: dict | None = None
121
+ self._feature_names: list[str] = []
122
+ self._formula: str | None = None
123
+
124
+ if endog is not None and exog is not None:
125
+ self._y = np.asarray(endog, dtype=float).ravel()
126
+ self._X = np.asarray(exog, dtype=float)
127
+ if self._X.ndim == 1:
128
+ self._X = self._X[:, None]
129
+
130
+ _validate_array(self._y, "endog")
131
+ _validate_array(self._X, "exog")
132
+
133
+ if len(self._y) != self._X.shape[0]:
134
+ raise ValueError(
135
+ f"endog and exog have incompatible shapes: "
136
+ f"endog has {len(self._y)} observations but exog has "
137
+ f"{self._X.shape[0]} rows"
138
+ )
139
+ if len(self._y) == 0:
140
+ raise ValueError("endog and exog must not be empty")
141
+ if self._X.shape[1] > self._X.shape[0]:
142
+ warnings.warn(
143
+ f"More predictors ({self._X.shape[1]}) than observations "
144
+ f"({self._X.shape[0]}). Model may be unidentifiable.",
145
+ stacklevel=2,
146
+ )
147
+
148
+ self._feature_names = [f"x{i}" for i in range(self._X.shape[1])]
149
+ elif (endog is None) != (exog is None):
150
+ raise ValueError(
151
+ "Both endog and exog must be provided together, or neither. "
152
+ "Got endog={} and exog={}".format(
153
+ "provided" if endog is not None else "None",
154
+ "provided" if exog is not None else "None",
155
+ )
156
+ )
157
+
158
+ if data is not None:
159
+ if not isinstance(data, pd.DataFrame):
160
+ raise TypeError(
161
+ f"data must be a pandas DataFrame, got {type(data).__name__}"
162
+ )
163
+ self._data = {col: np.asarray(data[col]) for col in data.columns}
164
+
165
+ if groups is not None:
166
+ if isinstance(groups, str):
167
+ if data is not None:
168
+ if groups not in data.columns:
169
+ raise ValueError(
170
+ f"groups column '{groups}' not found in data. "
171
+ f"Available columns: {list(data.columns)}"
172
+ )
173
+ self._groups = np.asarray(data[groups])
174
+ elif self._data is not None:
175
+ if groups not in self._data:
176
+ raise ValueError(
177
+ f"groups column '{groups}' not found in data. "
178
+ f"Available columns: {list(self._data.keys())}"
179
+ )
180
+ self._groups = np.asarray(self._data[groups])
181
+ else:
182
+ raise ValueError(
183
+ f"groups='{groups}' is a column name but no data was provided. "
184
+ f"Pass data= or provide groups as an array."
185
+ )
186
+ else:
187
+ self._groups = np.asarray(groups)
188
+ if self._y is not None and len(self._groups) != len(self._y):
189
+ raise ValueError(
190
+ f"groups array length ({len(self._groups)}) does not match "
191
+ f"number of observations ({len(self._y)})"
192
+ )
193
+
194
+ @classmethod
195
+ def from_formula(
196
+ cls,
197
+ formula: str,
198
+ data: pd.DataFrame,
199
+ correlation: CorStruct | None = None,
200
+ variance: VarFunc | None = None,
201
+ groups: str | None = None,
202
+ method: str = "REML",
203
+ ) -> GLS:
204
+ """Construct a GLS model from an R-style formula.
205
+
206
+ Parameters
207
+ ----------
208
+ formula : str
209
+ R-style formula, e.g., ``"y ~ x1 + x2"`` or ``"y ~ C(treatment) * time"``.
210
+ data : DataFrame
211
+ Data containing the variables referenced in the formula.
212
+ correlation : CorStruct, optional
213
+ Correlation structure.
214
+ variance : VarFunc, optional
215
+ Variance function.
216
+ groups : str, optional
217
+ Name of the grouping variable in ``data``.
218
+ method : str
219
+ ``'REML'`` or ``'ML'``.
220
+
221
+ Returns
222
+ -------
223
+ GLS
224
+ Model instance ready for ``.fit()``.
225
+ """
226
+ if not isinstance(formula, str):
227
+ raise TypeError(
228
+ f"formula must be a string, got {type(formula).__name__}"
229
+ )
230
+ if "~" not in formula:
231
+ raise ValueError(
232
+ f"formula must contain '~' separating response and predictors, "
233
+ f"e.g. 'y ~ x1 + x2'. Got: '{formula}'"
234
+ )
235
+ if not isinstance(data, pd.DataFrame):
236
+ raise TypeError(
237
+ f"data must be a pandas DataFrame, got {type(data).__name__}"
238
+ )
239
+ if len(data) == 0:
240
+ raise ValueError("data must not be empty")
241
+ if groups is not None and not isinstance(groups, str):
242
+ raise TypeError(
243
+ f"groups must be a column name (string), got {type(groups).__name__}"
244
+ )
245
+
246
+ import formulaic
247
+
248
+ model_spec = formulaic.model_matrix(formula, data)
249
+
250
+ # formulaic returns ModelMatrices with .lhs and .rhs attributes
251
+ if hasattr(model_spec, "lhs"):
252
+ y = np.asarray(model_spec.lhs).ravel()
253
+ X_mm = model_spec.rhs
254
+ X = np.asarray(X_mm, dtype=float)
255
+ feature_names = list(X_mm.columns)
256
+ elif isinstance(model_spec, tuple) and len(model_spec) == 2:
257
+ y = np.asarray(model_spec[0]).ravel()
258
+ X_mm = model_spec[1]
259
+ X = np.asarray(X_mm, dtype=float)
260
+ feature_names = list(X_mm.columns)
261
+ else:
262
+ X_mm = model_spec
263
+ X = np.asarray(X_mm, dtype=float)
264
+ feature_names = list(X_mm.columns)
265
+ lhs = formula.split("~")[0].strip()
266
+ y = np.asarray(data[lhs], dtype=float).ravel()
267
+
268
+ obj = cls(
269
+ endog=y,
270
+ exog=X,
271
+ correlation=correlation,
272
+ variance=variance,
273
+ groups=groups,
274
+ data=data,
275
+ method=method,
276
+ )
277
+ obj._feature_names = feature_names
278
+ obj._formula = formula
279
+ return obj
280
+
281
+ def _split_by_groups(self) -> tuple[list[NDArray], list[NDArray], list[NDArray]]:
282
+ """Split X, y, and indices by group.
283
+
284
+ Returns
285
+ -------
286
+ X_groups : list of (m_g, k) arrays
287
+ y_groups : list of (m_g,) arrays
288
+ idx_groups : list of index arrays (row indices into original data)
289
+ """
290
+ if self._groups is None:
291
+ return [self._X], [self._y], [np.arange(len(self._y))]
292
+
293
+ unique_groups = np.unique(self._groups)
294
+ X_groups = []
295
+ y_groups = []
296
+ idx_groups = []
297
+ for g in unique_groups:
298
+ mask = self._groups == g
299
+ idx = np.where(mask)[0]
300
+ X_groups.append(self._X[idx])
301
+ y_groups.append(self._y[idx])
302
+ idx_groups.append(idx)
303
+ return X_groups, y_groups, idx_groups
304
+
305
+ def _get_corr_matrices(
306
+ self, group_sizes: list[int]
307
+ ) -> list[NDArray]:
308
+ """Get correlation matrices for each group."""
309
+ if self.correlation is None:
310
+ return [np.eye(s) for s in group_sizes]
311
+ return [
312
+ self.correlation.get_correlation_matrix(s, group_id=i)
313
+ for i, s in enumerate(group_sizes)
314
+ ]
315
+
316
+ def _get_var_weights(
317
+ self, idx_groups: list[NDArray]
318
+ ) -> list[NDArray]:
319
+ """Get variance weights for each group."""
320
+ if self.variance is None:
321
+ return [np.ones(len(idx)) for idx in idx_groups]
322
+ return [
323
+ self.variance.get_weights(self._data, idx)
324
+ for idx in idx_groups
325
+ ]
326
+
327
+ def fit(
328
+ self,
329
+ maxiter: int = 200,
330
+ tol: float = 1e-8,
331
+ verbose: bool = False,
332
+ ) -> GLSResults:
333
+ """Fit the GLS model.
334
+
335
+ Parameters
336
+ ----------
337
+ maxiter : int
338
+ Maximum number of optimization iterations.
339
+ tol : float
340
+ Convergence tolerance.
341
+ verbose : bool
342
+ If True, print optimization progress.
343
+
344
+ Returns
345
+ -------
346
+ GLSResults
347
+ Fitted model results.
348
+ """
349
+ if self._X is None or self._y is None:
350
+ raise ValueError(
351
+ "No data provided. Use GLS.from_formula('y ~ x', data=df) "
352
+ "or pass endog= and exog= to the constructor."
353
+ )
354
+
355
+ N = len(self._y)
356
+ k = self._X.shape[1]
357
+
358
+ if N <= k:
359
+ warnings.warn(
360
+ f"Number of observations ({N}) is not greater than the number "
361
+ f"of predictors ({k}). Estimates may be unreliable.",
362
+ stacklevel=2,
363
+ )
364
+
365
+ if self.variance is not None and self._data is None:
366
+ raise ValueError(
367
+ "A variance function was specified but no data dictionary is available. "
368
+ "Use GLS.from_formula() or pass data= to the constructor."
369
+ )
370
+
371
+ # Step 1: OLS initial fit
372
+ beta_ols = np.linalg.lstsq(self._X, self._y, rcond=None)[0]
373
+ residuals_ols = self._y - self._X @ beta_ols
374
+
375
+ X_groups, y_groups, idx_groups = self._split_by_groups()
376
+ group_sizes = [len(yg) for yg in y_groups]
377
+
378
+ # If no correlation or variance structure, just do OLS/GLS with identity
379
+ if self.correlation is None and self.variance is None:
380
+ corr_matrices = [np.eye(s) for s in group_sizes]
381
+ var_weights = [np.ones(s) for s in group_sizes]
382
+
383
+ beta_hat, cov_beta, sigma2_hat, loglik = compute_gls_estimates(
384
+ X_groups, y_groups, corr_matrices, var_weights, N, self.method
385
+ )
386
+
387
+ return GLSResults(
388
+ model=self,
389
+ params=beta_hat,
390
+ cov_params=cov_beta,
391
+ sigma2=sigma2_hat,
392
+ loglik=loglik,
393
+ method=self.method,
394
+ nobs=N,
395
+ df_model=k - 1,
396
+ df_resid=N - k,
397
+ feature_names=self._feature_names,
398
+ n_iter=0,
399
+ converged=True,
400
+ )
401
+
402
+ # Step 2: Initialize correlation and variance params from OLS residuals
403
+ residuals_by_group = [residuals_ols[idx] for idx in idx_groups]
404
+
405
+ if self.correlation is not None:
406
+ self.correlation.initialize(residuals_by_group)
407
+
408
+ if self.variance is not None:
409
+ if self._data is None:
410
+ raise ValueError("Data dictionary required for variance functions.")
411
+ self.variance.initialize(residuals_ols, self._data)
412
+
413
+ # Step 3: Optimize profile log-likelihood
414
+ def _pack_params() -> NDArray:
415
+ parts = []
416
+ if self.correlation is not None and self.correlation.n_params > 0:
417
+ parts.append(self.correlation.get_unconstrained_params())
418
+ if self.variance is not None and self.variance.n_params > 0:
419
+ parts.append(self.variance.get_unconstrained_params())
420
+ return np.concatenate(parts) if parts else np.array([])
421
+
422
+ def _unpack_params(theta: NDArray) -> None:
423
+ idx = 0
424
+ if self.correlation is not None and self.correlation.n_params > 0:
425
+ n_corr = self.correlation.n_params
426
+ self.correlation.set_unconstrained_params(theta[idx : idx + n_corr])
427
+ idx += n_corr
428
+ if self.variance is not None and self.variance.n_params > 0:
429
+ n_var = self.variance.n_params
430
+ self.variance.set_unconstrained_params(theta[idx : idx + n_var])
431
+ idx += n_var
432
+
433
+ loglik_func = (
434
+ profile_loglik_reml if self.method == "REML" else profile_loglik_ml
435
+ )
436
+
437
+ n_eval = [0]
438
+
439
+ def neg_loglik(theta: NDArray) -> float:
440
+ _unpack_params(theta)
441
+ try:
442
+ corr_matrices = self._get_corr_matrices(group_sizes)
443
+ var_weights = self._get_var_weights(idx_groups)
444
+ ll = loglik_func(X_groups, y_groups, corr_matrices, var_weights, N)
445
+ except (np.linalg.LinAlgError, ValueError, FloatingPointError):
446
+ return 1e15
447
+ n_eval[0] += 1
448
+ if verbose and n_eval[0] % 10 == 0:
449
+ print(f" Iteration {n_eval[0]}: loglik = {ll:.4f}")
450
+ if np.isnan(ll) or np.isinf(ll):
451
+ return 1e15
452
+ return -ll
453
+
454
+ theta0 = _pack_params()
455
+
456
+ if len(theta0) > 0:
457
+ result = minimize(
458
+ neg_loglik,
459
+ theta0,
460
+ method="L-BFGS-B",
461
+ options={"maxiter": maxiter, "ftol": tol, "disp": verbose},
462
+ )
463
+ _unpack_params(result.x)
464
+ converged = result.success
465
+ n_iter = result.nit
466
+ if not converged:
467
+ warnings.warn(
468
+ f"Optimization did not converge after {n_iter} iterations: "
469
+ f"{result.message}. Results may be unreliable.",
470
+ stacklevel=2,
471
+ )
472
+ else:
473
+ converged = True
474
+ n_iter = 0
475
+
476
+ # Step 4: Compute final estimates at converged parameters
477
+ corr_matrices = self._get_corr_matrices(group_sizes)
478
+ var_weights = self._get_var_weights(idx_groups)
479
+
480
+ beta_hat, cov_beta, sigma2_hat, loglik = compute_gls_estimates(
481
+ X_groups, y_groups, corr_matrices, var_weights, N, self.method
482
+ )
483
+
484
+ # Collect estimated parameters
485
+ corr_params = (
486
+ self.correlation.get_params()
487
+ if self.correlation is not None and self.correlation.n_params > 0
488
+ else None
489
+ )
490
+ var_params = (
491
+ self.variance.get_params()
492
+ if self.variance is not None and self.variance.n_params > 0
493
+ else None
494
+ )
495
+
496
+ return GLSResults(
497
+ model=self,
498
+ params=beta_hat,
499
+ cov_params=cov_beta,
500
+ sigma2=sigma2_hat,
501
+ loglik=loglik,
502
+ method=self.method,
503
+ nobs=N,
504
+ df_model=k - 1,
505
+ df_resid=N - k,
506
+ feature_names=self._feature_names,
507
+ correlation_params=corr_params,
508
+ variance_params=var_params,
509
+ n_iter=n_iter,
510
+ converged=converged,
511
+ )