rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rustystats/formula.py ADDED
@@ -0,0 +1,1074 @@
1
+ """
2
+ Formula-based API for RustyStats GLM.
3
+
4
+ This module provides R-style formula support for fitting GLMs with DataFrames.
5
+ It uses the `formulaic` library for formula parsing and supports Polars DataFrames.
6
+
7
+ Example
8
+ -------
9
+ >>> import rustystats as rs
10
+ >>> import polars as pl
11
+ >>>
12
+ >>> data = pl.read_parquet("insurance_data.parquet")
13
+ >>> model = rs.glm(
14
+ ... formula="ClaimNb ~ VehPower + VehAge + C(VehBrand)",
15
+ ... data=data,
16
+ ... family="poisson",
17
+ ... offset="Exposure"
18
+ ... )
19
+ >>> result = model.fit()
20
+ >>> print(rs.summary(result))
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from typing import Optional, Union, List, TYPE_CHECKING
26
+ import numpy as np
27
+
28
+ # Lazy imports for optional dependencies
29
+ if TYPE_CHECKING:
30
+ import polars as pl
31
+
32
+
33
+ def _get_column(data: "pl.DataFrame", column: str) -> np.ndarray:
34
+ """Extract a column as numpy array from Polars DataFrame."""
35
+ return data[column].to_numpy()
36
+
37
+
38
+ # Import from interactions module (the canonical implementation)
39
+ from rustystats.interactions import build_design_matrix, InteractionBuilder
40
+
41
+
42
+ class FormulaGLM:
43
+ """
44
+ GLM model with formula-based specification.
45
+
46
+ This class provides an R-like interface for fitting GLMs using
47
+ formulas and DataFrames.
48
+
49
+ Parameters
50
+ ----------
51
+ formula : str
52
+ R-style formula specifying the model.
53
+ Examples:
54
+ - "y ~ x1 + x2": Linear model with intercept
55
+ - "y ~ x1 + C(cat)": Include categorical variable
56
+ - "y ~ 0 + x1 + x2": No intercept
57
+
58
+ data : pl.DataFrame
59
+ Polars DataFrame containing the data.
60
+
61
+ family : str, default="gaussian"
62
+ Distribution family: "gaussian", "poisson", "binomial", "gamma"
63
+
64
+ link : str, optional
65
+ Link function. If None, uses canonical link for family.
66
+
67
+ offset : str or array-like, optional
68
+ Offset term. Can be:
69
+ - Column name (str): Will extract from data
70
+ - Array: Use directly
71
+ For Poisson family, typically log(exposure).
72
+
73
+ weights : str or array-like, optional
74
+ Prior weights. Can be column name or array.
75
+
76
+ Attributes
77
+ ----------
78
+ formula : str
79
+ The formula used
80
+ data : DataFrame
81
+ Original data
82
+ family : str
83
+ Distribution family
84
+ feature_names : list[str]
85
+ Names of features in the design matrix
86
+
87
+ Examples
88
+ --------
89
+ >>> import rustystats as rs
90
+ >>> import polars as pl
91
+ >>>
92
+ >>> data = pl.DataFrame({
93
+ ... "claims": [0, 1, 2, 0, 1],
94
+ ... "age": [25, 35, 45, 55, 65],
95
+ ... "exposure": [1.0, 0.5, 1.0, 0.8, 1.0]
96
+ ... })
97
+ >>>
98
+ >>> model = rs.glm(
99
+ ... formula="claims ~ age",
100
+ ... data=data,
101
+ ... family="poisson",
102
+ ... offset="exposure" # Will auto-apply log()
103
+ ... )
104
+ >>> result = model.fit()
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ formula: str,
110
+ data: "pl.DataFrame",
111
+ family: str = "gaussian",
112
+ link: Optional[str] = None,
113
+ var_power: float = 1.5,
114
+ theta: Optional[float] = None,
115
+ offset: Optional[Union[str, np.ndarray]] = None,
116
+ weights: Optional[Union[str, np.ndarray]] = None,
117
+ ):
118
+ self.formula = formula
119
+ self.data = data
120
+ self.family = family.lower()
121
+ self.link = link
122
+ self.var_power = var_power
123
+ self.theta = theta # None means auto-estimate for negbinomial
124
+ self._offset_spec = offset
125
+ self._weights_spec = weights
126
+
127
+ # Extract raw exposure for target encoding BEFORE building design matrix
128
+ # For frequency models with log link, offset is typically log(exposure)
129
+ # but target encoding needs raw exposure to compute claim rates
130
+ raw_exposure = self._get_raw_exposure(offset)
131
+
132
+ # Build design matrix (uses optimized backend for interactions)
133
+ # Pass raw exposure so target encoding can use rate (y/exposure) instead of raw y
134
+ self._builder = InteractionBuilder(data)
135
+ self.y, self.X, self.feature_names = self._builder.build_design_matrix(
136
+ formula, exposure=raw_exposure
137
+ )
138
+ self.n_obs = len(self.y)
139
+ self.n_params = self.X.shape[1]
140
+
141
+ # Store validation results (computed lazily)
142
+ self._validation_results = None
143
+
144
+ # Process offset (applies log for Poisson/Gamma families)
145
+ self.offset = self._process_offset(offset)
146
+
147
+ # Process weights
148
+ self.weights = self._process_weights(weights)
149
+
150
+ def _process_offset(
151
+ self,
152
+ offset: Optional[Union[str, np.ndarray]]
153
+ ) -> Optional[np.ndarray]:
154
+ """Process offset specification."""
155
+ if offset is None:
156
+ return None
157
+
158
+ if isinstance(offset, str):
159
+ # It's a column name
160
+ offset_values = _get_column(self.data, offset)
161
+
162
+ # For Poisson/Gamma/QuasiPoisson/NegBinomial with log link, auto-apply log to exposure
163
+ if self.family in ("poisson", "quasipoisson", "negbinomial", "gamma") and self.link in (None, "log"):
164
+ # Check if values look like exposure (positive, not already logged)
165
+ if np.all(offset_values > 0) and np.mean(offset_values) > 0.01:
166
+ offset_values = np.log(offset_values)
167
+
168
+ return offset_values.astype(np.float64)
169
+ else:
170
+ return np.asarray(offset, dtype=np.float64)
171
+
172
+ def _process_weights(
173
+ self,
174
+ weights: Optional[Union[str, np.ndarray]]
175
+ ) -> Optional[np.ndarray]:
176
+ """Process weights specification."""
177
+ if weights is None:
178
+ return None
179
+
180
+ if isinstance(weights, str):
181
+ return _get_column(self.data, weights).astype(np.float64)
182
+ else:
183
+ return np.asarray(weights, dtype=np.float64)
184
+
185
+ def _get_raw_exposure(
186
+ self,
187
+ offset: Optional[Union[str, np.ndarray]]
188
+ ) -> Optional[np.ndarray]:
189
+ """
190
+ Get raw exposure values for target encoding.
191
+
192
+ For frequency models (Poisson, NegBinomial, etc.), the offset is typically
193
+ log(exposure). However, target encoding needs the raw exposure values
194
+ to compute claim rates (claims/exposure) instead of raw claim counts.
195
+
196
+ This method extracts the raw exposure BEFORE log transformation.
197
+ """
198
+ if offset is None:
199
+ return None
200
+
201
+ if isinstance(offset, str):
202
+ # It's a column name - extract raw values
203
+ return _get_column(self.data, offset).astype(np.float64)
204
+ else:
205
+ # It's an array - assume it's raw exposure values
206
+ # (if user passed log(exposure), they'll get log-rate encoding which is also valid)
207
+ return np.asarray(offset, dtype=np.float64)
208
+
209
+ @property
210
+ def df_model(self) -> int:
211
+ """Degrees of freedom for model (number of parameters - 1)."""
212
+ return self.n_params - 1
213
+
214
+ @property
215
+ def df_resid(self) -> int:
216
+ """Degrees of freedom for residuals (n - p)."""
217
+ return self.n_obs - self.n_params
218
+
219
+ def validate(self, verbose: bool = True) -> dict:
220
+ """
221
+ Validate the design matrix before fitting.
222
+
223
+ Checks for common issues that cause fitting failures:
224
+ - Rank deficiency (linearly dependent columns)
225
+ - High multicollinearity
226
+ - Zero variance columns
227
+ - NaN/Inf values
228
+
229
+ Parameters
230
+ ----------
231
+ verbose : bool, default=True
232
+ Print diagnostic messages with fix suggestions.
233
+
234
+ Returns
235
+ -------
236
+ dict
237
+ Validation results including 'valid' (bool) and 'suggestions' (list).
238
+
239
+ Examples
240
+ --------
241
+ >>> model = rs.glm("y ~ ns(x, df=4) + C(cat)", data, family="poisson")
242
+ >>> results = model.validate()
243
+ >>> if not results['valid']:
244
+ ... print("Issues found:", results['suggestions'])
245
+ """
246
+ self._validation_results = self._builder.validate_design_matrix(
247
+ self.X, self.feature_names, verbose=verbose
248
+ )
249
+ return self._validation_results
250
+
251
+ def explore(
252
+ self,
253
+ categorical_factors: Optional[List[str]] = None,
254
+ continuous_factors: Optional[List[str]] = None,
255
+ n_bins: int = 10,
256
+ rare_threshold_pct: float = 1.0,
257
+ max_categorical_levels: int = 20,
258
+ detect_interactions: bool = True,
259
+ max_interaction_factors: int = 10,
260
+ ):
261
+ """
262
+ Explore data before fitting the model.
263
+
264
+ This provides pre-fit analysis including factor statistics and
265
+ interaction detection based on the response variable.
266
+
267
+ Parameters
268
+ ----------
269
+ categorical_factors : list of str, optional
270
+ Names of categorical factors to analyze.
271
+ continuous_factors : list of str, optional
272
+ Names of continuous factors to analyze.
273
+ n_bins : int, default=10
274
+ Number of bins for continuous factors.
275
+ rare_threshold_pct : float, default=1.0
276
+ Threshold (%) below which categorical levels are grouped.
277
+ max_categorical_levels : int, default=20
278
+ Maximum categorical levels to show.
279
+ detect_interactions : bool, default=True
280
+ Whether to detect potential interactions.
281
+ max_interaction_factors : int, default=10
282
+ Maximum factors for interaction detection.
283
+
284
+ Returns
285
+ -------
286
+ DataExploration
287
+ Pre-fit exploration results with to_json() method.
288
+
289
+ Examples
290
+ --------
291
+ >>> model = rs.glm("ClaimNb ~ Age + C(Region)", data, family="poisson")
292
+ >>>
293
+ >>> # Explore before fitting
294
+ >>> exploration = model.explore(
295
+ ... categorical_factors=["Region", "VehBrand"],
296
+ ... continuous_factors=["Age", "VehPower"],
297
+ ... )
298
+ >>> print(exploration.to_json())
299
+ >>>
300
+ >>> # Then fit
301
+ >>> result = model.fit()
302
+ """
303
+ from rustystats.diagnostics import explore_data
304
+
305
+ # Parse formula to get response column name
306
+ response = self.formula.split("~")[0].strip()
307
+
308
+ # Get exposure column if set
309
+ exposure_col = None
310
+ if isinstance(self._offset_spec, str):
311
+ exposure_col = self._offset_spec
312
+
313
+ return explore_data(
314
+ data=self.data,
315
+ response=response,
316
+ categorical_factors=categorical_factors,
317
+ continuous_factors=continuous_factors,
318
+ exposure=exposure_col,
319
+ family=self.family,
320
+ n_bins=n_bins,
321
+ rare_threshold_pct=rare_threshold_pct,
322
+ max_categorical_levels=max_categorical_levels,
323
+ detect_interactions=detect_interactions,
324
+ max_interaction_factors=max_interaction_factors,
325
+ )
326
+
327
+ def fit(
328
+ self,
329
+ alpha: float = 0.0,
330
+ l1_ratio: float = 0.0,
331
+ max_iter: int = 25,
332
+ tol: float = 1e-8,
333
+ ):
334
+ """
335
+ Fit the GLM model, optionally with regularization.
336
+
337
+ Parameters
338
+ ----------
339
+ alpha : float, default=0.0
340
+ Regularization strength. Higher values = more shrinkage.
341
+ - alpha=0: No regularization (standard GLM)
342
+ - alpha>0: Regularized GLM
343
+
344
+ l1_ratio : float, default=0.0
345
+ Elastic Net mixing parameter:
346
+ - l1_ratio=0.0: Ridge (L2) penalty
347
+ - l1_ratio=1.0: Lasso (L1) penalty - performs variable selection
348
+ - 0 < l1_ratio < 1: Elastic Net
349
+
350
+ max_iter : int, default=25
351
+ Maximum IRLS iterations.
352
+ tol : float, default=1e-8
353
+ Convergence tolerance.
354
+
355
+ Returns
356
+ -------
357
+ FormulaGLMResults
358
+ Fitted model results with feature names attached.
359
+
360
+ Examples
361
+ --------
362
+ >>> # Standard GLM
363
+ >>> result = model.fit()
364
+
365
+ >>> # Ridge regularization
366
+ >>> result = model.fit(alpha=0.1, l1_ratio=0.0)
367
+
368
+ >>> # Lasso for variable selection
369
+ >>> result = model.fit(alpha=0.1, l1_ratio=1.0)
370
+ """
371
+ from rustystats._rustystats import fit_glm_py as _fit_glm_rust, fit_negbinomial_py as _fit_negbinomial_rust
372
+
373
+ # Check if we need auto theta estimation for negbinomial
374
+ is_negbinomial = self.family in ("negbinomial", "negativebinomial", "negative_binomial", "neg-binomial", "nb")
375
+ auto_theta = is_negbinomial and self.theta is None
376
+
377
+ try:
378
+ if auto_theta:
379
+ # Use profile likelihood to auto-estimate theta
380
+ result = _fit_negbinomial_rust(
381
+ self.y,
382
+ self.X,
383
+ self.link,
384
+ None, # init_theta (use method-of-moments)
385
+ 1e-5, # theta_tol
386
+ 10, # max_theta_iter
387
+ self.offset,
388
+ self.weights,
389
+ max_iter,
390
+ tol,
391
+ )
392
+ result_family = result.family # Contains estimated theta
393
+ else:
394
+ # Use fixed theta (default 1.0 for negbinomial if not auto)
395
+ theta = self.theta if self.theta is not None else 1.0
396
+ result = _fit_glm_rust(
397
+ self.y,
398
+ self.X,
399
+ self.family,
400
+ self.link,
401
+ self.var_power,
402
+ theta,
403
+ self.offset,
404
+ self.weights,
405
+ alpha,
406
+ l1_ratio,
407
+ max_iter,
408
+ tol,
409
+ )
410
+ result_family = self.family
411
+ except ValueError as e:
412
+ if "singular" in str(e).lower() or "multicollinearity" in str(e).lower():
413
+ # Run validation to provide helpful diagnostics
414
+ print("\n" + "=" * 60)
415
+ print("MODEL FITTING FAILED - Running diagnostics...")
416
+ print("=" * 60)
417
+ validation = self.validate(verbose=True)
418
+ raise ValueError(
419
+ f"GLM fitting failed due to design matrix issues. "
420
+ f"See diagnostics above for specific problems and fixes.\n"
421
+ f"You can also run model.validate() before fit() to check for issues.\n"
422
+ f"Original error: {e}"
423
+ ) from None
424
+ else:
425
+ raise
426
+
427
+ # Wrap result with formula metadata
428
+ return FormulaGLMResults(
429
+ result=result,
430
+ feature_names=self.feature_names,
431
+ formula=self.formula,
432
+ family=result_family,
433
+ link=self.link,
434
+ builder=self._builder,
435
+ offset_spec=self._offset_spec,
436
+ offset_is_exposure=(self.family in ("poisson", "quasipoisson", "negbinomial", "gamma") and self.link in (None, "log")),
437
+ )
438
+
439
+
440
+ class FormulaGLMResults:
441
+ """
442
+ Results from a formula-based GLM fit.
443
+
444
+ This wraps the base GLMResults and adds formula-specific functionality
445
+ like named coefficients and automatic summary formatting.
446
+
447
+ Attributes
448
+ ----------
449
+ params : np.ndarray
450
+ Fitted coefficients
451
+ feature_names : list[str]
452
+ Names corresponding to each coefficient
453
+ formula : str
454
+ The formula used to fit the model
455
+ """
456
+
457
+ def __init__(
458
+ self,
459
+ result,
460
+ feature_names: List[str],
461
+ formula: str,
462
+ family: str,
463
+ link: Optional[str],
464
+ builder: Optional["InteractionBuilder"] = None,
465
+ offset_spec: Optional[Union[str, np.ndarray]] = None,
466
+ offset_is_exposure: bool = False,
467
+ ):
468
+ self._result = result
469
+ self.feature_names = feature_names
470
+ self.formula = formula
471
+ self.family = family
472
+ self.link = link or self._default_link(family)
473
+ self._builder = builder
474
+ self._offset_spec = offset_spec
475
+ self._offset_is_exposure = offset_is_exposure
476
+
477
+ @staticmethod
478
+ def _default_link(family: str) -> str:
479
+ """Get default link for family."""
480
+ # Handle NegativeBinomial(theta=...) format
481
+ family_lower = family.lower()
482
+ if family_lower.startswith("negativebinomial"):
483
+ return "log"
484
+ return {
485
+ "gaussian": "identity",
486
+ "poisson": "log",
487
+ "quasipoisson": "log",
488
+ "negbinomial": "log",
489
+ "binomial": "logit",
490
+ "gamma": "log",
491
+ "inversegaussian": "inverse",
492
+ "tweedie": "log",
493
+ }.get(family_lower, "identity")
494
+
495
+ # Delegate to underlying result
496
+ @property
497
+ def params(self) -> np.ndarray:
498
+ """Fitted coefficients."""
499
+ return self._result.params
500
+
501
+ @property
502
+ def fittedvalues(self) -> np.ndarray:
503
+ """Fitted values (predicted means)."""
504
+ return self._result.fittedvalues
505
+
506
+ @property
507
+ def linear_predictor(self) -> np.ndarray:
508
+ """Linear predictor (eta = X @ beta)."""
509
+ return self._result.linear_predictor
510
+
511
+ @property
512
+ def deviance(self) -> float:
513
+ """Model deviance."""
514
+ return self._result.deviance
515
+
516
+ @property
517
+ def converged(self) -> bool:
518
+ """Whether IRLS converged."""
519
+ return self._result.converged
520
+
521
+ @property
522
+ def iterations(self) -> int:
523
+ """Number of IRLS iterations."""
524
+ return self._result.iterations
525
+
526
+ def bse(self) -> np.ndarray:
527
+ """Standard errors of coefficients."""
528
+ return self._result.bse()
529
+
530
+ def tvalues(self) -> np.ndarray:
531
+ """z/t statistics."""
532
+ return self._result.tvalues()
533
+
534
+ def pvalues(self) -> np.ndarray:
535
+ """P-values for coefficients."""
536
+ return self._result.pvalues()
537
+
538
+ def conf_int(self, alpha: float = 0.05) -> np.ndarray:
539
+ """Confidence intervals."""
540
+ return self._result.conf_int(alpha)
541
+
542
+ def significance_codes(self) -> List[str]:
543
+ """Significance codes."""
544
+ return self._result.significance_codes()
545
+
546
+ # Robust standard errors (sandwich estimators)
547
+ def bse_robust(self, cov_type: str = "HC1") -> np.ndarray:
548
+ """Robust standard errors of coefficients (HC/sandwich estimator).
549
+
550
+ Unlike model-based standard errors that assume correct variance
551
+ specification, robust standard errors are valid under heteroscedasticity.
552
+
553
+ Parameters
554
+ ----------
555
+ cov_type : str, optional
556
+ Type of robust covariance. Options:
557
+ - "HC0": No small-sample correction
558
+ - "HC1": Degrees of freedom correction (default, recommended)
559
+ - "HC2": Leverage-adjusted
560
+ - "HC3": Jackknife-like (most conservative)
561
+
562
+ Returns
563
+ -------
564
+ numpy.ndarray
565
+ Array of robust standard errors, one for each coefficient.
566
+ """
567
+ return self._result.bse_robust(cov_type)
568
+
569
+ def tvalues_robust(self, cov_type: str = "HC1") -> np.ndarray:
570
+ """z/t statistics using robust standard errors.
571
+
572
+ Parameters
573
+ ----------
574
+ cov_type : str, optional
575
+ Type of robust covariance. Default "HC1".
576
+
577
+ Returns
578
+ -------
579
+ numpy.ndarray
580
+ Array of t/z statistics (coefficient / robust SE).
581
+ """
582
+ return self._result.tvalues_robust(cov_type)
583
+
584
+ def pvalues_robust(self, cov_type: str = "HC1") -> np.ndarray:
585
+ """P-values using robust standard errors.
586
+
587
+ Parameters
588
+ ----------
589
+ cov_type : str, optional
590
+ Type of robust covariance. Default "HC1".
591
+
592
+ Returns
593
+ -------
594
+ numpy.ndarray
595
+ Array of p-values.
596
+ """
597
+ return self._result.pvalues_robust(cov_type)
598
+
599
+ def conf_int_robust(self, alpha: float = 0.05, cov_type: str = "HC1") -> np.ndarray:
600
+ """Confidence intervals using robust standard errors.
601
+
602
+ Parameters
603
+ ----------
604
+ alpha : float, optional
605
+ Significance level. Default 0.05 gives 95% CI.
606
+ cov_type : str, optional
607
+ Type of robust covariance. Default "HC1".
608
+
609
+ Returns
610
+ -------
611
+ numpy.ndarray
612
+ 2D array of shape (n_params, 2) with [lower, upper] bounds.
613
+ """
614
+ return self._result.conf_int_robust(alpha, cov_type)
615
+
616
+ def cov_robust(self, cov_type: str = "HC1") -> np.ndarray:
617
+ """Robust covariance matrix (HC/sandwich estimator).
618
+
619
+ Parameters
620
+ ----------
621
+ cov_type : str, optional
622
+ Type of robust covariance. Default "HC1".
623
+
624
+ Returns
625
+ -------
626
+ numpy.ndarray
627
+ Robust covariance matrix (p × p).
628
+ """
629
+ return self._result.cov_robust(cov_type)
630
+
631
+ # Diagnostic methods (statsmodels-compatible)
632
+ def resid_response(self) -> np.ndarray:
633
+ """Response residuals: y - μ."""
634
+ return self._result.resid_response()
635
+
636
+ def resid_pearson(self) -> np.ndarray:
637
+ """Pearson residuals: (y - μ) / √V(μ)."""
638
+ return self._result.resid_pearson()
639
+
640
+ def resid_deviance(self) -> np.ndarray:
641
+ """Deviance residuals: sign(y - μ) × √d_i."""
642
+ return self._result.resid_deviance()
643
+
644
+ def resid_working(self) -> np.ndarray:
645
+ """Working residuals: (y - μ) × g'(μ)."""
646
+ return self._result.resid_working()
647
+
648
+ def llf(self) -> float:
649
+ """Log-likelihood of the fitted model."""
650
+ return self._result.llf()
651
+
652
+ def aic(self) -> float:
653
+ """Akaike Information Criterion."""
654
+ return self._result.aic()
655
+
656
+ def bic(self) -> float:
657
+ """Bayesian Information Criterion."""
658
+ return self._result.bic()
659
+
660
+ def null_deviance(self) -> float:
661
+ """Deviance of intercept-only model."""
662
+ return self._result.null_deviance()
663
+
664
+ def pearson_chi2(self) -> float:
665
+ """Pearson chi-squared statistic."""
666
+ return self._result.pearson_chi2()
667
+
668
+ def scale(self) -> float:
669
+ """Estimated dispersion parameter (deviance-based)."""
670
+ return self._result.scale()
671
+
672
+ def scale_pearson(self) -> float:
673
+ """Estimated dispersion parameter (Pearson-based)."""
674
+ return self._result.scale_pearson()
675
+
676
+ # Regularization properties
677
+ @property
678
+ def alpha(self) -> float:
679
+ """Regularization strength (lambda)."""
680
+ return self._result.alpha
681
+
682
+ @property
683
+ def l1_ratio(self):
684
+ """L1 ratio for Elastic Net (1.0=Lasso, 0.0=Ridge)."""
685
+ return self._result.l1_ratio
686
+
687
+ @property
688
+ def is_regularized(self) -> bool:
689
+ """Whether this is a regularized model."""
690
+ return self._result.is_regularized
691
+
692
+ @property
693
+ def penalty_type(self) -> str:
694
+ """Type of penalty: 'none', 'ridge', 'lasso', or 'elasticnet'."""
695
+ return self._result.penalty_type
696
+
697
+ def n_nonzero(self) -> int:
698
+ """Number of non-zero coefficients (excluding intercept)."""
699
+ return self._result.n_nonzero()
700
+
701
+ def selected_features(self) -> List[str]:
702
+ """
703
+ Get names of features with non-zero coefficients.
704
+
705
+ Useful for Lasso/Elastic Net to see which variables were selected.
706
+ """
707
+ indices = self._result.selected_features()
708
+ return [self.feature_names[i] for i in indices]
709
+
710
+ @property
711
+ def nobs(self) -> int:
712
+ """Number of observations."""
713
+ return self._result.nobs
714
+
715
+ @property
716
+ def df_resid(self) -> int:
717
+ """Residual degrees of freedom."""
718
+ return self._result.df_resid
719
+
720
+ @property
721
+ def df_model(self) -> int:
722
+ """Model degrees of freedom."""
723
+ return self._result.df_model
724
+
725
+ def coef_table(self) -> "pl.DataFrame":
726
+ """
727
+ Return coefficients as a DataFrame with names.
728
+
729
+ Returns
730
+ -------
731
+ pl.DataFrame
732
+ DataFrame with columns: Feature, Estimate, Std.Error, z, Pr(>|z|), Signif
733
+ """
734
+ import polars as pl
735
+
736
+ return pl.DataFrame({
737
+ "Feature": self.feature_names,
738
+ "Estimate": self.params,
739
+ "Std.Error": self.bse(),
740
+ "z": self.tvalues(),
741
+ "Pr(>|z|)": self.pvalues(),
742
+ "Signif": self.significance_codes(),
743
+ })
744
+
745
+ def relativities(self) -> "pl.DataFrame":
746
+ """
747
+ Return relativities (exp(coef)) for log-link models.
748
+
749
+ Returns
750
+ -------
751
+ pl.DataFrame
752
+ DataFrame with Feature, Relativity and confidence interval columns
753
+ """
754
+ import polars as pl
755
+
756
+ if self.link not in ("log",):
757
+ raise ValueError(
758
+ f"Relativities only meaningful for log link, not '{self.link}'"
759
+ )
760
+
761
+ ci = self.conf_int()
762
+
763
+ return pl.DataFrame({
764
+ "Feature": self.feature_names,
765
+ "Relativity": np.exp(self.params),
766
+ "CI_Lower": np.exp(ci[:, 0]),
767
+ "CI_Upper": np.exp(ci[:, 1]),
768
+ })
769
+
770
+ def summary(self) -> str:
771
+ """
772
+ Generate a formatted summary string.
773
+
774
+ Returns
775
+ -------
776
+ str
777
+ Formatted summary table
778
+ """
779
+ from rustystats.glm import summary
780
+ return summary(self._result, feature_names=self.feature_names)
781
+
782
+ def diagnostics(
783
+ self,
784
+ data: "pl.DataFrame",
785
+ categorical_factors: Optional[List[str]] = None,
786
+ continuous_factors: Optional[List[str]] = None,
787
+ n_calibration_bins: int = 10,
788
+ n_factor_bins: int = 10,
789
+ rare_threshold_pct: float = 1.0,
790
+ max_categorical_levels: int = 20,
791
+ detect_interactions: bool = True,
792
+ max_interaction_factors: int = 10,
793
+ ):
794
+ """
795
+ Compute comprehensive model diagnostics.
796
+
797
+ Parameters
798
+ ----------
799
+ data : pl.DataFrame
800
+ Original data used for fitting.
801
+ categorical_factors : list of str, optional
802
+ Names of categorical factors to analyze (both fitted and unfitted).
803
+ continuous_factors : list of str, optional
804
+ Names of continuous factors to analyze (both fitted and unfitted).
805
+ n_calibration_bins : int, default=10
806
+ Number of bins for calibration curve.
807
+ n_factor_bins : int, default=10
808
+ Number of quantile bins for continuous factors.
809
+ rare_threshold_pct : float, default=1.0
810
+ Threshold (%) below which categorical levels are grouped into "Other".
811
+ max_categorical_levels : int, default=20
812
+ Maximum number of categorical levels to show.
813
+ detect_interactions : bool, default=True
814
+ Whether to detect potential interactions.
815
+ max_interaction_factors : int, default=10
816
+ Maximum factors to consider for interaction detection.
817
+
818
+ Returns
819
+ -------
820
+ ModelDiagnostics
821
+ Complete diagnostics object with to_json() method.
822
+
823
+ Examples
824
+ --------
825
+ >>> result = rs.glm("ClaimNb ~ Age + C(Region)", data, family="poisson").fit()
826
+ >>> diagnostics = result.diagnostics(
827
+ ... data=data,
828
+ ... categorical_factors=["Region", "VehBrand"],
829
+ ... continuous_factors=["Age", "VehPower"]
830
+ ... )
831
+ >>> print(diagnostics.to_json())
832
+ """
833
+ from rustystats.diagnostics import compute_diagnostics
834
+
835
+ return compute_diagnostics(
836
+ result=self,
837
+ data=data,
838
+ categorical_factors=categorical_factors,
839
+ continuous_factors=continuous_factors,
840
+ n_calibration_bins=n_calibration_bins,
841
+ n_factor_bins=n_factor_bins,
842
+ rare_threshold_pct=rare_threshold_pct,
843
+ max_categorical_levels=max_categorical_levels,
844
+ detect_interactions=detect_interactions,
845
+ max_interaction_factors=max_interaction_factors,
846
+ )
847
+
848
+ def diagnostics_json(
849
+ self,
850
+ data: "pl.DataFrame",
851
+ categorical_factors: Optional[List[str]] = None,
852
+ continuous_factors: Optional[List[str]] = None,
853
+ n_calibration_bins: int = 10,
854
+ n_factor_bins: int = 10,
855
+ rare_threshold_pct: float = 1.0,
856
+ max_categorical_levels: int = 20,
857
+ detect_interactions: bool = True,
858
+ max_interaction_factors: int = 10,
859
+ indent: Optional[int] = None,
860
+ ) -> str:
861
+ """
862
+ Compute diagnostics and return as JSON string.
863
+
864
+ This is a convenience method that calls diagnostics() and converts
865
+ the result to JSON. The output is optimized for LLM consumption.
866
+
867
+ Parameters
868
+ ----------
869
+ data : pl.DataFrame
870
+ Original data used for fitting.
871
+ categorical_factors : list of str, optional
872
+ Names of categorical factors to analyze.
873
+ continuous_factors : list of str, optional
874
+ Names of continuous factors to analyze.
875
+ indent : int, optional
876
+ JSON indentation. None for compact output.
877
+
878
+ Returns
879
+ -------
880
+ str
881
+ JSON string containing all diagnostics.
882
+ """
883
+ diag = self.diagnostics(
884
+ data=data,
885
+ categorical_factors=categorical_factors,
886
+ continuous_factors=continuous_factors,
887
+ n_calibration_bins=n_calibration_bins,
888
+ n_factor_bins=n_factor_bins,
889
+ rare_threshold_pct=rare_threshold_pct,
890
+ max_categorical_levels=max_categorical_levels,
891
+ detect_interactions=detect_interactions,
892
+ max_interaction_factors=max_interaction_factors,
893
+ )
894
+ return diag.to_json(indent=indent)
895
+
896
+ def predict(
897
+ self,
898
+ new_data: "pl.DataFrame",
899
+ offset: Optional[Union[str, np.ndarray]] = None,
900
+ ) -> np.ndarray:
901
+ """
902
+ Predict on new data using the fitted model.
903
+
904
+ Parameters
905
+ ----------
906
+ new_data : pl.DataFrame
907
+ New data to predict on. Must have the same columns as training data.
908
+ offset : str or array-like, optional
909
+ Offset for new data. If None and the model was fit with an offset
910
+ column name, that column will be extracted from new_data.
911
+ For Poisson/Gamma with log link, log() is auto-applied to exposure.
912
+
913
+ Returns
914
+ -------
915
+ np.ndarray
916
+ Predicted values (on the response scale, i.e., μ = E[Y]).
917
+
918
+ Examples
919
+ --------
920
+ >>> model = rs.glm("ClaimNb ~ Age + C(Region)", data, family="poisson", offset="Exposure")
921
+ >>> result = model.fit()
922
+ >>>
923
+ >>> # Predict on new data
924
+ >>> predictions = result.predict(new_data)
925
+ >>>
926
+ >>> # Predict with custom offset
927
+ >>> predictions = result.predict(new_data, offset=np.log(new_exposures))
928
+ """
929
+ if self._builder is None:
930
+ raise ValueError(
931
+ "Cannot predict: model was not fitted with formula API. "
932
+ "Use fittedvalues for training data predictions."
933
+ )
934
+
935
+ # Build design matrix for new data using stored encoding state
936
+ X_new = self._builder.transform_new_data(new_data)
937
+
938
+ # Compute linear predictor: η = X @ β
939
+ linear_pred = X_new @ self.params
940
+
941
+ # Handle offset
942
+ # If offset is provided as a string, extract column and apply log() for log-link models
943
+ # If offset is provided as array, use directly (user handles transformation)
944
+ # If offset is None, no offset is applied
945
+ if offset is not None:
946
+ if isinstance(offset, str):
947
+ offset_values = new_data[offset].to_numpy().astype(np.float64)
948
+ # Apply log() for log-link models (same as fitting)
949
+ if self.link == "log":
950
+ offset_values = np.log(offset_values)
951
+ else:
952
+ offset_values = np.asarray(offset, dtype=np.float64)
953
+ linear_pred = linear_pred + offset_values
954
+
955
+ # Apply inverse link function to get predictions on response scale
956
+ return self._apply_inverse_link(linear_pred)
957
+
958
+ def _apply_inverse_link(self, eta: np.ndarray) -> np.ndarray:
959
+ """Apply inverse link function to linear predictor."""
960
+ link = self.link
961
+ if link == "identity":
962
+ return eta
963
+ elif link == "log":
964
+ return np.exp(eta)
965
+ elif link == "logit":
966
+ return 1.0 / (1.0 + np.exp(-eta))
967
+ elif link == "inverse":
968
+ return 1.0 / eta
969
+ else:
970
+ # Default to identity
971
+ return eta
972
+
973
+ def __repr__(self) -> str:
974
+ return (
975
+ f"<FormulaGLMResults: {self.family} family, "
976
+ f"{len(self.params)} parameters, "
977
+ f"deviance={self.deviance:.2f}>"
978
+ )
979
+
980
+
981
+ def glm(
982
+ formula: str,
983
+ data: "pl.DataFrame",
984
+ family: str = "gaussian",
985
+ link: Optional[str] = None,
986
+ var_power: float = 1.5,
987
+ theta: Optional[float] = None,
988
+ offset: Optional[Union[str, np.ndarray]] = None,
989
+ weights: Optional[Union[str, np.ndarray]] = None,
990
+ ) -> FormulaGLM:
991
+ """
992
+ Create a GLM model from a formula and DataFrame.
993
+
994
+ This is the main entry point for the formula-based API.
995
+
996
+ Parameters
997
+ ----------
998
+ formula : str
999
+ R-style formula specifying the model.
1000
+
1001
+ Supported syntax:
1002
+ - Main effects: ``x1``, ``x2``, ``C(cat)`` (categorical)
1003
+ - Two-way interactions: ``x1:x2`` (interaction only), ``x1*x2`` (main effects + interaction)
1004
+ - Categorical interactions: ``C(cat1)*C(cat2)``, ``C(cat):x``
1005
+ - Higher-order: ``x1:x2:x3``
1006
+ - Splines: ``bs(x, df=5)``, ``ns(x, df=4)``
1007
+ - Intercept: included by default, use ``0 +`` or ``- 1`` to remove
1008
+
1009
+ data : pl.DataFrame
1010
+ Polars DataFrame containing the variables.
1011
+
1012
+ family : str, default="gaussian"
1013
+ Distribution family: "gaussian", "poisson", "binomial", "gamma", "tweedie",
1014
+ "quasipoisson", "quasibinomial", or "negbinomial"
1015
+
1016
+ link : str, optional
1017
+ Link function. If None, uses canonical link.
1018
+
1019
+ var_power : float, default=1.5
1020
+ Variance power for Tweedie family (ignored for others).
1021
+
1022
+ theta : float, optional
1023
+ Dispersion parameter for Negative Binomial family (ignored for others).
1024
+ If None (default), theta is automatically estimated using profile likelihood.
1025
+
1026
+ offset : str or array-like, optional
1027
+ Offset term. If string, treated as column name.
1028
+ For Poisson, log() is auto-applied to exposure columns.
1029
+
1030
+ weights : str or array-like, optional
1031
+ Prior weights. If string, treated as column name.
1032
+
1033
+ Returns
1034
+ -------
1035
+ FormulaGLM
1036
+ Model object. Call .fit() to fit the model.
1037
+
1038
+ Examples
1039
+ --------
1040
+ >>> import rustystats as rs
1041
+ >>> import polars as pl
1042
+ >>>
1043
+ >>> # Load data
1044
+ >>> data = pl.read_parquet("insurance.parquet")
1045
+ >>>
1046
+ >>> # Fit Poisson model for claim frequency
1047
+ >>> model = rs.glm(
1048
+ ... formula="ClaimNb ~ VehPower + VehAge + C(VehBrand) + C(Area)",
1049
+ ... data=data,
1050
+ ... family="poisson",
1051
+ ... offset="Exposure"
1052
+ ... )
1053
+ >>> result = model.fit()
1054
+ >>>
1055
+ >>> # Model with interactions
1056
+ >>> model = rs.glm(
1057
+ ... formula="ClaimNb ~ VehPower*VehAge + C(Area):DrivAge",
1058
+ ... data=data,
1059
+ ... family="poisson",
1060
+ ... offset="Exposure"
1061
+ ... )
1062
+ >>> result = model.fit()
1063
+ >>> print(result.summary())
1064
+ """
1065
+ return FormulaGLM(
1066
+ formula=formula,
1067
+ data=data,
1068
+ family=family,
1069
+ link=link,
1070
+ var_power=var_power,
1071
+ theta=theta,
1072
+ offset=offset,
1073
+ weights=weights,
1074
+ )