rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rustystats/families.py ADDED
@@ -0,0 +1,423 @@
1
+ """
2
+ Distribution Families for GLMs
3
+ ==============================
4
+
5
+ This module provides distribution families that specify:
6
+ 1. What type of data you're modeling (counts, binary, continuous, etc.)
7
+ 2. How variance relates to the mean (the variance function)
8
+ 3. The default link function to use
9
+
10
+ Choosing the Right Family
11
+ -------------------------
12
+
13
+ +------------------+-------------------+------------------+-------------------+
14
+ | Data Type | Example | Family | Typical Link |
15
+ +==================+===================+==================+===================+
16
+ | Continuous | Temperature | Gaussian | Identity |
17
+ | Strictly positive| Claim amounts | Gamma | Log |
18
+ | Counts (0,1,2,..)| Claim frequency | Poisson | Log |
19
+ | Binary (0 or 1) | Did they claim? | Binomial | Logit |
20
+ | Proportions | % who claimed | Binomial | Logit |
21
+ +------------------+-------------------+------------------+-------------------+
22
+
23
+ Understanding Variance Functions
24
+ --------------------------------
25
+
26
+ The variance function V(μ) tells us how the variance of Y relates to its mean:
27
+
28
+ Var(Y) = φ × V(μ)
29
+
30
+ where φ is the dispersion parameter (φ=1 for Poisson and Binomial).
31
+
32
+ - **Gaussian**: V(μ) = 1
33
+ Variance is constant. A $100 claim varies the same as a $10,000 claim.
34
+ This is usually unrealistic for monetary amounts.
35
+
36
+ - **Poisson**: V(μ) = μ
37
+ Variance equals mean. If average claims = 0.1, variance = 0.1.
38
+ Good for counts, but real data is often overdispersed.
39
+
40
+ - **Gamma**: V(μ) = μ²
41
+ Variance is proportional to mean squared, so coefficient of variation (CV)
42
+ is constant. A $1,000 claim varies proportionally the same as a $100,000 claim.
43
+ Very appropriate for insurance claim amounts.
44
+
45
+ - **Binomial**: V(μ) = μ(1-μ)
46
+ Maximum variance at μ=0.5, zero variance at μ=0 or μ=1.
47
+ Makes sense: if something always (or never) happens, there's no variation.
48
+
49
+ Examples
50
+ --------
51
+ >>> import rustystats as rs
52
+ >>> import numpy as np
53
+ >>>
54
+ >>> # Check variance function values
55
+ >>> poisson = rs.families.Poisson()
56
+ >>> mu = np.array([1.0, 2.0, 5.0])
57
+ >>> print(poisson.variance(mu)) # [1.0, 2.0, 5.0] - same as mu!
58
+ >>>
59
+ >>> gamma = rs.families.Gamma()
60
+ >>> print(gamma.variance(mu)) # [1.0, 4.0, 25.0] - mu squared!
61
+ """
62
+
63
+ # Import the Rust implementations
64
+ from rustystats._rustystats import (
65
+ GaussianFamily as _GaussianFamily,
66
+ PoissonFamily as _PoissonFamily,
67
+ BinomialFamily as _BinomialFamily,
68
+ GammaFamily as _GammaFamily,
69
+ QuasiPoissonFamily as _QuasiPoissonFamily,
70
+ QuasiBinomialFamily as _QuasiBinomialFamily,
71
+ NegativeBinomialFamily as _NegativeBinomialFamily,
72
+ )
73
+
74
+
75
+ def Gaussian():
76
+ """
77
+ Gaussian (Normal) family for continuous response data.
78
+
79
+ Use this for standard linear regression where the response can be
80
+ any real value (positive, negative, or zero).
81
+
82
+ Properties
83
+ ----------
84
+ - Variance function: V(μ) = 1 (constant variance)
85
+ - Default link: Identity (η = μ)
86
+ - Dispersion: σ² (estimated from residuals)
87
+
88
+ When to Use
89
+ -----------
90
+ - Continuous data with approximately constant variance
91
+ - When you'd normally use ordinary least squares
92
+ - When residuals are roughly normally distributed
93
+
94
+ When NOT to Use
95
+ ---------------
96
+ - For strictly positive data (use Gamma instead)
97
+ - For count data (use Poisson instead)
98
+ - For binary outcomes (use Binomial instead)
99
+
100
+ Example
101
+ -------
102
+ >>> family = rs.families.Gaussian()
103
+ >>> print(family.name()) # "Gaussian"
104
+ >>> print(family.variance(np.array([1.0, 100.0]))) # [1.0, 1.0]
105
+ """
106
+ return _GaussianFamily()
107
+
108
+
109
+ def Poisson():
110
+ """
111
+ Poisson family for count data (0, 1, 2, 3, ...).
112
+
113
+ This is the standard family for claim FREQUENCY modeling.
114
+
115
+ Properties
116
+ ----------
117
+ - Variance function: V(μ) = μ (variance equals the mean)
118
+ - Default link: Log (η = log(μ))
119
+ - Dispersion: φ = 1 (fixed)
120
+
121
+ Key Assumption: Equidispersion
122
+ ------------------------------
123
+ Poisson assumes variance = mean. This is often violated in practice
124
+ ("overdispersion"). Check by looking at:
125
+
126
+ Pearson χ² / degrees of freedom
127
+
128
+ If this is much greater than 1, you have overdispersion. Options:
129
+ - Use quasi-Poisson (adjust standard errors)
130
+ - Use Negative Binomial family (once implemented)
131
+
132
+ When to Use
133
+ -----------
134
+ - Claim counts per policy
135
+ - Number of accidents
136
+ - Event counts in a fixed period
137
+
138
+ Exposure Adjustment
139
+ -------------------
140
+ Often used with an "exposure" offset. If modeling annual claim counts
141
+ but some policies are only 6 months:
142
+
143
+ E(claims) = exposure × exp(Xβ)
144
+ log(E(claims)) = log(exposure) + Xβ
145
+
146
+ The log(exposure) term is an "offset" with coefficient fixed at 1.
147
+
148
+ Example
149
+ -------
150
+ >>> family = rs.families.Poisson()
151
+ >>> mu = np.array([0.5, 1.0, 2.0])
152
+ >>> print(family.variance(mu)) # [0.5, 1.0, 2.0] - same as mu!
153
+ """
154
+ return _PoissonFamily()
155
+
156
+
157
+ def Binomial():
158
+ """
159
+ Binomial family for binary or proportion data.
160
+
161
+ This is the foundation of LOGISTIC REGRESSION.
162
+
163
+ Properties
164
+ ----------
165
+ - Variance function: V(μ) = μ(1-μ)
166
+ - Default link: Logit (η = log(μ/(1-μ)))
167
+ - Dispersion: φ = 1 (fixed)
168
+
169
+ Understanding the Variance Function
170
+ -----------------------------------
171
+ V(μ) = μ(1-μ) means variance is:
172
+ - Maximum at μ = 0.5 (most uncertainty)
173
+ - Zero at μ = 0 or μ = 1 (certain outcomes)
174
+
175
+ This makes intuitive sense: if something almost always (or never)
176
+ happens, there's not much variation in outcomes.
177
+
178
+ Interpreting Coefficients
179
+ -------------------------
180
+ With logit link, coefficients are on the log-odds scale:
181
+
182
+ - If β = 0.5 for variable X, then exp(0.5) ≈ 1.65
183
+ - This means: "1.65 times the odds for each 1-unit increase in X"
184
+ - OR: "65% higher odds"
185
+
186
+ When to Use
187
+ -----------
188
+ - Binary outcomes (claim/no claim, lapse/retain)
189
+ - Conversion rates
190
+ - Any yes/no question
191
+
192
+ Example
193
+ -------
194
+ >>> family = rs.families.Binomial()
195
+ >>> mu = np.array([0.2, 0.5, 0.8])
196
+ >>> print(family.variance(mu)) # [0.16, 0.25, 0.16]
197
+ >>> # Note: max variance at μ=0.5
198
+ """
199
+ return _BinomialFamily()
200
+
201
+
202
+ def Gamma():
203
+ """
204
+ Gamma family for positive continuous data.
205
+
206
+ This is the standard family for claim SEVERITY (amount) modeling.
207
+
208
+ Properties
209
+ ----------
210
+ - Variance function: V(μ) = μ² (variance proportional to mean squared)
211
+ - Default link: Log (η = log(μ)) - note: canonical is inverse, but log is standard
212
+ - Dispersion: φ = 1/shape (estimated from residuals)
213
+
214
+ Key Insight: Constant Coefficient of Variation
215
+ ----------------------------------------------
216
+ Since V(μ) = μ², the standard deviation is proportional to the mean:
217
+
218
+ SD(Y) = √(φ × μ²) = √φ × μ
219
+ CV = SD/mean = √φ (constant!)
220
+
221
+ This is very realistic for monetary amounts:
222
+ - A $1,000 claim might vary by ±$500 (CV = 50%)
223
+ - A $100,000 claim might vary by ±$50,000 (same CV = 50%)
224
+
225
+ Why Gamma for Claim Amounts?
226
+ ----------------------------
227
+ - Gaussian assumes constant variance (unrealistic for money)
228
+ - Gamma's constant CV matches observed behavior of claim amounts
229
+ - Log link ensures predictions are always positive
230
+ - Coefficients have multiplicative interpretation
231
+
232
+ Combining with Poisson (Pure Premium)
233
+ -------------------------------------
234
+ Pure premium = Frequency × Severity
235
+
236
+ If you model:
237
+ - Frequency: Poisson with log link
238
+ - Severity: Gamma with log link
239
+
240
+ Then pure premium coefficients are the SUM of the two models' coefficients!
241
+ (Because log(Freq × Sev) = log(Freq) + log(Sev))
242
+
243
+ Example
244
+ -------
245
+ >>> family = rs.families.Gamma()
246
+ >>> mu = np.array([100.0, 1000.0, 10000.0])
247
+ >>> print(family.variance(mu)) # [10000, 1000000, 100000000]
248
+ >>> # Variance grows with the square of the mean
249
+ """
250
+ return _GammaFamily()
251
+
252
+
253
+ def QuasiPoisson():
254
+ """
255
+ QuasiPoisson family for overdispersed count data.
256
+
257
+ Uses the same variance function as Poisson (V(μ) = μ) but estimates
258
+ the dispersion parameter φ from data instead of fixing it at 1.
259
+
260
+ Properties
261
+ ----------
262
+ - Variance function: V(μ) = μ (same as Poisson)
263
+ - Full variance: Var(Y) = φ × μ where φ is estimated
264
+ - Default link: Log (η = log(μ))
265
+ - Dispersion: φ = Pearson_χ² / (n - p), estimated from data
266
+
267
+ When to Use
268
+ -----------
269
+ - Count data with overdispersion (Pearson χ²/df >> 1)
270
+ - When you want Poisson-like point estimates but valid standard errors
271
+ - Insurance claim frequency with extra-Poisson variation
272
+
273
+ How It Works
274
+ ------------
275
+ Point estimates (coefficients) are IDENTICAL to Poisson. The only
276
+ difference is how standard errors are computed:
277
+
278
+ - Poisson: SE = sqrt(diag((X'WX)⁻¹))
279
+ - QuasiPoisson: SE = sqrt(φ × diag((X'WX)⁻¹))
280
+
281
+ The inflation factor √φ makes confidence intervals wider and p-values
282
+ more conservative, correctly accounting for overdispersion.
283
+
284
+ Detecting Overdispersion
285
+ ------------------------
286
+ After fitting a Poisson model, check:
287
+
288
+ dispersion = result.pearson_chi2() / result.df_resid
289
+
290
+ If dispersion >> 1 (e.g., > 1.5), overdispersion is present.
291
+
292
+ Alternatives
293
+ ------------
294
+ - Robust standard errors (result.bse_robust("HC1"))
295
+ - Negative Binomial family (not yet implemented)
296
+
297
+ Example
298
+ -------
299
+ >>> import rustystats as rs
300
+ >>> # Fit QuasiPoisson when overdispersion is detected
301
+ >>> result = rs.fit_glm(y, X, family="quasipoisson")
302
+ >>> print(f"Estimated dispersion: {result.scale():.3f}")
303
+ >>> print(f"SE (model-based): {result.bse()}") # Inflated by √φ
304
+ """
305
+ return _QuasiPoissonFamily()
306
+
307
+
308
+ def QuasiBinomial():
309
+ """
310
+ QuasiBinomial family for overdispersed binary/proportion data.
311
+
312
+ Uses the same variance function as Binomial (V(μ) = μ(1-μ)) but estimates
313
+ the dispersion parameter φ from data instead of fixing it at 1.
314
+
315
+ Properties
316
+ ----------
317
+ - Variance function: V(μ) = μ(1-μ) (same as Binomial)
318
+ - Full variance: Var(Y) = φ × μ(1-μ) where φ is estimated
319
+ - Default link: Logit (η = log(μ/(1-μ)))
320
+ - Dispersion: φ = Pearson_χ² / (n - p), estimated from data
321
+
322
+ When to Use
323
+ -----------
324
+ - Binary outcomes with overdispersion
325
+ - Clustered binary data (where observations within clusters are correlated)
326
+ - When unobserved heterogeneity inflates variance beyond Binomial
327
+
328
+ How It Works
329
+ ------------
330
+ Point estimates (coefficients, odds ratios) are IDENTICAL to Binomial.
331
+ The only difference is how standard errors are computed:
332
+
333
+ - Binomial: SE = sqrt(diag((X'WX)⁻¹))
334
+ - QuasiBinomial: SE = sqrt(φ × diag((X'WX)⁻¹))
335
+
336
+ The inflation factor √φ makes confidence intervals wider and p-values
337
+ more conservative, correctly accounting for overdispersion.
338
+
339
+ Common Causes of Overdispersion
340
+ -------------------------------
341
+ - Clustered/correlated observations
342
+ - Omitted predictors that affect variance
343
+ - Non-constant success probability within groups
344
+
345
+ Alternatives
346
+ ------------
347
+ - Robust standard errors (result.bse_robust("HC1"))
348
+ - Mixed effects models (not yet implemented)
349
+
350
+ Example
351
+ -------
352
+ >>> import rustystats as rs
353
+ >>> # Fit QuasiBinomial when overdispersion is detected
354
+ >>> result = rs.fit_glm(y, X, family="quasibinomial")
355
+ >>> print(f"Estimated dispersion: {result.scale():.3f}")
356
+ >>> print(f"Odds ratios: {np.exp(result.params)}")
357
+ """
358
+ return _QuasiBinomialFamily()
359
+
360
+
361
+ def NegativeBinomial(theta=1.0):
362
+ """
363
+ Negative Binomial family for overdispersed count data.
364
+
365
+ Uses the NB2 parameterization where variance is quadratic in the mean:
366
+ Var(Y) = μ + μ²/θ
367
+
368
+ This is an alternative to QuasiPoisson that models overdispersion explicitly
369
+ with a proper probability distribution, enabling valid likelihood-based inference.
370
+
371
+ Parameters
372
+ ----------
373
+ theta : float, optional
374
+ Dispersion parameter (default: 1.0). Larger θ = less overdispersion.
375
+ - θ = 0.5: Strong overdispersion (variance = μ + 2μ²)
376
+ - θ = 1.0: Moderate overdispersion (variance = μ + μ²)
377
+ - θ = 10: Mild overdispersion (close to Poisson)
378
+ - θ → ∞: Approaches Poisson
379
+
380
+ Properties
381
+ ----------
382
+ - Variance function: V(μ) = μ + μ²/θ (NB2 parameterization)
383
+ - Default link: Log (η = log(μ))
384
+ - True probability distribution with valid likelihood
385
+
386
+ Comparison to QuasiPoisson
387
+ --------------------------
388
+ | Aspect | QuasiPoisson | Negative Binomial |
389
+ |------------------|---------------------|----------------------|
390
+ | Variance | φ × μ | μ + μ²/θ |
391
+ | True distribution| No (quasi) | Yes |
392
+ | Likelihood-based | No | Yes |
393
+ | AIC/BIC valid | Questionable | Yes |
394
+ | Predictions | Point only | Proper intervals |
395
+
396
+ When to Use
397
+ -----------
398
+ - Count data with overdispersion (variance > mean)
399
+ - When you need valid likelihood-based inference (AIC, BIC)
400
+ - When you want proper prediction intervals
401
+ - Claim frequency with extra-Poisson variation
402
+
403
+ Example
404
+ -------
405
+ >>> import rustystats as rs
406
+ >>> # Fit Negative Binomial with θ=1.0
407
+ >>> result = rs.fit_glm(y, X, family="negbinomial", theta=1.0)
408
+ >>>
409
+ >>> # Check the variance function
410
+ >>> family = rs.families.NegativeBinomial(theta=2.0)
411
+ >>> mu = np.array([1.0, 2.0, 4.0])
412
+ >>> print(family.variance(mu)) # [1.5, 4.0, 12.0]
413
+ >>>
414
+ >>> # Variance = μ + μ²/θ = μ + μ²/2
415
+ >>> # V(1) = 1 + 0.5 = 1.5
416
+ >>> # V(2) = 2 + 2 = 4.0
417
+ >>> # V(4) = 4 + 8 = 12.0
418
+ """
419
+ return _NegativeBinomialFamily(theta)
420
+
421
+
422
+ # For backwards compatibility and convenience
423
+ __all__ = ["Gaussian", "Poisson", "Binomial", "Gamma", "QuasiPoisson", "QuasiBinomial", "NegativeBinomial"]