rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rustystats/__init__.py +151 -0
- rustystats/_rustystats.cpython-313-x86_64-linux-gnu.so +0 -0
- rustystats/diagnostics.py +2471 -0
- rustystats/families.py +423 -0
- rustystats/formula.py +1074 -0
- rustystats/glm.py +249 -0
- rustystats/interactions.py +1246 -0
- rustystats/links.py +221 -0
- rustystats/splines.py +367 -0
- rustystats/target_encoding.py +375 -0
- rustystats-0.1.5.dist-info/METADATA +476 -0
- rustystats-0.1.5.dist-info/RECORD +14 -0
- rustystats-0.1.5.dist-info/WHEEL +4 -0
- rustystats-0.1.5.dist-info/licenses/LICENSE +21 -0
rustystats/glm.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GLM Summary Functions
|
|
3
|
+
=====================
|
|
4
|
+
|
|
5
|
+
This module provides summary formatting functions for GLM results.
|
|
6
|
+
These are used internally by the formula API.
|
|
7
|
+
|
|
8
|
+
Note: The array-based API (fit_glm, GLM class) has been removed.
|
|
9
|
+
Use the formula-based API instead:
|
|
10
|
+
|
|
11
|
+
>>> import rustystats as rs
|
|
12
|
+
>>> result = rs.glm("y ~ x1 + x2 + C(cat)", data, family="poisson").fit()
|
|
13
|
+
>>> print(result.summary())
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from typing import Optional, List
|
|
18
|
+
|
|
19
|
+
from rustystats._rustystats import GLMResults
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def summary(
|
|
23
|
+
result: GLMResults,
|
|
24
|
+
feature_names: Optional[List[str]] = None,
|
|
25
|
+
title: str = "GLM Results",
|
|
26
|
+
alpha: float = 0.05,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Generate a summary table for GLM results (statsmodels-style).
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
result : GLMResults
|
|
34
|
+
Fitted GLM results object.
|
|
35
|
+
|
|
36
|
+
feature_names : list of str, optional
|
|
37
|
+
Names for each coefficient. If None, uses x0, x1, x2, ...
|
|
38
|
+
|
|
39
|
+
title : str, optional
|
|
40
|
+
Title for the summary table.
|
|
41
|
+
|
|
42
|
+
alpha : float, optional
|
|
43
|
+
Significance level for confidence intervals. Default 0.05 (95% CI).
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
str
|
|
48
|
+
Formatted summary table.
|
|
49
|
+
"""
|
|
50
|
+
n_params = len(result.params)
|
|
51
|
+
|
|
52
|
+
# Generate feature names if not provided
|
|
53
|
+
if feature_names is None:
|
|
54
|
+
feature_names = [f"x{i}" for i in range(n_params)]
|
|
55
|
+
elif len(feature_names) != n_params:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"feature_names has {len(feature_names)} elements but model has {n_params} parameters"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Get statistics
|
|
61
|
+
coefs = result.params
|
|
62
|
+
std_errs = result.bse()
|
|
63
|
+
z_vals = result.tvalues()
|
|
64
|
+
p_vals = result.pvalues()
|
|
65
|
+
conf_ints = result.conf_int(alpha)
|
|
66
|
+
sig_codes = result.significance_codes()
|
|
67
|
+
|
|
68
|
+
# Get diagnostics
|
|
69
|
+
try:
|
|
70
|
+
llf = result.llf()
|
|
71
|
+
aic_val = result.aic()
|
|
72
|
+
bic_val = result.bic()
|
|
73
|
+
pearson_chi2 = result.pearson_chi2()
|
|
74
|
+
null_dev = result.null_deviance()
|
|
75
|
+
family_name = result.family
|
|
76
|
+
scale = result.scale()
|
|
77
|
+
except Exception:
|
|
78
|
+
# Fallback if diagnostics not available
|
|
79
|
+
llf = aic_val = bic_val = pearson_chi2 = null_dev = float('nan')
|
|
80
|
+
family_name = "Unknown"
|
|
81
|
+
scale = 1.0
|
|
82
|
+
|
|
83
|
+
# Build the table
|
|
84
|
+
lines = []
|
|
85
|
+
lines.append("=" * 78)
|
|
86
|
+
lines.append(title.center(78))
|
|
87
|
+
lines.append("=" * 78)
|
|
88
|
+
lines.append("")
|
|
89
|
+
|
|
90
|
+
# Model info - statsmodels style
|
|
91
|
+
lines.append(f"{'Family:':<20} {family_name:<15} {'No. Observations:':<20} {result.nobs:>10}")
|
|
92
|
+
lines.append(f"{'Link Function:':<20} {'(default)':<15} {'Df Residuals:':<20} {result.df_resid:>10}")
|
|
93
|
+
|
|
94
|
+
# Show regularization info if applicable
|
|
95
|
+
try:
|
|
96
|
+
is_reg = result.is_regularized
|
|
97
|
+
penalty_type = result.penalty_type if is_reg else "none"
|
|
98
|
+
except Exception:
|
|
99
|
+
is_reg = False
|
|
100
|
+
penalty_type = "none"
|
|
101
|
+
|
|
102
|
+
if is_reg:
|
|
103
|
+
method = f"IRLS + {penalty_type.title()}"
|
|
104
|
+
lines.append(f"{'Method:':<20} {method:<15} {'Df Model:':<20} {result.df_model:>10}")
|
|
105
|
+
lines.append(f"{'Scale:':<20} {scale:<15.4f} {'Alpha (λ):':<20} {result.alpha:>10.4f}")
|
|
106
|
+
l1_val = result.l1_ratio if result.l1_ratio is not None else 0.0
|
|
107
|
+
lines.append(f"{'L1 Ratio:':<20} {l1_val:<15.2f} {'Iterations:':<20} {result.iterations:>10}")
|
|
108
|
+
# n_nonzero should always be available for regularized models
|
|
109
|
+
n_nonzero = result.n_nonzero()
|
|
110
|
+
lines.append(f"{'Non-zero coefs:':<20} {n_nonzero:<15}")
|
|
111
|
+
else:
|
|
112
|
+
lines.append(f"{'Method:':<20} {'IRLS':<15} {'Df Model:':<20} {result.df_model:>10}")
|
|
113
|
+
lines.append(f"{'Scale:':<20} {scale:<15.4f} {'Iterations:':<20} {result.iterations:>10}")
|
|
114
|
+
lines.append("")
|
|
115
|
+
|
|
116
|
+
# Goodness of fit
|
|
117
|
+
lines.append(f"{'Log-Likelihood:':<20} {llf:>15.4f} {'Deviance:':<20} {result.deviance:>15.4f}")
|
|
118
|
+
lines.append(f"{'AIC:':<20} {aic_val:>15.4f} {'Null Deviance:':<20} {null_dev:>15.4f}")
|
|
119
|
+
lines.append(f"{'BIC:':<20} {bic_val:>15.4f} {'Pearson chi2:':<20} {pearson_chi2:>15.2f}")
|
|
120
|
+
lines.append(f"{'Converged:':<20} {str(result.converged):<15}")
|
|
121
|
+
lines.append("")
|
|
122
|
+
lines.append("-" * 78)
|
|
123
|
+
|
|
124
|
+
# Calculate dynamic column width for variable names
|
|
125
|
+
# Use max of 16 chars or longest name (capped at 30)
|
|
126
|
+
max_name_len = max(len(name) for name in feature_names)
|
|
127
|
+
name_width = min(max(16, max_name_len), 30)
|
|
128
|
+
|
|
129
|
+
# Coefficient table header
|
|
130
|
+
ci_label = f"{int((1-alpha)*100)}% CI"
|
|
131
|
+
header = f"{'Variable':<{name_width}} {'Coef':>10} {'Std.Err':>10} {'z':>8} {'P>|z|':>8} {ci_label:>22} {'':>4}"
|
|
132
|
+
lines.append(header)
|
|
133
|
+
lines.append("-" * 78)
|
|
134
|
+
|
|
135
|
+
# Coefficient rows
|
|
136
|
+
for i in range(n_params):
|
|
137
|
+
name = feature_names[i][:name_width] # Truncate only if exceeds max
|
|
138
|
+
coef = coefs[i]
|
|
139
|
+
se = std_errs[i]
|
|
140
|
+
z = z_vals[i]
|
|
141
|
+
p = p_vals[i]
|
|
142
|
+
ci_low, ci_high = conf_ints[i]
|
|
143
|
+
sig = sig_codes[i]
|
|
144
|
+
|
|
145
|
+
# Format p-value
|
|
146
|
+
if p < 0.0001:
|
|
147
|
+
p_str = "<0.0001"
|
|
148
|
+
else:
|
|
149
|
+
p_str = f"{p:.4f}"
|
|
150
|
+
|
|
151
|
+
ci_str = f"[{ci_low:>8.4f}, {ci_high:>8.4f}]"
|
|
152
|
+
row = f"{name:<{name_width}} {coef:>10.4f} {se:>10.4f} {z:>8.3f} {p_str:>8} {ci_str:>22} {sig:>4}"
|
|
153
|
+
lines.append(row)
|
|
154
|
+
|
|
155
|
+
lines.append("-" * 78)
|
|
156
|
+
lines.append("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
|
|
157
|
+
lines.append("=" * 78)
|
|
158
|
+
|
|
159
|
+
return "\n".join(lines)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def summary_relativities(
|
|
163
|
+
result: GLMResults,
|
|
164
|
+
feature_names: Optional[List[str]] = None,
|
|
165
|
+
title: str = "GLM Relativities (Log Link)",
|
|
166
|
+
alpha: float = 0.05,
|
|
167
|
+
) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Generate a summary table showing relativities (exp of coefficients).
|
|
170
|
+
|
|
171
|
+
This is appropriate for models with a log link (Poisson, Gamma).
|
|
172
|
+
Relativities show the multiplicative effect of each variable.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
result : GLMResults
|
|
177
|
+
Fitted GLM results object (should use log link).
|
|
178
|
+
|
|
179
|
+
feature_names : list of str, optional
|
|
180
|
+
Names for each coefficient.
|
|
181
|
+
|
|
182
|
+
title : str, optional
|
|
183
|
+
Title for the summary table.
|
|
184
|
+
|
|
185
|
+
alpha : float, optional
|
|
186
|
+
Significance level for confidence intervals.
|
|
187
|
+
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
str
|
|
191
|
+
Formatted summary table with relativities.
|
|
192
|
+
|
|
193
|
+
Interpretation
|
|
194
|
+
--------------
|
|
195
|
+
A relativity of 1.15 for "Age 25-35" means that group has 15% higher
|
|
196
|
+
claim frequency than the base level, all else being equal.
|
|
197
|
+
"""
|
|
198
|
+
n_params = len(result.params)
|
|
199
|
+
|
|
200
|
+
if feature_names is None:
|
|
201
|
+
feature_names = [f"x{i}" for i in range(n_params)]
|
|
202
|
+
elif len(feature_names) != n_params:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"feature_names has {len(feature_names)} elements but model has {n_params} parameters"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
coefs = result.params
|
|
208
|
+
conf_ints = result.conf_int(alpha)
|
|
209
|
+
p_vals = result.pvalues()
|
|
210
|
+
sig_codes = result.significance_codes()
|
|
211
|
+
|
|
212
|
+
# Build the table
|
|
213
|
+
lines = []
|
|
214
|
+
lines.append("=" * 70)
|
|
215
|
+
lines.append(title.center(70))
|
|
216
|
+
lines.append("=" * 70)
|
|
217
|
+
lines.append("")
|
|
218
|
+
lines.append(f"No. Observations: {result.nobs:>10} Deviance: {result.deviance:>10.4f}")
|
|
219
|
+
lines.append("")
|
|
220
|
+
lines.append("-" * 70)
|
|
221
|
+
|
|
222
|
+
ci_label = f"{int((1-alpha)*100)}% CI"
|
|
223
|
+
header = f"{'Variable':<15} {'Coef':>10} {'Relativity':>12} {ci_label + ' (Rel)':>24} {'P>|z|':>8}"
|
|
224
|
+
lines.append(header)
|
|
225
|
+
lines.append("-" * 70)
|
|
226
|
+
|
|
227
|
+
for i in range(n_params):
|
|
228
|
+
name = feature_names[i][:15]
|
|
229
|
+
coef = coefs[i]
|
|
230
|
+
rel = np.exp(coef)
|
|
231
|
+
ci_low_rel = np.exp(conf_ints[i, 0])
|
|
232
|
+
ci_high_rel = np.exp(conf_ints[i, 1])
|
|
233
|
+
p = p_vals[i]
|
|
234
|
+
sig = sig_codes[i]
|
|
235
|
+
|
|
236
|
+
if p < 0.0001:
|
|
237
|
+
p_str = "<0.0001"
|
|
238
|
+
else:
|
|
239
|
+
p_str = f"{p:.4f}"
|
|
240
|
+
|
|
241
|
+
ci_str = f"[{ci_low_rel:>8.4f}, {ci_high_rel:>8.4f}]"
|
|
242
|
+
row = f"{name:<15} {coef:>10.4f} {rel:>12.4f} {ci_str:>24} {p_str:>8} {sig}"
|
|
243
|
+
lines.append(row)
|
|
244
|
+
|
|
245
|
+
lines.append("-" * 70)
|
|
246
|
+
lines.append("Relativity = exp(Coef). Values > 1 increase the response.")
|
|
247
|
+
lines.append("=" * 70)
|
|
248
|
+
|
|
249
|
+
return "\n".join(lines)
|