rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rustystats/__init__.py +151 -0
- rustystats/_rustystats.cpython-313-x86_64-linux-gnu.so +0 -0
- rustystats/diagnostics.py +2471 -0
- rustystats/families.py +423 -0
- rustystats/formula.py +1074 -0
- rustystats/glm.py +249 -0
- rustystats/interactions.py +1246 -0
- rustystats/links.py +221 -0
- rustystats/splines.py +367 -0
- rustystats/target_encoding.py +375 -0
- rustystats-0.1.5.dist-info/METADATA +476 -0
- rustystats-0.1.5.dist-info/RECORD +14 -0
- rustystats-0.1.5.dist-info/WHEEL +4 -0
- rustystats-0.1.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized interaction term support for RustyStats.
|
|
3
|
+
|
|
4
|
+
This module provides high-performance interaction term handling for GLMs.
|
|
5
|
+
All heavy computation is done in Rust for maximum speed:
|
|
6
|
+
- Categorical encoding (Rust parallel construction)
|
|
7
|
+
- Interaction terms (Rust parallel for large data)
|
|
8
|
+
- Spline basis functions (Rust with Rayon)
|
|
9
|
+
|
|
10
|
+
The Python layer handles only:
|
|
11
|
+
- Formula parsing (string manipulation)
|
|
12
|
+
- DataFrame column extraction
|
|
13
|
+
- Orchestration of Rust calls
|
|
14
|
+
|
|
15
|
+
Example
|
|
16
|
+
-------
|
|
17
|
+
>>> from rustystats.interactions import InteractionBuilder
|
|
18
|
+
>>>
|
|
19
|
+
>>> builder = InteractionBuilder(data)
|
|
20
|
+
>>> y, X, names = builder.build_design_matrix('y ~ x1*x2 + C(cat) + bs(age, df=5)')
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from typing import List, Optional, Tuple, Union, Dict, Set, TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
# Import Rust implementations for heavy computation
|
|
31
|
+
from rustystats._rustystats import (
|
|
32
|
+
encode_categorical_py as _encode_categorical_rust,
|
|
33
|
+
build_cat_cat_interaction_py as _build_cat_cat_rust,
|
|
34
|
+
build_cat_cont_interaction_py as _build_cat_cont_rust,
|
|
35
|
+
build_cont_cont_interaction_py as _build_cont_cont_rust,
|
|
36
|
+
multiply_matrix_by_continuous_py as _multiply_matrix_cont_rust,
|
|
37
|
+
parse_formula_py as _parse_formula_rust,
|
|
38
|
+
target_encode_py as _target_encode_rust,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
import polars as pl
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class InteractionTerm:
|
|
47
|
+
"""Represents a single interaction term like x1:x2 or C(cat1):x2."""
|
|
48
|
+
|
|
49
|
+
factors: List[str] # Variables involved (e.g., ['x1', 'x2'] or ['cat1', 'x2'])
|
|
50
|
+
categorical_flags: List[bool] # Which factors are categorical
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def order(self) -> int:
|
|
54
|
+
"""Order of interaction (2 for pairwise, 3 for three-way, etc.)."""
|
|
55
|
+
return len(self.factors)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def is_pure_continuous(self) -> bool:
|
|
59
|
+
"""True if all factors are continuous."""
|
|
60
|
+
return not any(self.categorical_flags)
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def is_pure_categorical(self) -> bool:
|
|
64
|
+
"""True if all factors are categorical."""
|
|
65
|
+
return all(self.categorical_flags)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def is_mixed(self) -> bool:
|
|
69
|
+
"""True if mixture of categorical and continuous."""
|
|
70
|
+
return any(self.categorical_flags) and not all(self.categorical_flags)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Import SplineTerm from splines module (canonical implementation)
|
|
74
|
+
from rustystats.splines import SplineTerm
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class CategoricalEncoding:
|
|
79
|
+
"""Cached categorical encoding data for a variable."""
|
|
80
|
+
encoding: np.ndarray # (n, k-1) dummy matrix
|
|
81
|
+
names: List[str] # Column names like ['var[T.B]', 'var[T.C]']
|
|
82
|
+
indices: np.ndarray # (n,) level indices (int32)
|
|
83
|
+
levels: List[str] # All categorical levels
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class TargetEncodingTermSpec:
|
|
88
|
+
"""Parsed target encoding term specification from formula."""
|
|
89
|
+
var_name: str
|
|
90
|
+
prior_weight: float = 1.0
|
|
91
|
+
n_permutations: int = 4
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class IdentityTermSpec:
|
|
96
|
+
"""Parsed identity term specification from formula (I() expressions)."""
|
|
97
|
+
expression: str # The raw expression inside I(), e.g., "x ** 2" or "x + y"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ParsedFormula:
|
|
102
|
+
"""Parsed formula with identified terms."""
|
|
103
|
+
|
|
104
|
+
response: str
|
|
105
|
+
main_effects: List[str] # Main effect variables
|
|
106
|
+
interactions: List[InteractionTerm] # Interaction terms
|
|
107
|
+
categorical_vars: Set[str] # Variables marked as categorical with C()
|
|
108
|
+
spline_terms: List[SplineTerm] = field(default_factory=list) # Spline terms
|
|
109
|
+
target_encoding_terms: List[TargetEncodingTermSpec] = field(default_factory=list) # TE() terms
|
|
110
|
+
identity_terms: List[IdentityTermSpec] = field(default_factory=list) # I() terms
|
|
111
|
+
has_intercept: bool = True
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def parse_formula_interactions(formula: str) -> ParsedFormula:
|
|
115
|
+
"""
|
|
116
|
+
Parse a formula string and extract interaction terms.
|
|
117
|
+
|
|
118
|
+
Uses Rust for fast parsing of:
|
|
119
|
+
- Main effects: x1, x2, C(cat)
|
|
120
|
+
- Two-way interactions: x1:x2, x1*x2, C(cat):x
|
|
121
|
+
- Higher-order: x1:x2:x3
|
|
122
|
+
- Intercept removal: 0 + ... or -1
|
|
123
|
+
- Spline terms: bs(x, df=5), ns(x, df=4)
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
formula : str
|
|
128
|
+
R-style formula like "y ~ x1*x2 + C(cat) + bs(age, df=5)"
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
ParsedFormula
|
|
133
|
+
Parsed structure with all terms identified
|
|
134
|
+
"""
|
|
135
|
+
# Use Rust parser
|
|
136
|
+
parsed = _parse_formula_rust(formula)
|
|
137
|
+
|
|
138
|
+
# Convert to Python dataclasses
|
|
139
|
+
interactions = [
|
|
140
|
+
InteractionTerm(
|
|
141
|
+
factors=i['factors'],
|
|
142
|
+
categorical_flags=i['categorical_flags']
|
|
143
|
+
)
|
|
144
|
+
for i in parsed['interactions']
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
spline_terms = [
|
|
148
|
+
SplineTerm(
|
|
149
|
+
var_name=s['var_name'],
|
|
150
|
+
spline_type=s['spline_type'],
|
|
151
|
+
df=s['df'],
|
|
152
|
+
degree=s['degree']
|
|
153
|
+
)
|
|
154
|
+
for s in parsed['spline_terms']
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
# Parse target encoding terms
|
|
158
|
+
target_encoding_terms = [
|
|
159
|
+
TargetEncodingTermSpec(
|
|
160
|
+
var_name=t['var_name'],
|
|
161
|
+
prior_weight=t['prior_weight'],
|
|
162
|
+
n_permutations=t['n_permutations']
|
|
163
|
+
)
|
|
164
|
+
for t in parsed.get('target_encoding_terms', [])
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
# Parse identity terms (I() expressions)
|
|
168
|
+
identity_terms = [
|
|
169
|
+
IdentityTermSpec(expression=i['expression'])
|
|
170
|
+
for i in parsed.get('identity_terms', [])
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# Filter out "1" from main effects (it's just an explicit intercept indicator)
|
|
174
|
+
main_effects = [m for m in parsed['main_effects'] if m != '1']
|
|
175
|
+
|
|
176
|
+
return ParsedFormula(
|
|
177
|
+
response=parsed['response'],
|
|
178
|
+
main_effects=main_effects,
|
|
179
|
+
interactions=interactions,
|
|
180
|
+
categorical_vars=set(parsed['categorical_vars']),
|
|
181
|
+
spline_terms=spline_terms,
|
|
182
|
+
target_encoding_terms=target_encoding_terms,
|
|
183
|
+
identity_terms=identity_terms,
|
|
184
|
+
has_intercept=parsed['has_intercept'],
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class InteractionBuilder:
|
|
189
|
+
"""
|
|
190
|
+
Efficiently builds design matrices with interaction terms.
|
|
191
|
+
|
|
192
|
+
Optimizations:
|
|
193
|
+
1. Continuous × Continuous: Single vectorized multiplication
|
|
194
|
+
2. Categorical × Continuous: Sparse-aware dummy encoding
|
|
195
|
+
3. Categorical × Categorical: Direct index-based construction
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
data : pl.DataFrame
|
|
200
|
+
Polars DataFrame
|
|
201
|
+
dtype : numpy dtype, default=np.float64
|
|
202
|
+
Data type for output arrays
|
|
203
|
+
|
|
204
|
+
Example
|
|
205
|
+
-------
|
|
206
|
+
>>> builder = InteractionBuilder(df)
|
|
207
|
+
>>> X, names = builder.build_matrix('y ~ x1*x2 + C(area):age')
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def __init__(
|
|
211
|
+
self,
|
|
212
|
+
data: "pl.DataFrame",
|
|
213
|
+
dtype: np.dtype = np.float64,
|
|
214
|
+
):
|
|
215
|
+
self.data = data
|
|
216
|
+
self.dtype = dtype
|
|
217
|
+
self._n = len(data)
|
|
218
|
+
|
|
219
|
+
# Consolidated cache for categorical encodings (keyed by "varname_dropfirst")
|
|
220
|
+
self._cat_encoding_cache: Dict[str, CategoricalEncoding] = {}
|
|
221
|
+
# Store spline terms with fitted knots for prediction
|
|
222
|
+
self._fitted_splines: Dict[str, SplineTerm] = {}
|
|
223
|
+
# Store parsed formula for prediction
|
|
224
|
+
self._parsed_formula: Optional[ParsedFormula] = None
|
|
225
|
+
|
|
226
|
+
def _parse_spline_factor(self, factor: str) -> Optional[SplineTerm]:
|
|
227
|
+
"""Parse a spline term from a factor name like 'bs(VehAge, df=4)' or 'ns(age, df=3)'."""
|
|
228
|
+
factor_lower = factor.strip().lower()
|
|
229
|
+
if factor_lower.startswith('bs(') or factor_lower.startswith('ns('):
|
|
230
|
+
spline_type = 'bs' if factor_lower.startswith('bs(') else 'ns'
|
|
231
|
+
# Extract content inside parentheses
|
|
232
|
+
content = factor[3:-1] if factor.endswith(')') else factor[3:]
|
|
233
|
+
parts = [p.strip() for p in content.split(',')]
|
|
234
|
+
var_name = parts[0]
|
|
235
|
+
df = 4 # default
|
|
236
|
+
degree = 3 # default for B-splines
|
|
237
|
+
for part in parts[1:]:
|
|
238
|
+
if '=' in part:
|
|
239
|
+
key, val = part.split('=', 1)
|
|
240
|
+
key = key.strip().lower()
|
|
241
|
+
val = val.strip()
|
|
242
|
+
if key == 'df':
|
|
243
|
+
df = int(val)
|
|
244
|
+
elif key == 'degree':
|
|
245
|
+
degree = int(val)
|
|
246
|
+
return SplineTerm(var_name=var_name, spline_type=spline_type, df=df, degree=degree)
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
def _get_column(self, name: str) -> np.ndarray:
|
|
250
|
+
"""Extract column as numpy array."""
|
|
251
|
+
return self.data[name].to_numpy().astype(self.dtype)
|
|
252
|
+
|
|
253
|
+
def _get_categorical_indices(self, name: str) -> Tuple[np.ndarray, List[str]]:
|
|
254
|
+
"""Get cached categorical indices and levels for a variable."""
|
|
255
|
+
cache_key = f"{name}_True" # Always use drop_first=True for indices
|
|
256
|
+
if cache_key not in self._cat_encoding_cache:
|
|
257
|
+
self._get_categorical_encoding(name) # Populate cache
|
|
258
|
+
cached = self._cat_encoding_cache[cache_key]
|
|
259
|
+
return cached.indices, cached.levels
|
|
260
|
+
|
|
261
|
+
def _get_categorical_levels(self, name: str) -> List[str]:
|
|
262
|
+
"""Get cached categorical levels for a variable."""
|
|
263
|
+
cache_key = f"{name}_True"
|
|
264
|
+
if cache_key not in self._cat_encoding_cache:
|
|
265
|
+
raise ValueError(f"Categorical variable '{name}' was not seen during training.")
|
|
266
|
+
return self._cat_encoding_cache[cache_key].levels
|
|
267
|
+
|
|
268
|
+
def _get_categorical_encoding(
|
|
269
|
+
self,
|
|
270
|
+
name: str,
|
|
271
|
+
drop_first: bool = True
|
|
272
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
273
|
+
"""
|
|
274
|
+
Get dummy encoding for a categorical variable.
|
|
275
|
+
|
|
276
|
+
Uses Rust for factorization and parallel matrix construction.
|
|
277
|
+
Pure Rust implementation.
|
|
278
|
+
|
|
279
|
+
Returns
|
|
280
|
+
-------
|
|
281
|
+
encoding : np.ndarray
|
|
282
|
+
(n, k-1) dummy matrix where k is number of levels
|
|
283
|
+
names : list[str]
|
|
284
|
+
Column names like ['var[T.B]', 'var[T.C]', ...]
|
|
285
|
+
"""
|
|
286
|
+
cache_key = f"{name}_{drop_first}"
|
|
287
|
+
if cache_key in self._cat_encoding_cache:
|
|
288
|
+
cached = self._cat_encoding_cache[cache_key]
|
|
289
|
+
return cached.encoding, cached.names
|
|
290
|
+
|
|
291
|
+
col = self.data[name].to_numpy()
|
|
292
|
+
|
|
293
|
+
# Convert to string list for Rust factorization
|
|
294
|
+
values = [str(v) for v in col]
|
|
295
|
+
|
|
296
|
+
# Use Rust for factorization + matrix construction
|
|
297
|
+
encoding, names, indices, levels = _encode_categorical_rust(values, name, drop_first)
|
|
298
|
+
|
|
299
|
+
# Cache all encoding data in a single consolidated object
|
|
300
|
+
self._cat_encoding_cache[cache_key] = CategoricalEncoding(
|
|
301
|
+
encoding=encoding,
|
|
302
|
+
names=names,
|
|
303
|
+
indices=np.array(indices, dtype=np.int32),
|
|
304
|
+
levels=levels,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
return encoding, names
|
|
308
|
+
|
|
309
|
+
def build_interaction_columns(
|
|
310
|
+
self,
|
|
311
|
+
interaction: InteractionTerm,
|
|
312
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
313
|
+
"""
|
|
314
|
+
Build columns for a single interaction term.
|
|
315
|
+
|
|
316
|
+
Optimized for different interaction types:
|
|
317
|
+
- Pure continuous: Single O(n) element-wise multiply
|
|
318
|
+
- Mixed: Broadcast multiply continuous with each dummy column
|
|
319
|
+
- Pure categorical: Sparse index-based construction
|
|
320
|
+
|
|
321
|
+
Returns
|
|
322
|
+
-------
|
|
323
|
+
columns : np.ndarray
|
|
324
|
+
(n, k) interaction columns
|
|
325
|
+
names : list[str]
|
|
326
|
+
Column names
|
|
327
|
+
"""
|
|
328
|
+
if interaction.is_pure_continuous:
|
|
329
|
+
return self._build_continuous_interaction(interaction)
|
|
330
|
+
elif interaction.is_pure_categorical:
|
|
331
|
+
return self._build_categorical_interaction(interaction)
|
|
332
|
+
else:
|
|
333
|
+
return self._build_mixed_interaction(interaction)
|
|
334
|
+
|
|
335
|
+
def _build_continuous_interaction(
|
|
336
|
+
self,
|
|
337
|
+
interaction: InteractionTerm
|
|
338
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
339
|
+
"""Build continuous × continuous interaction using Rust for computation."""
|
|
340
|
+
factors = interaction.factors
|
|
341
|
+
|
|
342
|
+
if len(factors) == 2:
|
|
343
|
+
# Optimized 2-way: direct Rust call
|
|
344
|
+
x1 = self._get_column(factors[0])
|
|
345
|
+
x2 = self._get_column(factors[1])
|
|
346
|
+
result, name = _build_cont_cont_rust(x1, x2, factors[0], factors[1])
|
|
347
|
+
return result.reshape(-1, 1), [name]
|
|
348
|
+
else:
|
|
349
|
+
# N-way: chain pairwise Rust calls
|
|
350
|
+
result = self._get_column(factors[0])
|
|
351
|
+
current_name = factors[0]
|
|
352
|
+
|
|
353
|
+
for factor in factors[1:]:
|
|
354
|
+
x2 = self._get_column(factor)
|
|
355
|
+
result, current_name = _build_cont_cont_rust(result, x2, current_name, factor)
|
|
356
|
+
|
|
357
|
+
return result.reshape(-1, 1), [current_name]
|
|
358
|
+
|
|
359
|
+
def _build_categorical_interaction(
|
|
360
|
+
self,
|
|
361
|
+
interaction: InteractionTerm
|
|
362
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
363
|
+
"""
|
|
364
|
+
Build categorical × categorical interaction efficiently.
|
|
365
|
+
|
|
366
|
+
Uses index-based construction instead of materializing full matrices.
|
|
367
|
+
"""
|
|
368
|
+
# Get encodings for each categorical factor
|
|
369
|
+
encodings = []
|
|
370
|
+
all_names = []
|
|
371
|
+
|
|
372
|
+
for factor in interaction.factors:
|
|
373
|
+
enc, names = self._get_categorical_encoding(factor)
|
|
374
|
+
encodings.append(enc)
|
|
375
|
+
all_names.append(names)
|
|
376
|
+
|
|
377
|
+
if len(interaction.factors) == 2:
|
|
378
|
+
# Optimized 2-way interaction
|
|
379
|
+
return self._build_2way_categorical(encodings, all_names, interaction.factors)
|
|
380
|
+
else:
|
|
381
|
+
# General n-way interaction (slower)
|
|
382
|
+
return self._build_nway_categorical(encodings, all_names, interaction.factors)
|
|
383
|
+
|
|
384
|
+
def _build_2way_categorical(
|
|
385
|
+
self,
|
|
386
|
+
encodings: List[np.ndarray],
|
|
387
|
+
all_names: List[List[str]],
|
|
388
|
+
factors: List[str],
|
|
389
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
390
|
+
"""
|
|
391
|
+
Optimized 2-way categorical interaction using index-based construction.
|
|
392
|
+
|
|
393
|
+
Instead of multiplying dense matrices, we use the fact that for any row,
|
|
394
|
+
at most one column in each encoding is 1. So the interaction column
|
|
395
|
+
corresponding to (level_i, level_j) is 1 only if both encodings are 1.
|
|
396
|
+
"""
|
|
397
|
+
# Get original indices (from cache or compute via encoding)
|
|
398
|
+
cat1, cat2 = factors
|
|
399
|
+
|
|
400
|
+
# Get indices and levels using consolidated cache
|
|
401
|
+
idx1, levels1 = self._get_categorical_indices(cat1)
|
|
402
|
+
idx2, levels2 = self._get_categorical_indices(cat2)
|
|
403
|
+
|
|
404
|
+
# Number of non-reference levels
|
|
405
|
+
n1 = len(levels1) - 1
|
|
406
|
+
n2 = len(levels2) - 1
|
|
407
|
+
|
|
408
|
+
if n1 * n2 == 0:
|
|
409
|
+
return np.zeros((self._n, 0), dtype=self.dtype), []
|
|
410
|
+
|
|
411
|
+
# Use Rust for fast parallel construction
|
|
412
|
+
names1, names2 = all_names
|
|
413
|
+
result, col_names = _build_cat_cat_rust(
|
|
414
|
+
idx1.astype(np.int32), n1,
|
|
415
|
+
idx2.astype(np.int32), n2,
|
|
416
|
+
list(names1), list(names2)
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
return result, col_names
|
|
420
|
+
|
|
421
|
+
def _build_nway_categorical(
|
|
422
|
+
self,
|
|
423
|
+
encodings: List[np.ndarray],
|
|
424
|
+
all_names: List[List[str]],
|
|
425
|
+
factors: List[str],
|
|
426
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
427
|
+
"""
|
|
428
|
+
General n-way categorical interaction using recursive 2-way Rust calls.
|
|
429
|
+
|
|
430
|
+
For 3+ way interactions, we recursively combine pairs using the
|
|
431
|
+
optimized 2-way Rust implementation.
|
|
432
|
+
"""
|
|
433
|
+
if len(factors) == 2:
|
|
434
|
+
# Base case - use optimized 2-way
|
|
435
|
+
return self._build_2way_categorical(encodings, all_names, factors)
|
|
436
|
+
|
|
437
|
+
# Recursive case: combine first two factors, then combine with rest
|
|
438
|
+
# Build first two factors' interaction
|
|
439
|
+
first_two_enc = encodings[:2]
|
|
440
|
+
first_two_names = all_names[:2]
|
|
441
|
+
first_two_factors = factors[:2]
|
|
442
|
+
|
|
443
|
+
combined, combined_names = self._build_2way_categorical(
|
|
444
|
+
first_two_enc, first_two_names, first_two_factors
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Recursively combine with remaining factors
|
|
448
|
+
remaining_enc = [combined] + encodings[2:]
|
|
449
|
+
remaining_names = [combined_names] + all_names[2:]
|
|
450
|
+
remaining_factors = [f"{first_two_factors[0]}:{first_two_factors[1]}"] + factors[2:]
|
|
451
|
+
|
|
452
|
+
return self._build_nway_categorical(remaining_enc, remaining_names, remaining_factors)
|
|
453
|
+
|
|
454
|
+
def _build_mixed_interaction(
|
|
455
|
+
self,
|
|
456
|
+
interaction: InteractionTerm
|
|
457
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
458
|
+
"""Build categorical × continuous interaction using Rust."""
|
|
459
|
+
# Separate categorical and continuous factors
|
|
460
|
+
cat_factors = []
|
|
461
|
+
cont_factors = []
|
|
462
|
+
spline_factors = [] # Spline terms need special handling
|
|
463
|
+
|
|
464
|
+
for factor, is_cat in zip(interaction.factors, interaction.categorical_flags):
|
|
465
|
+
if is_cat:
|
|
466
|
+
cat_factors.append(factor)
|
|
467
|
+
else:
|
|
468
|
+
# Check if this is a spline term
|
|
469
|
+
spline = self._parse_spline_factor(factor)
|
|
470
|
+
if spline is not None:
|
|
471
|
+
spline_factors.append((factor, spline))
|
|
472
|
+
else:
|
|
473
|
+
cont_factors.append(factor)
|
|
474
|
+
|
|
475
|
+
# Build categorical encoding first
|
|
476
|
+
if len(cat_factors) == 1:
|
|
477
|
+
cat_name = cat_factors[0]
|
|
478
|
+
cat_encoding, cat_names = self._get_categorical_encoding(cat_name)
|
|
479
|
+
else:
|
|
480
|
+
cat_interaction = InteractionTerm(
|
|
481
|
+
factors=cat_factors,
|
|
482
|
+
categorical_flags=[True] * len(cat_factors)
|
|
483
|
+
)
|
|
484
|
+
cat_encoding, cat_names = self._build_categorical_interaction(cat_interaction)
|
|
485
|
+
|
|
486
|
+
if cat_encoding.shape[1] == 0:
|
|
487
|
+
return np.zeros((self._n, 0), dtype=self.dtype), []
|
|
488
|
+
|
|
489
|
+
# Handle spline × categorical interactions
|
|
490
|
+
if spline_factors:
|
|
491
|
+
# Build spline basis for each spline factor
|
|
492
|
+
all_columns = []
|
|
493
|
+
all_names = []
|
|
494
|
+
|
|
495
|
+
for spline_str, spline in spline_factors:
|
|
496
|
+
x = self._get_column(spline.var_name)
|
|
497
|
+
spline_basis, spline_names = spline.transform(x)
|
|
498
|
+
# Store fitted spline for prediction
|
|
499
|
+
self._fitted_splines[spline.var_name] = spline
|
|
500
|
+
|
|
501
|
+
# Multiply each spline column by each categorical column
|
|
502
|
+
for j, spl_name in enumerate(spline_names):
|
|
503
|
+
for i, cat_name in enumerate(cat_names):
|
|
504
|
+
col = cat_encoding[:, i] * spline_basis[:, j]
|
|
505
|
+
all_columns.append(col)
|
|
506
|
+
all_names.append(f"{cat_name}:{spl_name}")
|
|
507
|
+
|
|
508
|
+
# Also include any regular continuous factors
|
|
509
|
+
if cont_factors:
|
|
510
|
+
cont_product = self._get_column(cont_factors[0])
|
|
511
|
+
for factor in cont_factors[1:]:
|
|
512
|
+
cont_product = cont_product * self._get_column(factor)
|
|
513
|
+
cont_name = ':'.join(cont_factors)
|
|
514
|
+
|
|
515
|
+
# Multiply by continuous
|
|
516
|
+
final_columns = []
|
|
517
|
+
final_names = []
|
|
518
|
+
for col, name in zip(all_columns, all_names):
|
|
519
|
+
final_columns.append(col * cont_product)
|
|
520
|
+
final_names.append(f"{name}:{cont_name}")
|
|
521
|
+
all_columns = final_columns
|
|
522
|
+
all_names = final_names
|
|
523
|
+
|
|
524
|
+
if all_columns:
|
|
525
|
+
return np.column_stack(all_columns), all_names
|
|
526
|
+
return np.zeros((self._n, 0), dtype=self.dtype), []
|
|
527
|
+
|
|
528
|
+
# Standard continuous × categorical (no splines)
|
|
529
|
+
cont_product = self._get_column(cont_factors[0])
|
|
530
|
+
for factor in cont_factors[1:]:
|
|
531
|
+
cont_product = cont_product * self._get_column(factor)
|
|
532
|
+
cont_name = ':'.join(cont_factors)
|
|
533
|
+
|
|
534
|
+
# Build categorical part and use Rust for interaction
|
|
535
|
+
if len(cat_factors) == 1:
|
|
536
|
+
# Single categorical - use Rust directly
|
|
537
|
+
cat_name = cat_factors[0]
|
|
538
|
+
|
|
539
|
+
# Get indices and levels using consolidated cache
|
|
540
|
+
cat_indices, levels = self._get_categorical_indices(cat_name)
|
|
541
|
+
n_levels = len(levels) - 1 # Excluding reference
|
|
542
|
+
|
|
543
|
+
if n_levels == 0:
|
|
544
|
+
return np.zeros((self._n, 0), dtype=self.dtype), []
|
|
545
|
+
|
|
546
|
+
# Get category names from encoding
|
|
547
|
+
_, cat_names = self._get_categorical_encoding(cat_name)
|
|
548
|
+
|
|
549
|
+
# Use Rust for fast parallel construction
|
|
550
|
+
result, col_names = _build_cat_cont_rust(
|
|
551
|
+
cat_indices.astype(np.int32),
|
|
552
|
+
n_levels,
|
|
553
|
+
cont_product.astype(np.float64),
|
|
554
|
+
list(cat_names),
|
|
555
|
+
cont_name
|
|
556
|
+
)
|
|
557
|
+
return result, col_names
|
|
558
|
+
else:
|
|
559
|
+
# Multiple categorical - build their interaction first, then multiply using Rust
|
|
560
|
+
cat_interaction = InteractionTerm(
|
|
561
|
+
factors=cat_factors,
|
|
562
|
+
categorical_flags=[True] * len(cat_factors)
|
|
563
|
+
)
|
|
564
|
+
cat_encoding, cat_names = self._build_categorical_interaction(cat_interaction)
|
|
565
|
+
|
|
566
|
+
# Use Rust to multiply categorical matrix by continuous
|
|
567
|
+
result, col_names = _multiply_matrix_cont_rust(
|
|
568
|
+
cat_encoding.astype(np.float64),
|
|
569
|
+
cont_product.astype(np.float64),
|
|
570
|
+
list(cat_names),
|
|
571
|
+
cont_name
|
|
572
|
+
)
|
|
573
|
+
return result, col_names
|
|
574
|
+
|
|
575
|
+
def _build_spline_columns(
|
|
576
|
+
self,
|
|
577
|
+
spline: SplineTerm,
|
|
578
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
579
|
+
"""
|
|
580
|
+
Build columns for a spline term.
|
|
581
|
+
|
|
582
|
+
Uses SplineTerm.transform() which calls the fast Rust implementation.
|
|
583
|
+
"""
|
|
584
|
+
x = self._get_column(spline.var_name)
|
|
585
|
+
return spline.transform(x)
|
|
586
|
+
|
|
587
|
+
def _build_target_encoding_columns(
|
|
588
|
+
self,
|
|
589
|
+
te_term: TargetEncodingTermSpec,
|
|
590
|
+
target: np.ndarray,
|
|
591
|
+
seed: Optional[int] = None,
|
|
592
|
+
exposure: Optional[np.ndarray] = None,
|
|
593
|
+
) -> Tuple[np.ndarray, str, dict]:
|
|
594
|
+
"""
|
|
595
|
+
Build target-encoded column for a categorical variable.
|
|
596
|
+
|
|
597
|
+
Uses ordered target statistics to prevent target leakage.
|
|
598
|
+
|
|
599
|
+
For frequency models with exposure, uses claim rate (target/exposure)
|
|
600
|
+
instead of raw counts to produce more meaningful encoded values.
|
|
601
|
+
|
|
602
|
+
Parameters
|
|
603
|
+
----------
|
|
604
|
+
te_term : TargetEncodingTermSpec
|
|
605
|
+
Target encoding term specification
|
|
606
|
+
target : np.ndarray
|
|
607
|
+
Target variable values (e.g., ClaimCount)
|
|
608
|
+
seed : int, optional
|
|
609
|
+
Random seed for reproducibility
|
|
610
|
+
exposure : np.ndarray, optional
|
|
611
|
+
Exposure values. If provided, target encoding uses rate (target/exposure)
|
|
612
|
+
instead of raw target values. This prevents collapse to near-constant
|
|
613
|
+
values for low-frequency count data.
|
|
614
|
+
|
|
615
|
+
Returns
|
|
616
|
+
-------
|
|
617
|
+
encoded : np.ndarray
|
|
618
|
+
Target-encoded values (n,)
|
|
619
|
+
name : str
|
|
620
|
+
Column name like "TE(brand)"
|
|
621
|
+
stats : dict
|
|
622
|
+
Level statistics for prediction on new data
|
|
623
|
+
"""
|
|
624
|
+
col = self.data[te_term.var_name].to_numpy()
|
|
625
|
+
categories = [str(v) for v in col]
|
|
626
|
+
|
|
627
|
+
# Use rate (target/exposure) for encoding when exposure is available
|
|
628
|
+
# This prevents near-constant encoded values for low-frequency count data
|
|
629
|
+
if exposure is not None:
|
|
630
|
+
encoding_target = (target / np.maximum(exposure, 1e-10)).astype(np.float64)
|
|
631
|
+
else:
|
|
632
|
+
encoding_target = target.astype(np.float64)
|
|
633
|
+
|
|
634
|
+
encoded, name, prior, stats = _target_encode_rust(
|
|
635
|
+
categories,
|
|
636
|
+
encoding_target,
|
|
637
|
+
te_term.var_name,
|
|
638
|
+
te_term.prior_weight,
|
|
639
|
+
te_term.n_permutations,
|
|
640
|
+
seed,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# Store whether we used rate encoding for prediction
|
|
644
|
+
return encoded, name, {
|
|
645
|
+
'prior': prior,
|
|
646
|
+
'stats': stats,
|
|
647
|
+
'prior_weight': te_term.prior_weight,
|
|
648
|
+
'used_rate_encoding': exposure is not None,
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
def _build_identity_columns(
|
|
652
|
+
self,
|
|
653
|
+
identity: IdentityTermSpec,
|
|
654
|
+
data: "pl.DataFrame",
|
|
655
|
+
) -> Tuple[np.ndarray, str]:
|
|
656
|
+
"""
|
|
657
|
+
Build column for an identity term (I() expression).
|
|
658
|
+
|
|
659
|
+
Evaluates expressions like I(x ** 2), I(x + y), I(x * y) against DataFrame columns.
|
|
660
|
+
|
|
661
|
+
Parameters
|
|
662
|
+
----------
|
|
663
|
+
identity : IdentityTermSpec
|
|
664
|
+
Identity term specification with the expression
|
|
665
|
+
data : pl.DataFrame
|
|
666
|
+
DataFrame containing the columns referenced in the expression
|
|
667
|
+
|
|
668
|
+
Returns
|
|
669
|
+
-------
|
|
670
|
+
values : np.ndarray
|
|
671
|
+
Evaluated expression values (n,)
|
|
672
|
+
name : str
|
|
673
|
+
Column name like "I(x ** 2)"
|
|
674
|
+
"""
|
|
675
|
+
import polars as pl
|
|
676
|
+
|
|
677
|
+
expr = identity.expression
|
|
678
|
+
name = f"I({expr})"
|
|
679
|
+
|
|
680
|
+
# Convert Python ** to Polars pow() and evaluate
|
|
681
|
+
# Common patterns: x ** 2, x ** 3, x + y, x * y, x / y
|
|
682
|
+
try:
|
|
683
|
+
# Use Polars eval with SQL-like syntax
|
|
684
|
+
# Convert ** to .pow() for polars
|
|
685
|
+
polars_expr = self._convert_expression_to_polars(expr)
|
|
686
|
+
result = data.select(polars_expr.alias("__result__"))["__result__"].to_numpy()
|
|
687
|
+
return result.astype(self.dtype), name
|
|
688
|
+
except Exception as e:
|
|
689
|
+
raise ValueError(
|
|
690
|
+
f"Failed to evaluate I() expression '{expr}': {e}\n"
|
|
691
|
+
f"Supported operations: +, -, *, /, ** (power)\n"
|
|
692
|
+
f"Example: I(x ** 2), I(x + y), I(x * y)"
|
|
693
|
+
) from e
|
|
694
|
+
|
|
695
|
+
def _convert_expression_to_polars(self, expr: str) -> "pl.Expr":
|
|
696
|
+
"""
|
|
697
|
+
Convert a Python-style expression to a Polars expression.
|
|
698
|
+
|
|
699
|
+
Handles:
|
|
700
|
+
- x ** 2 -> col("x").pow(2)
|
|
701
|
+
- x + y -> col("x") + col("y")
|
|
702
|
+
- x * y -> col("x") * col("y")
|
|
703
|
+
- x / y -> col("x") / col("y")
|
|
704
|
+
- x - y -> col("x") - col("y")
|
|
705
|
+
"""
|
|
706
|
+
import polars as pl
|
|
707
|
+
import re
|
|
708
|
+
|
|
709
|
+
expr = expr.strip()
|
|
710
|
+
|
|
711
|
+
# Handle power operator: var ** num or var ** var
|
|
712
|
+
power_match = re.match(r'^(\w+)\s*\*\*\s*(\d+(?:\.\d+)?|\w+)$', expr)
|
|
713
|
+
if power_match:
|
|
714
|
+
var_name = power_match.group(1)
|
|
715
|
+
power = power_match.group(2)
|
|
716
|
+
try:
|
|
717
|
+
# Try to parse as number
|
|
718
|
+
power_val = float(power)
|
|
719
|
+
return pl.col(var_name).pow(power_val)
|
|
720
|
+
except ValueError:
|
|
721
|
+
# It's a column name
|
|
722
|
+
return pl.col(var_name).pow(pl.col(power))
|
|
723
|
+
|
|
724
|
+
# Handle binary operations: var op var or var op num
|
|
725
|
+
binary_ops = [
|
|
726
|
+
(r'^(\w+)\s*\+\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a + b),
|
|
727
|
+
(r'^(\w+)\s*-\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a - b),
|
|
728
|
+
(r'^(\w+)\s*\*\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a * b),
|
|
729
|
+
(r'^(\w+)\s*/\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a / b),
|
|
730
|
+
]
|
|
731
|
+
|
|
732
|
+
for pattern, op_func in binary_ops:
|
|
733
|
+
match = re.match(pattern, expr)
|
|
734
|
+
if match:
|
|
735
|
+
left = match.group(1)
|
|
736
|
+
right = match.group(2)
|
|
737
|
+
left_expr = pl.col(left)
|
|
738
|
+
try:
|
|
739
|
+
right_val = float(right)
|
|
740
|
+
right_expr = pl.lit(right_val)
|
|
741
|
+
except ValueError:
|
|
742
|
+
right_expr = pl.col(right)
|
|
743
|
+
return op_func(left_expr, right_expr)
|
|
744
|
+
|
|
745
|
+
# If no pattern matched, try direct column reference (simple case)
|
|
746
|
+
# This handles cases like I(x) which is just the column itself
|
|
747
|
+
if re.match(r'^\w+$', expr):
|
|
748
|
+
return pl.col(expr)
|
|
749
|
+
|
|
750
|
+
raise ValueError(
|
|
751
|
+
f"Cannot parse expression '{expr}'. "
|
|
752
|
+
f"Supported formats: 'x ** 2', 'x + y', 'x * y', 'x / y', 'x - y'"
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
def build_design_matrix(
|
|
756
|
+
self,
|
|
757
|
+
formula: str,
|
|
758
|
+
exposure: Optional[np.ndarray] = None,
|
|
759
|
+
) -> Tuple[np.ndarray, np.ndarray, List[str]]:
|
|
760
|
+
"""
|
|
761
|
+
Build complete design matrix from formula.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
formula : str
|
|
766
|
+
R-style formula like "y ~ x1*x2 + C(cat) + bs(age, df=5)"
|
|
767
|
+
exposure : np.ndarray, optional
|
|
768
|
+
Exposure values. If provided, target encoding (TE) will use
|
|
769
|
+
rate (y/exposure) instead of raw y values. This is important
|
|
770
|
+
for frequency models to prevent TE values collapsing to near-constant.
|
|
771
|
+
|
|
772
|
+
Returns
|
|
773
|
+
-------
|
|
774
|
+
y : np.ndarray
|
|
775
|
+
Response variable
|
|
776
|
+
X : np.ndarray
|
|
777
|
+
Design matrix
|
|
778
|
+
names : list[str]
|
|
779
|
+
Column names
|
|
780
|
+
"""
|
|
781
|
+
parsed = parse_formula_interactions(formula)
|
|
782
|
+
|
|
783
|
+
columns = []
|
|
784
|
+
names = []
|
|
785
|
+
|
|
786
|
+
# Add intercept
|
|
787
|
+
if parsed.has_intercept:
|
|
788
|
+
columns.append(np.ones(self._n, dtype=self.dtype))
|
|
789
|
+
names.append('Intercept')
|
|
790
|
+
|
|
791
|
+
# Add main effects
|
|
792
|
+
for var in parsed.main_effects:
|
|
793
|
+
if var in parsed.categorical_vars:
|
|
794
|
+
enc, enc_names = self._get_categorical_encoding(var)
|
|
795
|
+
columns.append(enc)
|
|
796
|
+
names.extend(enc_names)
|
|
797
|
+
else:
|
|
798
|
+
columns.append(self._get_column(var).reshape(-1, 1))
|
|
799
|
+
names.append(var)
|
|
800
|
+
|
|
801
|
+
# Add spline terms
|
|
802
|
+
for spline in parsed.spline_terms:
|
|
803
|
+
spline_cols, spline_names = self._build_spline_columns(spline)
|
|
804
|
+
columns.append(spline_cols)
|
|
805
|
+
names.extend(spline_names)
|
|
806
|
+
# Store fitted spline for prediction
|
|
807
|
+
self._fitted_splines[spline.var_name] = spline
|
|
808
|
+
|
|
809
|
+
# Add interactions
|
|
810
|
+
for interaction in parsed.interactions:
|
|
811
|
+
int_cols, int_names = self.build_interaction_columns(interaction)
|
|
812
|
+
if int_cols.ndim == 1:
|
|
813
|
+
int_cols = int_cols.reshape(-1, 1)
|
|
814
|
+
columns.append(int_cols)
|
|
815
|
+
names.extend(int_names)
|
|
816
|
+
|
|
817
|
+
# Store parsed formula for prediction
|
|
818
|
+
self._parsed_formula = parsed
|
|
819
|
+
|
|
820
|
+
# Get response (needed for target encoding)
|
|
821
|
+
y = self._get_column(parsed.response)
|
|
822
|
+
|
|
823
|
+
# Add target encoding terms (CatBoost-style)
|
|
824
|
+
# Store stats for prediction on new data
|
|
825
|
+
# When exposure is provided, use rate (y/exposure) for encoding
|
|
826
|
+
self._te_stats: Dict[str, dict] = {}
|
|
827
|
+
for te_term in parsed.target_encoding_terms:
|
|
828
|
+
te_col, te_name, te_stats = self._build_target_encoding_columns(
|
|
829
|
+
te_term, y, exposure=exposure
|
|
830
|
+
)
|
|
831
|
+
columns.append(te_col.reshape(-1, 1))
|
|
832
|
+
names.append(te_name)
|
|
833
|
+
self._te_stats[te_term.var_name] = te_stats
|
|
834
|
+
|
|
835
|
+
# Add identity terms (I() expressions like I(x ** 2))
|
|
836
|
+
for identity in parsed.identity_terms:
|
|
837
|
+
id_col, id_name = self._build_identity_columns(identity, self.data)
|
|
838
|
+
columns.append(id_col.reshape(-1, 1))
|
|
839
|
+
names.append(id_name)
|
|
840
|
+
|
|
841
|
+
# Stack all columns
|
|
842
|
+
if columns:
|
|
843
|
+
X = np.hstack([c if c.ndim == 2 else c.reshape(-1, 1) for c in columns])
|
|
844
|
+
else:
|
|
845
|
+
X = np.ones((self._n, 1), dtype=self.dtype)
|
|
846
|
+
names = ['Intercept']
|
|
847
|
+
|
|
848
|
+
# Store for validation
|
|
849
|
+
self._last_X = X
|
|
850
|
+
self._last_names = names
|
|
851
|
+
|
|
852
|
+
return y, X, names
|
|
853
|
+
|
|
854
|
+
def validate_design_matrix(
|
|
855
|
+
self,
|
|
856
|
+
X: np.ndarray = None,
|
|
857
|
+
names: List[str] = None,
|
|
858
|
+
corr_threshold: float = 0.999,
|
|
859
|
+
verbose: bool = True,
|
|
860
|
+
) -> dict:
|
|
861
|
+
"""
|
|
862
|
+
Validate design matrix for common issues that cause fitting failures.
|
|
863
|
+
|
|
864
|
+
Parameters
|
|
865
|
+
----------
|
|
866
|
+
X : np.ndarray, optional
|
|
867
|
+
Design matrix to validate. If None, uses last built matrix.
|
|
868
|
+
names : list of str, optional
|
|
869
|
+
Feature names. If None, uses last built names.
|
|
870
|
+
corr_threshold : float, default=0.999
|
|
871
|
+
Correlation threshold above which columns are flagged as problematic.
|
|
872
|
+
verbose : bool, default=True
|
|
873
|
+
Print diagnostic messages.
|
|
874
|
+
|
|
875
|
+
Returns
|
|
876
|
+
-------
|
|
877
|
+
dict
|
|
878
|
+
Validation results with keys:
|
|
879
|
+
- 'valid': bool, True if matrix is suitable for fitting
|
|
880
|
+
- 'rank': int, matrix rank
|
|
881
|
+
- 'expected_rank': int, number of columns
|
|
882
|
+
- 'condition_number': float, condition number (large = ill-conditioned)
|
|
883
|
+
- 'problematic_columns': list of tuples (col1, col2, correlation)
|
|
884
|
+
- 'zero_variance_columns': list of column names with zero variance
|
|
885
|
+
- 'suggestions': list of actionable fix suggestions
|
|
886
|
+
"""
|
|
887
|
+
if X is None:
|
|
888
|
+
X = getattr(self, '_last_X', None)
|
|
889
|
+
names = getattr(self, '_last_names', None)
|
|
890
|
+
if X is None:
|
|
891
|
+
raise ValueError("No design matrix to validate. Call build_design_matrix() first.")
|
|
892
|
+
|
|
893
|
+
n_rows, n_cols = X.shape
|
|
894
|
+
results = {
|
|
895
|
+
'valid': True,
|
|
896
|
+
'rank': None,
|
|
897
|
+
'expected_rank': n_cols,
|
|
898
|
+
'condition_number': None,
|
|
899
|
+
'problematic_columns': [],
|
|
900
|
+
'zero_variance_columns': [],
|
|
901
|
+
'suggestions': [],
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
# Check for NaN/Inf
|
|
905
|
+
if np.isnan(X).any():
|
|
906
|
+
results['valid'] = False
|
|
907
|
+
nan_cols = [names[i] for i in range(n_cols) if np.isnan(X[:, i]).any()]
|
|
908
|
+
results['suggestions'].append(f"Columns contain NaN values: {nan_cols}")
|
|
909
|
+
|
|
910
|
+
if np.isinf(X).any():
|
|
911
|
+
results['valid'] = False
|
|
912
|
+
inf_cols = [names[i] for i in range(n_cols) if np.isinf(X[:, i]).any()]
|
|
913
|
+
results['suggestions'].append(f"Columns contain Inf values: {inf_cols}")
|
|
914
|
+
|
|
915
|
+
# Check for zero variance columns (exclude Intercept which is supposed to be constant)
|
|
916
|
+
variances = np.var(X, axis=0)
|
|
917
|
+
zero_var_idx = np.where(variances < 1e-10)[0]
|
|
918
|
+
if len(zero_var_idx) > 0:
|
|
919
|
+
zero_var_cols = [names[i] for i in zero_var_idx if i < len(names) and names[i] != 'Intercept']
|
|
920
|
+
if zero_var_cols:
|
|
921
|
+
results['zero_variance_columns'] = zero_var_cols
|
|
922
|
+
results['valid'] = False
|
|
923
|
+
results['suggestions'].append(
|
|
924
|
+
f"Columns have zero/near-zero variance: {zero_var_cols}. "
|
|
925
|
+
"This often happens with splines on highly skewed data where most values are identical."
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
# Check matrix rank
|
|
929
|
+
try:
|
|
930
|
+
results['rank'] = np.linalg.matrix_rank(X)
|
|
931
|
+
if results['rank'] < n_cols:
|
|
932
|
+
results['valid'] = False
|
|
933
|
+
results['suggestions'].append(
|
|
934
|
+
f"Matrix is rank-deficient: rank={results['rank']}, expected={n_cols}. "
|
|
935
|
+
f"{n_cols - results['rank']} columns are linearly dependent."
|
|
936
|
+
)
|
|
937
|
+
except Exception as e:
|
|
938
|
+
results['suggestions'].append(f"Warning: Could not compute matrix rank: {e}")
|
|
939
|
+
|
|
940
|
+
# Check condition number
|
|
941
|
+
try:
|
|
942
|
+
results['condition_number'] = np.linalg.cond(X)
|
|
943
|
+
if results['condition_number'] > 1e10:
|
|
944
|
+
results['valid'] = False
|
|
945
|
+
results['suggestions'].append(
|
|
946
|
+
f"Matrix is ill-conditioned (condition number={results['condition_number']:.2e}). "
|
|
947
|
+
"This indicates near-linear dependence between columns."
|
|
948
|
+
)
|
|
949
|
+
except Exception as e:
|
|
950
|
+
results['suggestions'].append(f"Warning: Could not compute condition number: {e}")
|
|
951
|
+
|
|
952
|
+
# Check for highly correlated columns (skip intercept)
|
|
953
|
+
try:
|
|
954
|
+
# Compute correlations only for non-constant columns
|
|
955
|
+
non_const_idx = [i for i in range(n_cols) if variances[i] > 1e-10]
|
|
956
|
+
if len(non_const_idx) > 1:
|
|
957
|
+
X_subset = X[:, non_const_idx]
|
|
958
|
+
corr_matrix = np.corrcoef(X_subset.T)
|
|
959
|
+
|
|
960
|
+
for i in range(len(non_const_idx)):
|
|
961
|
+
for j in range(i + 1, len(non_const_idx)):
|
|
962
|
+
corr = abs(corr_matrix[i, j])
|
|
963
|
+
if corr > corr_threshold:
|
|
964
|
+
col1 = names[non_const_idx[i]]
|
|
965
|
+
col2 = names[non_const_idx[j]]
|
|
966
|
+
results['problematic_columns'].append((col1, col2, corr))
|
|
967
|
+
|
|
968
|
+
if results['problematic_columns']:
|
|
969
|
+
results['valid'] = False
|
|
970
|
+
pairs = [f"'{c1}' <-> '{c2}' (r={r:.4f})" for c1, c2, r in results['problematic_columns']]
|
|
971
|
+
results['suggestions'].append(
|
|
972
|
+
f"Highly correlated column pairs detected:\n " + "\n ".join(pairs) + "\n"
|
|
973
|
+
"This often happens with natural splines (ns) on skewed data. Fixes:\n"
|
|
974
|
+
" 1. Use B-splines instead: bs(VarName, df=4) - more robust to skewed data\n"
|
|
975
|
+
" 2. Use log transform: ns(log_VarName, df=4) for skewed variables\n"
|
|
976
|
+
" 3. Reduce degrees of freedom: ns(VarName, df=2)\n"
|
|
977
|
+
" 4. Use linear term instead: just 'VarName' without spline"
|
|
978
|
+
)
|
|
979
|
+
except Exception as e:
|
|
980
|
+
results['suggestions'].append(f"Warning: Could not compute column correlations: {e}")
|
|
981
|
+
|
|
982
|
+
if verbose:
|
|
983
|
+
print("=" * 60)
|
|
984
|
+
print("DESIGN MATRIX VALIDATION")
|
|
985
|
+
print("=" * 60)
|
|
986
|
+
print(f"Shape: {n_rows} rows × {n_cols} columns")
|
|
987
|
+
print(f"Rank: {results['rank']} / {n_cols}")
|
|
988
|
+
if results['condition_number']:
|
|
989
|
+
print(f"Condition number: {results['condition_number']:.2e}")
|
|
990
|
+
print(f"Status: {'✓ VALID' if results['valid'] else '✗ INVALID'}")
|
|
991
|
+
|
|
992
|
+
if not results['valid']:
|
|
993
|
+
print("\nPROBLEMS DETECTED:")
|
|
994
|
+
for i, suggestion in enumerate(results['suggestions'], 1):
|
|
995
|
+
print(f"\n{i}. {suggestion}")
|
|
996
|
+
print("=" * 60)
|
|
997
|
+
|
|
998
|
+
return results
|
|
999
|
+
|
|
1000
|
+
def transform_new_data(
|
|
1001
|
+
self,
|
|
1002
|
+
new_data: "pl.DataFrame",
|
|
1003
|
+
) -> np.ndarray:
|
|
1004
|
+
"""
|
|
1005
|
+
Transform new data using the encoding state from training.
|
|
1006
|
+
|
|
1007
|
+
This method applies the same transformations learned during
|
|
1008
|
+
build_design_matrix() to new data for prediction.
|
|
1009
|
+
|
|
1010
|
+
Parameters
|
|
1011
|
+
----------
|
|
1012
|
+
new_data : pl.DataFrame
|
|
1013
|
+
New data to transform. Must have same columns as training data.
|
|
1014
|
+
|
|
1015
|
+
Returns
|
|
1016
|
+
-------
|
|
1017
|
+
X : np.ndarray
|
|
1018
|
+
Design matrix for new data
|
|
1019
|
+
|
|
1020
|
+
Raises
|
|
1021
|
+
------
|
|
1022
|
+
ValueError
|
|
1023
|
+
If build_design_matrix() was not called first, or if new data
|
|
1024
|
+
contains unseen categorical levels.
|
|
1025
|
+
"""
|
|
1026
|
+
if self._parsed_formula is None:
|
|
1027
|
+
raise ValueError(
|
|
1028
|
+
"Must call build_design_matrix() before transform_new_data(). "
|
|
1029
|
+
"No formula has been fitted yet."
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
parsed = self._parsed_formula
|
|
1033
|
+
n_new = len(new_data)
|
|
1034
|
+
columns = []
|
|
1035
|
+
|
|
1036
|
+
# Add intercept
|
|
1037
|
+
if parsed.has_intercept:
|
|
1038
|
+
columns.append(np.ones(n_new, dtype=self.dtype))
|
|
1039
|
+
|
|
1040
|
+
# Add main effects
|
|
1041
|
+
for var in parsed.main_effects:
|
|
1042
|
+
if var in parsed.categorical_vars:
|
|
1043
|
+
enc = self._encode_categorical_new(new_data, var)
|
|
1044
|
+
columns.append(enc)
|
|
1045
|
+
else:
|
|
1046
|
+
col = new_data[var].to_numpy().astype(self.dtype)
|
|
1047
|
+
columns.append(col.reshape(-1, 1))
|
|
1048
|
+
|
|
1049
|
+
# Add spline terms using fitted knots
|
|
1050
|
+
for spline in parsed.spline_terms:
|
|
1051
|
+
x = new_data[spline.var_name].to_numpy().astype(self.dtype)
|
|
1052
|
+
# Use the fitted spline which has the same knots as training
|
|
1053
|
+
fitted_spline = self._fitted_splines.get(spline.var_name, spline)
|
|
1054
|
+
spline_cols, _ = fitted_spline.transform(x)
|
|
1055
|
+
columns.append(spline_cols)
|
|
1056
|
+
|
|
1057
|
+
# Add interactions
|
|
1058
|
+
for interaction in parsed.interactions:
|
|
1059
|
+
int_cols = self._build_interaction_new(new_data, interaction, n_new)
|
|
1060
|
+
if int_cols.ndim == 1:
|
|
1061
|
+
int_cols = int_cols.reshape(-1, 1)
|
|
1062
|
+
columns.append(int_cols)
|
|
1063
|
+
|
|
1064
|
+
# Add target encoding terms using stored statistics
|
|
1065
|
+
for te_term in parsed.target_encoding_terms:
|
|
1066
|
+
te_col = self._encode_target_new(new_data, te_term)
|
|
1067
|
+
columns.append(te_col.reshape(-1, 1))
|
|
1068
|
+
|
|
1069
|
+
# Add identity terms (I() expressions) - same evaluation on new data
|
|
1070
|
+
for identity in parsed.identity_terms:
|
|
1071
|
+
id_col, _ = self._build_identity_columns(identity, new_data)
|
|
1072
|
+
columns.append(id_col.reshape(-1, 1))
|
|
1073
|
+
|
|
1074
|
+
# Stack all columns
|
|
1075
|
+
if columns:
|
|
1076
|
+
X = np.hstack([c if c.ndim == 2 else c.reshape(-1, 1) for c in columns])
|
|
1077
|
+
else:
|
|
1078
|
+
X = np.ones((n_new, 1), dtype=self.dtype)
|
|
1079
|
+
|
|
1080
|
+
return X
|
|
1081
|
+
|
|
1082
|
+
def _encode_categorical_new(
|
|
1083
|
+
self,
|
|
1084
|
+
new_data: "pl.DataFrame",
|
|
1085
|
+
var_name: str,
|
|
1086
|
+
) -> np.ndarray:
|
|
1087
|
+
"""Encode categorical variable using levels from training."""
|
|
1088
|
+
levels = self._get_categorical_levels(var_name)
|
|
1089
|
+
col = new_data[var_name].to_numpy()
|
|
1090
|
+
n = len(col)
|
|
1091
|
+
|
|
1092
|
+
# Create level to index mapping (reference level is index 0)
|
|
1093
|
+
level_to_idx = {level: i for i, level in enumerate(levels)}
|
|
1094
|
+
|
|
1095
|
+
# Number of dummy columns (excluding reference level)
|
|
1096
|
+
n_dummies = len(levels) - 1
|
|
1097
|
+
encoding = np.zeros((n, n_dummies), dtype=self.dtype)
|
|
1098
|
+
|
|
1099
|
+
for i, val in enumerate(col):
|
|
1100
|
+
val_str = str(val)
|
|
1101
|
+
if val_str in level_to_idx:
|
|
1102
|
+
idx = level_to_idx[val_str]
|
|
1103
|
+
if idx > 0: # Skip reference level
|
|
1104
|
+
encoding[i, idx - 1] = 1.0
|
|
1105
|
+
# Unknown levels get all zeros (mapped to reference)
|
|
1106
|
+
|
|
1107
|
+
return encoding
|
|
1108
|
+
|
|
1109
|
+
def _build_interaction_new(
|
|
1110
|
+
self,
|
|
1111
|
+
new_data: "pl.DataFrame",
|
|
1112
|
+
interaction: InteractionTerm,
|
|
1113
|
+
n: int,
|
|
1114
|
+
) -> np.ndarray:
|
|
1115
|
+
"""Build interaction columns for new data."""
|
|
1116
|
+
if interaction.is_pure_continuous:
|
|
1117
|
+
# Continuous × continuous
|
|
1118
|
+
result = new_data[interaction.factors[0]].to_numpy().astype(self.dtype)
|
|
1119
|
+
for factor in interaction.factors[1:]:
|
|
1120
|
+
result = result * new_data[factor].to_numpy().astype(self.dtype)
|
|
1121
|
+
return result.reshape(-1, 1)
|
|
1122
|
+
|
|
1123
|
+
elif interaction.is_pure_categorical:
|
|
1124
|
+
# Categorical × categorical
|
|
1125
|
+
encodings = []
|
|
1126
|
+
for factor in interaction.factors:
|
|
1127
|
+
enc = self._encode_categorical_new(new_data, factor)
|
|
1128
|
+
encodings.append(enc)
|
|
1129
|
+
|
|
1130
|
+
# Build interaction by taking outer product
|
|
1131
|
+
result = encodings[0]
|
|
1132
|
+
for enc in encodings[1:]:
|
|
1133
|
+
# Kronecker-style expansion
|
|
1134
|
+
n_cols1, n_cols2 = result.shape[1], enc.shape[1]
|
|
1135
|
+
new_result = np.zeros((n, n_cols1 * n_cols2), dtype=self.dtype)
|
|
1136
|
+
for i in range(n_cols1):
|
|
1137
|
+
for j in range(n_cols2):
|
|
1138
|
+
new_result[:, i * n_cols2 + j] = result[:, i] * enc[:, j]
|
|
1139
|
+
result = new_result
|
|
1140
|
+
return result
|
|
1141
|
+
|
|
1142
|
+
else:
|
|
1143
|
+
# Mixed: categorical × continuous
|
|
1144
|
+
cat_factors = []
|
|
1145
|
+
cont_factors = []
|
|
1146
|
+
for factor, is_cat in zip(interaction.factors, interaction.categorical_flags):
|
|
1147
|
+
if is_cat:
|
|
1148
|
+
cat_factors.append(factor)
|
|
1149
|
+
else:
|
|
1150
|
+
cont_factors.append(factor)
|
|
1151
|
+
|
|
1152
|
+
# Build continuous product
|
|
1153
|
+
cont_product = new_data[cont_factors[0]].to_numpy().astype(self.dtype)
|
|
1154
|
+
for factor in cont_factors[1:]:
|
|
1155
|
+
cont_product = cont_product * new_data[factor].to_numpy().astype(self.dtype)
|
|
1156
|
+
|
|
1157
|
+
# Build categorical encoding
|
|
1158
|
+
if len(cat_factors) == 1:
|
|
1159
|
+
cat_enc = self._encode_categorical_new(new_data, cat_factors[0])
|
|
1160
|
+
else:
|
|
1161
|
+
# Multiple categorical - build their interaction
|
|
1162
|
+
cat_enc = self._encode_categorical_new(new_data, cat_factors[0])
|
|
1163
|
+
for factor in cat_factors[1:]:
|
|
1164
|
+
enc = self._encode_categorical_new(new_data, factor)
|
|
1165
|
+
n_cols1, n_cols2 = cat_enc.shape[1], enc.shape[1]
|
|
1166
|
+
new_enc = np.zeros((n, n_cols1 * n_cols2), dtype=self.dtype)
|
|
1167
|
+
for i in range(n_cols1):
|
|
1168
|
+
for j in range(n_cols2):
|
|
1169
|
+
new_enc[:, i * n_cols2 + j] = cat_enc[:, i] * enc[:, j]
|
|
1170
|
+
cat_enc = new_enc
|
|
1171
|
+
|
|
1172
|
+
# Multiply categorical dummies by continuous
|
|
1173
|
+
result = cat_enc * cont_product.reshape(-1, 1)
|
|
1174
|
+
return result
|
|
1175
|
+
|
|
1176
|
+
def _encode_target_new(
|
|
1177
|
+
self,
|
|
1178
|
+
new_data: "pl.DataFrame",
|
|
1179
|
+
te_term: TargetEncodingTermSpec,
|
|
1180
|
+
) -> np.ndarray:
|
|
1181
|
+
"""Encode using target statistics from training."""
|
|
1182
|
+
if te_term.var_name not in self._te_stats:
|
|
1183
|
+
raise ValueError(
|
|
1184
|
+
f"Target encoding for '{te_term.var_name}' was not fitted during training."
|
|
1185
|
+
)
|
|
1186
|
+
|
|
1187
|
+
stats = self._te_stats[te_term.var_name]
|
|
1188
|
+
prior = stats['prior']
|
|
1189
|
+
level_stats = stats['stats'] # Dict[str, (sum, count)]
|
|
1190
|
+
prior_weight = stats['prior_weight']
|
|
1191
|
+
|
|
1192
|
+
col = new_data[te_term.var_name].to_numpy()
|
|
1193
|
+
n = len(col)
|
|
1194
|
+
encoded = np.zeros(n, dtype=self.dtype)
|
|
1195
|
+
|
|
1196
|
+
for i, val in enumerate(col):
|
|
1197
|
+
val_str = str(val)
|
|
1198
|
+
if val_str in level_stats:
|
|
1199
|
+
level_sum, level_count = level_stats[val_str]
|
|
1200
|
+
# Use full training statistics for prediction
|
|
1201
|
+
encoded[i] = (level_sum + prior * prior_weight) / (level_count + prior_weight)
|
|
1202
|
+
else:
|
|
1203
|
+
# Unknown level - use global prior
|
|
1204
|
+
encoded[i] = prior
|
|
1205
|
+
|
|
1206
|
+
return encoded
|
|
1207
|
+
|
|
1208
|
+
|
|
1209
|
+
def build_design_matrix(
|
|
1210
|
+
formula: str,
|
|
1211
|
+
data: "pl.DataFrame",
|
|
1212
|
+
) -> Tuple[np.ndarray, np.ndarray, List[str]]:
|
|
1213
|
+
"""
|
|
1214
|
+
Build design matrix with optimized interaction handling.
|
|
1215
|
+
|
|
1216
|
+
This is a drop-in replacement for formulaic's model_matrix that is
|
|
1217
|
+
optimized for:
|
|
1218
|
+
- Large datasets (uses vectorized operations)
|
|
1219
|
+
- High-cardinality categoricals (sparse intermediate representations)
|
|
1220
|
+
- Many interaction terms
|
|
1221
|
+
|
|
1222
|
+
Parameters
|
|
1223
|
+
----------
|
|
1224
|
+
formula : str
|
|
1225
|
+
R-style formula
|
|
1226
|
+
data : pl.DataFrame
|
|
1227
|
+
Polars DataFrame
|
|
1228
|
+
|
|
1229
|
+
Returns
|
|
1230
|
+
-------
|
|
1231
|
+
y : np.ndarray
|
|
1232
|
+
Response variable
|
|
1233
|
+
X : np.ndarray
|
|
1234
|
+
Design matrix
|
|
1235
|
+
feature_names : list[str]
|
|
1236
|
+
Column names
|
|
1237
|
+
|
|
1238
|
+
Example
|
|
1239
|
+
-------
|
|
1240
|
+
>>> y, X, names = build_design_matrix(
|
|
1241
|
+
... "claims ~ age*C(region) + C(brand)*C(fuel)",
|
|
1242
|
+
... data
|
|
1243
|
+
... )
|
|
1244
|
+
"""
|
|
1245
|
+
builder = InteractionBuilder(data)
|
|
1246
|
+
return builder.build_design_matrix(formula)
|