rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1246 @@
1
+ """
2
+ Optimized interaction term support for RustyStats.
3
+
4
+ This module provides high-performance interaction term handling for GLMs.
5
+ All heavy computation is done in Rust for maximum speed:
6
+ - Categorical encoding (Rust parallel construction)
7
+ - Interaction terms (Rust parallel for large data)
8
+ - Spline basis functions (Rust with Rayon)
9
+
10
+ The Python layer handles only:
11
+ - Formula parsing (string manipulation)
12
+ - DataFrame column extraction
13
+ - Orchestration of Rust calls
14
+
15
+ Example
16
+ -------
17
+ >>> from rustystats.interactions import InteractionBuilder
18
+ >>>
19
+ >>> builder = InteractionBuilder(data)
20
+ >>> y, X, names = builder.build_design_matrix('y ~ x1*x2 + C(cat) + bs(age, df=5)')
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from dataclasses import dataclass, field
26
+ from typing import List, Optional, Tuple, Union, Dict, Set, TYPE_CHECKING
27
+
28
+ import numpy as np
29
+
30
+ # Import Rust implementations for heavy computation
31
+ from rustystats._rustystats import (
32
+ encode_categorical_py as _encode_categorical_rust,
33
+ build_cat_cat_interaction_py as _build_cat_cat_rust,
34
+ build_cat_cont_interaction_py as _build_cat_cont_rust,
35
+ build_cont_cont_interaction_py as _build_cont_cont_rust,
36
+ multiply_matrix_by_continuous_py as _multiply_matrix_cont_rust,
37
+ parse_formula_py as _parse_formula_rust,
38
+ target_encode_py as _target_encode_rust,
39
+ )
40
+
41
+ if TYPE_CHECKING:
42
+ import polars as pl
43
+
44
+
45
+ @dataclass
46
+ class InteractionTerm:
47
+ """Represents a single interaction term like x1:x2 or C(cat1):x2."""
48
+
49
+ factors: List[str] # Variables involved (e.g., ['x1', 'x2'] or ['cat1', 'x2'])
50
+ categorical_flags: List[bool] # Which factors are categorical
51
+
52
+ @property
53
+ def order(self) -> int:
54
+ """Order of interaction (2 for pairwise, 3 for three-way, etc.)."""
55
+ return len(self.factors)
56
+
57
+ @property
58
+ def is_pure_continuous(self) -> bool:
59
+ """True if all factors are continuous."""
60
+ return not any(self.categorical_flags)
61
+
62
+ @property
63
+ def is_pure_categorical(self) -> bool:
64
+ """True if all factors are categorical."""
65
+ return all(self.categorical_flags)
66
+
67
+ @property
68
+ def is_mixed(self) -> bool:
69
+ """True if mixture of categorical and continuous."""
70
+ return any(self.categorical_flags) and not all(self.categorical_flags)
71
+
72
+
73
+ # Import SplineTerm from splines module (canonical implementation)
74
+ from rustystats.splines import SplineTerm
75
+
76
+
77
+ @dataclass
78
+ class CategoricalEncoding:
79
+ """Cached categorical encoding data for a variable."""
80
+ encoding: np.ndarray # (n, k-1) dummy matrix
81
+ names: List[str] # Column names like ['var[T.B]', 'var[T.C]']
82
+ indices: np.ndarray # (n,) level indices (int32)
83
+ levels: List[str] # All categorical levels
84
+
85
+
86
+ @dataclass
87
+ class TargetEncodingTermSpec:
88
+ """Parsed target encoding term specification from formula."""
89
+ var_name: str
90
+ prior_weight: float = 1.0
91
+ n_permutations: int = 4
92
+
93
+
94
+ @dataclass
95
+ class IdentityTermSpec:
96
+ """Parsed identity term specification from formula (I() expressions)."""
97
+ expression: str # The raw expression inside I(), e.g., "x ** 2" or "x + y"
98
+
99
+
100
+ @dataclass
101
+ class ParsedFormula:
102
+ """Parsed formula with identified terms."""
103
+
104
+ response: str
105
+ main_effects: List[str] # Main effect variables
106
+ interactions: List[InteractionTerm] # Interaction terms
107
+ categorical_vars: Set[str] # Variables marked as categorical with C()
108
+ spline_terms: List[SplineTerm] = field(default_factory=list) # Spline terms
109
+ target_encoding_terms: List[TargetEncodingTermSpec] = field(default_factory=list) # TE() terms
110
+ identity_terms: List[IdentityTermSpec] = field(default_factory=list) # I() terms
111
+ has_intercept: bool = True
112
+
113
+
114
+ def parse_formula_interactions(formula: str) -> ParsedFormula:
115
+ """
116
+ Parse a formula string and extract interaction terms.
117
+
118
+ Uses Rust for fast parsing of:
119
+ - Main effects: x1, x2, C(cat)
120
+ - Two-way interactions: x1:x2, x1*x2, C(cat):x
121
+ - Higher-order: x1:x2:x3
122
+ - Intercept removal: 0 + ... or -1
123
+ - Spline terms: bs(x, df=5), ns(x, df=4)
124
+
125
+ Parameters
126
+ ----------
127
+ formula : str
128
+ R-style formula like "y ~ x1*x2 + C(cat) + bs(age, df=5)"
129
+
130
+ Returns
131
+ -------
132
+ ParsedFormula
133
+ Parsed structure with all terms identified
134
+ """
135
+ # Use Rust parser
136
+ parsed = _parse_formula_rust(formula)
137
+
138
+ # Convert to Python dataclasses
139
+ interactions = [
140
+ InteractionTerm(
141
+ factors=i['factors'],
142
+ categorical_flags=i['categorical_flags']
143
+ )
144
+ for i in parsed['interactions']
145
+ ]
146
+
147
+ spline_terms = [
148
+ SplineTerm(
149
+ var_name=s['var_name'],
150
+ spline_type=s['spline_type'],
151
+ df=s['df'],
152
+ degree=s['degree']
153
+ )
154
+ for s in parsed['spline_terms']
155
+ ]
156
+
157
+ # Parse target encoding terms
158
+ target_encoding_terms = [
159
+ TargetEncodingTermSpec(
160
+ var_name=t['var_name'],
161
+ prior_weight=t['prior_weight'],
162
+ n_permutations=t['n_permutations']
163
+ )
164
+ for t in parsed.get('target_encoding_terms', [])
165
+ ]
166
+
167
+ # Parse identity terms (I() expressions)
168
+ identity_terms = [
169
+ IdentityTermSpec(expression=i['expression'])
170
+ for i in parsed.get('identity_terms', [])
171
+ ]
172
+
173
+ # Filter out "1" from main effects (it's just an explicit intercept indicator)
174
+ main_effects = [m for m in parsed['main_effects'] if m != '1']
175
+
176
+ return ParsedFormula(
177
+ response=parsed['response'],
178
+ main_effects=main_effects,
179
+ interactions=interactions,
180
+ categorical_vars=set(parsed['categorical_vars']),
181
+ spline_terms=spline_terms,
182
+ target_encoding_terms=target_encoding_terms,
183
+ identity_terms=identity_terms,
184
+ has_intercept=parsed['has_intercept'],
185
+ )
186
+
187
+
188
+ class InteractionBuilder:
189
+ """
190
+ Efficiently builds design matrices with interaction terms.
191
+
192
+ Optimizations:
193
+ 1. Continuous × Continuous: Single vectorized multiplication
194
+ 2. Categorical × Continuous: Sparse-aware dummy encoding
195
+ 3. Categorical × Categorical: Direct index-based construction
196
+
197
+ Parameters
198
+ ----------
199
+ data : pl.DataFrame
200
+ Polars DataFrame
201
+ dtype : numpy dtype, default=np.float64
202
+ Data type for output arrays
203
+
204
+ Example
205
+ -------
206
+ >>> builder = InteractionBuilder(df)
207
+ >>> X, names = builder.build_matrix('y ~ x1*x2 + C(area):age')
208
+ """
209
+
210
+ def __init__(
211
+ self,
212
+ data: "pl.DataFrame",
213
+ dtype: np.dtype = np.float64,
214
+ ):
215
+ self.data = data
216
+ self.dtype = dtype
217
+ self._n = len(data)
218
+
219
+ # Consolidated cache for categorical encodings (keyed by "varname_dropfirst")
220
+ self._cat_encoding_cache: Dict[str, CategoricalEncoding] = {}
221
+ # Store spline terms with fitted knots for prediction
222
+ self._fitted_splines: Dict[str, SplineTerm] = {}
223
+ # Store parsed formula for prediction
224
+ self._parsed_formula: Optional[ParsedFormula] = None
225
+
226
+ def _parse_spline_factor(self, factor: str) -> Optional[SplineTerm]:
227
+ """Parse a spline term from a factor name like 'bs(VehAge, df=4)' or 'ns(age, df=3)'."""
228
+ factor_lower = factor.strip().lower()
229
+ if factor_lower.startswith('bs(') or factor_lower.startswith('ns('):
230
+ spline_type = 'bs' if factor_lower.startswith('bs(') else 'ns'
231
+ # Extract content inside parentheses
232
+ content = factor[3:-1] if factor.endswith(')') else factor[3:]
233
+ parts = [p.strip() for p in content.split(',')]
234
+ var_name = parts[0]
235
+ df = 4 # default
236
+ degree = 3 # default for B-splines
237
+ for part in parts[1:]:
238
+ if '=' in part:
239
+ key, val = part.split('=', 1)
240
+ key = key.strip().lower()
241
+ val = val.strip()
242
+ if key == 'df':
243
+ df = int(val)
244
+ elif key == 'degree':
245
+ degree = int(val)
246
+ return SplineTerm(var_name=var_name, spline_type=spline_type, df=df, degree=degree)
247
+ return None
248
+
249
+ def _get_column(self, name: str) -> np.ndarray:
250
+ """Extract column as numpy array."""
251
+ return self.data[name].to_numpy().astype(self.dtype)
252
+
253
+ def _get_categorical_indices(self, name: str) -> Tuple[np.ndarray, List[str]]:
254
+ """Get cached categorical indices and levels for a variable."""
255
+ cache_key = f"{name}_True" # Always use drop_first=True for indices
256
+ if cache_key not in self._cat_encoding_cache:
257
+ self._get_categorical_encoding(name) # Populate cache
258
+ cached = self._cat_encoding_cache[cache_key]
259
+ return cached.indices, cached.levels
260
+
261
+ def _get_categorical_levels(self, name: str) -> List[str]:
262
+ """Get cached categorical levels for a variable."""
263
+ cache_key = f"{name}_True"
264
+ if cache_key not in self._cat_encoding_cache:
265
+ raise ValueError(f"Categorical variable '{name}' was not seen during training.")
266
+ return self._cat_encoding_cache[cache_key].levels
267
+
268
+ def _get_categorical_encoding(
269
+ self,
270
+ name: str,
271
+ drop_first: bool = True
272
+ ) -> Tuple[np.ndarray, List[str]]:
273
+ """
274
+ Get dummy encoding for a categorical variable.
275
+
276
+ Uses Rust for factorization and parallel matrix construction.
277
+ Pure Rust implementation.
278
+
279
+ Returns
280
+ -------
281
+ encoding : np.ndarray
282
+ (n, k-1) dummy matrix where k is number of levels
283
+ names : list[str]
284
+ Column names like ['var[T.B]', 'var[T.C]', ...]
285
+ """
286
+ cache_key = f"{name}_{drop_first}"
287
+ if cache_key in self._cat_encoding_cache:
288
+ cached = self._cat_encoding_cache[cache_key]
289
+ return cached.encoding, cached.names
290
+
291
+ col = self.data[name].to_numpy()
292
+
293
+ # Convert to string list for Rust factorization
294
+ values = [str(v) for v in col]
295
+
296
+ # Use Rust for factorization + matrix construction
297
+ encoding, names, indices, levels = _encode_categorical_rust(values, name, drop_first)
298
+
299
+ # Cache all encoding data in a single consolidated object
300
+ self._cat_encoding_cache[cache_key] = CategoricalEncoding(
301
+ encoding=encoding,
302
+ names=names,
303
+ indices=np.array(indices, dtype=np.int32),
304
+ levels=levels,
305
+ )
306
+
307
+ return encoding, names
308
+
309
+ def build_interaction_columns(
310
+ self,
311
+ interaction: InteractionTerm,
312
+ ) -> Tuple[np.ndarray, List[str]]:
313
+ """
314
+ Build columns for a single interaction term.
315
+
316
+ Optimized for different interaction types:
317
+ - Pure continuous: Single O(n) element-wise multiply
318
+ - Mixed: Broadcast multiply continuous with each dummy column
319
+ - Pure categorical: Sparse index-based construction
320
+
321
+ Returns
322
+ -------
323
+ columns : np.ndarray
324
+ (n, k) interaction columns
325
+ names : list[str]
326
+ Column names
327
+ """
328
+ if interaction.is_pure_continuous:
329
+ return self._build_continuous_interaction(interaction)
330
+ elif interaction.is_pure_categorical:
331
+ return self._build_categorical_interaction(interaction)
332
+ else:
333
+ return self._build_mixed_interaction(interaction)
334
+
335
+ def _build_continuous_interaction(
336
+ self,
337
+ interaction: InteractionTerm
338
+ ) -> Tuple[np.ndarray, List[str]]:
339
+ """Build continuous × continuous interaction using Rust for computation."""
340
+ factors = interaction.factors
341
+
342
+ if len(factors) == 2:
343
+ # Optimized 2-way: direct Rust call
344
+ x1 = self._get_column(factors[0])
345
+ x2 = self._get_column(factors[1])
346
+ result, name = _build_cont_cont_rust(x1, x2, factors[0], factors[1])
347
+ return result.reshape(-1, 1), [name]
348
+ else:
349
+ # N-way: chain pairwise Rust calls
350
+ result = self._get_column(factors[0])
351
+ current_name = factors[0]
352
+
353
+ for factor in factors[1:]:
354
+ x2 = self._get_column(factor)
355
+ result, current_name = _build_cont_cont_rust(result, x2, current_name, factor)
356
+
357
+ return result.reshape(-1, 1), [current_name]
358
+
359
+ def _build_categorical_interaction(
360
+ self,
361
+ interaction: InteractionTerm
362
+ ) -> Tuple[np.ndarray, List[str]]:
363
+ """
364
+ Build categorical × categorical interaction efficiently.
365
+
366
+ Uses index-based construction instead of materializing full matrices.
367
+ """
368
+ # Get encodings for each categorical factor
369
+ encodings = []
370
+ all_names = []
371
+
372
+ for factor in interaction.factors:
373
+ enc, names = self._get_categorical_encoding(factor)
374
+ encodings.append(enc)
375
+ all_names.append(names)
376
+
377
+ if len(interaction.factors) == 2:
378
+ # Optimized 2-way interaction
379
+ return self._build_2way_categorical(encodings, all_names, interaction.factors)
380
+ else:
381
+ # General n-way interaction (slower)
382
+ return self._build_nway_categorical(encodings, all_names, interaction.factors)
383
+
384
+ def _build_2way_categorical(
385
+ self,
386
+ encodings: List[np.ndarray],
387
+ all_names: List[List[str]],
388
+ factors: List[str],
389
+ ) -> Tuple[np.ndarray, List[str]]:
390
+ """
391
+ Optimized 2-way categorical interaction using index-based construction.
392
+
393
+ Instead of multiplying dense matrices, we use the fact that for any row,
394
+ at most one column in each encoding is 1. So the interaction column
395
+ corresponding to (level_i, level_j) is 1 only if both encodings are 1.
396
+ """
397
+ # Get original indices (from cache or compute via encoding)
398
+ cat1, cat2 = factors
399
+
400
+ # Get indices and levels using consolidated cache
401
+ idx1, levels1 = self._get_categorical_indices(cat1)
402
+ idx2, levels2 = self._get_categorical_indices(cat2)
403
+
404
+ # Number of non-reference levels
405
+ n1 = len(levels1) - 1
406
+ n2 = len(levels2) - 1
407
+
408
+ if n1 * n2 == 0:
409
+ return np.zeros((self._n, 0), dtype=self.dtype), []
410
+
411
+ # Use Rust for fast parallel construction
412
+ names1, names2 = all_names
413
+ result, col_names = _build_cat_cat_rust(
414
+ idx1.astype(np.int32), n1,
415
+ idx2.astype(np.int32), n2,
416
+ list(names1), list(names2)
417
+ )
418
+
419
+ return result, col_names
420
+
421
+ def _build_nway_categorical(
422
+ self,
423
+ encodings: List[np.ndarray],
424
+ all_names: List[List[str]],
425
+ factors: List[str],
426
+ ) -> Tuple[np.ndarray, List[str]]:
427
+ """
428
+ General n-way categorical interaction using recursive 2-way Rust calls.
429
+
430
+ For 3+ way interactions, we recursively combine pairs using the
431
+ optimized 2-way Rust implementation.
432
+ """
433
+ if len(factors) == 2:
434
+ # Base case - use optimized 2-way
435
+ return self._build_2way_categorical(encodings, all_names, factors)
436
+
437
+ # Recursive case: combine first two factors, then combine with rest
438
+ # Build first two factors' interaction
439
+ first_two_enc = encodings[:2]
440
+ first_two_names = all_names[:2]
441
+ first_two_factors = factors[:2]
442
+
443
+ combined, combined_names = self._build_2way_categorical(
444
+ first_two_enc, first_two_names, first_two_factors
445
+ )
446
+
447
+ # Recursively combine with remaining factors
448
+ remaining_enc = [combined] + encodings[2:]
449
+ remaining_names = [combined_names] + all_names[2:]
450
+ remaining_factors = [f"{first_two_factors[0]}:{first_two_factors[1]}"] + factors[2:]
451
+
452
+ return self._build_nway_categorical(remaining_enc, remaining_names, remaining_factors)
453
+
454
+ def _build_mixed_interaction(
455
+ self,
456
+ interaction: InteractionTerm
457
+ ) -> Tuple[np.ndarray, List[str]]:
458
+ """Build categorical × continuous interaction using Rust."""
459
+ # Separate categorical and continuous factors
460
+ cat_factors = []
461
+ cont_factors = []
462
+ spline_factors = [] # Spline terms need special handling
463
+
464
+ for factor, is_cat in zip(interaction.factors, interaction.categorical_flags):
465
+ if is_cat:
466
+ cat_factors.append(factor)
467
+ else:
468
+ # Check if this is a spline term
469
+ spline = self._parse_spline_factor(factor)
470
+ if spline is not None:
471
+ spline_factors.append((factor, spline))
472
+ else:
473
+ cont_factors.append(factor)
474
+
475
+ # Build categorical encoding first
476
+ if len(cat_factors) == 1:
477
+ cat_name = cat_factors[0]
478
+ cat_encoding, cat_names = self._get_categorical_encoding(cat_name)
479
+ else:
480
+ cat_interaction = InteractionTerm(
481
+ factors=cat_factors,
482
+ categorical_flags=[True] * len(cat_factors)
483
+ )
484
+ cat_encoding, cat_names = self._build_categorical_interaction(cat_interaction)
485
+
486
+ if cat_encoding.shape[1] == 0:
487
+ return np.zeros((self._n, 0), dtype=self.dtype), []
488
+
489
+ # Handle spline × categorical interactions
490
+ if spline_factors:
491
+ # Build spline basis for each spline factor
492
+ all_columns = []
493
+ all_names = []
494
+
495
+ for spline_str, spline in spline_factors:
496
+ x = self._get_column(spline.var_name)
497
+ spline_basis, spline_names = spline.transform(x)
498
+ # Store fitted spline for prediction
499
+ self._fitted_splines[spline.var_name] = spline
500
+
501
+ # Multiply each spline column by each categorical column
502
+ for j, spl_name in enumerate(spline_names):
503
+ for i, cat_name in enumerate(cat_names):
504
+ col = cat_encoding[:, i] * spline_basis[:, j]
505
+ all_columns.append(col)
506
+ all_names.append(f"{cat_name}:{spl_name}")
507
+
508
+ # Also include any regular continuous factors
509
+ if cont_factors:
510
+ cont_product = self._get_column(cont_factors[0])
511
+ for factor in cont_factors[1:]:
512
+ cont_product = cont_product * self._get_column(factor)
513
+ cont_name = ':'.join(cont_factors)
514
+
515
+ # Multiply by continuous
516
+ final_columns = []
517
+ final_names = []
518
+ for col, name in zip(all_columns, all_names):
519
+ final_columns.append(col * cont_product)
520
+ final_names.append(f"{name}:{cont_name}")
521
+ all_columns = final_columns
522
+ all_names = final_names
523
+
524
+ if all_columns:
525
+ return np.column_stack(all_columns), all_names
526
+ return np.zeros((self._n, 0), dtype=self.dtype), []
527
+
528
+ # Standard continuous × categorical (no splines)
529
+ cont_product = self._get_column(cont_factors[0])
530
+ for factor in cont_factors[1:]:
531
+ cont_product = cont_product * self._get_column(factor)
532
+ cont_name = ':'.join(cont_factors)
533
+
534
+ # Build categorical part and use Rust for interaction
535
+ if len(cat_factors) == 1:
536
+ # Single categorical - use Rust directly
537
+ cat_name = cat_factors[0]
538
+
539
+ # Get indices and levels using consolidated cache
540
+ cat_indices, levels = self._get_categorical_indices(cat_name)
541
+ n_levels = len(levels) - 1 # Excluding reference
542
+
543
+ if n_levels == 0:
544
+ return np.zeros((self._n, 0), dtype=self.dtype), []
545
+
546
+ # Get category names from encoding
547
+ _, cat_names = self._get_categorical_encoding(cat_name)
548
+
549
+ # Use Rust for fast parallel construction
550
+ result, col_names = _build_cat_cont_rust(
551
+ cat_indices.astype(np.int32),
552
+ n_levels,
553
+ cont_product.astype(np.float64),
554
+ list(cat_names),
555
+ cont_name
556
+ )
557
+ return result, col_names
558
+ else:
559
+ # Multiple categorical - build their interaction first, then multiply using Rust
560
+ cat_interaction = InteractionTerm(
561
+ factors=cat_factors,
562
+ categorical_flags=[True] * len(cat_factors)
563
+ )
564
+ cat_encoding, cat_names = self._build_categorical_interaction(cat_interaction)
565
+
566
+ # Use Rust to multiply categorical matrix by continuous
567
+ result, col_names = _multiply_matrix_cont_rust(
568
+ cat_encoding.astype(np.float64),
569
+ cont_product.astype(np.float64),
570
+ list(cat_names),
571
+ cont_name
572
+ )
573
+ return result, col_names
574
+
575
+ def _build_spline_columns(
576
+ self,
577
+ spline: SplineTerm,
578
+ ) -> Tuple[np.ndarray, List[str]]:
579
+ """
580
+ Build columns for a spline term.
581
+
582
+ Uses SplineTerm.transform() which calls the fast Rust implementation.
583
+ """
584
+ x = self._get_column(spline.var_name)
585
+ return spline.transform(x)
586
+
587
+ def _build_target_encoding_columns(
588
+ self,
589
+ te_term: TargetEncodingTermSpec,
590
+ target: np.ndarray,
591
+ seed: Optional[int] = None,
592
+ exposure: Optional[np.ndarray] = None,
593
+ ) -> Tuple[np.ndarray, str, dict]:
594
+ """
595
+ Build target-encoded column for a categorical variable.
596
+
597
+ Uses ordered target statistics to prevent target leakage.
598
+
599
+ For frequency models with exposure, uses claim rate (target/exposure)
600
+ instead of raw counts to produce more meaningful encoded values.
601
+
602
+ Parameters
603
+ ----------
604
+ te_term : TargetEncodingTermSpec
605
+ Target encoding term specification
606
+ target : np.ndarray
607
+ Target variable values (e.g., ClaimCount)
608
+ seed : int, optional
609
+ Random seed for reproducibility
610
+ exposure : np.ndarray, optional
611
+ Exposure values. If provided, target encoding uses rate (target/exposure)
612
+ instead of raw target values. This prevents collapse to near-constant
613
+ values for low-frequency count data.
614
+
615
+ Returns
616
+ -------
617
+ encoded : np.ndarray
618
+ Target-encoded values (n,)
619
+ name : str
620
+ Column name like "TE(brand)"
621
+ stats : dict
622
+ Level statistics for prediction on new data
623
+ """
624
+ col = self.data[te_term.var_name].to_numpy()
625
+ categories = [str(v) for v in col]
626
+
627
+ # Use rate (target/exposure) for encoding when exposure is available
628
+ # This prevents near-constant encoded values for low-frequency count data
629
+ if exposure is not None:
630
+ encoding_target = (target / np.maximum(exposure, 1e-10)).astype(np.float64)
631
+ else:
632
+ encoding_target = target.astype(np.float64)
633
+
634
+ encoded, name, prior, stats = _target_encode_rust(
635
+ categories,
636
+ encoding_target,
637
+ te_term.var_name,
638
+ te_term.prior_weight,
639
+ te_term.n_permutations,
640
+ seed,
641
+ )
642
+
643
+ # Store whether we used rate encoding for prediction
644
+ return encoded, name, {
645
+ 'prior': prior,
646
+ 'stats': stats,
647
+ 'prior_weight': te_term.prior_weight,
648
+ 'used_rate_encoding': exposure is not None,
649
+ }
650
+
651
+ def _build_identity_columns(
652
+ self,
653
+ identity: IdentityTermSpec,
654
+ data: "pl.DataFrame",
655
+ ) -> Tuple[np.ndarray, str]:
656
+ """
657
+ Build column for an identity term (I() expression).
658
+
659
+ Evaluates expressions like I(x ** 2), I(x + y), I(x * y) against DataFrame columns.
660
+
661
+ Parameters
662
+ ----------
663
+ identity : IdentityTermSpec
664
+ Identity term specification with the expression
665
+ data : pl.DataFrame
666
+ DataFrame containing the columns referenced in the expression
667
+
668
+ Returns
669
+ -------
670
+ values : np.ndarray
671
+ Evaluated expression values (n,)
672
+ name : str
673
+ Column name like "I(x ** 2)"
674
+ """
675
+ import polars as pl
676
+
677
+ expr = identity.expression
678
+ name = f"I({expr})"
679
+
680
+ # Convert Python ** to Polars pow() and evaluate
681
+ # Common patterns: x ** 2, x ** 3, x + y, x * y, x / y
682
+ try:
683
+ # Use Polars eval with SQL-like syntax
684
+ # Convert ** to .pow() for polars
685
+ polars_expr = self._convert_expression_to_polars(expr)
686
+ result = data.select(polars_expr.alias("__result__"))["__result__"].to_numpy()
687
+ return result.astype(self.dtype), name
688
+ except Exception as e:
689
+ raise ValueError(
690
+ f"Failed to evaluate I() expression '{expr}': {e}\n"
691
+ f"Supported operations: +, -, *, /, ** (power)\n"
692
+ f"Example: I(x ** 2), I(x + y), I(x * y)"
693
+ ) from e
694
+
695
+ def _convert_expression_to_polars(self, expr: str) -> "pl.Expr":
696
+ """
697
+ Convert a Python-style expression to a Polars expression.
698
+
699
+ Handles:
700
+ - x ** 2 -> col("x").pow(2)
701
+ - x + y -> col("x") + col("y")
702
+ - x * y -> col("x") * col("y")
703
+ - x / y -> col("x") / col("y")
704
+ - x - y -> col("x") - col("y")
705
+ """
706
+ import polars as pl
707
+ import re
708
+
709
+ expr = expr.strip()
710
+
711
+ # Handle power operator: var ** num or var ** var
712
+ power_match = re.match(r'^(\w+)\s*\*\*\s*(\d+(?:\.\d+)?|\w+)$', expr)
713
+ if power_match:
714
+ var_name = power_match.group(1)
715
+ power = power_match.group(2)
716
+ try:
717
+ # Try to parse as number
718
+ power_val = float(power)
719
+ return pl.col(var_name).pow(power_val)
720
+ except ValueError:
721
+ # It's a column name
722
+ return pl.col(var_name).pow(pl.col(power))
723
+
724
+ # Handle binary operations: var op var or var op num
725
+ binary_ops = [
726
+ (r'^(\w+)\s*\+\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a + b),
727
+ (r'^(\w+)\s*-\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a - b),
728
+ (r'^(\w+)\s*\*\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a * b),
729
+ (r'^(\w+)\s*/\s*(\w+|\d+(?:\.\d+)?)$', lambda a, b: a / b),
730
+ ]
731
+
732
+ for pattern, op_func in binary_ops:
733
+ match = re.match(pattern, expr)
734
+ if match:
735
+ left = match.group(1)
736
+ right = match.group(2)
737
+ left_expr = pl.col(left)
738
+ try:
739
+ right_val = float(right)
740
+ right_expr = pl.lit(right_val)
741
+ except ValueError:
742
+ right_expr = pl.col(right)
743
+ return op_func(left_expr, right_expr)
744
+
745
+ # If no pattern matched, try direct column reference (simple case)
746
+ # This handles cases like I(x) which is just the column itself
747
+ if re.match(r'^\w+$', expr):
748
+ return pl.col(expr)
749
+
750
+ raise ValueError(
751
+ f"Cannot parse expression '{expr}'. "
752
+ f"Supported formats: 'x ** 2', 'x + y', 'x * y', 'x / y', 'x - y'"
753
+ )
754
+
755
+ def build_design_matrix(
756
+ self,
757
+ formula: str,
758
+ exposure: Optional[np.ndarray] = None,
759
+ ) -> Tuple[np.ndarray, np.ndarray, List[str]]:
760
+ """
761
+ Build complete design matrix from formula.
762
+
763
+ Parameters
764
+ ----------
765
+ formula : str
766
+ R-style formula like "y ~ x1*x2 + C(cat) + bs(age, df=5)"
767
+ exposure : np.ndarray, optional
768
+ Exposure values. If provided, target encoding (TE) will use
769
+ rate (y/exposure) instead of raw y values. This is important
770
+ for frequency models to prevent TE values collapsing to near-constant.
771
+
772
+ Returns
773
+ -------
774
+ y : np.ndarray
775
+ Response variable
776
+ X : np.ndarray
777
+ Design matrix
778
+ names : list[str]
779
+ Column names
780
+ """
781
+ parsed = parse_formula_interactions(formula)
782
+
783
+ columns = []
784
+ names = []
785
+
786
+ # Add intercept
787
+ if parsed.has_intercept:
788
+ columns.append(np.ones(self._n, dtype=self.dtype))
789
+ names.append('Intercept')
790
+
791
+ # Add main effects
792
+ for var in parsed.main_effects:
793
+ if var in parsed.categorical_vars:
794
+ enc, enc_names = self._get_categorical_encoding(var)
795
+ columns.append(enc)
796
+ names.extend(enc_names)
797
+ else:
798
+ columns.append(self._get_column(var).reshape(-1, 1))
799
+ names.append(var)
800
+
801
+ # Add spline terms
802
+ for spline in parsed.spline_terms:
803
+ spline_cols, spline_names = self._build_spline_columns(spline)
804
+ columns.append(spline_cols)
805
+ names.extend(spline_names)
806
+ # Store fitted spline for prediction
807
+ self._fitted_splines[spline.var_name] = spline
808
+
809
+ # Add interactions
810
+ for interaction in parsed.interactions:
811
+ int_cols, int_names = self.build_interaction_columns(interaction)
812
+ if int_cols.ndim == 1:
813
+ int_cols = int_cols.reshape(-1, 1)
814
+ columns.append(int_cols)
815
+ names.extend(int_names)
816
+
817
+ # Store parsed formula for prediction
818
+ self._parsed_formula = parsed
819
+
820
+ # Get response (needed for target encoding)
821
+ y = self._get_column(parsed.response)
822
+
823
+ # Add target encoding terms (CatBoost-style)
824
+ # Store stats for prediction on new data
825
+ # When exposure is provided, use rate (y/exposure) for encoding
826
+ self._te_stats: Dict[str, dict] = {}
827
+ for te_term in parsed.target_encoding_terms:
828
+ te_col, te_name, te_stats = self._build_target_encoding_columns(
829
+ te_term, y, exposure=exposure
830
+ )
831
+ columns.append(te_col.reshape(-1, 1))
832
+ names.append(te_name)
833
+ self._te_stats[te_term.var_name] = te_stats
834
+
835
+ # Add identity terms (I() expressions like I(x ** 2))
836
+ for identity in parsed.identity_terms:
837
+ id_col, id_name = self._build_identity_columns(identity, self.data)
838
+ columns.append(id_col.reshape(-1, 1))
839
+ names.append(id_name)
840
+
841
+ # Stack all columns
842
+ if columns:
843
+ X = np.hstack([c if c.ndim == 2 else c.reshape(-1, 1) for c in columns])
844
+ else:
845
+ X = np.ones((self._n, 1), dtype=self.dtype)
846
+ names = ['Intercept']
847
+
848
+ # Store for validation
849
+ self._last_X = X
850
+ self._last_names = names
851
+
852
+ return y, X, names
853
+
854
+ def validate_design_matrix(
855
+ self,
856
+ X: np.ndarray = None,
857
+ names: List[str] = None,
858
+ corr_threshold: float = 0.999,
859
+ verbose: bool = True,
860
+ ) -> dict:
861
+ """
862
+ Validate design matrix for common issues that cause fitting failures.
863
+
864
+ Parameters
865
+ ----------
866
+ X : np.ndarray, optional
867
+ Design matrix to validate. If None, uses last built matrix.
868
+ names : list of str, optional
869
+ Feature names. If None, uses last built names.
870
+ corr_threshold : float, default=0.999
871
+ Correlation threshold above which columns are flagged as problematic.
872
+ verbose : bool, default=True
873
+ Print diagnostic messages.
874
+
875
+ Returns
876
+ -------
877
+ dict
878
+ Validation results with keys:
879
+ - 'valid': bool, True if matrix is suitable for fitting
880
+ - 'rank': int, matrix rank
881
+ - 'expected_rank': int, number of columns
882
+ - 'condition_number': float, condition number (large = ill-conditioned)
883
+ - 'problematic_columns': list of tuples (col1, col2, correlation)
884
+ - 'zero_variance_columns': list of column names with zero variance
885
+ - 'suggestions': list of actionable fix suggestions
886
+ """
887
+ if X is None:
888
+ X = getattr(self, '_last_X', None)
889
+ names = getattr(self, '_last_names', None)
890
+ if X is None:
891
+ raise ValueError("No design matrix to validate. Call build_design_matrix() first.")
892
+
893
+ n_rows, n_cols = X.shape
894
+ results = {
895
+ 'valid': True,
896
+ 'rank': None,
897
+ 'expected_rank': n_cols,
898
+ 'condition_number': None,
899
+ 'problematic_columns': [],
900
+ 'zero_variance_columns': [],
901
+ 'suggestions': [],
902
+ }
903
+
904
+ # Check for NaN/Inf
905
+ if np.isnan(X).any():
906
+ results['valid'] = False
907
+ nan_cols = [names[i] for i in range(n_cols) if np.isnan(X[:, i]).any()]
908
+ results['suggestions'].append(f"Columns contain NaN values: {nan_cols}")
909
+
910
+ if np.isinf(X).any():
911
+ results['valid'] = False
912
+ inf_cols = [names[i] for i in range(n_cols) if np.isinf(X[:, i]).any()]
913
+ results['suggestions'].append(f"Columns contain Inf values: {inf_cols}")
914
+
915
+ # Check for zero variance columns (exclude Intercept which is supposed to be constant)
916
+ variances = np.var(X, axis=0)
917
+ zero_var_idx = np.where(variances < 1e-10)[0]
918
+ if len(zero_var_idx) > 0:
919
+ zero_var_cols = [names[i] for i in zero_var_idx if i < len(names) and names[i] != 'Intercept']
920
+ if zero_var_cols:
921
+ results['zero_variance_columns'] = zero_var_cols
922
+ results['valid'] = False
923
+ results['suggestions'].append(
924
+ f"Columns have zero/near-zero variance: {zero_var_cols}. "
925
+ "This often happens with splines on highly skewed data where most values are identical."
926
+ )
927
+
928
+ # Check matrix rank
929
+ try:
930
+ results['rank'] = np.linalg.matrix_rank(X)
931
+ if results['rank'] < n_cols:
932
+ results['valid'] = False
933
+ results['suggestions'].append(
934
+ f"Matrix is rank-deficient: rank={results['rank']}, expected={n_cols}. "
935
+ f"{n_cols - results['rank']} columns are linearly dependent."
936
+ )
937
+ except Exception as e:
938
+ results['suggestions'].append(f"Warning: Could not compute matrix rank: {e}")
939
+
940
+ # Check condition number
941
+ try:
942
+ results['condition_number'] = np.linalg.cond(X)
943
+ if results['condition_number'] > 1e10:
944
+ results['valid'] = False
945
+ results['suggestions'].append(
946
+ f"Matrix is ill-conditioned (condition number={results['condition_number']:.2e}). "
947
+ "This indicates near-linear dependence between columns."
948
+ )
949
+ except Exception as e:
950
+ results['suggestions'].append(f"Warning: Could not compute condition number: {e}")
951
+
952
+ # Check for highly correlated columns (skip intercept)
953
+ try:
954
+ # Compute correlations only for non-constant columns
955
+ non_const_idx = [i for i in range(n_cols) if variances[i] > 1e-10]
956
+ if len(non_const_idx) > 1:
957
+ X_subset = X[:, non_const_idx]
958
+ corr_matrix = np.corrcoef(X_subset.T)
959
+
960
+ for i in range(len(non_const_idx)):
961
+ for j in range(i + 1, len(non_const_idx)):
962
+ corr = abs(corr_matrix[i, j])
963
+ if corr > corr_threshold:
964
+ col1 = names[non_const_idx[i]]
965
+ col2 = names[non_const_idx[j]]
966
+ results['problematic_columns'].append((col1, col2, corr))
967
+
968
+ if results['problematic_columns']:
969
+ results['valid'] = False
970
+ pairs = [f"'{c1}' <-> '{c2}' (r={r:.4f})" for c1, c2, r in results['problematic_columns']]
971
+ results['suggestions'].append(
972
+ f"Highly correlated column pairs detected:\n " + "\n ".join(pairs) + "\n"
973
+ "This often happens with natural splines (ns) on skewed data. Fixes:\n"
974
+ " 1. Use B-splines instead: bs(VarName, df=4) - more robust to skewed data\n"
975
+ " 2. Use log transform: ns(log_VarName, df=4) for skewed variables\n"
976
+ " 3. Reduce degrees of freedom: ns(VarName, df=2)\n"
977
+ " 4. Use linear term instead: just 'VarName' without spline"
978
+ )
979
+ except Exception as e:
980
+ results['suggestions'].append(f"Warning: Could not compute column correlations: {e}")
981
+
982
+ if verbose:
983
+ print("=" * 60)
984
+ print("DESIGN MATRIX VALIDATION")
985
+ print("=" * 60)
986
+ print(f"Shape: {n_rows} rows × {n_cols} columns")
987
+ print(f"Rank: {results['rank']} / {n_cols}")
988
+ if results['condition_number']:
989
+ print(f"Condition number: {results['condition_number']:.2e}")
990
+ print(f"Status: {'✓ VALID' if results['valid'] else '✗ INVALID'}")
991
+
992
+ if not results['valid']:
993
+ print("\nPROBLEMS DETECTED:")
994
+ for i, suggestion in enumerate(results['suggestions'], 1):
995
+ print(f"\n{i}. {suggestion}")
996
+ print("=" * 60)
997
+
998
+ return results
999
+
1000
+ def transform_new_data(
1001
+ self,
1002
+ new_data: "pl.DataFrame",
1003
+ ) -> np.ndarray:
1004
+ """
1005
+ Transform new data using the encoding state from training.
1006
+
1007
+ This method applies the same transformations learned during
1008
+ build_design_matrix() to new data for prediction.
1009
+
1010
+ Parameters
1011
+ ----------
1012
+ new_data : pl.DataFrame
1013
+ New data to transform. Must have same columns as training data.
1014
+
1015
+ Returns
1016
+ -------
1017
+ X : np.ndarray
1018
+ Design matrix for new data
1019
+
1020
+ Raises
1021
+ ------
1022
+ ValueError
1023
+ If build_design_matrix() was not called first, or if new data
1024
+ contains unseen categorical levels.
1025
+ """
1026
+ if self._parsed_formula is None:
1027
+ raise ValueError(
1028
+ "Must call build_design_matrix() before transform_new_data(). "
1029
+ "No formula has been fitted yet."
1030
+ )
1031
+
1032
+ parsed = self._parsed_formula
1033
+ n_new = len(new_data)
1034
+ columns = []
1035
+
1036
+ # Add intercept
1037
+ if parsed.has_intercept:
1038
+ columns.append(np.ones(n_new, dtype=self.dtype))
1039
+
1040
+ # Add main effects
1041
+ for var in parsed.main_effects:
1042
+ if var in parsed.categorical_vars:
1043
+ enc = self._encode_categorical_new(new_data, var)
1044
+ columns.append(enc)
1045
+ else:
1046
+ col = new_data[var].to_numpy().astype(self.dtype)
1047
+ columns.append(col.reshape(-1, 1))
1048
+
1049
+ # Add spline terms using fitted knots
1050
+ for spline in parsed.spline_terms:
1051
+ x = new_data[spline.var_name].to_numpy().astype(self.dtype)
1052
+ # Use the fitted spline which has the same knots as training
1053
+ fitted_spline = self._fitted_splines.get(spline.var_name, spline)
1054
+ spline_cols, _ = fitted_spline.transform(x)
1055
+ columns.append(spline_cols)
1056
+
1057
+ # Add interactions
1058
+ for interaction in parsed.interactions:
1059
+ int_cols = self._build_interaction_new(new_data, interaction, n_new)
1060
+ if int_cols.ndim == 1:
1061
+ int_cols = int_cols.reshape(-1, 1)
1062
+ columns.append(int_cols)
1063
+
1064
+ # Add target encoding terms using stored statistics
1065
+ for te_term in parsed.target_encoding_terms:
1066
+ te_col = self._encode_target_new(new_data, te_term)
1067
+ columns.append(te_col.reshape(-1, 1))
1068
+
1069
+ # Add identity terms (I() expressions) - same evaluation on new data
1070
+ for identity in parsed.identity_terms:
1071
+ id_col, _ = self._build_identity_columns(identity, new_data)
1072
+ columns.append(id_col.reshape(-1, 1))
1073
+
1074
+ # Stack all columns
1075
+ if columns:
1076
+ X = np.hstack([c if c.ndim == 2 else c.reshape(-1, 1) for c in columns])
1077
+ else:
1078
+ X = np.ones((n_new, 1), dtype=self.dtype)
1079
+
1080
+ return X
1081
+
1082
+ def _encode_categorical_new(
1083
+ self,
1084
+ new_data: "pl.DataFrame",
1085
+ var_name: str,
1086
+ ) -> np.ndarray:
1087
+ """Encode categorical variable using levels from training."""
1088
+ levels = self._get_categorical_levels(var_name)
1089
+ col = new_data[var_name].to_numpy()
1090
+ n = len(col)
1091
+
1092
+ # Create level to index mapping (reference level is index 0)
1093
+ level_to_idx = {level: i for i, level in enumerate(levels)}
1094
+
1095
+ # Number of dummy columns (excluding reference level)
1096
+ n_dummies = len(levels) - 1
1097
+ encoding = np.zeros((n, n_dummies), dtype=self.dtype)
1098
+
1099
+ for i, val in enumerate(col):
1100
+ val_str = str(val)
1101
+ if val_str in level_to_idx:
1102
+ idx = level_to_idx[val_str]
1103
+ if idx > 0: # Skip reference level
1104
+ encoding[i, idx - 1] = 1.0
1105
+ # Unknown levels get all zeros (mapped to reference)
1106
+
1107
+ return encoding
1108
+
1109
+ def _build_interaction_new(
1110
+ self,
1111
+ new_data: "pl.DataFrame",
1112
+ interaction: InteractionTerm,
1113
+ n: int,
1114
+ ) -> np.ndarray:
1115
+ """Build interaction columns for new data."""
1116
+ if interaction.is_pure_continuous:
1117
+ # Continuous × continuous
1118
+ result = new_data[interaction.factors[0]].to_numpy().astype(self.dtype)
1119
+ for factor in interaction.factors[1:]:
1120
+ result = result * new_data[factor].to_numpy().astype(self.dtype)
1121
+ return result.reshape(-1, 1)
1122
+
1123
+ elif interaction.is_pure_categorical:
1124
+ # Categorical × categorical
1125
+ encodings = []
1126
+ for factor in interaction.factors:
1127
+ enc = self._encode_categorical_new(new_data, factor)
1128
+ encodings.append(enc)
1129
+
1130
+ # Build interaction by taking outer product
1131
+ result = encodings[0]
1132
+ for enc in encodings[1:]:
1133
+ # Kronecker-style expansion
1134
+ n_cols1, n_cols2 = result.shape[1], enc.shape[1]
1135
+ new_result = np.zeros((n, n_cols1 * n_cols2), dtype=self.dtype)
1136
+ for i in range(n_cols1):
1137
+ for j in range(n_cols2):
1138
+ new_result[:, i * n_cols2 + j] = result[:, i] * enc[:, j]
1139
+ result = new_result
1140
+ return result
1141
+
1142
+ else:
1143
+ # Mixed: categorical × continuous
1144
+ cat_factors = []
1145
+ cont_factors = []
1146
+ for factor, is_cat in zip(interaction.factors, interaction.categorical_flags):
1147
+ if is_cat:
1148
+ cat_factors.append(factor)
1149
+ else:
1150
+ cont_factors.append(factor)
1151
+
1152
+ # Build continuous product
1153
+ cont_product = new_data[cont_factors[0]].to_numpy().astype(self.dtype)
1154
+ for factor in cont_factors[1:]:
1155
+ cont_product = cont_product * new_data[factor].to_numpy().astype(self.dtype)
1156
+
1157
+ # Build categorical encoding
1158
+ if len(cat_factors) == 1:
1159
+ cat_enc = self._encode_categorical_new(new_data, cat_factors[0])
1160
+ else:
1161
+ # Multiple categorical - build their interaction
1162
+ cat_enc = self._encode_categorical_new(new_data, cat_factors[0])
1163
+ for factor in cat_factors[1:]:
1164
+ enc = self._encode_categorical_new(new_data, factor)
1165
+ n_cols1, n_cols2 = cat_enc.shape[1], enc.shape[1]
1166
+ new_enc = np.zeros((n, n_cols1 * n_cols2), dtype=self.dtype)
1167
+ for i in range(n_cols1):
1168
+ for j in range(n_cols2):
1169
+ new_enc[:, i * n_cols2 + j] = cat_enc[:, i] * enc[:, j]
1170
+ cat_enc = new_enc
1171
+
1172
+ # Multiply categorical dummies by continuous
1173
+ result = cat_enc * cont_product.reshape(-1, 1)
1174
+ return result
1175
+
1176
+ def _encode_target_new(
1177
+ self,
1178
+ new_data: "pl.DataFrame",
1179
+ te_term: TargetEncodingTermSpec,
1180
+ ) -> np.ndarray:
1181
+ """Encode using target statistics from training."""
1182
+ if te_term.var_name not in self._te_stats:
1183
+ raise ValueError(
1184
+ f"Target encoding for '{te_term.var_name}' was not fitted during training."
1185
+ )
1186
+
1187
+ stats = self._te_stats[te_term.var_name]
1188
+ prior = stats['prior']
1189
+ level_stats = stats['stats'] # Dict[str, (sum, count)]
1190
+ prior_weight = stats['prior_weight']
1191
+
1192
+ col = new_data[te_term.var_name].to_numpy()
1193
+ n = len(col)
1194
+ encoded = np.zeros(n, dtype=self.dtype)
1195
+
1196
+ for i, val in enumerate(col):
1197
+ val_str = str(val)
1198
+ if val_str in level_stats:
1199
+ level_sum, level_count = level_stats[val_str]
1200
+ # Use full training statistics for prediction
1201
+ encoded[i] = (level_sum + prior * prior_weight) / (level_count + prior_weight)
1202
+ else:
1203
+ # Unknown level - use global prior
1204
+ encoded[i] = prior
1205
+
1206
+ return encoded
1207
+
1208
+
1209
+ def build_design_matrix(
1210
+ formula: str,
1211
+ data: "pl.DataFrame",
1212
+ ) -> Tuple[np.ndarray, np.ndarray, List[str]]:
1213
+ """
1214
+ Build design matrix with optimized interaction handling.
1215
+
1216
+ This is a drop-in replacement for formulaic's model_matrix that is
1217
+ optimized for:
1218
+ - Large datasets (uses vectorized operations)
1219
+ - High-cardinality categoricals (sparse intermediate representations)
1220
+ - Many interaction terms
1221
+
1222
+ Parameters
1223
+ ----------
1224
+ formula : str
1225
+ R-style formula
1226
+ data : pl.DataFrame
1227
+ Polars DataFrame
1228
+
1229
+ Returns
1230
+ -------
1231
+ y : np.ndarray
1232
+ Response variable
1233
+ X : np.ndarray
1234
+ Design matrix
1235
+ feature_names : list[str]
1236
+ Column names
1237
+
1238
+ Example
1239
+ -------
1240
+ >>> y, X, names = build_design_matrix(
1241
+ ... "claims ~ age*C(region) + C(brand)*C(fuel)",
1242
+ ... data
1243
+ ... )
1244
+ """
1245
+ builder = InteractionBuilder(data)
1246
+ return builder.build_design_matrix(formula)