rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rustystats/__init__.py +151 -0
- rustystats/_rustystats.cpython-313-x86_64-linux-gnu.so +0 -0
- rustystats/diagnostics.py +2471 -0
- rustystats/families.py +423 -0
- rustystats/formula.py +1074 -0
- rustystats/glm.py +249 -0
- rustystats/interactions.py +1246 -0
- rustystats/links.py +221 -0
- rustystats/splines.py +367 -0
- rustystats/target_encoding.py +375 -0
- rustystats-0.1.5.dist-info/METADATA +476 -0
- rustystats-0.1.5.dist-info/RECORD +14 -0
- rustystats-0.1.5.dist-info/WHEEL +4 -0
- rustystats-0.1.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Target Encoding (CatBoost-style Ordered Target Statistics)
|
|
3
|
+
===========================================================
|
|
4
|
+
|
|
5
|
+
Implements CatBoost's ordered target statistics for categorical encoding.
|
|
6
|
+
This prevents target leakage during training by using only "past" observations
|
|
7
|
+
in the permutation order to compute statistics.
|
|
8
|
+
|
|
9
|
+
Reference: https://arxiv.org/abs/1706.09516 (CatBoost paper)
|
|
10
|
+
|
|
11
|
+
Key Features
|
|
12
|
+
------------
|
|
13
|
+
- **Ordered statistics**: For training, each observation is encoded using only
|
|
14
|
+
observations that appear before it in a random permutation order
|
|
15
|
+
- **Multiple permutations**: Average across several permutations to reduce variance
|
|
16
|
+
- **Regularization**: Prior weight controls smoothing toward global mean
|
|
17
|
+
- **No target leakage**: The observation's own target is never used in its encoding
|
|
18
|
+
|
|
19
|
+
Usage
|
|
20
|
+
-----
|
|
21
|
+
Direct API:
|
|
22
|
+
>>> import rustystats as rs
|
|
23
|
+
>>> encoded, name, prior, stats = rs.target_encode(categories, target, "var")
|
|
24
|
+
>>> # For prediction on new data:
|
|
25
|
+
>>> new_encoded = rs.apply_target_encoding(new_categories, stats, prior)
|
|
26
|
+
|
|
27
|
+
Formula API:
|
|
28
|
+
>>> result = rs.glm("y ~ TE(brand) + age", data, family="poisson").fit()
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
32
|
+
import numpy as np
|
|
33
|
+
|
|
34
|
+
from . import _rustystats
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def target_encode(
|
|
38
|
+
categories: Union[List[str], np.ndarray],
|
|
39
|
+
target: np.ndarray,
|
|
40
|
+
var_name: str = "x",
|
|
41
|
+
prior_weight: float = 1.0,
|
|
42
|
+
n_permutations: int = 4,
|
|
43
|
+
seed: Optional[int] = None,
|
|
44
|
+
) -> Tuple[np.ndarray, str, float, Dict[str, Tuple[float, int]]]:
|
|
45
|
+
"""
|
|
46
|
+
Target encode categorical variables using CatBoost-style ordered target statistics.
|
|
47
|
+
|
|
48
|
+
This encoding prevents target leakage during training by computing statistics
|
|
49
|
+
using only "past" observations in a random permutation order.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
categories : list[str] or numpy.ndarray
|
|
54
|
+
Categorical values as strings
|
|
55
|
+
target : numpy.ndarray
|
|
56
|
+
Target variable (continuous or binary)
|
|
57
|
+
var_name : str, optional
|
|
58
|
+
Variable name for output column (default: "x")
|
|
59
|
+
prior_weight : float, optional
|
|
60
|
+
Regularization strength toward global mean (default: 1.0).
|
|
61
|
+
Higher values = more regularization for rare categories.
|
|
62
|
+
n_permutations : int, optional
|
|
63
|
+
Number of random permutations to average (default: 4).
|
|
64
|
+
More permutations = lower variance but slower.
|
|
65
|
+
seed : int, optional
|
|
66
|
+
Random seed for reproducibility (default: None = random)
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
encoded : numpy.ndarray
|
|
71
|
+
Encoded values (shape: n_samples,)
|
|
72
|
+
name : str
|
|
73
|
+
Column name like "TE(var_name)"
|
|
74
|
+
prior : float
|
|
75
|
+
Global prior (mean of target) - needed for prediction
|
|
76
|
+
level_stats : dict
|
|
77
|
+
Mapping of level -> (sum_target, count) for prediction on new data
|
|
78
|
+
|
|
79
|
+
Examples
|
|
80
|
+
--------
|
|
81
|
+
>>> import rustystats as rs
|
|
82
|
+
>>> import numpy as np
|
|
83
|
+
>>>
|
|
84
|
+
>>> categories = ["A", "B", "A", "B", "A", "B"]
|
|
85
|
+
>>> target = np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0])
|
|
86
|
+
>>>
|
|
87
|
+
>>> encoded, name, prior, stats = rs.target_encode(categories, target, "cat")
|
|
88
|
+
>>> print(f"Column: {name}, Prior: {prior:.3f}")
|
|
89
|
+
Column: TE(cat), Prior: 0.500
|
|
90
|
+
>>>
|
|
91
|
+
>>> # For new data:
|
|
92
|
+
>>> new_cats = ["A", "B", "C"] # C is unseen
|
|
93
|
+
>>> new_encoded = rs.apply_target_encoding(new_cats, stats, prior)
|
|
94
|
+
>>> print(new_encoded) # C gets the prior
|
|
95
|
+
|
|
96
|
+
Notes
|
|
97
|
+
-----
|
|
98
|
+
The algorithm:
|
|
99
|
+
1. Shuffle data with random permutation
|
|
100
|
+
2. For each observation i in permutation order:
|
|
101
|
+
encoded[i] = (sum_target_before + prior * prior_weight) / (count_before + prior_weight)
|
|
102
|
+
3. Average across multiple permutations to reduce variance
|
|
103
|
+
|
|
104
|
+
For prediction on new data, use `apply_target_encoding()` which uses the full
|
|
105
|
+
training statistics (no ordering needed).
|
|
106
|
+
"""
|
|
107
|
+
# Convert to list of strings if numpy array
|
|
108
|
+
if isinstance(categories, np.ndarray):
|
|
109
|
+
categories = [str(x) for x in categories]
|
|
110
|
+
else:
|
|
111
|
+
categories = [str(x) for x in categories]
|
|
112
|
+
|
|
113
|
+
target = np.asarray(target, dtype=np.float64)
|
|
114
|
+
|
|
115
|
+
return _rustystats.target_encode_py(
|
|
116
|
+
categories, target, var_name, prior_weight, n_permutations, seed
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def apply_target_encoding(
|
|
121
|
+
categories: Union[List[str], np.ndarray],
|
|
122
|
+
level_stats: Dict[str, Tuple[float, int]],
|
|
123
|
+
prior: float,
|
|
124
|
+
prior_weight: float = 1.0,
|
|
125
|
+
) -> np.ndarray:
|
|
126
|
+
"""
|
|
127
|
+
Apply target encoding to new data using pre-computed statistics.
|
|
128
|
+
|
|
129
|
+
For prediction: uses full training statistics (no ordering needed).
|
|
130
|
+
Unseen categories get the prior (global mean).
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
categories : list[str] or numpy.ndarray
|
|
135
|
+
Categorical values for new data
|
|
136
|
+
level_stats : dict
|
|
137
|
+
Mapping of level -> (sum_target, count) from training.
|
|
138
|
+
Returned by `target_encode()`.
|
|
139
|
+
prior : float
|
|
140
|
+
Global prior (mean of training target).
|
|
141
|
+
Returned by `target_encode()`.
|
|
142
|
+
prior_weight : float, optional
|
|
143
|
+
Prior weight (should match training, default: 1.0)
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
numpy.ndarray
|
|
148
|
+
Encoded values for new data
|
|
149
|
+
|
|
150
|
+
Examples
|
|
151
|
+
--------
|
|
152
|
+
>>> # Train
|
|
153
|
+
>>> encoded, name, prior, stats = rs.target_encode(train_cats, train_y, "brand")
|
|
154
|
+
>>>
|
|
155
|
+
>>> # Predict
|
|
156
|
+
>>> test_encoded = rs.apply_target_encoding(test_cats, stats, prior)
|
|
157
|
+
"""
|
|
158
|
+
# Convert to list of strings if numpy array
|
|
159
|
+
if isinstance(categories, np.ndarray):
|
|
160
|
+
categories = [str(x) for x in categories]
|
|
161
|
+
else:
|
|
162
|
+
categories = [str(x) for x in categories]
|
|
163
|
+
|
|
164
|
+
return _rustystats.apply_target_encoding_py(
|
|
165
|
+
categories, level_stats, prior, prior_weight
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class TargetEncoder:
|
|
170
|
+
"""
|
|
171
|
+
Scikit-learn style target encoder with CatBoost-style ordered target statistics.
|
|
172
|
+
|
|
173
|
+
Fits on training data and transforms both training and test data.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
prior_weight : float, optional
|
|
178
|
+
Regularization strength toward global mean (default: 1.0)
|
|
179
|
+
n_permutations : int, optional
|
|
180
|
+
Number of random permutations to average (default: 4)
|
|
181
|
+
seed : int, optional
|
|
182
|
+
Random seed for reproducibility
|
|
183
|
+
|
|
184
|
+
Attributes
|
|
185
|
+
----------
|
|
186
|
+
prior_ : float
|
|
187
|
+
Global prior (mean of training target)
|
|
188
|
+
level_stats_ : dict
|
|
189
|
+
Mapping of level -> (sum_target, count)
|
|
190
|
+
|
|
191
|
+
Examples
|
|
192
|
+
--------
|
|
193
|
+
>>> encoder = rs.TargetEncoder(prior_weight=1.0, n_permutations=4)
|
|
194
|
+
>>> train_encoded = encoder.fit_transform(train_categories, train_y)
|
|
195
|
+
>>> test_encoded = encoder.transform(test_categories)
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
def __init__(
|
|
199
|
+
self,
|
|
200
|
+
prior_weight: float = 1.0,
|
|
201
|
+
n_permutations: int = 4,
|
|
202
|
+
seed: Optional[int] = None,
|
|
203
|
+
):
|
|
204
|
+
self.prior_weight = prior_weight
|
|
205
|
+
self.n_permutations = n_permutations
|
|
206
|
+
self.seed = seed
|
|
207
|
+
self.prior_: Optional[float] = None
|
|
208
|
+
self.level_stats_: Optional[Dict[str, Tuple[float, int]]] = None
|
|
209
|
+
|
|
210
|
+
def fit(
|
|
211
|
+
self,
|
|
212
|
+
categories: Union[List[str], np.ndarray],
|
|
213
|
+
target: np.ndarray,
|
|
214
|
+
) -> "TargetEncoder":
|
|
215
|
+
"""
|
|
216
|
+
Fit the encoder on training data.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
categories : list[str] or numpy.ndarray
|
|
221
|
+
Categorical values
|
|
222
|
+
target : numpy.ndarray
|
|
223
|
+
Target variable
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
-------
|
|
227
|
+
self
|
|
228
|
+
"""
|
|
229
|
+
_, _, self.prior_, self.level_stats_ = target_encode(
|
|
230
|
+
categories, target, "x",
|
|
231
|
+
self.prior_weight, self.n_permutations, self.seed
|
|
232
|
+
)
|
|
233
|
+
return self
|
|
234
|
+
|
|
235
|
+
def transform(
|
|
236
|
+
self,
|
|
237
|
+
categories: Union[List[str], np.ndarray],
|
|
238
|
+
) -> np.ndarray:
|
|
239
|
+
"""
|
|
240
|
+
Transform categories using fitted statistics.
|
|
241
|
+
|
|
242
|
+
For test/validation data, uses full training statistics.
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
categories : list[str] or numpy.ndarray
|
|
247
|
+
Categorical values
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
numpy.ndarray
|
|
252
|
+
Encoded values
|
|
253
|
+
"""
|
|
254
|
+
if self.level_stats_ is None or self.prior_ is None:
|
|
255
|
+
raise ValueError("Encoder not fitted. Call fit() first.")
|
|
256
|
+
|
|
257
|
+
return apply_target_encoding(
|
|
258
|
+
categories, self.level_stats_, self.prior_, self.prior_weight
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def fit_transform(
|
|
262
|
+
self,
|
|
263
|
+
categories: Union[List[str], np.ndarray],
|
|
264
|
+
target: np.ndarray,
|
|
265
|
+
) -> np.ndarray:
|
|
266
|
+
"""
|
|
267
|
+
Fit and transform training data using ordered target statistics.
|
|
268
|
+
|
|
269
|
+
Uses CatBoost-style ordering to prevent target leakage.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
categories : list[str] or numpy.ndarray
|
|
274
|
+
Categorical values
|
|
275
|
+
target : numpy.ndarray
|
|
276
|
+
Target variable
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
numpy.ndarray
|
|
281
|
+
Encoded values (with ordered statistics for training)
|
|
282
|
+
"""
|
|
283
|
+
encoded, _, self.prior_, self.level_stats_ = target_encode(
|
|
284
|
+
categories, target, "x",
|
|
285
|
+
self.prior_weight, self.n_permutations, self.seed
|
|
286
|
+
)
|
|
287
|
+
return encoded
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class TargetEncodingTerm:
|
|
291
|
+
"""
|
|
292
|
+
Represents a target encoding term in a formula.
|
|
293
|
+
|
|
294
|
+
Used internally by the formula parser to handle TE(var) syntax.
|
|
295
|
+
|
|
296
|
+
Parameters
|
|
297
|
+
----------
|
|
298
|
+
var_name : str
|
|
299
|
+
Variable name to encode
|
|
300
|
+
prior_weight : float, optional
|
|
301
|
+
Prior weight for regularization
|
|
302
|
+
n_permutations : int, optional
|
|
303
|
+
Number of permutations
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
def __init__(
|
|
307
|
+
self,
|
|
308
|
+
var_name: str,
|
|
309
|
+
prior_weight: float = 1.0,
|
|
310
|
+
n_permutations: int = 4,
|
|
311
|
+
):
|
|
312
|
+
self.var_name = var_name
|
|
313
|
+
self.prior_weight = prior_weight
|
|
314
|
+
self.n_permutations = n_permutations
|
|
315
|
+
self.encoder: Optional[TargetEncoder] = None
|
|
316
|
+
|
|
317
|
+
def fit_transform(
|
|
318
|
+
self,
|
|
319
|
+
data, # DataFrame (Polars or Pandas)
|
|
320
|
+
target: np.ndarray,
|
|
321
|
+
seed: Optional[int] = None,
|
|
322
|
+
) -> Tuple[np.ndarray, str]:
|
|
323
|
+
"""
|
|
324
|
+
Fit and transform the column from a DataFrame.
|
|
325
|
+
|
|
326
|
+
Returns
|
|
327
|
+
-------
|
|
328
|
+
values : numpy.ndarray
|
|
329
|
+
Encoded values
|
|
330
|
+
name : str
|
|
331
|
+
Column name
|
|
332
|
+
"""
|
|
333
|
+
# Extract column
|
|
334
|
+
if hasattr(data, 'to_numpy'):
|
|
335
|
+
# Polars DataFrame
|
|
336
|
+
col = data[self.var_name].to_numpy()
|
|
337
|
+
else:
|
|
338
|
+
# Pandas DataFrame
|
|
339
|
+
col = data[self.var_name].values
|
|
340
|
+
|
|
341
|
+
categories = [str(x) for x in col]
|
|
342
|
+
|
|
343
|
+
self.encoder = TargetEncoder(
|
|
344
|
+
prior_weight=self.prior_weight,
|
|
345
|
+
n_permutations=self.n_permutations,
|
|
346
|
+
seed=seed,
|
|
347
|
+
)
|
|
348
|
+
encoded = self.encoder.fit_transform(categories, target)
|
|
349
|
+
|
|
350
|
+
return encoded, f"TE({self.var_name})"
|
|
351
|
+
|
|
352
|
+
def transform(self, data) -> Tuple[np.ndarray, str]:
|
|
353
|
+
"""
|
|
354
|
+
Transform new data using fitted encoder.
|
|
355
|
+
|
|
356
|
+
Returns
|
|
357
|
+
-------
|
|
358
|
+
values : numpy.ndarray
|
|
359
|
+
Encoded values
|
|
360
|
+
name : str
|
|
361
|
+
Column name
|
|
362
|
+
"""
|
|
363
|
+
if self.encoder is None:
|
|
364
|
+
raise ValueError("Term not fitted. Call fit_transform() first.")
|
|
365
|
+
|
|
366
|
+
# Extract column
|
|
367
|
+
if hasattr(data, 'to_numpy'):
|
|
368
|
+
col = data[self.var_name].to_numpy()
|
|
369
|
+
else:
|
|
370
|
+
col = data[self.var_name].values
|
|
371
|
+
|
|
372
|
+
categories = [str(x) for x in col]
|
|
373
|
+
encoded = self.encoder.transform(categories)
|
|
374
|
+
|
|
375
|
+
return encoded, f"TE({self.var_name})"
|