rustystats 0.1.5__cp313-cp313-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ """
2
+ Target Encoding (CatBoost-style Ordered Target Statistics)
3
+ ===========================================================
4
+
5
+ Implements CatBoost's ordered target statistics for categorical encoding.
6
+ This prevents target leakage during training by using only "past" observations
7
+ in the permutation order to compute statistics.
8
+
9
+ Reference: https://arxiv.org/abs/1706.09516 (CatBoost paper)
10
+
11
+ Key Features
12
+ ------------
13
+ - **Ordered statistics**: For training, each observation is encoded using only
14
+ observations that appear before it in a random permutation order
15
+ - **Multiple permutations**: Average across several permutations to reduce variance
16
+ - **Regularization**: Prior weight controls smoothing toward global mean
17
+ - **No target leakage**: The observation's own target is never used in its encoding
18
+
19
+ Usage
20
+ -----
21
+ Direct API:
22
+ >>> import rustystats as rs
23
+ >>> encoded, name, prior, stats = rs.target_encode(categories, target, "var")
24
+ >>> # For prediction on new data:
25
+ >>> new_encoded = rs.apply_target_encoding(new_categories, stats, prior)
26
+
27
+ Formula API:
28
+ >>> result = rs.glm("y ~ TE(brand) + age", data, family="poisson").fit()
29
+ """
30
+
31
+ from typing import Dict, List, Optional, Tuple, Union
32
+ import numpy as np
33
+
34
+ from . import _rustystats
35
+
36
+
37
+ def target_encode(
38
+ categories: Union[List[str], np.ndarray],
39
+ target: np.ndarray,
40
+ var_name: str = "x",
41
+ prior_weight: float = 1.0,
42
+ n_permutations: int = 4,
43
+ seed: Optional[int] = None,
44
+ ) -> Tuple[np.ndarray, str, float, Dict[str, Tuple[float, int]]]:
45
+ """
46
+ Target encode categorical variables using CatBoost-style ordered target statistics.
47
+
48
+ This encoding prevents target leakage during training by computing statistics
49
+ using only "past" observations in a random permutation order.
50
+
51
+ Parameters
52
+ ----------
53
+ categories : list[str] or numpy.ndarray
54
+ Categorical values as strings
55
+ target : numpy.ndarray
56
+ Target variable (continuous or binary)
57
+ var_name : str, optional
58
+ Variable name for output column (default: "x")
59
+ prior_weight : float, optional
60
+ Regularization strength toward global mean (default: 1.0).
61
+ Higher values = more regularization for rare categories.
62
+ n_permutations : int, optional
63
+ Number of random permutations to average (default: 4).
64
+ More permutations = lower variance but slower.
65
+ seed : int, optional
66
+ Random seed for reproducibility (default: None = random)
67
+
68
+ Returns
69
+ -------
70
+ encoded : numpy.ndarray
71
+ Encoded values (shape: n_samples,)
72
+ name : str
73
+ Column name like "TE(var_name)"
74
+ prior : float
75
+ Global prior (mean of target) - needed for prediction
76
+ level_stats : dict
77
+ Mapping of level -> (sum_target, count) for prediction on new data
78
+
79
+ Examples
80
+ --------
81
+ >>> import rustystats as rs
82
+ >>> import numpy as np
83
+ >>>
84
+ >>> categories = ["A", "B", "A", "B", "A", "B"]
85
+ >>> target = np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0])
86
+ >>>
87
+ >>> encoded, name, prior, stats = rs.target_encode(categories, target, "cat")
88
+ >>> print(f"Column: {name}, Prior: {prior:.3f}")
89
+ Column: TE(cat), Prior: 0.500
90
+ >>>
91
+ >>> # For new data:
92
+ >>> new_cats = ["A", "B", "C"] # C is unseen
93
+ >>> new_encoded = rs.apply_target_encoding(new_cats, stats, prior)
94
+ >>> print(new_encoded) # C gets the prior
95
+
96
+ Notes
97
+ -----
98
+ The algorithm:
99
+ 1. Shuffle data with random permutation
100
+ 2. For each observation i in permutation order:
101
+ encoded[i] = (sum_target_before + prior * prior_weight) / (count_before + prior_weight)
102
+ 3. Average across multiple permutations to reduce variance
103
+
104
+ For prediction on new data, use `apply_target_encoding()` which uses the full
105
+ training statistics (no ordering needed).
106
+ """
107
+ # Convert to list of strings if numpy array
108
+ if isinstance(categories, np.ndarray):
109
+ categories = [str(x) for x in categories]
110
+ else:
111
+ categories = [str(x) for x in categories]
112
+
113
+ target = np.asarray(target, dtype=np.float64)
114
+
115
+ return _rustystats.target_encode_py(
116
+ categories, target, var_name, prior_weight, n_permutations, seed
117
+ )
118
+
119
+
120
+ def apply_target_encoding(
121
+ categories: Union[List[str], np.ndarray],
122
+ level_stats: Dict[str, Tuple[float, int]],
123
+ prior: float,
124
+ prior_weight: float = 1.0,
125
+ ) -> np.ndarray:
126
+ """
127
+ Apply target encoding to new data using pre-computed statistics.
128
+
129
+ For prediction: uses full training statistics (no ordering needed).
130
+ Unseen categories get the prior (global mean).
131
+
132
+ Parameters
133
+ ----------
134
+ categories : list[str] or numpy.ndarray
135
+ Categorical values for new data
136
+ level_stats : dict
137
+ Mapping of level -> (sum_target, count) from training.
138
+ Returned by `target_encode()`.
139
+ prior : float
140
+ Global prior (mean of training target).
141
+ Returned by `target_encode()`.
142
+ prior_weight : float, optional
143
+ Prior weight (should match training, default: 1.0)
144
+
145
+ Returns
146
+ -------
147
+ numpy.ndarray
148
+ Encoded values for new data
149
+
150
+ Examples
151
+ --------
152
+ >>> # Train
153
+ >>> encoded, name, prior, stats = rs.target_encode(train_cats, train_y, "brand")
154
+ >>>
155
+ >>> # Predict
156
+ >>> test_encoded = rs.apply_target_encoding(test_cats, stats, prior)
157
+ """
158
+ # Convert to list of strings if numpy array
159
+ if isinstance(categories, np.ndarray):
160
+ categories = [str(x) for x in categories]
161
+ else:
162
+ categories = [str(x) for x in categories]
163
+
164
+ return _rustystats.apply_target_encoding_py(
165
+ categories, level_stats, prior, prior_weight
166
+ )
167
+
168
+
169
+ class TargetEncoder:
170
+ """
171
+ Scikit-learn style target encoder with CatBoost-style ordered target statistics.
172
+
173
+ Fits on training data and transforms both training and test data.
174
+
175
+ Parameters
176
+ ----------
177
+ prior_weight : float, optional
178
+ Regularization strength toward global mean (default: 1.0)
179
+ n_permutations : int, optional
180
+ Number of random permutations to average (default: 4)
181
+ seed : int, optional
182
+ Random seed for reproducibility
183
+
184
+ Attributes
185
+ ----------
186
+ prior_ : float
187
+ Global prior (mean of training target)
188
+ level_stats_ : dict
189
+ Mapping of level -> (sum_target, count)
190
+
191
+ Examples
192
+ --------
193
+ >>> encoder = rs.TargetEncoder(prior_weight=1.0, n_permutations=4)
194
+ >>> train_encoded = encoder.fit_transform(train_categories, train_y)
195
+ >>> test_encoded = encoder.transform(test_categories)
196
+ """
197
+
198
+ def __init__(
199
+ self,
200
+ prior_weight: float = 1.0,
201
+ n_permutations: int = 4,
202
+ seed: Optional[int] = None,
203
+ ):
204
+ self.prior_weight = prior_weight
205
+ self.n_permutations = n_permutations
206
+ self.seed = seed
207
+ self.prior_: Optional[float] = None
208
+ self.level_stats_: Optional[Dict[str, Tuple[float, int]]] = None
209
+
210
+ def fit(
211
+ self,
212
+ categories: Union[List[str], np.ndarray],
213
+ target: np.ndarray,
214
+ ) -> "TargetEncoder":
215
+ """
216
+ Fit the encoder on training data.
217
+
218
+ Parameters
219
+ ----------
220
+ categories : list[str] or numpy.ndarray
221
+ Categorical values
222
+ target : numpy.ndarray
223
+ Target variable
224
+
225
+ Returns
226
+ -------
227
+ self
228
+ """
229
+ _, _, self.prior_, self.level_stats_ = target_encode(
230
+ categories, target, "x",
231
+ self.prior_weight, self.n_permutations, self.seed
232
+ )
233
+ return self
234
+
235
+ def transform(
236
+ self,
237
+ categories: Union[List[str], np.ndarray],
238
+ ) -> np.ndarray:
239
+ """
240
+ Transform categories using fitted statistics.
241
+
242
+ For test/validation data, uses full training statistics.
243
+
244
+ Parameters
245
+ ----------
246
+ categories : list[str] or numpy.ndarray
247
+ Categorical values
248
+
249
+ Returns
250
+ -------
251
+ numpy.ndarray
252
+ Encoded values
253
+ """
254
+ if self.level_stats_ is None or self.prior_ is None:
255
+ raise ValueError("Encoder not fitted. Call fit() first.")
256
+
257
+ return apply_target_encoding(
258
+ categories, self.level_stats_, self.prior_, self.prior_weight
259
+ )
260
+
261
+ def fit_transform(
262
+ self,
263
+ categories: Union[List[str], np.ndarray],
264
+ target: np.ndarray,
265
+ ) -> np.ndarray:
266
+ """
267
+ Fit and transform training data using ordered target statistics.
268
+
269
+ Uses CatBoost-style ordering to prevent target leakage.
270
+
271
+ Parameters
272
+ ----------
273
+ categories : list[str] or numpy.ndarray
274
+ Categorical values
275
+ target : numpy.ndarray
276
+ Target variable
277
+
278
+ Returns
279
+ -------
280
+ numpy.ndarray
281
+ Encoded values (with ordered statistics for training)
282
+ """
283
+ encoded, _, self.prior_, self.level_stats_ = target_encode(
284
+ categories, target, "x",
285
+ self.prior_weight, self.n_permutations, self.seed
286
+ )
287
+ return encoded
288
+
289
+
290
+ class TargetEncodingTerm:
291
+ """
292
+ Represents a target encoding term in a formula.
293
+
294
+ Used internally by the formula parser to handle TE(var) syntax.
295
+
296
+ Parameters
297
+ ----------
298
+ var_name : str
299
+ Variable name to encode
300
+ prior_weight : float, optional
301
+ Prior weight for regularization
302
+ n_permutations : int, optional
303
+ Number of permutations
304
+ """
305
+
306
+ def __init__(
307
+ self,
308
+ var_name: str,
309
+ prior_weight: float = 1.0,
310
+ n_permutations: int = 4,
311
+ ):
312
+ self.var_name = var_name
313
+ self.prior_weight = prior_weight
314
+ self.n_permutations = n_permutations
315
+ self.encoder: Optional[TargetEncoder] = None
316
+
317
+ def fit_transform(
318
+ self,
319
+ data, # DataFrame (Polars or Pandas)
320
+ target: np.ndarray,
321
+ seed: Optional[int] = None,
322
+ ) -> Tuple[np.ndarray, str]:
323
+ """
324
+ Fit and transform the column from a DataFrame.
325
+
326
+ Returns
327
+ -------
328
+ values : numpy.ndarray
329
+ Encoded values
330
+ name : str
331
+ Column name
332
+ """
333
+ # Extract column
334
+ if hasattr(data, 'to_numpy'):
335
+ # Polars DataFrame
336
+ col = data[self.var_name].to_numpy()
337
+ else:
338
+ # Pandas DataFrame
339
+ col = data[self.var_name].values
340
+
341
+ categories = [str(x) for x in col]
342
+
343
+ self.encoder = TargetEncoder(
344
+ prior_weight=self.prior_weight,
345
+ n_permutations=self.n_permutations,
346
+ seed=seed,
347
+ )
348
+ encoded = self.encoder.fit_transform(categories, target)
349
+
350
+ return encoded, f"TE({self.var_name})"
351
+
352
+ def transform(self, data) -> Tuple[np.ndarray, str]:
353
+ """
354
+ Transform new data using fitted encoder.
355
+
356
+ Returns
357
+ -------
358
+ values : numpy.ndarray
359
+ Encoded values
360
+ name : str
361
+ Column name
362
+ """
363
+ if self.encoder is None:
364
+ raise ValueError("Term not fitted. Call fit_transform() first.")
365
+
366
+ # Extract column
367
+ if hasattr(data, 'to_numpy'):
368
+ col = data[self.var_name].to_numpy()
369
+ else:
370
+ col = data[self.var_name].values
371
+
372
+ categories = [str(x) for x in col]
373
+ encoded = self.encoder.transform(categories)
374
+
375
+ return encoded, f"TE({self.var_name})"