aplr 10.19.2__cp311-cp311-win_amd64.whl → 10.20.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aplr/aplr.py +103 -289
- {aplr-10.19.2.dist-info → aplr-10.20.0.dist-info}/METADATA +1 -1
- aplr-10.20.0.dist-info/RECORD +8 -0
- aplr_cpp.cp311-win_amd64.pyd +0 -0
- aplr-10.19.2.dist-info/RECORD +0 -8
- {aplr-10.19.2.dist-info → aplr-10.20.0.dist-info}/WHEEL +0 -0
- {aplr-10.19.2.dist-info → aplr-10.20.0.dist-info}/licenses/LICENSE +0 -0
- {aplr-10.19.2.dist-info → aplr-10.20.0.dist-info}/top_level.txt +0 -0
aplr/aplr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Callable, Optional, Dict, Union
|
|
1
|
+
from typing import List, Callable, Optional, Dict, Union
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import aplr_cpp
|
|
@@ -10,241 +10,57 @@ IntVector = np.ndarray
|
|
|
10
10
|
IntMatrix = np.ndarray
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
self._fit_preprocessor(X, X_names, sample_weight)
|
|
33
|
-
|
|
34
|
-
X = self._transform_X(X)
|
|
35
|
-
|
|
36
|
-
return X.to_numpy(dtype=np.float64), list(X.columns)
|
|
37
|
-
|
|
38
|
-
def _preprocess_X_predict(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
|
|
39
|
-
X = self._transform_X(X)
|
|
40
|
-
return X.to_numpy(dtype=np.float64)
|
|
41
|
-
|
|
42
|
-
def _fit_preprocessor(
|
|
43
|
-
self,
|
|
44
|
-
X: Union[pd.DataFrame, FloatMatrix],
|
|
45
|
-
X_names: List[str],
|
|
46
|
-
sample_weight: FloatVector,
|
|
47
|
-
) -> None:
|
|
48
|
-
"""Learns transformations from the training data and sets preprocessor state."""
|
|
49
|
-
X = self._convert_input_to_dataframe_for_fit(X, X_names=X_names)
|
|
50
|
-
self.X_names_ = list(X.columns)
|
|
51
|
-
self.categorical_features_ = list(
|
|
52
|
-
X.select_dtypes(include=["category", "object"]).columns
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
self._fit_one_hot_encoding(X)
|
|
56
|
-
self._fit_missing_indicators(X)
|
|
57
|
-
|
|
58
|
-
# Learn median values for imputation from the original data.
|
|
59
|
-
self.median_values_ = {}
|
|
60
|
-
numeric_cols_for_median = [
|
|
61
|
-
col for col in X.columns if col not in self.categorical_features_
|
|
62
|
-
]
|
|
63
|
-
for col in numeric_cols_for_median:
|
|
64
|
-
missing_mask = X[col].isnull()
|
|
65
|
-
if sample_weight.size > 0:
|
|
66
|
-
valid_indices = ~missing_mask
|
|
67
|
-
col_data = X.loc[valid_indices, col]
|
|
68
|
-
col_weights = sample_weight[valid_indices]
|
|
69
|
-
if col_data.empty:
|
|
70
|
-
median_val = 0
|
|
71
|
-
else:
|
|
72
|
-
col_data_np = col_data.to_numpy()
|
|
73
|
-
sort_indices = np.argsort(col_data_np, kind="stable")
|
|
74
|
-
sorted_data = col_data_np[sort_indices]
|
|
75
|
-
sorted_weights = col_weights[sort_indices]
|
|
76
|
-
cumulative_weights = np.cumsum(sorted_weights)
|
|
77
|
-
total_weight = cumulative_weights[-1]
|
|
78
|
-
median_weight_index = np.searchsorted(
|
|
79
|
-
cumulative_weights, total_weight / 2.0
|
|
80
|
-
)
|
|
81
|
-
if median_weight_index >= len(sorted_data):
|
|
82
|
-
median_weight_index = len(sorted_data) - 1
|
|
83
|
-
median_val = sorted_data[median_weight_index]
|
|
84
|
-
else:
|
|
85
|
-
if X[col].isnull().all():
|
|
86
|
-
median_val = 0
|
|
87
|
-
else:
|
|
88
|
-
median_val = X[col].median()
|
|
89
|
-
|
|
90
|
-
if pd.isna(median_val):
|
|
91
|
-
median_val = 0
|
|
92
|
-
self.median_values_[col] = median_val
|
|
93
|
-
|
|
94
|
-
# Determine the final column names after all transformations.
|
|
95
|
-
final_cols = []
|
|
96
|
-
if self.ohe_columns_:
|
|
97
|
-
final_cols.extend(self.ohe_columns_)
|
|
13
|
+
def _dataframe_to_cpp_dataframe(df: pd.DataFrame) -> aplr_cpp.CppDataFrame:
|
|
14
|
+
"""Converts a pandas DataFrame to a CppDataFrame."""
|
|
15
|
+
cpp_df = aplr_cpp.CppDataFrame()
|
|
16
|
+
for col_name in df.columns:
|
|
17
|
+
col = df[col_name]
|
|
18
|
+
if pd.api.types.is_numeric_dtype(col.dtype):
|
|
19
|
+
# Convert numeric columns to std::vector<double>
|
|
20
|
+
# NaNs are preserved and handled in C++
|
|
21
|
+
cpp_df.add_numeric_column(
|
|
22
|
+
col_name, col.to_numpy(dtype=np.float64, na_value=np.nan)
|
|
23
|
+
)
|
|
24
|
+
elif (
|
|
25
|
+
isinstance(col.dtype, pd.CategoricalDtype)
|
|
26
|
+
or pd.api.types.is_object_dtype(col.dtype)
|
|
27
|
+
or pd.api.types.is_string_dtype(col.dtype)
|
|
28
|
+
):
|
|
29
|
+
# Convert categorical/object/string columns to std::vector<std::string>
|
|
30
|
+
# Missing values (None, np.nan) are converted to empty strings for C++ handling
|
|
31
|
+
cpp_df.add_categorical_column(col_name, col.astype(str).fillna("").tolist())
|
|
98
32
|
else:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
# (adding missing indicators or filling NaNs) and a copy hasn't already
|
|
125
|
-
# been made by one-hot encoding.
|
|
126
|
-
if not self.categorical_features_ and X.isnull().to_numpy().any():
|
|
127
|
-
X = X.copy()
|
|
128
|
-
|
|
129
|
-
X = self._transform_missing_indicators(X)
|
|
130
|
-
|
|
131
|
-
for col, val in self.median_values_.items():
|
|
132
|
-
if col in X.columns:
|
|
133
|
-
X[col] = X[col].fillna(val)
|
|
134
|
-
|
|
135
|
-
# Enforce final column order and add missing columns if necessary
|
|
136
|
-
if self.final_training_columns_:
|
|
137
|
-
missing_final_cols = set(self.final_training_columns_) - set(X.columns)
|
|
138
|
-
for c in missing_final_cols:
|
|
139
|
-
X[c] = 0
|
|
140
|
-
if not X.columns.equals(pd.Index(self.final_training_columns_)):
|
|
141
|
-
X = X.reindex(columns=self.final_training_columns_, copy=False)
|
|
142
|
-
|
|
143
|
-
return X
|
|
144
|
-
|
|
145
|
-
def _transform_one_hot_encoding(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
146
|
-
"""Applies one-hot encoding using learned OHE columns during transformation."""
|
|
147
|
-
if not self.categorical_features_:
|
|
148
|
-
return X
|
|
149
|
-
|
|
150
|
-
X = pd.get_dummies(X, columns=self.categorical_features_, dummy_na=False)
|
|
151
|
-
# Handle missing OHE columns (categories not seen in new data)
|
|
152
|
-
missing_cols = set(self.ohe_columns_) - set(X.columns)
|
|
153
|
-
for c in missing_cols:
|
|
154
|
-
X[c] = 0
|
|
155
|
-
# Ensure column order
|
|
156
|
-
if not X.columns.equals(pd.Index(self.ohe_columns_)):
|
|
157
|
-
X = X.reindex(columns=self.ohe_columns_, copy=False)
|
|
158
|
-
return X
|
|
159
|
-
|
|
160
|
-
def _transform_missing_indicators(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
-
"""Adds _missing indicator columns for features with NaNs during transformation."""
|
|
162
|
-
if not self.na_imputed_cols_:
|
|
163
|
-
return X
|
|
164
|
-
# Only add indicators for columns that were imputed during fit and are currently missing data.
|
|
165
|
-
for col in self.na_imputed_cols_:
|
|
166
|
-
if col in X.columns and X[col].isnull().any():
|
|
167
|
-
X[col + "_missing"] = X[col].isnull().astype(int)
|
|
168
|
-
return X
|
|
169
|
-
|
|
170
|
-
def _convert_input_to_dataframe_for_fit(
|
|
171
|
-
self,
|
|
172
|
-
X: Union[pd.DataFrame, FloatMatrix],
|
|
173
|
-
X_names: Optional[List[str]] = None,
|
|
174
|
-
) -> pd.DataFrame:
|
|
175
|
-
"""Converts input X to a pandas DataFrame for fitting, handling column names."""
|
|
176
|
-
X, was_converted = self._to_dataframe(X)
|
|
177
|
-
if was_converted:
|
|
178
|
-
if X_names:
|
|
179
|
-
X.columns = list(X_names)
|
|
180
|
-
else:
|
|
181
|
-
X.columns = [f"X{i}" for i in range(X.shape[1])]
|
|
182
|
-
return X
|
|
183
|
-
|
|
184
|
-
def _convert_input_to_dataframe_for_transform(
|
|
185
|
-
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
186
|
-
) -> pd.DataFrame:
|
|
187
|
-
"""Converts input X to a pandas DataFrame for transformation, aligning columns."""
|
|
188
|
-
X, was_converted = self._to_dataframe(X)
|
|
189
|
-
if was_converted:
|
|
190
|
-
if self.X_names_ and len(self.X_names_) == X.shape[1]:
|
|
191
|
-
X.columns = self.X_names_ # Use names learned during fit
|
|
192
|
-
else: # If X was already a DataFrame
|
|
193
|
-
if set(X.columns) != set(self.X_names_):
|
|
194
|
-
raise ValueError(
|
|
195
|
-
"Input columns for prediction do not match training columns."
|
|
33
|
+
raise TypeError(
|
|
34
|
+
f"Unsupported column type for column '{col_name}': {col.dtype}"
|
|
35
|
+
)
|
|
36
|
+
return cpp_df
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _prepare_input_data(
|
|
40
|
+
X: Union[pd.DataFrame, FloatMatrix], preprocess: bool
|
|
41
|
+
) -> Union[aplr_cpp.CppDataFrame, FloatMatrix]:
|
|
42
|
+
"""
|
|
43
|
+
Prepares the input data for the C++ backend.
|
|
44
|
+
|
|
45
|
+
If X is a pandas DataFrame, it's converted. If preprocess is True, it becomes
|
|
46
|
+
a CppDataFrame. If preprocess is False, it's converted to a NumPy array.
|
|
47
|
+
NumPy arrays are passed through as is.
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(X, pd.DataFrame):
|
|
50
|
+
if preprocess:
|
|
51
|
+
return _dataframe_to_cpp_dataframe(X)
|
|
52
|
+
else:
|
|
53
|
+
# Check if all columns are numeric before converting
|
|
54
|
+
if not all(pd.api.types.is_numeric_dtype(X[col]) for col in X.columns):
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
"Cannot convert DataFrame to matrix if it contains non-numeric columns. "
|
|
57
|
+
"Please ensure all columns are numeric or set preprocess=True."
|
|
196
58
|
)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
return X
|
|
59
|
+
return X.to_numpy(dtype=np.float64)
|
|
60
|
+
return X
|
|
200
61
|
|
|
201
|
-
def _to_dataframe(
|
|
202
|
-
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
203
|
-
) -> Tuple[pd.DataFrame, bool]:
|
|
204
|
-
"""Converts input to a pandas DataFrame if it is not already one."""
|
|
205
|
-
if isinstance(X, pd.DataFrame):
|
|
206
|
-
return X, False # Was already a DataFrame
|
|
207
62
|
|
|
208
|
-
|
|
209
|
-
try:
|
|
210
|
-
# If X is already a numpy array, astype with copy=False is more efficient.
|
|
211
|
-
# It will only copy if the dtype is different from np.float64.
|
|
212
|
-
if isinstance(X, np.ndarray):
|
|
213
|
-
X_numeric = X.astype(np.float64, copy=False)
|
|
214
|
-
else:
|
|
215
|
-
# For other array-likes (e.g., list of lists), create the array.
|
|
216
|
-
X_numeric = np.array(X, dtype=np.float64)
|
|
217
|
-
except (ValueError, TypeError) as e:
|
|
218
|
-
raise TypeError("Input X must be numeric if not a pandas DataFrame.") from e
|
|
219
|
-
return pd.DataFrame(X_numeric, copy=False), True # Was converted
|
|
220
|
-
|
|
221
|
-
def __setstate__(self, state):
|
|
222
|
-
"""Handles unpickling for backward compatibility."""
|
|
223
|
-
self.__dict__.update(state)
|
|
224
|
-
|
|
225
|
-
# For backward compatibility, initialize new attributes if they don't exist,
|
|
226
|
-
# indicating the model was trained before these features were introduced.
|
|
227
|
-
new_attributes = {
|
|
228
|
-
"X_names_": [],
|
|
229
|
-
"categorical_features_": [],
|
|
230
|
-
"ohe_columns_": [],
|
|
231
|
-
"na_imputed_cols_": [],
|
|
232
|
-
"median_values_": {},
|
|
233
|
-
"final_training_columns_": [],
|
|
234
|
-
}
|
|
235
|
-
for attr, default_value in new_attributes.items():
|
|
236
|
-
if not hasattr(self, attr):
|
|
237
|
-
setattr(self, attr, default_value)
|
|
238
|
-
|
|
239
|
-
def _validate_X_fit_rows(self, X):
|
|
240
|
-
"""Checks if X has enough rows to be fitted."""
|
|
241
|
-
if (isinstance(X, np.ndarray) and X.shape[0] < 2) or (
|
|
242
|
-
isinstance(X, pd.DataFrame) and len(X) < 2
|
|
243
|
-
):
|
|
244
|
-
raise ValueError("Input X must have at least 2 rows to be fitted.")
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class APLRRegressor(BaseAPLR):
|
|
63
|
+
class APLRRegressor:
|
|
248
64
|
def __init__(
|
|
249
65
|
self,
|
|
250
66
|
m: int = 3000,
|
|
@@ -312,6 +128,7 @@ class APLRRegressor(BaseAPLR):
|
|
|
312
128
|
ridge_penalty: float = 0.0001,
|
|
313
129
|
mean_bias_correction: bool = False,
|
|
314
130
|
faster_convergence: bool = False,
|
|
131
|
+
preprocess: bool = True,
|
|
315
132
|
):
|
|
316
133
|
self.m = m
|
|
317
134
|
self.v = v
|
|
@@ -361,14 +178,7 @@ class APLRRegressor(BaseAPLR):
|
|
|
361
178
|
self.ridge_penalty = ridge_penalty
|
|
362
179
|
self.mean_bias_correction = mean_bias_correction
|
|
363
180
|
self.faster_convergence = faster_convergence
|
|
364
|
-
|
|
365
|
-
# Data transformations
|
|
366
|
-
self.median_values_ = {}
|
|
367
|
-
self.categorical_features_ = []
|
|
368
|
-
self.ohe_columns_ = []
|
|
369
|
-
self.na_imputed_cols_ = []
|
|
370
|
-
self.X_names_ = []
|
|
371
|
-
self.final_training_columns_ = []
|
|
181
|
+
self.preprocess = preprocess
|
|
372
182
|
|
|
373
183
|
# Creating aplr_cpp and setting parameters
|
|
374
184
|
self.APLRRegressor = aplr_cpp.APLRRegressor()
|
|
@@ -432,6 +242,7 @@ class APLRRegressor(BaseAPLR):
|
|
|
432
242
|
self.APLRRegressor.ridge_penalty = self.ridge_penalty
|
|
433
243
|
self.APLRRegressor.mean_bias_correction = self.mean_bias_correction
|
|
434
244
|
self.APLRRegressor.faster_convergence = self.faster_convergence
|
|
245
|
+
self.APLRRegressor.preprocess = self.preprocess
|
|
435
246
|
|
|
436
247
|
def fit(
|
|
437
248
|
self,
|
|
@@ -450,16 +261,14 @@ class APLRRegressor(BaseAPLR):
|
|
|
450
261
|
predictor_penalties_for_interactions: List[float] = [],
|
|
451
262
|
predictor_min_observations_in_split: List[int] = [],
|
|
452
263
|
):
|
|
453
|
-
self._validate_X_fit_rows(X)
|
|
454
264
|
self.__set_params_cpp()
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
)
|
|
265
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
266
|
+
|
|
458
267
|
self.APLRRegressor.fit(
|
|
459
|
-
|
|
268
|
+
X,
|
|
460
269
|
y,
|
|
461
270
|
sample_weight,
|
|
462
|
-
|
|
271
|
+
X_names,
|
|
463
272
|
cv_observations,
|
|
464
273
|
prioritized_predictors_indexes,
|
|
465
274
|
monotonic_constraints,
|
|
@@ -477,14 +286,13 @@ class APLRRegressor(BaseAPLR):
|
|
|
477
286
|
X: Union[pd.DataFrame, FloatMatrix],
|
|
478
287
|
cap_predictions_to_minmax_in_training: bool = True,
|
|
479
288
|
) -> FloatVector:
|
|
289
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
290
|
+
|
|
480
291
|
if self.link_function == "custom_function":
|
|
481
292
|
self.APLRRegressor.calculate_custom_transform_linear_predictor_to_predictions_function = (
|
|
482
293
|
self.calculate_custom_transform_linear_predictor_to_predictions_function
|
|
483
294
|
)
|
|
484
|
-
|
|
485
|
-
return self.APLRRegressor.predict(
|
|
486
|
-
X_transformed, cap_predictions_to_minmax_in_training
|
|
487
|
-
)
|
|
295
|
+
return self.APLRRegressor.predict(X, cap_predictions_to_minmax_in_training)
|
|
488
296
|
|
|
489
297
|
def set_term_names(self, X_names: List[str]):
|
|
490
298
|
self.APLRRegressor.set_term_names(X_names)
|
|
@@ -494,44 +302,40 @@ class APLRRegressor(BaseAPLR):
|
|
|
494
302
|
X: Union[pd.DataFrame, FloatMatrix],
|
|
495
303
|
sample_weight: FloatVector = np.empty(0),
|
|
496
304
|
) -> FloatVector:
|
|
497
|
-
|
|
498
|
-
return self.APLRRegressor.calculate_feature_importance(
|
|
499
|
-
X_transformed, sample_weight
|
|
500
|
-
)
|
|
305
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
306
|
+
return self.APLRRegressor.calculate_feature_importance(X, sample_weight)
|
|
501
307
|
|
|
502
308
|
def calculate_term_importance(
|
|
503
309
|
self,
|
|
504
310
|
X: Union[pd.DataFrame, FloatMatrix],
|
|
505
311
|
sample_weight: FloatVector = np.empty(0),
|
|
506
312
|
) -> FloatVector:
|
|
507
|
-
|
|
508
|
-
return self.APLRRegressor.calculate_term_importance(
|
|
509
|
-
X_transformed, sample_weight
|
|
510
|
-
)
|
|
313
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
314
|
+
return self.APLRRegressor.calculate_term_importance(X, sample_weight)
|
|
511
315
|
|
|
512
316
|
def calculate_local_feature_contribution(
|
|
513
317
|
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
514
318
|
) -> FloatMatrix:
|
|
515
|
-
|
|
516
|
-
return self.APLRRegressor.calculate_local_feature_contribution(
|
|
319
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
320
|
+
return self.APLRRegressor.calculate_local_feature_contribution(X)
|
|
517
321
|
|
|
518
322
|
def calculate_local_term_contribution(
|
|
519
323
|
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
520
324
|
) -> FloatMatrix:
|
|
521
|
-
|
|
522
|
-
return self.APLRRegressor.calculate_local_term_contribution(
|
|
325
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
326
|
+
return self.APLRRegressor.calculate_local_term_contribution(X)
|
|
523
327
|
|
|
524
328
|
def calculate_local_contribution_from_selected_terms(
|
|
525
329
|
self, X: Union[pd.DataFrame, FloatMatrix], predictor_indexes: List[int]
|
|
526
330
|
) -> FloatVector:
|
|
527
|
-
|
|
331
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
528
332
|
return self.APLRRegressor.calculate_local_contribution_from_selected_terms(
|
|
529
|
-
|
|
333
|
+
X, predictor_indexes
|
|
530
334
|
)
|
|
531
335
|
|
|
532
336
|
def calculate_terms(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
|
|
533
|
-
|
|
534
|
-
return self.APLRRegressor.calculate_terms(
|
|
337
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
338
|
+
return self.APLRRegressor.calculate_terms(X)
|
|
535
339
|
|
|
536
340
|
def get_term_names(self) -> List[str]:
|
|
537
341
|
return self.APLRRegressor.get_term_names()
|
|
@@ -796,6 +600,7 @@ class APLRRegressor(BaseAPLR):
|
|
|
796
600
|
"ridge_penalty": self.ridge_penalty,
|
|
797
601
|
"mean_bias_correction": self.mean_bias_correction,
|
|
798
602
|
"faster_convergence": self.faster_convergence,
|
|
603
|
+
"preprocess": self.preprocess,
|
|
799
604
|
}
|
|
800
605
|
|
|
801
606
|
# For sklearn
|
|
@@ -805,8 +610,15 @@ class APLRRegressor(BaseAPLR):
|
|
|
805
610
|
self.__set_params_cpp()
|
|
806
611
|
return self
|
|
807
612
|
|
|
613
|
+
def __setstate__(self, state):
|
|
614
|
+
# For backwards compatibility with older pickled models
|
|
615
|
+
if "preprocess" not in state:
|
|
616
|
+
state["preprocess"] = False
|
|
617
|
+
self.__dict__.update(state)
|
|
618
|
+
self.__set_params_cpp()
|
|
619
|
+
|
|
808
620
|
|
|
809
|
-
class APLRClassifier
|
|
621
|
+
class APLRClassifier:
|
|
810
622
|
def __init__(
|
|
811
623
|
self,
|
|
812
624
|
m: int = 3000,
|
|
@@ -829,6 +641,7 @@ class APLRClassifier(BaseAPLR):
|
|
|
829
641
|
penalty_for_interactions: float = 0.0,
|
|
830
642
|
max_terms: int = 0,
|
|
831
643
|
ridge_penalty: float = 0.0001,
|
|
644
|
+
preprocess: bool = True,
|
|
832
645
|
):
|
|
833
646
|
self.m = m
|
|
834
647
|
self.v = v
|
|
@@ -856,14 +669,7 @@ class APLRClassifier(BaseAPLR):
|
|
|
856
669
|
self.penalty_for_interactions = penalty_for_interactions
|
|
857
670
|
self.max_terms = max_terms
|
|
858
671
|
self.ridge_penalty = ridge_penalty
|
|
859
|
-
|
|
860
|
-
# Data transformations
|
|
861
|
-
self.median_values_ = {}
|
|
862
|
-
self.categorical_features_ = []
|
|
863
|
-
self.ohe_columns_ = []
|
|
864
|
-
self.na_imputed_cols_ = []
|
|
865
|
-
self.X_names_ = []
|
|
866
|
-
self.final_training_columns_ = []
|
|
672
|
+
self.preprocess = preprocess
|
|
867
673
|
|
|
868
674
|
# Creating aplr_cpp and setting parameters
|
|
869
675
|
self.APLRClassifier = aplr_cpp.APLRClassifier()
|
|
@@ -899,6 +705,7 @@ class APLRClassifier(BaseAPLR):
|
|
|
899
705
|
self.APLRClassifier.penalty_for_interactions = self.penalty_for_interactions
|
|
900
706
|
self.APLRClassifier.max_terms = self.max_terms
|
|
901
707
|
self.APLRClassifier.ridge_penalty = self.ridge_penalty
|
|
708
|
+
self.APLRClassifier.preprocess = self.preprocess
|
|
902
709
|
|
|
903
710
|
def fit(
|
|
904
711
|
self,
|
|
@@ -915,11 +722,9 @@ class APLRClassifier(BaseAPLR):
|
|
|
915
722
|
predictor_penalties_for_interactions: List[float] = [],
|
|
916
723
|
predictor_min_observations_in_split: List[int] = [],
|
|
917
724
|
):
|
|
918
|
-
self._validate_X_fit_rows(X)
|
|
919
725
|
self.__set_params_cpp()
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
)
|
|
726
|
+
|
|
727
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
923
728
|
|
|
924
729
|
if isinstance(y, np.ndarray):
|
|
925
730
|
y = y.astype(str).tolist()
|
|
@@ -927,10 +732,10 @@ class APLRClassifier(BaseAPLR):
|
|
|
927
732
|
y = [str(val) for val in y]
|
|
928
733
|
|
|
929
734
|
self.APLRClassifier.fit(
|
|
930
|
-
|
|
735
|
+
X,
|
|
931
736
|
y,
|
|
932
737
|
sample_weight,
|
|
933
|
-
|
|
738
|
+
X_names,
|
|
934
739
|
cv_observations,
|
|
935
740
|
prioritized_predictors_indexes,
|
|
936
741
|
monotonic_constraints,
|
|
@@ -948,9 +753,10 @@ class APLRClassifier(BaseAPLR):
|
|
|
948
753
|
X: Union[pd.DataFrame, FloatMatrix],
|
|
949
754
|
cap_predictions_to_minmax_in_training: bool = False,
|
|
950
755
|
) -> FloatMatrix:
|
|
951
|
-
|
|
756
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
757
|
+
|
|
952
758
|
return self.APLRClassifier.predict_class_probabilities(
|
|
953
|
-
|
|
759
|
+
X, cap_predictions_to_minmax_in_training
|
|
954
760
|
)
|
|
955
761
|
|
|
956
762
|
def predict(
|
|
@@ -958,16 +764,15 @@ class APLRClassifier(BaseAPLR):
|
|
|
958
764
|
X: Union[pd.DataFrame, FloatMatrix],
|
|
959
765
|
cap_predictions_to_minmax_in_training: bool = False,
|
|
960
766
|
) -> List[str]:
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
)
|
|
767
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
768
|
+
|
|
769
|
+
return self.APLRClassifier.predict(X, cap_predictions_to_minmax_in_training)
|
|
965
770
|
|
|
966
771
|
def calculate_local_feature_contribution(
|
|
967
772
|
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
968
773
|
) -> FloatMatrix:
|
|
969
|
-
|
|
970
|
-
return self.APLRClassifier.calculate_local_feature_contribution(
|
|
774
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
775
|
+
return self.APLRClassifier.calculate_local_feature_contribution(X)
|
|
971
776
|
|
|
972
777
|
def get_categories(self) -> List[str]:
|
|
973
778
|
return self.APLRClassifier.get_categories()
|
|
@@ -998,6 +803,7 @@ class APLRClassifier(BaseAPLR):
|
|
|
998
803
|
penalty_for_interactions=self.penalty_for_interactions,
|
|
999
804
|
max_terms=self.max_terms,
|
|
1000
805
|
ridge_penalty=self.ridge_penalty,
|
|
806
|
+
preprocess=self.preprocess,
|
|
1001
807
|
)
|
|
1002
808
|
|
|
1003
809
|
logit_model_py.APLRRegressor = logit_model_cpp
|
|
@@ -1048,6 +854,7 @@ class APLRClassifier(BaseAPLR):
|
|
|
1048
854
|
"penalty_for_interactions": self.penalty_for_interactions,
|
|
1049
855
|
"max_terms": self.max_terms,
|
|
1050
856
|
"ridge_penalty": self.ridge_penalty,
|
|
857
|
+
"preprocess": self.preprocess,
|
|
1051
858
|
}
|
|
1052
859
|
|
|
1053
860
|
# For sklearn
|
|
@@ -1061,6 +868,13 @@ class APLRClassifier(BaseAPLR):
|
|
|
1061
868
|
def predict_proba(self, X: FloatMatrix) -> FloatMatrix:
|
|
1062
869
|
return self.predict_class_probabilities(X)
|
|
1063
870
|
|
|
871
|
+
def __setstate__(self, state):
|
|
872
|
+
# For backwards compatibility with older pickled models
|
|
873
|
+
if "preprocess" not in state:
|
|
874
|
+
state["preprocess"] = False
|
|
875
|
+
self.__dict__.update(state)
|
|
876
|
+
self.__set_params_cpp()
|
|
877
|
+
|
|
1064
878
|
|
|
1065
879
|
class APLRTuner:
|
|
1066
880
|
def __init__(
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
aplr_cpp.cp311-win_amd64.pyd,sha256=ecU1pYcAJ3K_B_pcYJS06DiNSMpCCovciVQWGpbWiwA,825856
|
|
2
|
+
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
+
aplr/aplr.py,sha256=I_LyS_uH9wmW7wE8uP6nvPhfCeeA1cQXIuLPqwT21OE,40125
|
|
4
|
+
aplr-10.20.0.dist-info/licenses/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
+
aplr-10.20.0.dist-info/METADATA,sha256=sRYG3XKunuA5qB-cm-6XBqUd4XRIgK1tVriXV3PX_5Y,1048
|
|
6
|
+
aplr-10.20.0.dist-info/WHEEL,sha256=JLOMsP7F5qtkAkINx5UnzbFguf8CqZeraV8o04b0I8I,101
|
|
7
|
+
aplr-10.20.0.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
+
aplr-10.20.0.dist-info/RECORD,,
|
aplr_cpp.cp311-win_amd64.pyd
CHANGED
|
Binary file
|
aplr-10.19.2.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
aplr_cpp.cp311-win_amd64.pyd,sha256=jFpeuXLruiu6gB8tB7mxrTWm3Q931259woyywenX_Is,674816
|
|
2
|
-
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
-
aplr/aplr.py,sha256=Pv_6dSaZ7WIbP6vzzB6-R8S0VLcKvlVJyP0RBToDZKw,48320
|
|
4
|
-
aplr-10.19.2.dist-info/licenses/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
-
aplr-10.19.2.dist-info/METADATA,sha256=e1-enJ5c5XtbnxCHKe-YNFq8EZAve0uOkxuiA-rKQWY,1048
|
|
6
|
-
aplr-10.19.2.dist-info/WHEEL,sha256=JLOMsP7F5qtkAkINx5UnzbFguf8CqZeraV8o04b0I8I,101
|
|
7
|
-
aplr-10.19.2.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
-
aplr-10.19.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|