aplr 7.0.0__cp311-cp311-win_amd64.whl → 10.19.2__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aplr/aplr.py +841 -90
- aplr-10.19.2.dist-info/METADATA +34 -0
- aplr-10.19.2.dist-info/RECORD +8 -0
- {aplr-7.0.0.dist-info → aplr-10.19.2.dist-info}/WHEEL +1 -1
- aplr_cpp.cp311-win_amd64.pyd +0 -0
- aplr-7.0.0.dist-info/METADATA +0 -17
- aplr-7.0.0.dist-info/RECORD +0 -8
- {aplr-7.0.0.dist-info → aplr-10.19.2.dist-info/licenses}/LICENSE +0 -0
- {aplr-7.0.0.dist-info → aplr-10.19.2.dist-info}/top_level.txt +0 -0
aplr/aplr.py
CHANGED
|
@@ -1,49 +1,317 @@
|
|
|
1
|
+
from typing import List, Callable, Optional, Dict, Union, Tuple
|
|
1
2
|
import numpy as np
|
|
2
|
-
import
|
|
3
|
-
from typing import List, Callable, Optional
|
|
3
|
+
import pandas as pd
|
|
4
4
|
import aplr_cpp
|
|
5
|
+
import itertools
|
|
5
6
|
|
|
7
|
+
FloatVector = np.ndarray
|
|
8
|
+
FloatMatrix = np.ndarray
|
|
9
|
+
IntVector = np.ndarray
|
|
10
|
+
IntMatrix = np.ndarray
|
|
6
11
|
|
|
7
|
-
|
|
12
|
+
|
|
13
|
+
class BaseAPLR:
|
|
14
|
+
def _preprocess_X_fit(
|
|
15
|
+
self,
|
|
16
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
17
|
+
X_names: List[str],
|
|
18
|
+
sample_weight: FloatVector,
|
|
19
|
+
) -> Tuple[FloatMatrix, List[str]]:
|
|
20
|
+
if sample_weight.size > 0:
|
|
21
|
+
if sample_weight.ndim != 1:
|
|
22
|
+
raise ValueError("sample_weight must be a 1D array.")
|
|
23
|
+
if len(sample_weight) != X.shape[0]:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
"sample_weight must have the same number of rows as X."
|
|
26
|
+
)
|
|
27
|
+
if np.any(np.isnan(sample_weight)) or np.any(np.isinf(sample_weight)):
|
|
28
|
+
raise ValueError("sample_weight cannot contain nan or infinite values.")
|
|
29
|
+
if np.any(sample_weight < 0):
|
|
30
|
+
raise ValueError("sample_weight cannot contain negative values.")
|
|
31
|
+
|
|
32
|
+
self._fit_preprocessor(X, X_names, sample_weight)
|
|
33
|
+
|
|
34
|
+
X = self._transform_X(X)
|
|
35
|
+
|
|
36
|
+
return X.to_numpy(dtype=np.float64), list(X.columns)
|
|
37
|
+
|
|
38
|
+
def _preprocess_X_predict(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
|
|
39
|
+
X = self._transform_X(X)
|
|
40
|
+
return X.to_numpy(dtype=np.float64)
|
|
41
|
+
|
|
42
|
+
def _fit_preprocessor(
|
|
43
|
+
self,
|
|
44
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
45
|
+
X_names: List[str],
|
|
46
|
+
sample_weight: FloatVector,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Learns transformations from the training data and sets preprocessor state."""
|
|
49
|
+
X = self._convert_input_to_dataframe_for_fit(X, X_names=X_names)
|
|
50
|
+
self.X_names_ = list(X.columns)
|
|
51
|
+
self.categorical_features_ = list(
|
|
52
|
+
X.select_dtypes(include=["category", "object"]).columns
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
self._fit_one_hot_encoding(X)
|
|
56
|
+
self._fit_missing_indicators(X)
|
|
57
|
+
|
|
58
|
+
# Learn median values for imputation from the original data.
|
|
59
|
+
self.median_values_ = {}
|
|
60
|
+
numeric_cols_for_median = [
|
|
61
|
+
col for col in X.columns if col not in self.categorical_features_
|
|
62
|
+
]
|
|
63
|
+
for col in numeric_cols_for_median:
|
|
64
|
+
missing_mask = X[col].isnull()
|
|
65
|
+
if sample_weight.size > 0:
|
|
66
|
+
valid_indices = ~missing_mask
|
|
67
|
+
col_data = X.loc[valid_indices, col]
|
|
68
|
+
col_weights = sample_weight[valid_indices]
|
|
69
|
+
if col_data.empty:
|
|
70
|
+
median_val = 0
|
|
71
|
+
else:
|
|
72
|
+
col_data_np = col_data.to_numpy()
|
|
73
|
+
sort_indices = np.argsort(col_data_np, kind="stable")
|
|
74
|
+
sorted_data = col_data_np[sort_indices]
|
|
75
|
+
sorted_weights = col_weights[sort_indices]
|
|
76
|
+
cumulative_weights = np.cumsum(sorted_weights)
|
|
77
|
+
total_weight = cumulative_weights[-1]
|
|
78
|
+
median_weight_index = np.searchsorted(
|
|
79
|
+
cumulative_weights, total_weight / 2.0
|
|
80
|
+
)
|
|
81
|
+
if median_weight_index >= len(sorted_data):
|
|
82
|
+
median_weight_index = len(sorted_data) - 1
|
|
83
|
+
median_val = sorted_data[median_weight_index]
|
|
84
|
+
else:
|
|
85
|
+
if X[col].isnull().all():
|
|
86
|
+
median_val = 0
|
|
87
|
+
else:
|
|
88
|
+
median_val = X[col].median()
|
|
89
|
+
|
|
90
|
+
if pd.isna(median_val):
|
|
91
|
+
median_val = 0
|
|
92
|
+
self.median_values_[col] = median_val
|
|
93
|
+
|
|
94
|
+
# Determine the final column names after all transformations.
|
|
95
|
+
final_cols = []
|
|
96
|
+
if self.ohe_columns_:
|
|
97
|
+
final_cols.extend(self.ohe_columns_)
|
|
98
|
+
else:
|
|
99
|
+
final_cols.extend(self.X_names_)
|
|
100
|
+
final_cols.extend([col + "_missing" for col in self.na_imputed_cols_])
|
|
101
|
+
self.final_training_columns_ = final_cols
|
|
102
|
+
|
|
103
|
+
def _fit_one_hot_encoding(self, X: pd.DataFrame) -> None:
|
|
104
|
+
"""Learns the complete set of columns that will exist after one-hot encoding."""
|
|
105
|
+
if not self.categorical_features_:
|
|
106
|
+
return
|
|
107
|
+
self.ohe_columns_ = list(
|
|
108
|
+
pd.get_dummies(
|
|
109
|
+
X, columns=self.categorical_features_, dummy_na=False
|
|
110
|
+
).columns
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _fit_missing_indicators(self, X: pd.DataFrame) -> None:
|
|
114
|
+
"""Learns which columns will have missing indicators added."""
|
|
115
|
+
self.na_imputed_cols_ = [col for col in X.columns if X[col].isnull().any()]
|
|
116
|
+
|
|
117
|
+
def _transform_X(self, X: Union[pd.DataFrame, FloatMatrix]) -> pd.DataFrame:
|
|
118
|
+
"""Transforms data using the fitted preprocessor attributes."""
|
|
119
|
+
X = self._convert_input_to_dataframe_for_transform(X)
|
|
120
|
+
X = self._transform_one_hot_encoding(X)
|
|
121
|
+
|
|
122
|
+
# Just-in-time copy to avoid modifying user's original data.
|
|
123
|
+
# A copy is needed if we are about to perform in-place modifications
|
|
124
|
+
# (adding missing indicators or filling NaNs) and a copy hasn't already
|
|
125
|
+
# been made by one-hot encoding.
|
|
126
|
+
if not self.categorical_features_ and X.isnull().to_numpy().any():
|
|
127
|
+
X = X.copy()
|
|
128
|
+
|
|
129
|
+
X = self._transform_missing_indicators(X)
|
|
130
|
+
|
|
131
|
+
for col, val in self.median_values_.items():
|
|
132
|
+
if col in X.columns:
|
|
133
|
+
X[col] = X[col].fillna(val)
|
|
134
|
+
|
|
135
|
+
# Enforce final column order and add missing columns if necessary
|
|
136
|
+
if self.final_training_columns_:
|
|
137
|
+
missing_final_cols = set(self.final_training_columns_) - set(X.columns)
|
|
138
|
+
for c in missing_final_cols:
|
|
139
|
+
X[c] = 0
|
|
140
|
+
if not X.columns.equals(pd.Index(self.final_training_columns_)):
|
|
141
|
+
X = X.reindex(columns=self.final_training_columns_, copy=False)
|
|
142
|
+
|
|
143
|
+
return X
|
|
144
|
+
|
|
145
|
+
def _transform_one_hot_encoding(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
146
|
+
"""Applies one-hot encoding using learned OHE columns during transformation."""
|
|
147
|
+
if not self.categorical_features_:
|
|
148
|
+
return X
|
|
149
|
+
|
|
150
|
+
X = pd.get_dummies(X, columns=self.categorical_features_, dummy_na=False)
|
|
151
|
+
# Handle missing OHE columns (categories not seen in new data)
|
|
152
|
+
missing_cols = set(self.ohe_columns_) - set(X.columns)
|
|
153
|
+
for c in missing_cols:
|
|
154
|
+
X[c] = 0
|
|
155
|
+
# Ensure column order
|
|
156
|
+
if not X.columns.equals(pd.Index(self.ohe_columns_)):
|
|
157
|
+
X = X.reindex(columns=self.ohe_columns_, copy=False)
|
|
158
|
+
return X
|
|
159
|
+
|
|
160
|
+
def _transform_missing_indicators(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
+
"""Adds _missing indicator columns for features with NaNs during transformation."""
|
|
162
|
+
if not self.na_imputed_cols_:
|
|
163
|
+
return X
|
|
164
|
+
# Only add indicators for columns that were imputed during fit and are currently missing data.
|
|
165
|
+
for col in self.na_imputed_cols_:
|
|
166
|
+
if col in X.columns and X[col].isnull().any():
|
|
167
|
+
X[col + "_missing"] = X[col].isnull().astype(int)
|
|
168
|
+
return X
|
|
169
|
+
|
|
170
|
+
def _convert_input_to_dataframe_for_fit(
|
|
171
|
+
self,
|
|
172
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
173
|
+
X_names: Optional[List[str]] = None,
|
|
174
|
+
) -> pd.DataFrame:
|
|
175
|
+
"""Converts input X to a pandas DataFrame for fitting, handling column names."""
|
|
176
|
+
X, was_converted = self._to_dataframe(X)
|
|
177
|
+
if was_converted:
|
|
178
|
+
if X_names:
|
|
179
|
+
X.columns = list(X_names)
|
|
180
|
+
else:
|
|
181
|
+
X.columns = [f"X{i}" for i in range(X.shape[1])]
|
|
182
|
+
return X
|
|
183
|
+
|
|
184
|
+
def _convert_input_to_dataframe_for_transform(
|
|
185
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
186
|
+
) -> pd.DataFrame:
|
|
187
|
+
"""Converts input X to a pandas DataFrame for transformation, aligning columns."""
|
|
188
|
+
X, was_converted = self._to_dataframe(X)
|
|
189
|
+
if was_converted:
|
|
190
|
+
if self.X_names_ and len(self.X_names_) == X.shape[1]:
|
|
191
|
+
X.columns = self.X_names_ # Use names learned during fit
|
|
192
|
+
else: # If X was already a DataFrame
|
|
193
|
+
if set(X.columns) != set(self.X_names_):
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"Input columns for prediction do not match training columns."
|
|
196
|
+
)
|
|
197
|
+
if not X.columns.equals(pd.Index(self.X_names_)):
|
|
198
|
+
X = X.reindex(columns=self.X_names_, copy=False)
|
|
199
|
+
return X
|
|
200
|
+
|
|
201
|
+
def _to_dataframe(
|
|
202
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
203
|
+
) -> Tuple[pd.DataFrame, bool]:
|
|
204
|
+
"""Converts input to a pandas DataFrame if it is not already one."""
|
|
205
|
+
if isinstance(X, pd.DataFrame):
|
|
206
|
+
return X, False # Was already a DataFrame
|
|
207
|
+
|
|
208
|
+
X_numeric: np.ndarray
|
|
209
|
+
try:
|
|
210
|
+
# If X is already a numpy array, astype with copy=False is more efficient.
|
|
211
|
+
# It will only copy if the dtype is different from np.float64.
|
|
212
|
+
if isinstance(X, np.ndarray):
|
|
213
|
+
X_numeric = X.astype(np.float64, copy=False)
|
|
214
|
+
else:
|
|
215
|
+
# For other array-likes (e.g., list of lists), create the array.
|
|
216
|
+
X_numeric = np.array(X, dtype=np.float64)
|
|
217
|
+
except (ValueError, TypeError) as e:
|
|
218
|
+
raise TypeError("Input X must be numeric if not a pandas DataFrame.") from e
|
|
219
|
+
return pd.DataFrame(X_numeric, copy=False), True # Was converted
|
|
220
|
+
|
|
221
|
+
def __setstate__(self, state):
|
|
222
|
+
"""Handles unpickling for backward compatibility."""
|
|
223
|
+
self.__dict__.update(state)
|
|
224
|
+
|
|
225
|
+
# For backward compatibility, initialize new attributes if they don't exist,
|
|
226
|
+
# indicating the model was trained before these features were introduced.
|
|
227
|
+
new_attributes = {
|
|
228
|
+
"X_names_": [],
|
|
229
|
+
"categorical_features_": [],
|
|
230
|
+
"ohe_columns_": [],
|
|
231
|
+
"na_imputed_cols_": [],
|
|
232
|
+
"median_values_": {},
|
|
233
|
+
"final_training_columns_": [],
|
|
234
|
+
}
|
|
235
|
+
for attr, default_value in new_attributes.items():
|
|
236
|
+
if not hasattr(self, attr):
|
|
237
|
+
setattr(self, attr, default_value)
|
|
238
|
+
|
|
239
|
+
def _validate_X_fit_rows(self, X):
|
|
240
|
+
"""Checks if X has enough rows to be fitted."""
|
|
241
|
+
if (isinstance(X, np.ndarray) and X.shape[0] < 2) or (
|
|
242
|
+
isinstance(X, pd.DataFrame) and len(X) < 2
|
|
243
|
+
):
|
|
244
|
+
raise ValueError("Input X must have at least 2 rows to be fitted.")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class APLRRegressor(BaseAPLR):
|
|
8
248
|
def __init__(
|
|
9
249
|
self,
|
|
10
|
-
m: int =
|
|
11
|
-
v: float = 0.
|
|
250
|
+
m: int = 3000,
|
|
251
|
+
v: float = 0.5,
|
|
12
252
|
random_state: int = 0,
|
|
13
253
|
loss_function: str = "mse",
|
|
14
254
|
link_function: str = "identity",
|
|
15
255
|
n_jobs: int = 0,
|
|
16
|
-
|
|
256
|
+
cv_folds: int = 5,
|
|
17
257
|
bins: int = 300,
|
|
18
258
|
max_interaction_level: int = 1,
|
|
19
259
|
max_interactions: int = 100000,
|
|
20
|
-
min_observations_in_split: int =
|
|
21
|
-
ineligible_boosting_steps_added: int =
|
|
22
|
-
max_eligible_terms: int =
|
|
260
|
+
min_observations_in_split: int = 4,
|
|
261
|
+
ineligible_boosting_steps_added: int = 15,
|
|
262
|
+
max_eligible_terms: int = 7,
|
|
23
263
|
verbosity: int = 0,
|
|
24
264
|
dispersion_parameter: float = 1.5,
|
|
25
265
|
validation_tuning_metric: str = "default",
|
|
26
266
|
quantile: float = 0.5,
|
|
27
267
|
calculate_custom_validation_error_function: Optional[
|
|
28
268
|
Callable[
|
|
29
|
-
[
|
|
269
|
+
[
|
|
270
|
+
FloatVector,
|
|
271
|
+
FloatVector,
|
|
272
|
+
FloatVector,
|
|
273
|
+
FloatVector,
|
|
274
|
+
FloatMatrix,
|
|
275
|
+
],
|
|
276
|
+
float,
|
|
30
277
|
]
|
|
31
278
|
] = None,
|
|
32
279
|
calculate_custom_loss_function: Optional[
|
|
33
280
|
Callable[
|
|
34
|
-
[
|
|
281
|
+
[
|
|
282
|
+
FloatVector,
|
|
283
|
+
FloatVector,
|
|
284
|
+
FloatVector,
|
|
285
|
+
FloatVector,
|
|
286
|
+
FloatMatrix,
|
|
287
|
+
],
|
|
288
|
+
float,
|
|
35
289
|
]
|
|
36
290
|
] = None,
|
|
37
291
|
calculate_custom_negative_gradient_function: Optional[
|
|
38
|
-
Callable[
|
|
292
|
+
Callable[
|
|
293
|
+
[FloatVector, FloatVector, FloatVector, FloatMatrix],
|
|
294
|
+
FloatVector,
|
|
295
|
+
]
|
|
39
296
|
] = None,
|
|
40
297
|
calculate_custom_transform_linear_predictor_to_predictions_function: Optional[
|
|
41
|
-
Callable[[
|
|
298
|
+
Callable[[FloatVector], FloatVector]
|
|
42
299
|
] = None,
|
|
43
300
|
calculate_custom_differentiate_predictions_wrt_linear_predictor_function: Optional[
|
|
44
|
-
Callable[[
|
|
301
|
+
Callable[[FloatVector], FloatVector]
|
|
45
302
|
] = None,
|
|
46
|
-
|
|
303
|
+
boosting_steps_before_interactions_are_allowed: int = 0,
|
|
304
|
+
monotonic_constraints_ignore_interactions: bool = False,
|
|
305
|
+
group_mse_by_prediction_bins: int = 10,
|
|
306
|
+
group_mse_cycle_min_obs_in_bin: int = 30,
|
|
307
|
+
early_stopping_rounds: int = 200,
|
|
308
|
+
num_first_steps_with_linear_effects_only: int = 0,
|
|
309
|
+
penalty_for_non_linearity: float = 0.0,
|
|
310
|
+
penalty_for_interactions: float = 0.0,
|
|
311
|
+
max_terms: int = 0,
|
|
312
|
+
ridge_penalty: float = 0.0001,
|
|
313
|
+
mean_bias_correction: bool = False,
|
|
314
|
+
faster_convergence: bool = False,
|
|
47
315
|
):
|
|
48
316
|
self.m = m
|
|
49
317
|
self.v = v
|
|
@@ -51,7 +319,7 @@ class APLRRegressor:
|
|
|
51
319
|
self.loss_function = loss_function
|
|
52
320
|
self.link_function = link_function
|
|
53
321
|
self.n_jobs = n_jobs
|
|
54
|
-
self.
|
|
322
|
+
self.cv_folds = cv_folds
|
|
55
323
|
self.bins = bins
|
|
56
324
|
self.max_interaction_level = max_interaction_level
|
|
57
325
|
self.max_interactions = max_interactions
|
|
@@ -75,9 +343,32 @@ class APLRRegressor:
|
|
|
75
343
|
self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
|
|
76
344
|
calculate_custom_differentiate_predictions_wrt_linear_predictor_function
|
|
77
345
|
)
|
|
78
|
-
self.
|
|
79
|
-
|
|
346
|
+
self.boosting_steps_before_interactions_are_allowed = (
|
|
347
|
+
boosting_steps_before_interactions_are_allowed
|
|
348
|
+
)
|
|
349
|
+
self.monotonic_constraints_ignore_interactions = (
|
|
350
|
+
monotonic_constraints_ignore_interactions
|
|
80
351
|
)
|
|
352
|
+
self.group_mse_by_prediction_bins = group_mse_by_prediction_bins
|
|
353
|
+
self.group_mse_cycle_min_obs_in_bin = group_mse_cycle_min_obs_in_bin
|
|
354
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
355
|
+
self.num_first_steps_with_linear_effects_only = (
|
|
356
|
+
num_first_steps_with_linear_effects_only
|
|
357
|
+
)
|
|
358
|
+
self.penalty_for_non_linearity = penalty_for_non_linearity
|
|
359
|
+
self.penalty_for_interactions = penalty_for_interactions
|
|
360
|
+
self.max_terms = max_terms
|
|
361
|
+
self.ridge_penalty = ridge_penalty
|
|
362
|
+
self.mean_bias_correction = mean_bias_correction
|
|
363
|
+
self.faster_convergence = faster_convergence
|
|
364
|
+
|
|
365
|
+
# Data transformations
|
|
366
|
+
self.median_values_ = {}
|
|
367
|
+
self.categorical_features_ = []
|
|
368
|
+
self.ohe_columns_ = []
|
|
369
|
+
self.na_imputed_cols_ = []
|
|
370
|
+
self.X_names_ = []
|
|
371
|
+
self.final_training_columns_ = []
|
|
81
372
|
|
|
82
373
|
# Creating aplr_cpp and setting parameters
|
|
83
374
|
self.APLRRegressor = aplr_cpp.APLRRegressor()
|
|
@@ -91,7 +382,7 @@ class APLRRegressor:
|
|
|
91
382
|
self.APLRRegressor.loss_function = self.loss_function
|
|
92
383
|
self.APLRRegressor.link_function = self.link_function
|
|
93
384
|
self.APLRRegressor.n_jobs = self.n_jobs
|
|
94
|
-
self.APLRRegressor.
|
|
385
|
+
self.APLRRegressor.cv_folds = self.cv_folds
|
|
95
386
|
self.APLRRegressor.bins = self.bins
|
|
96
387
|
self.APLRRegressor.max_interaction_level = self.max_interaction_level
|
|
97
388
|
self.APLRRegressor.max_interactions = self.max_interactions
|
|
@@ -119,87 +410,354 @@ class APLRRegressor:
|
|
|
119
410
|
self.APLRRegressor.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
|
|
120
411
|
self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function
|
|
121
412
|
)
|
|
122
|
-
self.APLRRegressor.
|
|
123
|
-
self.
|
|
413
|
+
self.APLRRegressor.boosting_steps_before_interactions_are_allowed = (
|
|
414
|
+
self.boosting_steps_before_interactions_are_allowed
|
|
415
|
+
)
|
|
416
|
+
self.APLRRegressor.monotonic_constraints_ignore_interactions = (
|
|
417
|
+
self.monotonic_constraints_ignore_interactions
|
|
418
|
+
)
|
|
419
|
+
self.APLRRegressor.group_mse_by_prediction_bins = (
|
|
420
|
+
self.group_mse_by_prediction_bins
|
|
421
|
+
)
|
|
422
|
+
self.APLRRegressor.group_mse_cycle_min_obs_in_bin = (
|
|
423
|
+
self.group_mse_cycle_min_obs_in_bin
|
|
424
|
+
)
|
|
425
|
+
self.APLRRegressor.early_stopping_rounds = self.early_stopping_rounds
|
|
426
|
+
self.APLRRegressor.num_first_steps_with_linear_effects_only = (
|
|
427
|
+
self.num_first_steps_with_linear_effects_only
|
|
124
428
|
)
|
|
429
|
+
self.APLRRegressor.penalty_for_non_linearity = self.penalty_for_non_linearity
|
|
430
|
+
self.APLRRegressor.penalty_for_interactions = self.penalty_for_interactions
|
|
431
|
+
self.APLRRegressor.max_terms = self.max_terms
|
|
432
|
+
self.APLRRegressor.ridge_penalty = self.ridge_penalty
|
|
433
|
+
self.APLRRegressor.mean_bias_correction = self.mean_bias_correction
|
|
434
|
+
self.APLRRegressor.faster_convergence = self.faster_convergence
|
|
125
435
|
|
|
126
436
|
def fit(
|
|
127
437
|
self,
|
|
128
|
-
X:
|
|
129
|
-
y:
|
|
130
|
-
sample_weight:
|
|
438
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
439
|
+
y: FloatVector,
|
|
440
|
+
sample_weight: FloatVector = np.empty(0),
|
|
131
441
|
X_names: List[str] = [],
|
|
132
|
-
|
|
442
|
+
cv_observations: IntMatrix = np.empty([0, 0]),
|
|
133
443
|
prioritized_predictors_indexes: List[int] = [],
|
|
134
444
|
monotonic_constraints: List[int] = [],
|
|
135
|
-
group:
|
|
445
|
+
group: FloatVector = np.empty(0),
|
|
136
446
|
interaction_constraints: List[List[int]] = [],
|
|
447
|
+
other_data: FloatMatrix = np.empty([0, 0]),
|
|
448
|
+
predictor_learning_rates: List[float] = [],
|
|
449
|
+
predictor_penalties_for_non_linearity: List[float] = [],
|
|
450
|
+
predictor_penalties_for_interactions: List[float] = [],
|
|
451
|
+
predictor_min_observations_in_split: List[int] = [],
|
|
137
452
|
):
|
|
453
|
+
self._validate_X_fit_rows(X)
|
|
138
454
|
self.__set_params_cpp()
|
|
455
|
+
X_transformed, X_names_transformed = self._preprocess_X_fit(
|
|
456
|
+
X, X_names, sample_weight
|
|
457
|
+
)
|
|
139
458
|
self.APLRRegressor.fit(
|
|
140
|
-
|
|
459
|
+
X_transformed,
|
|
141
460
|
y,
|
|
142
461
|
sample_weight,
|
|
143
|
-
|
|
144
|
-
|
|
462
|
+
X_names_transformed,
|
|
463
|
+
cv_observations,
|
|
145
464
|
prioritized_predictors_indexes,
|
|
146
465
|
monotonic_constraints,
|
|
147
466
|
group,
|
|
148
467
|
interaction_constraints,
|
|
468
|
+
other_data,
|
|
469
|
+
predictor_learning_rates,
|
|
470
|
+
predictor_penalties_for_non_linearity,
|
|
471
|
+
predictor_penalties_for_interactions,
|
|
472
|
+
predictor_min_observations_in_split,
|
|
149
473
|
)
|
|
150
474
|
|
|
151
475
|
def predict(
|
|
152
|
-
self,
|
|
153
|
-
|
|
476
|
+
self,
|
|
477
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
478
|
+
cap_predictions_to_minmax_in_training: bool = True,
|
|
479
|
+
) -> FloatVector:
|
|
154
480
|
if self.link_function == "custom_function":
|
|
155
481
|
self.APLRRegressor.calculate_custom_transform_linear_predictor_to_predictions_function = (
|
|
156
482
|
self.calculate_custom_transform_linear_predictor_to_predictions_function
|
|
157
483
|
)
|
|
158
|
-
|
|
484
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
485
|
+
return self.APLRRegressor.predict(
|
|
486
|
+
X_transformed, cap_predictions_to_minmax_in_training
|
|
487
|
+
)
|
|
159
488
|
|
|
160
489
|
def set_term_names(self, X_names: List[str]):
|
|
161
490
|
self.APLRRegressor.set_term_names(X_names)
|
|
162
491
|
|
|
163
|
-
def
|
|
164
|
-
|
|
492
|
+
def calculate_feature_importance(
|
|
493
|
+
self,
|
|
494
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
495
|
+
sample_weight: FloatVector = np.empty(0),
|
|
496
|
+
) -> FloatVector:
|
|
497
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
498
|
+
return self.APLRRegressor.calculate_feature_importance(
|
|
499
|
+
X_transformed, sample_weight
|
|
500
|
+
)
|
|
165
501
|
|
|
166
|
-
def
|
|
167
|
-
self,
|
|
168
|
-
|
|
169
|
-
|
|
502
|
+
def calculate_term_importance(
|
|
503
|
+
self,
|
|
504
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
505
|
+
sample_weight: FloatVector = np.empty(0),
|
|
506
|
+
) -> FloatVector:
|
|
507
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
508
|
+
return self.APLRRegressor.calculate_term_importance(
|
|
509
|
+
X_transformed, sample_weight
|
|
510
|
+
)
|
|
170
511
|
|
|
171
|
-
def
|
|
172
|
-
|
|
512
|
+
def calculate_local_feature_contribution(
|
|
513
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
514
|
+
) -> FloatMatrix:
|
|
515
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
516
|
+
return self.APLRRegressor.calculate_local_feature_contribution(X_transformed)
|
|
517
|
+
|
|
518
|
+
def calculate_local_term_contribution(
|
|
519
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
520
|
+
) -> FloatMatrix:
|
|
521
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
522
|
+
return self.APLRRegressor.calculate_local_term_contribution(X_transformed)
|
|
523
|
+
|
|
524
|
+
def calculate_local_contribution_from_selected_terms(
|
|
525
|
+
self, X: Union[pd.DataFrame, FloatMatrix], predictor_indexes: List[int]
|
|
526
|
+
) -> FloatVector:
|
|
527
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
528
|
+
return self.APLRRegressor.calculate_local_contribution_from_selected_terms(
|
|
529
|
+
X_transformed, predictor_indexes
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
def calculate_terms(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
|
|
533
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
534
|
+
return self.APLRRegressor.calculate_terms(X_transformed)
|
|
173
535
|
|
|
174
536
|
def get_term_names(self) -> List[str]:
|
|
175
537
|
return self.APLRRegressor.get_term_names()
|
|
176
538
|
|
|
177
|
-
def
|
|
178
|
-
return self.APLRRegressor.
|
|
539
|
+
def get_term_affiliations(self) -> List[str]:
|
|
540
|
+
return self.APLRRegressor.get_term_affiliations()
|
|
179
541
|
|
|
180
|
-
def
|
|
181
|
-
return self.APLRRegressor.
|
|
542
|
+
def get_unique_term_affiliations(self) -> List[str]:
|
|
543
|
+
return self.APLRRegressor.get_unique_term_affiliations()
|
|
182
544
|
|
|
183
|
-
def
|
|
545
|
+
def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
|
|
546
|
+
return self.APLRRegressor.get_base_predictors_in_each_unique_term_affiliation()
|
|
547
|
+
|
|
548
|
+
def get_term_coefficients(self) -> FloatVector:
|
|
549
|
+
return self.APLRRegressor.get_term_coefficients()
|
|
550
|
+
|
|
551
|
+
def get_validation_error_steps(self) -> FloatMatrix:
|
|
184
552
|
return self.APLRRegressor.get_validation_error_steps()
|
|
185
553
|
|
|
186
|
-
def get_feature_importance(self) ->
|
|
554
|
+
def get_feature_importance(self) -> FloatVector:
|
|
187
555
|
return self.APLRRegressor.get_feature_importance()
|
|
188
556
|
|
|
557
|
+
def get_term_importance(self) -> FloatVector:
|
|
558
|
+
return self.APLRRegressor.get_term_importance()
|
|
559
|
+
|
|
560
|
+
def get_term_main_predictor_indexes(self) -> IntVector:
|
|
561
|
+
return self.APLRRegressor.get_term_main_predictor_indexes()
|
|
562
|
+
|
|
563
|
+
def get_term_interaction_levels(self) -> IntVector:
|
|
564
|
+
return self.APLRRegressor.get_term_interaction_levels()
|
|
565
|
+
|
|
189
566
|
def get_intercept(self) -> float:
|
|
190
567
|
return self.APLRRegressor.get_intercept()
|
|
191
568
|
|
|
192
|
-
def get_intercept_steps(self) -> npt.ArrayLike:
|
|
193
|
-
return self.APLRRegressor.get_intercept_steps()
|
|
194
|
-
|
|
195
569
|
def get_optimal_m(self) -> int:
|
|
196
570
|
return self.APLRRegressor.get_optimal_m()
|
|
197
571
|
|
|
198
572
|
def get_validation_tuning_metric(self) -> str:
|
|
199
573
|
return self.APLRRegressor.get_validation_tuning_metric()
|
|
200
574
|
|
|
201
|
-
def
|
|
202
|
-
return self.APLRRegressor.
|
|
575
|
+
def get_main_effect_shape(self, predictor_index: int) -> Dict[float, float]:
|
|
576
|
+
return self.APLRRegressor.get_main_effect_shape(predictor_index)
|
|
577
|
+
|
|
578
|
+
def get_unique_term_affiliation_shape(
|
|
579
|
+
self,
|
|
580
|
+
unique_term_affiliation: str,
|
|
581
|
+
max_rows_before_sampling: int = 500000,
|
|
582
|
+
additional_points: int = 250,
|
|
583
|
+
) -> FloatMatrix:
|
|
584
|
+
return self.APLRRegressor.get_unique_term_affiliation_shape(
|
|
585
|
+
unique_term_affiliation, max_rows_before_sampling, additional_points
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
def get_cv_error(self) -> float:
|
|
589
|
+
return self.APLRRegressor.get_cv_error()
|
|
590
|
+
|
|
591
|
+
def get_num_cv_folds(self) -> int:
|
|
592
|
+
"""
|
|
593
|
+
Gets the number of cross-validation folds used during training.
|
|
594
|
+
|
|
595
|
+
:return: The number of folds.
|
|
596
|
+
"""
|
|
597
|
+
return self.APLRRegressor.get_num_cv_folds()
|
|
598
|
+
|
|
599
|
+
def get_cv_validation_predictions(self, fold_index: int) -> FloatVector:
|
|
600
|
+
"""
|
|
601
|
+
Gets the validation predictions for a specific cross-validation fold.
|
|
602
|
+
|
|
603
|
+
Note that these predictions may be conservative, as the final model is an ensemble of the models
|
|
604
|
+
from all cross-validation folds, which has a variance-reducing effect similar to bagging.
|
|
605
|
+
|
|
606
|
+
:param fold_index: The index of the fold.
|
|
607
|
+
:return: A numpy array containing the validation predictions.
|
|
608
|
+
"""
|
|
609
|
+
return self.APLRRegressor.get_cv_validation_predictions(fold_index)
|
|
610
|
+
|
|
611
|
+
def get_cv_y(self, fold_index: int) -> FloatVector:
|
|
612
|
+
"""
|
|
613
|
+
Gets the validation response values (y) for a specific cross-validation fold.
|
|
614
|
+
|
|
615
|
+
:param fold_index: The index of the fold.
|
|
616
|
+
:return: A numpy array containing the validation response values.
|
|
617
|
+
"""
|
|
618
|
+
return self.APLRRegressor.get_cv_y(fold_index)
|
|
619
|
+
|
|
620
|
+
def get_cv_sample_weight(self, fold_index: int) -> FloatVector:
|
|
621
|
+
"""
|
|
622
|
+
Gets the validation sample weights for a specific cross-validation fold.
|
|
623
|
+
|
|
624
|
+
:param fold_index: The index of the fold.
|
|
625
|
+
:return: A numpy array containing the validation sample weights.
|
|
626
|
+
"""
|
|
627
|
+
return self.APLRRegressor.get_cv_sample_weight(fold_index)
|
|
628
|
+
|
|
629
|
+
def get_cv_validation_indexes(self, fold_index: int) -> IntVector:
|
|
630
|
+
"""
|
|
631
|
+
Gets the original indexes of the validation observations for a specific cross-validation fold.
|
|
632
|
+
|
|
633
|
+
:param fold_index: The index of the fold.
|
|
634
|
+
:return: A numpy array containing the original indexes.
|
|
635
|
+
"""
|
|
636
|
+
return self.APLRRegressor.get_cv_validation_indexes(fold_index)
|
|
637
|
+
|
|
638
|
+
def set_intercept(self, value: float):
|
|
639
|
+
self.APLRRegressor.set_intercept(value)
|
|
640
|
+
|
|
641
|
+
def plot_affiliation_shape(
|
|
642
|
+
self,
|
|
643
|
+
affiliation: str,
|
|
644
|
+
plot: bool = True,
|
|
645
|
+
save: bool = False,
|
|
646
|
+
path: str = "",
|
|
647
|
+
):
|
|
648
|
+
"""
|
|
649
|
+
Plots or saves the shape of a given unique term affiliation.
|
|
650
|
+
|
|
651
|
+
For main effects, it produces a line plot. For two-way interactions, it produces a heatmap.
|
|
652
|
+
Plotting for higher-order interactions is not supported.
|
|
653
|
+
|
|
654
|
+
:param affiliation: A string specifying which unique_term_affiliation to use.
|
|
655
|
+
:param plot: If True, displays the plot.
|
|
656
|
+
:param save: If True, saves the plot to a file.
|
|
657
|
+
:param path: The file path to save the plot. If empty and save is True, a default path will be used.
|
|
658
|
+
"""
|
|
659
|
+
try:
|
|
660
|
+
import matplotlib.pyplot as plt
|
|
661
|
+
except ImportError:
|
|
662
|
+
raise ImportError("matplotlib is required for plotting. Please install it.")
|
|
663
|
+
|
|
664
|
+
all_affiliations = self.get_unique_term_affiliations()
|
|
665
|
+
if affiliation not in all_affiliations:
|
|
666
|
+
raise ValueError(
|
|
667
|
+
f"Affiliation '{affiliation}' not found in model. "
|
|
668
|
+
f"Available affiliations are: {all_affiliations}"
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
affiliation_index = all_affiliations.index(affiliation)
|
|
672
|
+
|
|
673
|
+
predictors_in_each_affiliation = (
|
|
674
|
+
self.get_base_predictors_in_each_unique_term_affiliation()
|
|
675
|
+
)
|
|
676
|
+
predictor_indexes_used = predictors_in_each_affiliation[affiliation_index]
|
|
677
|
+
|
|
678
|
+
shape = self.get_unique_term_affiliation_shape(affiliation)
|
|
679
|
+
if shape.shape[0] == 0:
|
|
680
|
+
print(f"No shape data available for affiliation '{affiliation}'.")
|
|
681
|
+
return
|
|
682
|
+
|
|
683
|
+
predictor_names = affiliation.split(" & ")
|
|
684
|
+
|
|
685
|
+
is_main_effect: bool = len(predictor_indexes_used) == 1
|
|
686
|
+
is_two_way_interaction: bool = len(predictor_indexes_used) == 2
|
|
687
|
+
|
|
688
|
+
if is_main_effect:
|
|
689
|
+
fig = plt.figure()
|
|
690
|
+
# Sort by predictor value for a clean line plot
|
|
691
|
+
sorted_indices = np.argsort(shape[:, 0])
|
|
692
|
+
plt.plot(shape[sorted_indices, 0], shape[sorted_indices, 1])
|
|
693
|
+
plt.xlabel(predictor_names[0])
|
|
694
|
+
plt.ylabel("Contribution to linear predictor")
|
|
695
|
+
plt.title(f"Main effect of {predictor_names[0]}")
|
|
696
|
+
plt.grid(True)
|
|
697
|
+
elif is_two_way_interaction:
|
|
698
|
+
fig = plt.figure(figsize=(8, 6))
|
|
699
|
+
|
|
700
|
+
# Get unique coordinates and their inverse mapping
|
|
701
|
+
y_unique, y_inv = np.unique(shape[:, 0], return_inverse=True)
|
|
702
|
+
x_unique, x_inv = np.unique(shape[:, 1], return_inverse=True)
|
|
703
|
+
|
|
704
|
+
# Create grid for sums and counts
|
|
705
|
+
grid_sums = np.zeros((len(y_unique), len(x_unique)))
|
|
706
|
+
grid_counts = np.zeros((len(y_unique), len(x_unique)))
|
|
707
|
+
|
|
708
|
+
# Populate sums and counts to later calculate the mean
|
|
709
|
+
np.add.at(grid_sums, (y_inv, x_inv), shape[:, 2])
|
|
710
|
+
np.add.at(grid_counts, (y_inv, x_inv), 1)
|
|
711
|
+
|
|
712
|
+
# Calculate mean, avoiding division by zero
|
|
713
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
714
|
+
pivot_table_values = np.true_divide(grid_sums, grid_counts)
|
|
715
|
+
# Where there's no data, pivot_table_values will be nan, which is fine for imshow.
|
|
716
|
+
|
|
717
|
+
plt.imshow(
|
|
718
|
+
pivot_table_values,
|
|
719
|
+
aspect="auto",
|
|
720
|
+
origin="lower",
|
|
721
|
+
extent=[
|
|
722
|
+
x_unique.min(),
|
|
723
|
+
x_unique.max(),
|
|
724
|
+
y_unique.min(),
|
|
725
|
+
y_unique.max(),
|
|
726
|
+
],
|
|
727
|
+
cmap="Blues_r",
|
|
728
|
+
)
|
|
729
|
+
plt.colorbar(label="Contribution to the linear predictor")
|
|
730
|
+
plt.xlabel(predictor_names[1])
|
|
731
|
+
plt.ylabel(predictor_names[0])
|
|
732
|
+
plt.title(
|
|
733
|
+
f"Interaction between {predictor_names[0]} and {predictor_names[1]}"
|
|
734
|
+
)
|
|
735
|
+
else:
|
|
736
|
+
print(
|
|
737
|
+
f"Plotting for interaction level > 2 is not supported. Affiliation: {affiliation}"
|
|
738
|
+
)
|
|
739
|
+
return
|
|
740
|
+
|
|
741
|
+
if save:
|
|
742
|
+
save_path = path or f"shape_of_{affiliation.replace(' & ', '_')}.png"
|
|
743
|
+
plt.savefig(save_path)
|
|
744
|
+
|
|
745
|
+
if plot:
|
|
746
|
+
plt.show()
|
|
747
|
+
|
|
748
|
+
plt.close(fig)
|
|
749
|
+
|
|
750
|
+
def remove_provided_custom_functions(self):
|
|
751
|
+
self.APLRRegressor.remove_provided_custom_functions()
|
|
752
|
+
self.calculate_custom_validation_error_function = None
|
|
753
|
+
self.calculate_custom_loss_function = None
|
|
754
|
+
self.calculate_custom_negative_gradient_function = None
|
|
755
|
+
|
|
756
|
+
def clear_cv_results(self):
|
|
757
|
+
"""
|
|
758
|
+
Clears the stored cross-validation results (predictions, y, etc.) to free up memory.
|
|
759
|
+
"""
|
|
760
|
+
self.APLRRegressor.clear_cv_results()
|
|
203
761
|
|
|
204
762
|
# For sklearn
|
|
205
763
|
def get_params(self, deep=True):
|
|
@@ -210,7 +768,7 @@ class APLRRegressor:
|
|
|
210
768
|
"loss_function": self.loss_function,
|
|
211
769
|
"link_function": self.link_function,
|
|
212
770
|
"n_jobs": self.n_jobs,
|
|
213
|
-
"
|
|
771
|
+
"cv_folds": self.cv_folds,
|
|
214
772
|
"bins": self.bins,
|
|
215
773
|
"max_interaction_level": self.max_interaction_level,
|
|
216
774
|
"max_interactions": self.max_interactions,
|
|
@@ -226,7 +784,18 @@ class APLRRegressor:
|
|
|
226
784
|
"calculate_custom_negative_gradient_function": self.calculate_custom_negative_gradient_function,
|
|
227
785
|
"calculate_custom_transform_linear_predictor_to_predictions_function": self.calculate_custom_transform_linear_predictor_to_predictions_function,
|
|
228
786
|
"calculate_custom_differentiate_predictions_wrt_linear_predictor_function": self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function,
|
|
229
|
-
"
|
|
787
|
+
"boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
|
|
788
|
+
"monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
|
|
789
|
+
"group_mse_by_prediction_bins": self.group_mse_by_prediction_bins,
|
|
790
|
+
"group_mse_cycle_min_obs_in_bin": self.group_mse_cycle_min_obs_in_bin,
|
|
791
|
+
"early_stopping_rounds": self.early_stopping_rounds,
|
|
792
|
+
"num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
|
|
793
|
+
"penalty_for_non_linearity": self.penalty_for_non_linearity,
|
|
794
|
+
"penalty_for_interactions": self.penalty_for_interactions,
|
|
795
|
+
"max_terms": self.max_terms,
|
|
796
|
+
"ridge_penalty": self.ridge_penalty,
|
|
797
|
+
"mean_bias_correction": self.mean_bias_correction,
|
|
798
|
+
"faster_convergence": self.faster_convergence,
|
|
230
799
|
}
|
|
231
800
|
|
|
232
801
|
# For sklearn
|
|
@@ -237,28 +806,35 @@ class APLRRegressor:
|
|
|
237
806
|
return self
|
|
238
807
|
|
|
239
808
|
|
|
240
|
-
class APLRClassifier:
|
|
809
|
+
class APLRClassifier(BaseAPLR):
|
|
241
810
|
def __init__(
|
|
242
811
|
self,
|
|
243
|
-
m: int =
|
|
244
|
-
v: float = 0.
|
|
812
|
+
m: int = 3000,
|
|
813
|
+
v: float = 0.5,
|
|
245
814
|
random_state: int = 0,
|
|
246
815
|
n_jobs: int = 0,
|
|
247
|
-
|
|
816
|
+
cv_folds: int = 5,
|
|
248
817
|
bins: int = 300,
|
|
249
818
|
verbosity: int = 0,
|
|
250
819
|
max_interaction_level: int = 1,
|
|
251
820
|
max_interactions: int = 100000,
|
|
252
|
-
min_observations_in_split: int =
|
|
253
|
-
ineligible_boosting_steps_added: int =
|
|
254
|
-
max_eligible_terms: int =
|
|
255
|
-
|
|
821
|
+
min_observations_in_split: int = 4,
|
|
822
|
+
ineligible_boosting_steps_added: int = 15,
|
|
823
|
+
max_eligible_terms: int = 7,
|
|
824
|
+
boosting_steps_before_interactions_are_allowed: int = 0,
|
|
825
|
+
monotonic_constraints_ignore_interactions: bool = False,
|
|
826
|
+
early_stopping_rounds: int = 200,
|
|
827
|
+
num_first_steps_with_linear_effects_only: int = 0,
|
|
828
|
+
penalty_for_non_linearity: float = 0.0,
|
|
829
|
+
penalty_for_interactions: float = 0.0,
|
|
830
|
+
max_terms: int = 0,
|
|
831
|
+
ridge_penalty: float = 0.0001,
|
|
256
832
|
):
|
|
257
833
|
self.m = m
|
|
258
834
|
self.v = v
|
|
259
835
|
self.random_state = random_state
|
|
260
836
|
self.n_jobs = n_jobs
|
|
261
|
-
self.
|
|
837
|
+
self.cv_folds = cv_folds
|
|
262
838
|
self.bins = bins
|
|
263
839
|
self.verbosity = verbosity
|
|
264
840
|
self.max_interaction_level = max_interaction_level
|
|
@@ -266,9 +842,28 @@ class APLRClassifier:
|
|
|
266
842
|
self.min_observations_in_split = min_observations_in_split
|
|
267
843
|
self.ineligible_boosting_steps_added = ineligible_boosting_steps_added
|
|
268
844
|
self.max_eligible_terms = max_eligible_terms
|
|
269
|
-
self.
|
|
270
|
-
|
|
845
|
+
self.boosting_steps_before_interactions_are_allowed = (
|
|
846
|
+
boosting_steps_before_interactions_are_allowed
|
|
847
|
+
)
|
|
848
|
+
self.monotonic_constraints_ignore_interactions = (
|
|
849
|
+
monotonic_constraints_ignore_interactions
|
|
850
|
+
)
|
|
851
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
852
|
+
self.num_first_steps_with_linear_effects_only = (
|
|
853
|
+
num_first_steps_with_linear_effects_only
|
|
271
854
|
)
|
|
855
|
+
self.penalty_for_non_linearity = penalty_for_non_linearity
|
|
856
|
+
self.penalty_for_interactions = penalty_for_interactions
|
|
857
|
+
self.max_terms = max_terms
|
|
858
|
+
self.ridge_penalty = ridge_penalty
|
|
859
|
+
|
|
860
|
+
# Data transformations
|
|
861
|
+
self.median_values_ = {}
|
|
862
|
+
self.categorical_features_ = []
|
|
863
|
+
self.ohe_columns_ = []
|
|
864
|
+
self.na_imputed_cols_ = []
|
|
865
|
+
self.X_names_ = []
|
|
866
|
+
self.final_training_columns_ = []
|
|
272
867
|
|
|
273
868
|
# Creating aplr_cpp and setting parameters
|
|
274
869
|
self.APLRClassifier = aplr_cpp.APLRClassifier()
|
|
@@ -280,7 +875,7 @@ class APLRClassifier:
|
|
|
280
875
|
self.APLRClassifier.v = self.v
|
|
281
876
|
self.APLRClassifier.random_state = self.random_state
|
|
282
877
|
self.APLRClassifier.n_jobs = self.n_jobs
|
|
283
|
-
self.APLRClassifier.
|
|
878
|
+
self.APLRClassifier.cv_folds = self.cv_folds
|
|
284
879
|
self.APLRClassifier.bins = self.bins
|
|
285
880
|
self.APLRClassifier.verbosity = self.verbosity
|
|
286
881
|
self.APLRClassifier.max_interaction_level = self.max_interaction_level
|
|
@@ -290,66 +885,146 @@ class APLRClassifier:
|
|
|
290
885
|
self.ineligible_boosting_steps_added
|
|
291
886
|
)
|
|
292
887
|
self.APLRClassifier.max_eligible_terms = self.max_eligible_terms
|
|
293
|
-
self.APLRClassifier.
|
|
294
|
-
self.
|
|
888
|
+
self.APLRClassifier.boosting_steps_before_interactions_are_allowed = (
|
|
889
|
+
self.boosting_steps_before_interactions_are_allowed
|
|
890
|
+
)
|
|
891
|
+
self.APLRClassifier.monotonic_constraints_ignore_interactions = (
|
|
892
|
+
self.monotonic_constraints_ignore_interactions
|
|
295
893
|
)
|
|
894
|
+
self.APLRClassifier.early_stopping_rounds = self.early_stopping_rounds
|
|
895
|
+
self.APLRClassifier.num_first_steps_with_linear_effects_only = (
|
|
896
|
+
self.num_first_steps_with_linear_effects_only
|
|
897
|
+
)
|
|
898
|
+
self.APLRClassifier.penalty_for_non_linearity = self.penalty_for_non_linearity
|
|
899
|
+
self.APLRClassifier.penalty_for_interactions = self.penalty_for_interactions
|
|
900
|
+
self.APLRClassifier.max_terms = self.max_terms
|
|
901
|
+
self.APLRClassifier.ridge_penalty = self.ridge_penalty
|
|
296
902
|
|
|
297
903
|
def fit(
|
|
298
904
|
self,
|
|
299
|
-
X:
|
|
300
|
-
y: List[str],
|
|
301
|
-
sample_weight:
|
|
905
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
906
|
+
y: Union[FloatVector, List[str]],
|
|
907
|
+
sample_weight: FloatVector = np.empty(0),
|
|
302
908
|
X_names: List[str] = [],
|
|
303
|
-
|
|
909
|
+
cv_observations: IntMatrix = np.empty([0, 0]),
|
|
304
910
|
prioritized_predictors_indexes: List[int] = [],
|
|
305
911
|
monotonic_constraints: List[int] = [],
|
|
306
912
|
interaction_constraints: List[List[int]] = [],
|
|
913
|
+
predictor_learning_rates: List[float] = [],
|
|
914
|
+
predictor_penalties_for_non_linearity: List[float] = [],
|
|
915
|
+
predictor_penalties_for_interactions: List[float] = [],
|
|
916
|
+
predictor_min_observations_in_split: List[int] = [],
|
|
307
917
|
):
|
|
918
|
+
self._validate_X_fit_rows(X)
|
|
308
919
|
self.__set_params_cpp()
|
|
920
|
+
X_transformed, X_names_transformed = self._preprocess_X_fit(
|
|
921
|
+
X, X_names, sample_weight
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
if isinstance(y, np.ndarray):
|
|
925
|
+
y = y.astype(str).tolist()
|
|
926
|
+
elif isinstance(y, list) and y and not isinstance(y[0], str):
|
|
927
|
+
y = [str(val) for val in y]
|
|
928
|
+
|
|
309
929
|
self.APLRClassifier.fit(
|
|
310
|
-
|
|
930
|
+
X_transformed,
|
|
311
931
|
y,
|
|
312
932
|
sample_weight,
|
|
313
|
-
|
|
314
|
-
|
|
933
|
+
X_names_transformed,
|
|
934
|
+
cv_observations,
|
|
315
935
|
prioritized_predictors_indexes,
|
|
316
936
|
monotonic_constraints,
|
|
317
937
|
interaction_constraints,
|
|
938
|
+
predictor_learning_rates,
|
|
939
|
+
predictor_penalties_for_non_linearity,
|
|
940
|
+
predictor_penalties_for_interactions,
|
|
941
|
+
predictor_min_observations_in_split,
|
|
318
942
|
)
|
|
943
|
+
# For sklearn
|
|
944
|
+
self.classes_ = np.arange(len(self.APLRClassifier.get_categories()))
|
|
319
945
|
|
|
320
946
|
def predict_class_probabilities(
|
|
321
|
-
self,
|
|
322
|
-
|
|
947
|
+
self,
|
|
948
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
949
|
+
cap_predictions_to_minmax_in_training: bool = False,
|
|
950
|
+
) -> FloatMatrix:
|
|
951
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
323
952
|
return self.APLRClassifier.predict_class_probabilities(
|
|
324
|
-
|
|
953
|
+
X_transformed, cap_predictions_to_minmax_in_training
|
|
325
954
|
)
|
|
326
955
|
|
|
327
956
|
def predict(
|
|
328
|
-
self,
|
|
957
|
+
self,
|
|
958
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
959
|
+
cap_predictions_to_minmax_in_training: bool = False,
|
|
329
960
|
) -> List[str]:
|
|
330
|
-
|
|
961
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
962
|
+
return self.APLRClassifier.predict(
|
|
963
|
+
X_transformed, cap_predictions_to_minmax_in_training
|
|
964
|
+
)
|
|
331
965
|
|
|
332
|
-
def
|
|
333
|
-
|
|
966
|
+
def calculate_local_feature_contribution(
|
|
967
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
968
|
+
) -> FloatMatrix:
|
|
969
|
+
X_transformed = self._preprocess_X_predict(X)
|
|
970
|
+
return self.APLRClassifier.calculate_local_feature_contribution(X_transformed)
|
|
334
971
|
|
|
335
972
|
def get_categories(self) -> List[str]:
|
|
336
973
|
return self.APLRClassifier.get_categories()
|
|
337
974
|
|
|
338
975
|
def get_logit_model(self, category: str) -> APLRRegressor:
|
|
339
|
-
|
|
976
|
+
logit_model_cpp = self.APLRClassifier.get_logit_model(category)
|
|
977
|
+
|
|
978
|
+
logit_model_py = APLRRegressor(
|
|
979
|
+
m=self.m,
|
|
980
|
+
v=self.v,
|
|
981
|
+
random_state=self.random_state,
|
|
982
|
+
loss_function="binomial",
|
|
983
|
+
link_function="logit",
|
|
984
|
+
n_jobs=self.n_jobs,
|
|
985
|
+
cv_folds=self.cv_folds,
|
|
986
|
+
bins=self.bins,
|
|
987
|
+
max_interaction_level=self.max_interaction_level,
|
|
988
|
+
max_interactions=self.max_interactions,
|
|
989
|
+
min_observations_in_split=self.min_observations_in_split,
|
|
990
|
+
ineligible_boosting_steps_added=self.ineligible_boosting_steps_added,
|
|
991
|
+
max_eligible_terms=self.max_eligible_terms,
|
|
992
|
+
verbosity=self.verbosity,
|
|
993
|
+
boosting_steps_before_interactions_are_allowed=self.boosting_steps_before_interactions_are_allowed,
|
|
994
|
+
monotonic_constraints_ignore_interactions=self.monotonic_constraints_ignore_interactions,
|
|
995
|
+
early_stopping_rounds=self.early_stopping_rounds,
|
|
996
|
+
num_first_steps_with_linear_effects_only=self.num_first_steps_with_linear_effects_only,
|
|
997
|
+
penalty_for_non_linearity=self.penalty_for_non_linearity,
|
|
998
|
+
penalty_for_interactions=self.penalty_for_interactions,
|
|
999
|
+
max_terms=self.max_terms,
|
|
1000
|
+
ridge_penalty=self.ridge_penalty,
|
|
1001
|
+
)
|
|
340
1002
|
|
|
341
|
-
|
|
342
|
-
return self.APLRClassifier.get_validation_indexes()
|
|
1003
|
+
logit_model_py.APLRRegressor = logit_model_cpp
|
|
343
1004
|
|
|
344
|
-
|
|
1005
|
+
return logit_model_py
|
|
1006
|
+
|
|
1007
|
+
def get_validation_error_steps(self) -> FloatMatrix:
|
|
345
1008
|
return self.APLRClassifier.get_validation_error_steps()
|
|
346
1009
|
|
|
347
|
-
def
|
|
348
|
-
return self.APLRClassifier.
|
|
1010
|
+
def get_cv_error(self) -> float:
|
|
1011
|
+
return self.APLRClassifier.get_cv_error()
|
|
349
1012
|
|
|
350
|
-
def get_feature_importance(self) ->
|
|
1013
|
+
def get_feature_importance(self) -> FloatVector:
|
|
351
1014
|
return self.APLRClassifier.get_feature_importance()
|
|
352
1015
|
|
|
1016
|
+
def get_unique_term_affiliations(self) -> List[str]:
|
|
1017
|
+
return self.APLRClassifier.get_unique_term_affiliations()
|
|
1018
|
+
|
|
1019
|
+
def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
|
|
1020
|
+
return self.APLRClassifier.get_base_predictors_in_each_unique_term_affiliation()
|
|
1021
|
+
|
|
1022
|
+
def clear_cv_results(self):
|
|
1023
|
+
"""
|
|
1024
|
+
Clears the stored cross-validation results from all underlying logit models to free up memory.
|
|
1025
|
+
"""
|
|
1026
|
+
self.APLRClassifier.clear_cv_results()
|
|
1027
|
+
|
|
353
1028
|
# For sklearn
|
|
354
1029
|
def get_params(self, deep=True):
|
|
355
1030
|
return {
|
|
@@ -357,7 +1032,7 @@ class APLRClassifier:
|
|
|
357
1032
|
"v": self.v,
|
|
358
1033
|
"random_state": self.random_state,
|
|
359
1034
|
"n_jobs": self.n_jobs,
|
|
360
|
-
"
|
|
1035
|
+
"cv_folds": self.cv_folds,
|
|
361
1036
|
"bins": self.bins,
|
|
362
1037
|
"verbosity": self.verbosity,
|
|
363
1038
|
"max_interaction_level": self.max_interaction_level,
|
|
@@ -365,7 +1040,14 @@ class APLRClassifier:
|
|
|
365
1040
|
"min_observations_in_split": self.min_observations_in_split,
|
|
366
1041
|
"ineligible_boosting_steps_added": self.ineligible_boosting_steps_added,
|
|
367
1042
|
"max_eligible_terms": self.max_eligible_terms,
|
|
368
|
-
"
|
|
1043
|
+
"boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
|
|
1044
|
+
"monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
|
|
1045
|
+
"early_stopping_rounds": self.early_stopping_rounds,
|
|
1046
|
+
"num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
|
|
1047
|
+
"penalty_for_non_linearity": self.penalty_for_non_linearity,
|
|
1048
|
+
"penalty_for_interactions": self.penalty_for_interactions,
|
|
1049
|
+
"max_terms": self.max_terms,
|
|
1050
|
+
"ridge_penalty": self.ridge_penalty,
|
|
369
1051
|
}
|
|
370
1052
|
|
|
371
1053
|
# For sklearn
|
|
@@ -374,3 +1056,72 @@ class APLRClassifier:
|
|
|
374
1056
|
setattr(self, parameter, value)
|
|
375
1057
|
self.__set_params_cpp()
|
|
376
1058
|
return self
|
|
1059
|
+
|
|
1060
|
+
# For sklearn
|
|
1061
|
+
def predict_proba(self, X: FloatMatrix) -> FloatMatrix:
|
|
1062
|
+
return self.predict_class_probabilities(X)
|
|
1063
|
+
|
|
1064
|
+
|
|
1065
|
+
class APLRTuner:
|
|
1066
|
+
def __init__(
|
|
1067
|
+
self,
|
|
1068
|
+
parameters: Union[Dict[str, List[float]], List[Dict[str, List[float]]]] = {
|
|
1069
|
+
"max_interaction_level": [0, 1],
|
|
1070
|
+
"min_observations_in_split": [4, 10, 20, 100, 500, 1000],
|
|
1071
|
+
},
|
|
1072
|
+
is_regressor: bool = True,
|
|
1073
|
+
):
|
|
1074
|
+
self.parameters = parameters
|
|
1075
|
+
self.is_regressor = is_regressor
|
|
1076
|
+
self.parameter_grid = self._create_parameter_grid()
|
|
1077
|
+
|
|
1078
|
+
def _create_parameter_grid(self) -> List[Dict[str, float]]:
|
|
1079
|
+
items = sorted(self.parameters.items())
|
|
1080
|
+
keys, values = zip(*items)
|
|
1081
|
+
combinations = list(itertools.product(*values))
|
|
1082
|
+
grid = [dict(zip(keys, combination)) for combination in combinations]
|
|
1083
|
+
return grid
|
|
1084
|
+
|
|
1085
|
+
def fit(self, X: Union[pd.DataFrame, FloatMatrix], y: FloatVector, **kwargs):
|
|
1086
|
+
self.cv_results: List[Dict[str, float]] = []
|
|
1087
|
+
best_validation_result = np.inf
|
|
1088
|
+
for params in self.parameter_grid:
|
|
1089
|
+
if self.is_regressor:
|
|
1090
|
+
model = APLRRegressor(**params)
|
|
1091
|
+
else:
|
|
1092
|
+
model = APLRClassifier(**params)
|
|
1093
|
+
model.fit(X, y, **kwargs)
|
|
1094
|
+
cv_error_for_this_model = model.get_cv_error()
|
|
1095
|
+
cv_results_for_this_model = model.get_params()
|
|
1096
|
+
cv_results_for_this_model["cv_error"] = cv_error_for_this_model
|
|
1097
|
+
self.cv_results.append(cv_results_for_this_model)
|
|
1098
|
+
if cv_error_for_this_model < best_validation_result:
|
|
1099
|
+
best_validation_result = cv_error_for_this_model
|
|
1100
|
+
self.best_model = model
|
|
1101
|
+
self.cv_results = sorted(self.cv_results, key=lambda x: x["cv_error"])
|
|
1102
|
+
|
|
1103
|
+
def predict(
|
|
1104
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
1105
|
+
) -> Union[FloatVector, List[str]]:
|
|
1106
|
+
return self.best_model.predict(X, **kwargs)
|
|
1107
|
+
|
|
1108
|
+
def predict_class_probabilities(
|
|
1109
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
1110
|
+
) -> FloatMatrix:
|
|
1111
|
+
if self.is_regressor == False:
|
|
1112
|
+
return self.best_model.predict_class_probabilities(X, **kwargs)
|
|
1113
|
+
else:
|
|
1114
|
+
raise TypeError(
|
|
1115
|
+
"predict_class_probabilities is only possible when is_regressor is False"
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
def predict_proba(
|
|
1119
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
1120
|
+
) -> FloatMatrix:
|
|
1121
|
+
return self.predict_class_probabilities(X, **kwargs)
|
|
1122
|
+
|
|
1123
|
+
def get_best_estimator(self) -> Union[APLRClassifier, APLRRegressor]:
|
|
1124
|
+
return self.best_model
|
|
1125
|
+
|
|
1126
|
+
def get_cv_results(self) -> List[Dict[str, float]]:
|
|
1127
|
+
return self.cv_results
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aplr
|
|
3
|
+
Version: 10.19.2
|
|
4
|
+
Summary: Automatic Piecewise Linear Regression
|
|
5
|
+
Home-page: https://github.com/ottenbreit-data-science/aplr
|
|
6
|
+
Author: Mathias von Ottenbreit
|
|
7
|
+
Author-email: ottenbreitdatascience@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Platform: Windows
|
|
10
|
+
Platform: Linux
|
|
11
|
+
Platform: MacOS
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: numpy>=1.11
|
|
17
|
+
Requires-Dist: pandas>=1.0.0
|
|
18
|
+
Provides-Extra: plots
|
|
19
|
+
Requires-Dist: matplotlib>=3.0; extra == "plots"
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: author-email
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: license
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: platform
|
|
29
|
+
Dynamic: provides-extra
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
The documentation for Automatic Piecewise Linear Regression is available at [https://github.com/ottenbreit-data-science/aplr](https://github.com/ottenbreit-data-science/aplr).
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
aplr_cpp.cp311-win_amd64.pyd,sha256=jFpeuXLruiu6gB8tB7mxrTWm3Q931259woyywenX_Is,674816
|
|
2
|
+
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
+
aplr/aplr.py,sha256=Pv_6dSaZ7WIbP6vzzB6-R8S0VLcKvlVJyP0RBToDZKw,48320
|
|
4
|
+
aplr-10.19.2.dist-info/licenses/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
+
aplr-10.19.2.dist-info/METADATA,sha256=e1-enJ5c5XtbnxCHKe-YNFq8EZAve0uOkxuiA-rKQWY,1048
|
|
6
|
+
aplr-10.19.2.dist-info/WHEEL,sha256=JLOMsP7F5qtkAkINx5UnzbFguf8CqZeraV8o04b0I8I,101
|
|
7
|
+
aplr-10.19.2.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
+
aplr-10.19.2.dist-info/RECORD,,
|
aplr_cpp.cp311-win_amd64.pyd
CHANGED
|
Binary file
|
aplr-7.0.0.dist-info/METADATA
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: aplr
|
|
3
|
-
Version: 7.0.0
|
|
4
|
-
Summary: Automatic Piecewise Linear Regression
|
|
5
|
-
Home-page: https://github.com/ottenbreit-data-science/aplr
|
|
6
|
-
Author: Mathias von Ottenbreit
|
|
7
|
-
Author-email: ottenbreitdatascience@gmail.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Platform: Windows
|
|
10
|
-
Platform: Linux
|
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
-
Requires-Python: >=3.8
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Requires-Dist: numpy (>=1.20)
|
|
16
|
-
|
|
17
|
-
Build predictive and interpretable parametric regression or classification machine learning models in Python based on the Automatic Piecewise Linear Regression methodology developed by Mathias von Ottenbreit.
|
aplr-7.0.0.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
aplr_cpp.cp311-win_amd64.pyd,sha256=pOvJghR3IOWSQcVzvvtAgQnlKIG9HMCerOMrwhuuACE,471040
|
|
2
|
-
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
-
aplr/aplr.py,sha256=0gVeqsL1WyXHZpIDqn6rXoT0tuv76nHZVs6LCzOxNEM,16012
|
|
4
|
-
aplr-7.0.0.dist-info/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
-
aplr-7.0.0.dist-info/METADATA,sha256=KwO6mV-2SWzBGp3lXXT-UQBhgpv99T1NVwI6aBeoSoc,671
|
|
6
|
-
aplr-7.0.0.dist-info/WHEEL,sha256=9wvhO-5NhjjD8YmmxAvXTPQXMDOZ50W5vklzeoqFtkM,102
|
|
7
|
-
aplr-7.0.0.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
-
aplr-7.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|