aplr 7.0.0__cp311-cp311-win_amd64.whl → 10.20.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aplr/aplr.py +645 -80
- aplr-10.20.0.dist-info/METADATA +34 -0
- aplr-10.20.0.dist-info/RECORD +8 -0
- {aplr-7.0.0.dist-info → aplr-10.20.0.dist-info}/WHEEL +1 -1
- aplr_cpp.cp311-win_amd64.pyd +0 -0
- aplr-7.0.0.dist-info/METADATA +0 -17
- aplr-7.0.0.dist-info/RECORD +0 -8
- {aplr-7.0.0.dist-info → aplr-10.20.0.dist-info/licenses}/LICENSE +0 -0
- {aplr-7.0.0.dist-info → aplr-10.20.0.dist-info}/top_level.txt +0 -0
aplr/aplr.py
CHANGED
|
@@ -1,49 +1,134 @@
|
|
|
1
|
+
from typing import List, Callable, Optional, Dict, Union
|
|
1
2
|
import numpy as np
|
|
2
|
-
import
|
|
3
|
-
from typing import List, Callable, Optional
|
|
3
|
+
import pandas as pd
|
|
4
4
|
import aplr_cpp
|
|
5
|
+
import itertools
|
|
6
|
+
|
|
7
|
+
FloatVector = np.ndarray
|
|
8
|
+
FloatMatrix = np.ndarray
|
|
9
|
+
IntVector = np.ndarray
|
|
10
|
+
IntMatrix = np.ndarray
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _dataframe_to_cpp_dataframe(df: pd.DataFrame) -> aplr_cpp.CppDataFrame:
|
|
14
|
+
"""Converts a pandas DataFrame to a CppDataFrame."""
|
|
15
|
+
cpp_df = aplr_cpp.CppDataFrame()
|
|
16
|
+
for col_name in df.columns:
|
|
17
|
+
col = df[col_name]
|
|
18
|
+
if pd.api.types.is_numeric_dtype(col.dtype):
|
|
19
|
+
# Convert numeric columns to std::vector<double>
|
|
20
|
+
# NaNs are preserved and handled in C++
|
|
21
|
+
cpp_df.add_numeric_column(
|
|
22
|
+
col_name, col.to_numpy(dtype=np.float64, na_value=np.nan)
|
|
23
|
+
)
|
|
24
|
+
elif (
|
|
25
|
+
isinstance(col.dtype, pd.CategoricalDtype)
|
|
26
|
+
or pd.api.types.is_object_dtype(col.dtype)
|
|
27
|
+
or pd.api.types.is_string_dtype(col.dtype)
|
|
28
|
+
):
|
|
29
|
+
# Convert categorical/object/string columns to std::vector<std::string>
|
|
30
|
+
# Missing values (None, np.nan) are converted to empty strings for C++ handling
|
|
31
|
+
cpp_df.add_categorical_column(col_name, col.astype(str).fillna("").tolist())
|
|
32
|
+
else:
|
|
33
|
+
raise TypeError(
|
|
34
|
+
f"Unsupported column type for column '{col_name}': {col.dtype}"
|
|
35
|
+
)
|
|
36
|
+
return cpp_df
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _prepare_input_data(
|
|
40
|
+
X: Union[pd.DataFrame, FloatMatrix], preprocess: bool
|
|
41
|
+
) -> Union[aplr_cpp.CppDataFrame, FloatMatrix]:
|
|
42
|
+
"""
|
|
43
|
+
Prepares the input data for the C++ backend.
|
|
44
|
+
|
|
45
|
+
If X is a pandas DataFrame, it's converted. If preprocess is True, it becomes
|
|
46
|
+
a CppDataFrame. If preprocess is False, it's converted to a NumPy array.
|
|
47
|
+
NumPy arrays are passed through as is.
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(X, pd.DataFrame):
|
|
50
|
+
if preprocess:
|
|
51
|
+
return _dataframe_to_cpp_dataframe(X)
|
|
52
|
+
else:
|
|
53
|
+
# Check if all columns are numeric before converting
|
|
54
|
+
if not all(pd.api.types.is_numeric_dtype(X[col]) for col in X.columns):
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
"Cannot convert DataFrame to matrix if it contains non-numeric columns. "
|
|
57
|
+
"Please ensure all columns are numeric or set preprocess=True."
|
|
58
|
+
)
|
|
59
|
+
return X.to_numpy(dtype=np.float64)
|
|
60
|
+
return X
|
|
5
61
|
|
|
6
62
|
|
|
7
63
|
class APLRRegressor:
|
|
8
64
|
def __init__(
|
|
9
65
|
self,
|
|
10
|
-
m: int =
|
|
11
|
-
v: float = 0.
|
|
66
|
+
m: int = 3000,
|
|
67
|
+
v: float = 0.5,
|
|
12
68
|
random_state: int = 0,
|
|
13
69
|
loss_function: str = "mse",
|
|
14
70
|
link_function: str = "identity",
|
|
15
71
|
n_jobs: int = 0,
|
|
16
|
-
|
|
72
|
+
cv_folds: int = 5,
|
|
17
73
|
bins: int = 300,
|
|
18
74
|
max_interaction_level: int = 1,
|
|
19
75
|
max_interactions: int = 100000,
|
|
20
|
-
min_observations_in_split: int =
|
|
21
|
-
ineligible_boosting_steps_added: int =
|
|
22
|
-
max_eligible_terms: int =
|
|
76
|
+
min_observations_in_split: int = 4,
|
|
77
|
+
ineligible_boosting_steps_added: int = 15,
|
|
78
|
+
max_eligible_terms: int = 7,
|
|
23
79
|
verbosity: int = 0,
|
|
24
80
|
dispersion_parameter: float = 1.5,
|
|
25
81
|
validation_tuning_metric: str = "default",
|
|
26
82
|
quantile: float = 0.5,
|
|
27
83
|
calculate_custom_validation_error_function: Optional[
|
|
28
84
|
Callable[
|
|
29
|
-
[
|
|
85
|
+
[
|
|
86
|
+
FloatVector,
|
|
87
|
+
FloatVector,
|
|
88
|
+
FloatVector,
|
|
89
|
+
FloatVector,
|
|
90
|
+
FloatMatrix,
|
|
91
|
+
],
|
|
92
|
+
float,
|
|
30
93
|
]
|
|
31
94
|
] = None,
|
|
32
95
|
calculate_custom_loss_function: Optional[
|
|
33
96
|
Callable[
|
|
34
|
-
[
|
|
97
|
+
[
|
|
98
|
+
FloatVector,
|
|
99
|
+
FloatVector,
|
|
100
|
+
FloatVector,
|
|
101
|
+
FloatVector,
|
|
102
|
+
FloatMatrix,
|
|
103
|
+
],
|
|
104
|
+
float,
|
|
35
105
|
]
|
|
36
106
|
] = None,
|
|
37
107
|
calculate_custom_negative_gradient_function: Optional[
|
|
38
|
-
Callable[
|
|
108
|
+
Callable[
|
|
109
|
+
[FloatVector, FloatVector, FloatVector, FloatMatrix],
|
|
110
|
+
FloatVector,
|
|
111
|
+
]
|
|
39
112
|
] = None,
|
|
40
113
|
calculate_custom_transform_linear_predictor_to_predictions_function: Optional[
|
|
41
|
-
Callable[[
|
|
114
|
+
Callable[[FloatVector], FloatVector]
|
|
42
115
|
] = None,
|
|
43
116
|
calculate_custom_differentiate_predictions_wrt_linear_predictor_function: Optional[
|
|
44
|
-
Callable[[
|
|
117
|
+
Callable[[FloatVector], FloatVector]
|
|
45
118
|
] = None,
|
|
46
|
-
|
|
119
|
+
boosting_steps_before_interactions_are_allowed: int = 0,
|
|
120
|
+
monotonic_constraints_ignore_interactions: bool = False,
|
|
121
|
+
group_mse_by_prediction_bins: int = 10,
|
|
122
|
+
group_mse_cycle_min_obs_in_bin: int = 30,
|
|
123
|
+
early_stopping_rounds: int = 200,
|
|
124
|
+
num_first_steps_with_linear_effects_only: int = 0,
|
|
125
|
+
penalty_for_non_linearity: float = 0.0,
|
|
126
|
+
penalty_for_interactions: float = 0.0,
|
|
127
|
+
max_terms: int = 0,
|
|
128
|
+
ridge_penalty: float = 0.0001,
|
|
129
|
+
mean_bias_correction: bool = False,
|
|
130
|
+
faster_convergence: bool = False,
|
|
131
|
+
preprocess: bool = True,
|
|
47
132
|
):
|
|
48
133
|
self.m = m
|
|
49
134
|
self.v = v
|
|
@@ -51,7 +136,7 @@ class APLRRegressor:
|
|
|
51
136
|
self.loss_function = loss_function
|
|
52
137
|
self.link_function = link_function
|
|
53
138
|
self.n_jobs = n_jobs
|
|
54
|
-
self.
|
|
139
|
+
self.cv_folds = cv_folds
|
|
55
140
|
self.bins = bins
|
|
56
141
|
self.max_interaction_level = max_interaction_level
|
|
57
142
|
self.max_interactions = max_interactions
|
|
@@ -75,9 +160,25 @@ class APLRRegressor:
|
|
|
75
160
|
self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
|
|
76
161
|
calculate_custom_differentiate_predictions_wrt_linear_predictor_function
|
|
77
162
|
)
|
|
78
|
-
self.
|
|
79
|
-
|
|
163
|
+
self.boosting_steps_before_interactions_are_allowed = (
|
|
164
|
+
boosting_steps_before_interactions_are_allowed
|
|
165
|
+
)
|
|
166
|
+
self.monotonic_constraints_ignore_interactions = (
|
|
167
|
+
monotonic_constraints_ignore_interactions
|
|
168
|
+
)
|
|
169
|
+
self.group_mse_by_prediction_bins = group_mse_by_prediction_bins
|
|
170
|
+
self.group_mse_cycle_min_obs_in_bin = group_mse_cycle_min_obs_in_bin
|
|
171
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
172
|
+
self.num_first_steps_with_linear_effects_only = (
|
|
173
|
+
num_first_steps_with_linear_effects_only
|
|
80
174
|
)
|
|
175
|
+
self.penalty_for_non_linearity = penalty_for_non_linearity
|
|
176
|
+
self.penalty_for_interactions = penalty_for_interactions
|
|
177
|
+
self.max_terms = max_terms
|
|
178
|
+
self.ridge_penalty = ridge_penalty
|
|
179
|
+
self.mean_bias_correction = mean_bias_correction
|
|
180
|
+
self.faster_convergence = faster_convergence
|
|
181
|
+
self.preprocess = preprocess
|
|
81
182
|
|
|
82
183
|
# Creating aplr_cpp and setting parameters
|
|
83
184
|
self.APLRRegressor = aplr_cpp.APLRRegressor()
|
|
@@ -91,7 +192,7 @@ class APLRRegressor:
|
|
|
91
192
|
self.APLRRegressor.loss_function = self.loss_function
|
|
92
193
|
self.APLRRegressor.link_function = self.link_function
|
|
93
194
|
self.APLRRegressor.n_jobs = self.n_jobs
|
|
94
|
-
self.APLRRegressor.
|
|
195
|
+
self.APLRRegressor.cv_folds = self.cv_folds
|
|
95
196
|
self.APLRRegressor.bins = self.bins
|
|
96
197
|
self.APLRRegressor.max_interaction_level = self.max_interaction_level
|
|
97
198
|
self.APLRRegressor.max_interactions = self.max_interactions
|
|
@@ -119,38 +220,74 @@ class APLRRegressor:
|
|
|
119
220
|
self.APLRRegressor.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
|
|
120
221
|
self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function
|
|
121
222
|
)
|
|
122
|
-
self.APLRRegressor.
|
|
123
|
-
self.
|
|
223
|
+
self.APLRRegressor.boosting_steps_before_interactions_are_allowed = (
|
|
224
|
+
self.boosting_steps_before_interactions_are_allowed
|
|
124
225
|
)
|
|
226
|
+
self.APLRRegressor.monotonic_constraints_ignore_interactions = (
|
|
227
|
+
self.monotonic_constraints_ignore_interactions
|
|
228
|
+
)
|
|
229
|
+
self.APLRRegressor.group_mse_by_prediction_bins = (
|
|
230
|
+
self.group_mse_by_prediction_bins
|
|
231
|
+
)
|
|
232
|
+
self.APLRRegressor.group_mse_cycle_min_obs_in_bin = (
|
|
233
|
+
self.group_mse_cycle_min_obs_in_bin
|
|
234
|
+
)
|
|
235
|
+
self.APLRRegressor.early_stopping_rounds = self.early_stopping_rounds
|
|
236
|
+
self.APLRRegressor.num_first_steps_with_linear_effects_only = (
|
|
237
|
+
self.num_first_steps_with_linear_effects_only
|
|
238
|
+
)
|
|
239
|
+
self.APLRRegressor.penalty_for_non_linearity = self.penalty_for_non_linearity
|
|
240
|
+
self.APLRRegressor.penalty_for_interactions = self.penalty_for_interactions
|
|
241
|
+
self.APLRRegressor.max_terms = self.max_terms
|
|
242
|
+
self.APLRRegressor.ridge_penalty = self.ridge_penalty
|
|
243
|
+
self.APLRRegressor.mean_bias_correction = self.mean_bias_correction
|
|
244
|
+
self.APLRRegressor.faster_convergence = self.faster_convergence
|
|
245
|
+
self.APLRRegressor.preprocess = self.preprocess
|
|
125
246
|
|
|
126
247
|
def fit(
|
|
127
248
|
self,
|
|
128
|
-
X:
|
|
129
|
-
y:
|
|
130
|
-
sample_weight:
|
|
249
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
250
|
+
y: FloatVector,
|
|
251
|
+
sample_weight: FloatVector = np.empty(0),
|
|
131
252
|
X_names: List[str] = [],
|
|
132
|
-
|
|
253
|
+
cv_observations: IntMatrix = np.empty([0, 0]),
|
|
133
254
|
prioritized_predictors_indexes: List[int] = [],
|
|
134
255
|
monotonic_constraints: List[int] = [],
|
|
135
|
-
group:
|
|
256
|
+
group: FloatVector = np.empty(0),
|
|
136
257
|
interaction_constraints: List[List[int]] = [],
|
|
258
|
+
other_data: FloatMatrix = np.empty([0, 0]),
|
|
259
|
+
predictor_learning_rates: List[float] = [],
|
|
260
|
+
predictor_penalties_for_non_linearity: List[float] = [],
|
|
261
|
+
predictor_penalties_for_interactions: List[float] = [],
|
|
262
|
+
predictor_min_observations_in_split: List[int] = [],
|
|
137
263
|
):
|
|
138
264
|
self.__set_params_cpp()
|
|
265
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
266
|
+
|
|
139
267
|
self.APLRRegressor.fit(
|
|
140
268
|
X,
|
|
141
269
|
y,
|
|
142
270
|
sample_weight,
|
|
143
271
|
X_names,
|
|
144
|
-
|
|
272
|
+
cv_observations,
|
|
145
273
|
prioritized_predictors_indexes,
|
|
146
274
|
monotonic_constraints,
|
|
147
275
|
group,
|
|
148
276
|
interaction_constraints,
|
|
277
|
+
other_data,
|
|
278
|
+
predictor_learning_rates,
|
|
279
|
+
predictor_penalties_for_non_linearity,
|
|
280
|
+
predictor_penalties_for_interactions,
|
|
281
|
+
predictor_min_observations_in_split,
|
|
149
282
|
)
|
|
150
283
|
|
|
151
284
|
def predict(
|
|
152
|
-
self,
|
|
153
|
-
|
|
285
|
+
self,
|
|
286
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
287
|
+
cap_predictions_to_minmax_in_training: bool = True,
|
|
288
|
+
) -> FloatVector:
|
|
289
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
290
|
+
|
|
154
291
|
if self.link_function == "custom_function":
|
|
155
292
|
self.APLRRegressor.calculate_custom_transform_linear_predictor_to_predictions_function = (
|
|
156
293
|
self.calculate_custom_transform_linear_predictor_to_predictions_function
|
|
@@ -160,46 +297,271 @@ class APLRRegressor:
|
|
|
160
297
|
def set_term_names(self, X_names: List[str]):
|
|
161
298
|
self.APLRRegressor.set_term_names(X_names)
|
|
162
299
|
|
|
163
|
-
def
|
|
164
|
-
|
|
300
|
+
def calculate_feature_importance(
|
|
301
|
+
self,
|
|
302
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
303
|
+
sample_weight: FloatVector = np.empty(0),
|
|
304
|
+
) -> FloatVector:
|
|
305
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
306
|
+
return self.APLRRegressor.calculate_feature_importance(X, sample_weight)
|
|
165
307
|
|
|
166
|
-
def
|
|
167
|
-
self,
|
|
168
|
-
|
|
169
|
-
|
|
308
|
+
def calculate_term_importance(
|
|
309
|
+
self,
|
|
310
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
311
|
+
sample_weight: FloatVector = np.empty(0),
|
|
312
|
+
) -> FloatVector:
|
|
313
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
314
|
+
return self.APLRRegressor.calculate_term_importance(X, sample_weight)
|
|
315
|
+
|
|
316
|
+
def calculate_local_feature_contribution(
|
|
317
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
318
|
+
) -> FloatMatrix:
|
|
319
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
320
|
+
return self.APLRRegressor.calculate_local_feature_contribution(X)
|
|
321
|
+
|
|
322
|
+
def calculate_local_term_contribution(
|
|
323
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
324
|
+
) -> FloatMatrix:
|
|
325
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
326
|
+
return self.APLRRegressor.calculate_local_term_contribution(X)
|
|
327
|
+
|
|
328
|
+
def calculate_local_contribution_from_selected_terms(
|
|
329
|
+
self, X: Union[pd.DataFrame, FloatMatrix], predictor_indexes: List[int]
|
|
330
|
+
) -> FloatVector:
|
|
331
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
332
|
+
return self.APLRRegressor.calculate_local_contribution_from_selected_terms(
|
|
333
|
+
X, predictor_indexes
|
|
334
|
+
)
|
|
170
335
|
|
|
171
|
-
def calculate_terms(self, X:
|
|
336
|
+
def calculate_terms(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
|
|
337
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
172
338
|
return self.APLRRegressor.calculate_terms(X)
|
|
173
339
|
|
|
174
340
|
def get_term_names(self) -> List[str]:
|
|
175
341
|
return self.APLRRegressor.get_term_names()
|
|
176
342
|
|
|
177
|
-
def
|
|
178
|
-
return self.APLRRegressor.
|
|
343
|
+
def get_term_affiliations(self) -> List[str]:
|
|
344
|
+
return self.APLRRegressor.get_term_affiliations()
|
|
345
|
+
|
|
346
|
+
def get_unique_term_affiliations(self) -> List[str]:
|
|
347
|
+
return self.APLRRegressor.get_unique_term_affiliations()
|
|
179
348
|
|
|
180
|
-
def
|
|
181
|
-
return self.APLRRegressor.
|
|
349
|
+
def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
|
|
350
|
+
return self.APLRRegressor.get_base_predictors_in_each_unique_term_affiliation()
|
|
182
351
|
|
|
183
|
-
def
|
|
352
|
+
def get_term_coefficients(self) -> FloatVector:
|
|
353
|
+
return self.APLRRegressor.get_term_coefficients()
|
|
354
|
+
|
|
355
|
+
def get_validation_error_steps(self) -> FloatMatrix:
|
|
184
356
|
return self.APLRRegressor.get_validation_error_steps()
|
|
185
357
|
|
|
186
|
-
def get_feature_importance(self) ->
|
|
358
|
+
def get_feature_importance(self) -> FloatVector:
|
|
187
359
|
return self.APLRRegressor.get_feature_importance()
|
|
188
360
|
|
|
361
|
+
def get_term_importance(self) -> FloatVector:
|
|
362
|
+
return self.APLRRegressor.get_term_importance()
|
|
363
|
+
|
|
364
|
+
def get_term_main_predictor_indexes(self) -> IntVector:
|
|
365
|
+
return self.APLRRegressor.get_term_main_predictor_indexes()
|
|
366
|
+
|
|
367
|
+
def get_term_interaction_levels(self) -> IntVector:
|
|
368
|
+
return self.APLRRegressor.get_term_interaction_levels()
|
|
369
|
+
|
|
189
370
|
def get_intercept(self) -> float:
|
|
190
371
|
return self.APLRRegressor.get_intercept()
|
|
191
372
|
|
|
192
|
-
def get_intercept_steps(self) -> npt.ArrayLike:
|
|
193
|
-
return self.APLRRegressor.get_intercept_steps()
|
|
194
|
-
|
|
195
373
|
def get_optimal_m(self) -> int:
|
|
196
374
|
return self.APLRRegressor.get_optimal_m()
|
|
197
375
|
|
|
198
376
|
def get_validation_tuning_metric(self) -> str:
|
|
199
377
|
return self.APLRRegressor.get_validation_tuning_metric()
|
|
200
378
|
|
|
201
|
-
def
|
|
202
|
-
return self.APLRRegressor.
|
|
379
|
+
def get_main_effect_shape(self, predictor_index: int) -> Dict[float, float]:
|
|
380
|
+
return self.APLRRegressor.get_main_effect_shape(predictor_index)
|
|
381
|
+
|
|
382
|
+
def get_unique_term_affiliation_shape(
|
|
383
|
+
self,
|
|
384
|
+
unique_term_affiliation: str,
|
|
385
|
+
max_rows_before_sampling: int = 500000,
|
|
386
|
+
additional_points: int = 250,
|
|
387
|
+
) -> FloatMatrix:
|
|
388
|
+
return self.APLRRegressor.get_unique_term_affiliation_shape(
|
|
389
|
+
unique_term_affiliation, max_rows_before_sampling, additional_points
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
def get_cv_error(self) -> float:
|
|
393
|
+
return self.APLRRegressor.get_cv_error()
|
|
394
|
+
|
|
395
|
+
def get_num_cv_folds(self) -> int:
|
|
396
|
+
"""
|
|
397
|
+
Gets the number of cross-validation folds used during training.
|
|
398
|
+
|
|
399
|
+
:return: The number of folds.
|
|
400
|
+
"""
|
|
401
|
+
return self.APLRRegressor.get_num_cv_folds()
|
|
402
|
+
|
|
403
|
+
def get_cv_validation_predictions(self, fold_index: int) -> FloatVector:
|
|
404
|
+
"""
|
|
405
|
+
Gets the validation predictions for a specific cross-validation fold.
|
|
406
|
+
|
|
407
|
+
Note that these predictions may be conservative, as the final model is an ensemble of the models
|
|
408
|
+
from all cross-validation folds, which has a variance-reducing effect similar to bagging.
|
|
409
|
+
|
|
410
|
+
:param fold_index: The index of the fold.
|
|
411
|
+
:return: A numpy array containing the validation predictions.
|
|
412
|
+
"""
|
|
413
|
+
return self.APLRRegressor.get_cv_validation_predictions(fold_index)
|
|
414
|
+
|
|
415
|
+
def get_cv_y(self, fold_index: int) -> FloatVector:
|
|
416
|
+
"""
|
|
417
|
+
Gets the validation response values (y) for a specific cross-validation fold.
|
|
418
|
+
|
|
419
|
+
:param fold_index: The index of the fold.
|
|
420
|
+
:return: A numpy array containing the validation response values.
|
|
421
|
+
"""
|
|
422
|
+
return self.APLRRegressor.get_cv_y(fold_index)
|
|
423
|
+
|
|
424
|
+
def get_cv_sample_weight(self, fold_index: int) -> FloatVector:
|
|
425
|
+
"""
|
|
426
|
+
Gets the validation sample weights for a specific cross-validation fold.
|
|
427
|
+
|
|
428
|
+
:param fold_index: The index of the fold.
|
|
429
|
+
:return: A numpy array containing the validation sample weights.
|
|
430
|
+
"""
|
|
431
|
+
return self.APLRRegressor.get_cv_sample_weight(fold_index)
|
|
432
|
+
|
|
433
|
+
def get_cv_validation_indexes(self, fold_index: int) -> IntVector:
|
|
434
|
+
"""
|
|
435
|
+
Gets the original indexes of the validation observations for a specific cross-validation fold.
|
|
436
|
+
|
|
437
|
+
:param fold_index: The index of the fold.
|
|
438
|
+
:return: A numpy array containing the original indexes.
|
|
439
|
+
"""
|
|
440
|
+
return self.APLRRegressor.get_cv_validation_indexes(fold_index)
|
|
441
|
+
|
|
442
|
+
def set_intercept(self, value: float):
|
|
443
|
+
self.APLRRegressor.set_intercept(value)
|
|
444
|
+
|
|
445
|
+
def plot_affiliation_shape(
|
|
446
|
+
self,
|
|
447
|
+
affiliation: str,
|
|
448
|
+
plot: bool = True,
|
|
449
|
+
save: bool = False,
|
|
450
|
+
path: str = "",
|
|
451
|
+
):
|
|
452
|
+
"""
|
|
453
|
+
Plots or saves the shape of a given unique term affiliation.
|
|
454
|
+
|
|
455
|
+
For main effects, it produces a line plot. For two-way interactions, it produces a heatmap.
|
|
456
|
+
Plotting for higher-order interactions is not supported.
|
|
457
|
+
|
|
458
|
+
:param affiliation: A string specifying which unique_term_affiliation to use.
|
|
459
|
+
:param plot: If True, displays the plot.
|
|
460
|
+
:param save: If True, saves the plot to a file.
|
|
461
|
+
:param path: The file path to save the plot. If empty and save is True, a default path will be used.
|
|
462
|
+
"""
|
|
463
|
+
try:
|
|
464
|
+
import matplotlib.pyplot as plt
|
|
465
|
+
except ImportError:
|
|
466
|
+
raise ImportError("matplotlib is required for plotting. Please install it.")
|
|
467
|
+
|
|
468
|
+
all_affiliations = self.get_unique_term_affiliations()
|
|
469
|
+
if affiliation not in all_affiliations:
|
|
470
|
+
raise ValueError(
|
|
471
|
+
f"Affiliation '{affiliation}' not found in model. "
|
|
472
|
+
f"Available affiliations are: {all_affiliations}"
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
affiliation_index = all_affiliations.index(affiliation)
|
|
476
|
+
|
|
477
|
+
predictors_in_each_affiliation = (
|
|
478
|
+
self.get_base_predictors_in_each_unique_term_affiliation()
|
|
479
|
+
)
|
|
480
|
+
predictor_indexes_used = predictors_in_each_affiliation[affiliation_index]
|
|
481
|
+
|
|
482
|
+
shape = self.get_unique_term_affiliation_shape(affiliation)
|
|
483
|
+
if shape.shape[0] == 0:
|
|
484
|
+
print(f"No shape data available for affiliation '{affiliation}'.")
|
|
485
|
+
return
|
|
486
|
+
|
|
487
|
+
predictor_names = affiliation.split(" & ")
|
|
488
|
+
|
|
489
|
+
is_main_effect: bool = len(predictor_indexes_used) == 1
|
|
490
|
+
is_two_way_interaction: bool = len(predictor_indexes_used) == 2
|
|
491
|
+
|
|
492
|
+
if is_main_effect:
|
|
493
|
+
fig = plt.figure()
|
|
494
|
+
# Sort by predictor value for a clean line plot
|
|
495
|
+
sorted_indices = np.argsort(shape[:, 0])
|
|
496
|
+
plt.plot(shape[sorted_indices, 0], shape[sorted_indices, 1])
|
|
497
|
+
plt.xlabel(predictor_names[0])
|
|
498
|
+
plt.ylabel("Contribution to linear predictor")
|
|
499
|
+
plt.title(f"Main effect of {predictor_names[0]}")
|
|
500
|
+
plt.grid(True)
|
|
501
|
+
elif is_two_way_interaction:
|
|
502
|
+
fig = plt.figure(figsize=(8, 6))
|
|
503
|
+
|
|
504
|
+
# Get unique coordinates and their inverse mapping
|
|
505
|
+
y_unique, y_inv = np.unique(shape[:, 0], return_inverse=True)
|
|
506
|
+
x_unique, x_inv = np.unique(shape[:, 1], return_inverse=True)
|
|
507
|
+
|
|
508
|
+
# Create grid for sums and counts
|
|
509
|
+
grid_sums = np.zeros((len(y_unique), len(x_unique)))
|
|
510
|
+
grid_counts = np.zeros((len(y_unique), len(x_unique)))
|
|
511
|
+
|
|
512
|
+
# Populate sums and counts to later calculate the mean
|
|
513
|
+
np.add.at(grid_sums, (y_inv, x_inv), shape[:, 2])
|
|
514
|
+
np.add.at(grid_counts, (y_inv, x_inv), 1)
|
|
515
|
+
|
|
516
|
+
# Calculate mean, avoiding division by zero
|
|
517
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
518
|
+
pivot_table_values = np.true_divide(grid_sums, grid_counts)
|
|
519
|
+
# Where there's no data, pivot_table_values will be nan, which is fine for imshow.
|
|
520
|
+
|
|
521
|
+
plt.imshow(
|
|
522
|
+
pivot_table_values,
|
|
523
|
+
aspect="auto",
|
|
524
|
+
origin="lower",
|
|
525
|
+
extent=[
|
|
526
|
+
x_unique.min(),
|
|
527
|
+
x_unique.max(),
|
|
528
|
+
y_unique.min(),
|
|
529
|
+
y_unique.max(),
|
|
530
|
+
],
|
|
531
|
+
cmap="Blues_r",
|
|
532
|
+
)
|
|
533
|
+
plt.colorbar(label="Contribution to the linear predictor")
|
|
534
|
+
plt.xlabel(predictor_names[1])
|
|
535
|
+
plt.ylabel(predictor_names[0])
|
|
536
|
+
plt.title(
|
|
537
|
+
f"Interaction between {predictor_names[0]} and {predictor_names[1]}"
|
|
538
|
+
)
|
|
539
|
+
else:
|
|
540
|
+
print(
|
|
541
|
+
f"Plotting for interaction level > 2 is not supported. Affiliation: {affiliation}"
|
|
542
|
+
)
|
|
543
|
+
return
|
|
544
|
+
|
|
545
|
+
if save:
|
|
546
|
+
save_path = path or f"shape_of_{affiliation.replace(' & ', '_')}.png"
|
|
547
|
+
plt.savefig(save_path)
|
|
548
|
+
|
|
549
|
+
if plot:
|
|
550
|
+
plt.show()
|
|
551
|
+
|
|
552
|
+
plt.close(fig)
|
|
553
|
+
|
|
554
|
+
def remove_provided_custom_functions(self):
|
|
555
|
+
self.APLRRegressor.remove_provided_custom_functions()
|
|
556
|
+
self.calculate_custom_validation_error_function = None
|
|
557
|
+
self.calculate_custom_loss_function = None
|
|
558
|
+
self.calculate_custom_negative_gradient_function = None
|
|
559
|
+
|
|
560
|
+
def clear_cv_results(self):
|
|
561
|
+
"""
|
|
562
|
+
Clears the stored cross-validation results (predictions, y, etc.) to free up memory.
|
|
563
|
+
"""
|
|
564
|
+
self.APLRRegressor.clear_cv_results()
|
|
203
565
|
|
|
204
566
|
# For sklearn
|
|
205
567
|
def get_params(self, deep=True):
|
|
@@ -210,7 +572,7 @@ class APLRRegressor:
|
|
|
210
572
|
"loss_function": self.loss_function,
|
|
211
573
|
"link_function": self.link_function,
|
|
212
574
|
"n_jobs": self.n_jobs,
|
|
213
|
-
"
|
|
575
|
+
"cv_folds": self.cv_folds,
|
|
214
576
|
"bins": self.bins,
|
|
215
577
|
"max_interaction_level": self.max_interaction_level,
|
|
216
578
|
"max_interactions": self.max_interactions,
|
|
@@ -226,7 +588,19 @@ class APLRRegressor:
|
|
|
226
588
|
"calculate_custom_negative_gradient_function": self.calculate_custom_negative_gradient_function,
|
|
227
589
|
"calculate_custom_transform_linear_predictor_to_predictions_function": self.calculate_custom_transform_linear_predictor_to_predictions_function,
|
|
228
590
|
"calculate_custom_differentiate_predictions_wrt_linear_predictor_function": self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function,
|
|
229
|
-
"
|
|
591
|
+
"boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
|
|
592
|
+
"monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
|
|
593
|
+
"group_mse_by_prediction_bins": self.group_mse_by_prediction_bins,
|
|
594
|
+
"group_mse_cycle_min_obs_in_bin": self.group_mse_cycle_min_obs_in_bin,
|
|
595
|
+
"early_stopping_rounds": self.early_stopping_rounds,
|
|
596
|
+
"num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
|
|
597
|
+
"penalty_for_non_linearity": self.penalty_for_non_linearity,
|
|
598
|
+
"penalty_for_interactions": self.penalty_for_interactions,
|
|
599
|
+
"max_terms": self.max_terms,
|
|
600
|
+
"ridge_penalty": self.ridge_penalty,
|
|
601
|
+
"mean_bias_correction": self.mean_bias_correction,
|
|
602
|
+
"faster_convergence": self.faster_convergence,
|
|
603
|
+
"preprocess": self.preprocess,
|
|
230
604
|
}
|
|
231
605
|
|
|
232
606
|
# For sklearn
|
|
@@ -236,29 +610,44 @@ class APLRRegressor:
|
|
|
236
610
|
self.__set_params_cpp()
|
|
237
611
|
return self
|
|
238
612
|
|
|
613
|
+
def __setstate__(self, state):
|
|
614
|
+
# For backwards compatibility with older pickled models
|
|
615
|
+
if "preprocess" not in state:
|
|
616
|
+
state["preprocess"] = False
|
|
617
|
+
self.__dict__.update(state)
|
|
618
|
+
self.__set_params_cpp()
|
|
619
|
+
|
|
239
620
|
|
|
240
621
|
class APLRClassifier:
|
|
241
622
|
def __init__(
|
|
242
623
|
self,
|
|
243
|
-
m: int =
|
|
244
|
-
v: float = 0.
|
|
624
|
+
m: int = 3000,
|
|
625
|
+
v: float = 0.5,
|
|
245
626
|
random_state: int = 0,
|
|
246
627
|
n_jobs: int = 0,
|
|
247
|
-
|
|
628
|
+
cv_folds: int = 5,
|
|
248
629
|
bins: int = 300,
|
|
249
630
|
verbosity: int = 0,
|
|
250
631
|
max_interaction_level: int = 1,
|
|
251
632
|
max_interactions: int = 100000,
|
|
252
|
-
min_observations_in_split: int =
|
|
253
|
-
ineligible_boosting_steps_added: int =
|
|
254
|
-
max_eligible_terms: int =
|
|
255
|
-
|
|
633
|
+
min_observations_in_split: int = 4,
|
|
634
|
+
ineligible_boosting_steps_added: int = 15,
|
|
635
|
+
max_eligible_terms: int = 7,
|
|
636
|
+
boosting_steps_before_interactions_are_allowed: int = 0,
|
|
637
|
+
monotonic_constraints_ignore_interactions: bool = False,
|
|
638
|
+
early_stopping_rounds: int = 200,
|
|
639
|
+
num_first_steps_with_linear_effects_only: int = 0,
|
|
640
|
+
penalty_for_non_linearity: float = 0.0,
|
|
641
|
+
penalty_for_interactions: float = 0.0,
|
|
642
|
+
max_terms: int = 0,
|
|
643
|
+
ridge_penalty: float = 0.0001,
|
|
644
|
+
preprocess: bool = True,
|
|
256
645
|
):
|
|
257
646
|
self.m = m
|
|
258
647
|
self.v = v
|
|
259
648
|
self.random_state = random_state
|
|
260
649
|
self.n_jobs = n_jobs
|
|
261
|
-
self.
|
|
650
|
+
self.cv_folds = cv_folds
|
|
262
651
|
self.bins = bins
|
|
263
652
|
self.verbosity = verbosity
|
|
264
653
|
self.max_interaction_level = max_interaction_level
|
|
@@ -266,9 +655,21 @@ class APLRClassifier:
|
|
|
266
655
|
self.min_observations_in_split = min_observations_in_split
|
|
267
656
|
self.ineligible_boosting_steps_added = ineligible_boosting_steps_added
|
|
268
657
|
self.max_eligible_terms = max_eligible_terms
|
|
269
|
-
self.
|
|
270
|
-
|
|
658
|
+
self.boosting_steps_before_interactions_are_allowed = (
|
|
659
|
+
boosting_steps_before_interactions_are_allowed
|
|
660
|
+
)
|
|
661
|
+
self.monotonic_constraints_ignore_interactions = (
|
|
662
|
+
monotonic_constraints_ignore_interactions
|
|
663
|
+
)
|
|
664
|
+
self.early_stopping_rounds = early_stopping_rounds
|
|
665
|
+
self.num_first_steps_with_linear_effects_only = (
|
|
666
|
+
num_first_steps_with_linear_effects_only
|
|
271
667
|
)
|
|
668
|
+
self.penalty_for_non_linearity = penalty_for_non_linearity
|
|
669
|
+
self.penalty_for_interactions = penalty_for_interactions
|
|
670
|
+
self.max_terms = max_terms
|
|
671
|
+
self.ridge_penalty = ridge_penalty
|
|
672
|
+
self.preprocess = preprocess
|
|
272
673
|
|
|
273
674
|
# Creating aplr_cpp and setting parameters
|
|
274
675
|
self.APLRClassifier = aplr_cpp.APLRClassifier()
|
|
@@ -280,7 +681,7 @@ class APLRClassifier:
|
|
|
280
681
|
self.APLRClassifier.v = self.v
|
|
281
682
|
self.APLRClassifier.random_state = self.random_state
|
|
282
683
|
self.APLRClassifier.n_jobs = self.n_jobs
|
|
283
|
-
self.APLRClassifier.
|
|
684
|
+
self.APLRClassifier.cv_folds = self.cv_folds
|
|
284
685
|
self.APLRClassifier.bins = self.bins
|
|
285
686
|
self.APLRClassifier.verbosity = self.verbosity
|
|
286
687
|
self.APLRClassifier.max_interaction_level = self.max_interaction_level
|
|
@@ -290,66 +691,146 @@ class APLRClassifier:
|
|
|
290
691
|
self.ineligible_boosting_steps_added
|
|
291
692
|
)
|
|
292
693
|
self.APLRClassifier.max_eligible_terms = self.max_eligible_terms
|
|
293
|
-
self.APLRClassifier.
|
|
294
|
-
self.
|
|
694
|
+
self.APLRClassifier.boosting_steps_before_interactions_are_allowed = (
|
|
695
|
+
self.boosting_steps_before_interactions_are_allowed
|
|
295
696
|
)
|
|
697
|
+
self.APLRClassifier.monotonic_constraints_ignore_interactions = (
|
|
698
|
+
self.monotonic_constraints_ignore_interactions
|
|
699
|
+
)
|
|
700
|
+
self.APLRClassifier.early_stopping_rounds = self.early_stopping_rounds
|
|
701
|
+
self.APLRClassifier.num_first_steps_with_linear_effects_only = (
|
|
702
|
+
self.num_first_steps_with_linear_effects_only
|
|
703
|
+
)
|
|
704
|
+
self.APLRClassifier.penalty_for_non_linearity = self.penalty_for_non_linearity
|
|
705
|
+
self.APLRClassifier.penalty_for_interactions = self.penalty_for_interactions
|
|
706
|
+
self.APLRClassifier.max_terms = self.max_terms
|
|
707
|
+
self.APLRClassifier.ridge_penalty = self.ridge_penalty
|
|
708
|
+
self.APLRClassifier.preprocess = self.preprocess
|
|
296
709
|
|
|
297
710
|
def fit(
|
|
298
711
|
self,
|
|
299
|
-
X:
|
|
300
|
-
y: List[str],
|
|
301
|
-
sample_weight:
|
|
712
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
713
|
+
y: Union[FloatVector, List[str]],
|
|
714
|
+
sample_weight: FloatVector = np.empty(0),
|
|
302
715
|
X_names: List[str] = [],
|
|
303
|
-
|
|
716
|
+
cv_observations: IntMatrix = np.empty([0, 0]),
|
|
304
717
|
prioritized_predictors_indexes: List[int] = [],
|
|
305
718
|
monotonic_constraints: List[int] = [],
|
|
306
719
|
interaction_constraints: List[List[int]] = [],
|
|
720
|
+
predictor_learning_rates: List[float] = [],
|
|
721
|
+
predictor_penalties_for_non_linearity: List[float] = [],
|
|
722
|
+
predictor_penalties_for_interactions: List[float] = [],
|
|
723
|
+
predictor_min_observations_in_split: List[int] = [],
|
|
307
724
|
):
|
|
308
725
|
self.__set_params_cpp()
|
|
726
|
+
|
|
727
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
728
|
+
|
|
729
|
+
if isinstance(y, np.ndarray):
|
|
730
|
+
y = y.astype(str).tolist()
|
|
731
|
+
elif isinstance(y, list) and y and not isinstance(y[0], str):
|
|
732
|
+
y = [str(val) for val in y]
|
|
733
|
+
|
|
309
734
|
self.APLRClassifier.fit(
|
|
310
735
|
X,
|
|
311
736
|
y,
|
|
312
737
|
sample_weight,
|
|
313
738
|
X_names,
|
|
314
|
-
|
|
739
|
+
cv_observations,
|
|
315
740
|
prioritized_predictors_indexes,
|
|
316
741
|
monotonic_constraints,
|
|
317
742
|
interaction_constraints,
|
|
743
|
+
predictor_learning_rates,
|
|
744
|
+
predictor_penalties_for_non_linearity,
|
|
745
|
+
predictor_penalties_for_interactions,
|
|
746
|
+
predictor_min_observations_in_split,
|
|
318
747
|
)
|
|
748
|
+
# For sklearn
|
|
749
|
+
self.classes_ = np.arange(len(self.APLRClassifier.get_categories()))
|
|
319
750
|
|
|
320
751
|
def predict_class_probabilities(
|
|
321
|
-
self,
|
|
322
|
-
|
|
752
|
+
self,
|
|
753
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
754
|
+
cap_predictions_to_minmax_in_training: bool = False,
|
|
755
|
+
) -> FloatMatrix:
|
|
756
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
757
|
+
|
|
323
758
|
return self.APLRClassifier.predict_class_probabilities(
|
|
324
759
|
X, cap_predictions_to_minmax_in_training
|
|
325
760
|
)
|
|
326
761
|
|
|
327
762
|
def predict(
|
|
328
|
-
self,
|
|
763
|
+
self,
|
|
764
|
+
X: Union[pd.DataFrame, FloatMatrix],
|
|
765
|
+
cap_predictions_to_minmax_in_training: bool = False,
|
|
329
766
|
) -> List[str]:
|
|
767
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
768
|
+
|
|
330
769
|
return self.APLRClassifier.predict(X, cap_predictions_to_minmax_in_training)
|
|
331
770
|
|
|
332
|
-
def
|
|
333
|
-
|
|
771
|
+
def calculate_local_feature_contribution(
|
|
772
|
+
self, X: Union[pd.DataFrame, FloatMatrix]
|
|
773
|
+
) -> FloatMatrix:
|
|
774
|
+
X = _prepare_input_data(X, self.preprocess)
|
|
775
|
+
return self.APLRClassifier.calculate_local_feature_contribution(X)
|
|
334
776
|
|
|
335
777
|
def get_categories(self) -> List[str]:
|
|
336
778
|
return self.APLRClassifier.get_categories()
|
|
337
779
|
|
|
338
780
|
def get_logit_model(self, category: str) -> APLRRegressor:
|
|
339
|
-
|
|
781
|
+
logit_model_cpp = self.APLRClassifier.get_logit_model(category)
|
|
782
|
+
|
|
783
|
+
logit_model_py = APLRRegressor(
|
|
784
|
+
m=self.m,
|
|
785
|
+
v=self.v,
|
|
786
|
+
random_state=self.random_state,
|
|
787
|
+
loss_function="binomial",
|
|
788
|
+
link_function="logit",
|
|
789
|
+
n_jobs=self.n_jobs,
|
|
790
|
+
cv_folds=self.cv_folds,
|
|
791
|
+
bins=self.bins,
|
|
792
|
+
max_interaction_level=self.max_interaction_level,
|
|
793
|
+
max_interactions=self.max_interactions,
|
|
794
|
+
min_observations_in_split=self.min_observations_in_split,
|
|
795
|
+
ineligible_boosting_steps_added=self.ineligible_boosting_steps_added,
|
|
796
|
+
max_eligible_terms=self.max_eligible_terms,
|
|
797
|
+
verbosity=self.verbosity,
|
|
798
|
+
boosting_steps_before_interactions_are_allowed=self.boosting_steps_before_interactions_are_allowed,
|
|
799
|
+
monotonic_constraints_ignore_interactions=self.monotonic_constraints_ignore_interactions,
|
|
800
|
+
early_stopping_rounds=self.early_stopping_rounds,
|
|
801
|
+
num_first_steps_with_linear_effects_only=self.num_first_steps_with_linear_effects_only,
|
|
802
|
+
penalty_for_non_linearity=self.penalty_for_non_linearity,
|
|
803
|
+
penalty_for_interactions=self.penalty_for_interactions,
|
|
804
|
+
max_terms=self.max_terms,
|
|
805
|
+
ridge_penalty=self.ridge_penalty,
|
|
806
|
+
preprocess=self.preprocess,
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
logit_model_py.APLRRegressor = logit_model_cpp
|
|
340
810
|
|
|
341
|
-
|
|
342
|
-
return self.APLRClassifier.get_validation_indexes()
|
|
811
|
+
return logit_model_py
|
|
343
812
|
|
|
344
|
-
def get_validation_error_steps(self) ->
|
|
813
|
+
def get_validation_error_steps(self) -> FloatMatrix:
|
|
345
814
|
return self.APLRClassifier.get_validation_error_steps()
|
|
346
815
|
|
|
347
|
-
def
|
|
348
|
-
return self.APLRClassifier.
|
|
816
|
+
def get_cv_error(self) -> float:
|
|
817
|
+
return self.APLRClassifier.get_cv_error()
|
|
349
818
|
|
|
350
|
-
def get_feature_importance(self) ->
|
|
819
|
+
def get_feature_importance(self) -> FloatVector:
|
|
351
820
|
return self.APLRClassifier.get_feature_importance()
|
|
352
821
|
|
|
822
|
+
def get_unique_term_affiliations(self) -> List[str]:
|
|
823
|
+
return self.APLRClassifier.get_unique_term_affiliations()
|
|
824
|
+
|
|
825
|
+
def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
|
|
826
|
+
return self.APLRClassifier.get_base_predictors_in_each_unique_term_affiliation()
|
|
827
|
+
|
|
828
|
+
def clear_cv_results(self):
|
|
829
|
+
"""
|
|
830
|
+
Clears the stored cross-validation results from all underlying logit models to free up memory.
|
|
831
|
+
"""
|
|
832
|
+
self.APLRClassifier.clear_cv_results()
|
|
833
|
+
|
|
353
834
|
# For sklearn
|
|
354
835
|
def get_params(self, deep=True):
|
|
355
836
|
return {
|
|
@@ -357,7 +838,7 @@ class APLRClassifier:
|
|
|
357
838
|
"v": self.v,
|
|
358
839
|
"random_state": self.random_state,
|
|
359
840
|
"n_jobs": self.n_jobs,
|
|
360
|
-
"
|
|
841
|
+
"cv_folds": self.cv_folds,
|
|
361
842
|
"bins": self.bins,
|
|
362
843
|
"verbosity": self.verbosity,
|
|
363
844
|
"max_interaction_level": self.max_interaction_level,
|
|
@@ -365,7 +846,15 @@ class APLRClassifier:
|
|
|
365
846
|
"min_observations_in_split": self.min_observations_in_split,
|
|
366
847
|
"ineligible_boosting_steps_added": self.ineligible_boosting_steps_added,
|
|
367
848
|
"max_eligible_terms": self.max_eligible_terms,
|
|
368
|
-
"
|
|
849
|
+
"boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
|
|
850
|
+
"monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
|
|
851
|
+
"early_stopping_rounds": self.early_stopping_rounds,
|
|
852
|
+
"num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
|
|
853
|
+
"penalty_for_non_linearity": self.penalty_for_non_linearity,
|
|
854
|
+
"penalty_for_interactions": self.penalty_for_interactions,
|
|
855
|
+
"max_terms": self.max_terms,
|
|
856
|
+
"ridge_penalty": self.ridge_penalty,
|
|
857
|
+
"preprocess": self.preprocess,
|
|
369
858
|
}
|
|
370
859
|
|
|
371
860
|
# For sklearn
|
|
@@ -374,3 +863,79 @@ class APLRClassifier:
|
|
|
374
863
|
setattr(self, parameter, value)
|
|
375
864
|
self.__set_params_cpp()
|
|
376
865
|
return self
|
|
866
|
+
|
|
867
|
+
# For sklearn
|
|
868
|
+
def predict_proba(self, X: FloatMatrix) -> FloatMatrix:
|
|
869
|
+
return self.predict_class_probabilities(X)
|
|
870
|
+
|
|
871
|
+
def __setstate__(self, state):
|
|
872
|
+
# For backwards compatibility with older pickled models
|
|
873
|
+
if "preprocess" not in state:
|
|
874
|
+
state["preprocess"] = False
|
|
875
|
+
self.__dict__.update(state)
|
|
876
|
+
self.__set_params_cpp()
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
class APLRTuner:
|
|
880
|
+
def __init__(
|
|
881
|
+
self,
|
|
882
|
+
parameters: Union[Dict[str, List[float]], List[Dict[str, List[float]]]] = {
|
|
883
|
+
"max_interaction_level": [0, 1],
|
|
884
|
+
"min_observations_in_split": [4, 10, 20, 100, 500, 1000],
|
|
885
|
+
},
|
|
886
|
+
is_regressor: bool = True,
|
|
887
|
+
):
|
|
888
|
+
self.parameters = parameters
|
|
889
|
+
self.is_regressor = is_regressor
|
|
890
|
+
self.parameter_grid = self._create_parameter_grid()
|
|
891
|
+
|
|
892
|
+
def _create_parameter_grid(self) -> List[Dict[str, float]]:
|
|
893
|
+
items = sorted(self.parameters.items())
|
|
894
|
+
keys, values = zip(*items)
|
|
895
|
+
combinations = list(itertools.product(*values))
|
|
896
|
+
grid = [dict(zip(keys, combination)) for combination in combinations]
|
|
897
|
+
return grid
|
|
898
|
+
|
|
899
|
+
def fit(self, X: Union[pd.DataFrame, FloatMatrix], y: FloatVector, **kwargs):
|
|
900
|
+
self.cv_results: List[Dict[str, float]] = []
|
|
901
|
+
best_validation_result = np.inf
|
|
902
|
+
for params in self.parameter_grid:
|
|
903
|
+
if self.is_regressor:
|
|
904
|
+
model = APLRRegressor(**params)
|
|
905
|
+
else:
|
|
906
|
+
model = APLRClassifier(**params)
|
|
907
|
+
model.fit(X, y, **kwargs)
|
|
908
|
+
cv_error_for_this_model = model.get_cv_error()
|
|
909
|
+
cv_results_for_this_model = model.get_params()
|
|
910
|
+
cv_results_for_this_model["cv_error"] = cv_error_for_this_model
|
|
911
|
+
self.cv_results.append(cv_results_for_this_model)
|
|
912
|
+
if cv_error_for_this_model < best_validation_result:
|
|
913
|
+
best_validation_result = cv_error_for_this_model
|
|
914
|
+
self.best_model = model
|
|
915
|
+
self.cv_results = sorted(self.cv_results, key=lambda x: x["cv_error"])
|
|
916
|
+
|
|
917
|
+
def predict(
|
|
918
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
919
|
+
) -> Union[FloatVector, List[str]]:
|
|
920
|
+
return self.best_model.predict(X, **kwargs)
|
|
921
|
+
|
|
922
|
+
def predict_class_probabilities(
|
|
923
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
924
|
+
) -> FloatMatrix:
|
|
925
|
+
if self.is_regressor == False:
|
|
926
|
+
return self.best_model.predict_class_probabilities(X, **kwargs)
|
|
927
|
+
else:
|
|
928
|
+
raise TypeError(
|
|
929
|
+
"predict_class_probabilities is only possible when is_regressor is False"
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
def predict_proba(
|
|
933
|
+
self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
|
|
934
|
+
) -> FloatMatrix:
|
|
935
|
+
return self.predict_class_probabilities(X, **kwargs)
|
|
936
|
+
|
|
937
|
+
def get_best_estimator(self) -> Union[APLRClassifier, APLRRegressor]:
|
|
938
|
+
return self.best_model
|
|
939
|
+
|
|
940
|
+
def get_cv_results(self) -> List[Dict[str, float]]:
|
|
941
|
+
return self.cv_results
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aplr
|
|
3
|
+
Version: 10.20.0
|
|
4
|
+
Summary: Automatic Piecewise Linear Regression
|
|
5
|
+
Home-page: https://github.com/ottenbreit-data-science/aplr
|
|
6
|
+
Author: Mathias von Ottenbreit
|
|
7
|
+
Author-email: ottenbreitdatascience@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Platform: Windows
|
|
10
|
+
Platform: Linux
|
|
11
|
+
Platform: MacOS
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: numpy>=1.11
|
|
17
|
+
Requires-Dist: pandas>=1.0.0
|
|
18
|
+
Provides-Extra: plots
|
|
19
|
+
Requires-Dist: matplotlib>=3.0; extra == "plots"
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: author-email
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: license
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: platform
|
|
29
|
+
Dynamic: provides-extra
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
The documentation for Automatic Piecewise Linear Regression is available at [https://github.com/ottenbreit-data-science/aplr](https://github.com/ottenbreit-data-science/aplr).
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
aplr_cpp.cp311-win_amd64.pyd,sha256=ecU1pYcAJ3K_B_pcYJS06DiNSMpCCovciVQWGpbWiwA,825856
|
|
2
|
+
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
+
aplr/aplr.py,sha256=I_LyS_uH9wmW7wE8uP6nvPhfCeeA1cQXIuLPqwT21OE,40125
|
|
4
|
+
aplr-10.20.0.dist-info/licenses/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
+
aplr-10.20.0.dist-info/METADATA,sha256=sRYG3XKunuA5qB-cm-6XBqUd4XRIgK1tVriXV3PX_5Y,1048
|
|
6
|
+
aplr-10.20.0.dist-info/WHEEL,sha256=JLOMsP7F5qtkAkINx5UnzbFguf8CqZeraV8o04b0I8I,101
|
|
7
|
+
aplr-10.20.0.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
+
aplr-10.20.0.dist-info/RECORD,,
|
aplr_cpp.cp311-win_amd64.pyd
CHANGED
|
Binary file
|
aplr-7.0.0.dist-info/METADATA
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: aplr
|
|
3
|
-
Version: 7.0.0
|
|
4
|
-
Summary: Automatic Piecewise Linear Regression
|
|
5
|
-
Home-page: https://github.com/ottenbreit-data-science/aplr
|
|
6
|
-
Author: Mathias von Ottenbreit
|
|
7
|
-
Author-email: ottenbreitdatascience@gmail.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Platform: Windows
|
|
10
|
-
Platform: Linux
|
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
-
Requires-Python: >=3.8
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Requires-Dist: numpy (>=1.20)
|
|
16
|
-
|
|
17
|
-
Build predictive and interpretable parametric regression or classification machine learning models in Python based on the Automatic Piecewise Linear Regression methodology developed by Mathias von Ottenbreit.
|
aplr-7.0.0.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
aplr_cpp.cp311-win_amd64.pyd,sha256=pOvJghR3IOWSQcVzvvtAgQnlKIG9HMCerOMrwhuuACE,471040
|
|
2
|
-
aplr/__init__.py,sha256=oDFSgVytP_qQ8ilun6oHxKr-DYEeqjEQp5FciX45lls,21
|
|
3
|
-
aplr/aplr.py,sha256=0gVeqsL1WyXHZpIDqn6rXoT0tuv76nHZVs6LCzOxNEM,16012
|
|
4
|
-
aplr-7.0.0.dist-info/LICENSE,sha256=YOMo-RaL4P7edMZGD96-NskKpxyMZdP3-WiiMMmihNk,1134
|
|
5
|
-
aplr-7.0.0.dist-info/METADATA,sha256=KwO6mV-2SWzBGp3lXXT-UQBhgpv99T1NVwI6aBeoSoc,671
|
|
6
|
-
aplr-7.0.0.dist-info/WHEEL,sha256=9wvhO-5NhjjD8YmmxAvXTPQXMDOZ50W5vklzeoqFtkM,102
|
|
7
|
-
aplr-7.0.0.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
|
|
8
|
-
aplr-7.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|