aplr 10.18.0__cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aplr might be problematic. Click here for more details.

aplr/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .aplr import *
aplr/aplr.py ADDED
@@ -0,0 +1,959 @@
1
+ from typing import List, Callable, Optional, Dict, Union
2
+ import numpy as np
3
+ import pandas as pd
4
+ import aplr_cpp
5
+ import itertools
6
+
7
+ FloatVector = np.ndarray
8
+ FloatMatrix = np.ndarray
9
+ IntVector = np.ndarray
10
+ IntMatrix = np.ndarray
11
+
12
+
13
+ class BaseAPLR:
14
+ def _validate_X_fit_rows(self, X):
15
+ """Checks if X has enough rows to be fitted."""
16
+ if (isinstance(X, np.ndarray) and X.shape[0] < 2) or (
17
+ isinstance(X, pd.DataFrame) and len(X) < 2
18
+ ):
19
+ raise ValueError("Input X must have at least 2 rows to be fitted.")
20
+
21
+ def _common_X_preprocessing(self, X, is_fitting: bool, X_names=None):
22
+ """Common preprocessing for fit and predict."""
23
+ is_dataframe_input = isinstance(X, pd.DataFrame)
24
+
25
+ if not is_dataframe_input:
26
+ try:
27
+ X_numeric = np.array(X, dtype=np.float64)
28
+ except (ValueError, TypeError) as e:
29
+ raise TypeError(
30
+ "Input X must be numeric if not a pandas DataFrame."
31
+ ) from e
32
+ X = pd.DataFrame(X_numeric)
33
+ if is_fitting:
34
+ if X_names:
35
+ X.columns = X_names
36
+ else:
37
+ X.columns = [f"X{i}" for i in range(X.shape[1])]
38
+ elif hasattr(self, "X_names_") and len(self.X_names_) == X.shape[1]:
39
+ X.columns = self.X_names_
40
+ else: # X is already a DataFrame
41
+ X = X.copy() # Always copy to avoid modifying original
42
+ if not is_fitting and hasattr(self, "X_names_"):
43
+ # Check if input columns for prediction match training columns (before OHE)
44
+ if set(X.columns) != set(self.X_names_):
45
+ raise ValueError(
46
+ "Input columns for prediction do not match training columns."
47
+ )
48
+ X = X[self.X_names_] # Ensure order of original columns
49
+
50
+ if is_fitting:
51
+ self.X_names_ = list(X.columns)
52
+ self.categorical_features_ = list(
53
+ X.select_dtypes(include=["category", "object"]).columns
54
+ )
55
+
56
+ if self.categorical_features_:
57
+ X = pd.get_dummies(X, columns=self.categorical_features_, dummy_na=False)
58
+ if is_fitting:
59
+ self.ohe_columns_ = list(X.columns)
60
+ else:
61
+ missing_cols = set(self.ohe_columns_) - set(X.columns)
62
+ for c in missing_cols:
63
+ X[c] = 0
64
+ X = X[self.ohe_columns_] # Enforce column order
65
+
66
+ if is_fitting:
67
+ self.na_imputed_cols_ = [col for col in X.columns if X[col].isnull().any()]
68
+
69
+ if self.na_imputed_cols_:
70
+ for col in self.na_imputed_cols_:
71
+ X[col + "_missing"] = X[col].isnull().astype(int)
72
+
73
+ if not is_fitting:
74
+ for col in self.median_values_:
75
+ if col in X.columns:
76
+ X[col] = X[col].fillna(self.median_values_[col])
77
+
78
+ return X
79
+
80
+ def _preprocess_X_fit(self, X, X_names, sample_weight):
81
+ if sample_weight.size > 0:
82
+ if sample_weight.ndim != 1:
83
+ raise ValueError("sample_weight must be a 1D array.")
84
+ if len(sample_weight) != X.shape[0]:
85
+ raise ValueError(
86
+ "sample_weight must have the same number of rows as X."
87
+ )
88
+ if np.any(np.isnan(sample_weight)) or np.any(np.isinf(sample_weight)):
89
+ raise ValueError("sample_weight cannot contain nan or infinite values.")
90
+ if np.any(sample_weight < 0):
91
+ raise ValueError("sample_weight cannot contain negative values.")
92
+ X = self._common_X_preprocessing(X, is_fitting=True, X_names=X_names)
93
+ self.median_values_ = {}
94
+ numeric_cols_for_median = [col for col in X.columns if "_missing" not in col]
95
+ for col in numeric_cols_for_median:
96
+ missing_mask = X[col].isnull()
97
+
98
+ if sample_weight.size > 0:
99
+ valid_indices = ~missing_mask
100
+ col_data = X.loc[valid_indices, col]
101
+ col_weights = sample_weight[valid_indices]
102
+ if col_data.empty:
103
+ median_val = 0
104
+ else:
105
+ col_data_np = col_data.to_numpy()
106
+ sort_indices = np.argsort(col_data_np, kind="stable")
107
+ sorted_data = col_data_np[sort_indices]
108
+ sorted_weights = col_weights[sort_indices]
109
+
110
+ cumulative_weights = np.cumsum(sorted_weights)
111
+ total_weight = cumulative_weights[-1]
112
+
113
+ median_weight_index = np.searchsorted(
114
+ cumulative_weights, total_weight / 2.0
115
+ )
116
+ if median_weight_index >= len(sorted_data):
117
+ median_weight_index = len(sorted_data) - 1
118
+ median_val = sorted_data[median_weight_index]
119
+ else:
120
+ median_val = X[col].median()
121
+
122
+ if pd.isna(median_val):
123
+ median_val = 0
124
+
125
+ self.median_values_[col] = median_val
126
+ X[col] = X[col].fillna(median_val)
127
+
128
+ self.final_training_columns_ = list(X.columns)
129
+ return X.values.astype(np.float64), list(X.columns)
130
+
131
+ def _preprocess_X_predict(self, X):
132
+ X = self._common_X_preprocessing(X, is_fitting=False)
133
+
134
+ if hasattr(self, "final_training_columns_"):
135
+ X = X[self.final_training_columns_]
136
+
137
+ return X.values.astype(np.float64)
138
+
139
+
140
+ class APLRRegressor(BaseAPLR):
141
+ def __init__(
142
+ self,
143
+ m: int = 3000,
144
+ v: float = 0.5,
145
+ random_state: int = 0,
146
+ loss_function: str = "mse",
147
+ link_function: str = "identity",
148
+ n_jobs: int = 0,
149
+ cv_folds: int = 5,
150
+ bins: int = 300,
151
+ max_interaction_level: int = 1,
152
+ max_interactions: int = 100000,
153
+ min_observations_in_split: int = 4,
154
+ ineligible_boosting_steps_added: int = 15,
155
+ max_eligible_terms: int = 7,
156
+ verbosity: int = 0,
157
+ dispersion_parameter: float = 1.5,
158
+ validation_tuning_metric: str = "default",
159
+ quantile: float = 0.5,
160
+ calculate_custom_validation_error_function: Optional[
161
+ Callable[
162
+ [
163
+ FloatVector,
164
+ FloatVector,
165
+ FloatVector,
166
+ FloatVector,
167
+ FloatMatrix,
168
+ ],
169
+ float,
170
+ ]
171
+ ] = None,
172
+ calculate_custom_loss_function: Optional[
173
+ Callable[
174
+ [
175
+ FloatVector,
176
+ FloatVector,
177
+ FloatVector,
178
+ FloatVector,
179
+ FloatMatrix,
180
+ ],
181
+ float,
182
+ ]
183
+ ] = None,
184
+ calculate_custom_negative_gradient_function: Optional[
185
+ Callable[
186
+ [FloatVector, FloatVector, FloatVector, FloatMatrix],
187
+ FloatVector,
188
+ ]
189
+ ] = None,
190
+ calculate_custom_transform_linear_predictor_to_predictions_function: Optional[
191
+ Callable[[FloatVector], FloatVector]
192
+ ] = None,
193
+ calculate_custom_differentiate_predictions_wrt_linear_predictor_function: Optional[
194
+ Callable[[FloatVector], FloatVector]
195
+ ] = None,
196
+ boosting_steps_before_interactions_are_allowed: int = 0,
197
+ monotonic_constraints_ignore_interactions: bool = False,
198
+ group_mse_by_prediction_bins: int = 10,
199
+ group_mse_cycle_min_obs_in_bin: int = 30,
200
+ early_stopping_rounds: int = 200,
201
+ num_first_steps_with_linear_effects_only: int = 0,
202
+ penalty_for_non_linearity: float = 0.0,
203
+ penalty_for_interactions: float = 0.0,
204
+ max_terms: int = 0,
205
+ ridge_penalty: float = 0.0001,
206
+ mean_bias_correction: bool = False,
207
+ faster_convergence: bool = False,
208
+ ):
209
+ self.m = m
210
+ self.v = v
211
+ self.random_state = random_state
212
+ self.loss_function = loss_function
213
+ self.link_function = link_function
214
+ self.n_jobs = n_jobs
215
+ self.cv_folds = cv_folds
216
+ self.bins = bins
217
+ self.max_interaction_level = max_interaction_level
218
+ self.max_interactions = max_interactions
219
+ self.min_observations_in_split = min_observations_in_split
220
+ self.ineligible_boosting_steps_added = ineligible_boosting_steps_added
221
+ self.max_eligible_terms = max_eligible_terms
222
+ self.verbosity = verbosity
223
+ self.dispersion_parameter = dispersion_parameter
224
+ self.validation_tuning_metric = validation_tuning_metric
225
+ self.quantile = quantile
226
+ self.calculate_custom_validation_error_function = (
227
+ calculate_custom_validation_error_function
228
+ )
229
+ self.calculate_custom_loss_function = calculate_custom_loss_function
230
+ self.calculate_custom_negative_gradient_function = (
231
+ calculate_custom_negative_gradient_function
232
+ )
233
+ self.calculate_custom_transform_linear_predictor_to_predictions_function = (
234
+ calculate_custom_transform_linear_predictor_to_predictions_function
235
+ )
236
+ self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
237
+ calculate_custom_differentiate_predictions_wrt_linear_predictor_function
238
+ )
239
+ self.boosting_steps_before_interactions_are_allowed = (
240
+ boosting_steps_before_interactions_are_allowed
241
+ )
242
+ self.monotonic_constraints_ignore_interactions = (
243
+ monotonic_constraints_ignore_interactions
244
+ )
245
+ self.group_mse_by_prediction_bins = group_mse_by_prediction_bins
246
+ self.group_mse_cycle_min_obs_in_bin = group_mse_cycle_min_obs_in_bin
247
+ self.early_stopping_rounds = early_stopping_rounds
248
+ self.num_first_steps_with_linear_effects_only = (
249
+ num_first_steps_with_linear_effects_only
250
+ )
251
+ self.penalty_for_non_linearity = penalty_for_non_linearity
252
+ self.penalty_for_interactions = penalty_for_interactions
253
+ self.max_terms = max_terms
254
+ self.ridge_penalty = ridge_penalty
255
+ self.mean_bias_correction = mean_bias_correction
256
+ self.faster_convergence = faster_convergence
257
+
258
+ # Data transformations
259
+ self.median_values_ = {}
260
+ self.categorical_features_ = []
261
+ self.ohe_columns_ = []
262
+ self.na_imputed_cols_ = []
263
+ self.X_names_ = []
264
+
265
+ # Creating aplr_cpp and setting parameters
266
+ self.APLRRegressor = aplr_cpp.APLRRegressor()
267
+ self.__set_params_cpp()
268
+
269
+ # Sets parameters for aplr_cpp.APLRRegressor cpp object
270
+ def __set_params_cpp(self):
271
+ self.APLRRegressor.m = self.m
272
+ self.APLRRegressor.v = self.v
273
+ self.APLRRegressor.random_state = self.random_state
274
+ self.APLRRegressor.loss_function = self.loss_function
275
+ self.APLRRegressor.link_function = self.link_function
276
+ self.APLRRegressor.n_jobs = self.n_jobs
277
+ self.APLRRegressor.cv_folds = self.cv_folds
278
+ self.APLRRegressor.bins = self.bins
279
+ self.APLRRegressor.max_interaction_level = self.max_interaction_level
280
+ self.APLRRegressor.max_interactions = self.max_interactions
281
+ self.APLRRegressor.min_observations_in_split = self.min_observations_in_split
282
+ self.APLRRegressor.ineligible_boosting_steps_added = (
283
+ self.ineligible_boosting_steps_added
284
+ )
285
+ self.APLRRegressor.max_eligible_terms = self.max_eligible_terms
286
+ self.APLRRegressor.verbosity = self.verbosity
287
+ self.APLRRegressor.dispersion_parameter = self.dispersion_parameter
288
+ self.APLRRegressor.validation_tuning_metric = self.validation_tuning_metric
289
+ self.APLRRegressor.quantile = self.quantile
290
+ self.APLRRegressor.calculate_custom_validation_error_function = (
291
+ self.calculate_custom_validation_error_function
292
+ )
293
+ self.APLRRegressor.calculate_custom_loss_function = (
294
+ self.calculate_custom_loss_function
295
+ )
296
+ self.APLRRegressor.calculate_custom_negative_gradient_function = (
297
+ self.calculate_custom_negative_gradient_function
298
+ )
299
+ self.APLRRegressor.calculate_custom_transform_linear_predictor_to_predictions_function = (
300
+ self.calculate_custom_transform_linear_predictor_to_predictions_function
301
+ )
302
+ self.APLRRegressor.calculate_custom_differentiate_predictions_wrt_linear_predictor_function = (
303
+ self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function
304
+ )
305
+ self.APLRRegressor.boosting_steps_before_interactions_are_allowed = (
306
+ self.boosting_steps_before_interactions_are_allowed
307
+ )
308
+ self.APLRRegressor.monotonic_constraints_ignore_interactions = (
309
+ self.monotonic_constraints_ignore_interactions
310
+ )
311
+ self.APLRRegressor.group_mse_by_prediction_bins = (
312
+ self.group_mse_by_prediction_bins
313
+ )
314
+ self.APLRRegressor.group_mse_cycle_min_obs_in_bin = (
315
+ self.group_mse_cycle_min_obs_in_bin
316
+ )
317
+ self.APLRRegressor.early_stopping_rounds = self.early_stopping_rounds
318
+ self.APLRRegressor.num_first_steps_with_linear_effects_only = (
319
+ self.num_first_steps_with_linear_effects_only
320
+ )
321
+ self.APLRRegressor.penalty_for_non_linearity = self.penalty_for_non_linearity
322
+ self.APLRRegressor.penalty_for_interactions = self.penalty_for_interactions
323
+ self.APLRRegressor.max_terms = self.max_terms
324
+ self.APLRRegressor.ridge_penalty = self.ridge_penalty
325
+ self.APLRRegressor.mean_bias_correction = self.mean_bias_correction
326
+ self.APLRRegressor.faster_convergence = self.faster_convergence
327
+
328
+ def fit(
329
+ self,
330
+ X: Union[pd.DataFrame, FloatMatrix],
331
+ y: FloatVector,
332
+ sample_weight: FloatVector = np.empty(0),
333
+ X_names: List[str] = [],
334
+ cv_observations: IntMatrix = np.empty([0, 0]),
335
+ prioritized_predictors_indexes: List[int] = [],
336
+ monotonic_constraints: List[int] = [],
337
+ group: FloatVector = np.empty(0),
338
+ interaction_constraints: List[List[int]] = [],
339
+ other_data: FloatMatrix = np.empty([0, 0]),
340
+ predictor_learning_rates: List[float] = [],
341
+ predictor_penalties_for_non_linearity: List[float] = [],
342
+ predictor_penalties_for_interactions: List[float] = [],
343
+ predictor_min_observations_in_split: List[int] = [],
344
+ ):
345
+ self._validate_X_fit_rows(X)
346
+ self.__set_params_cpp()
347
+ X_transformed, X_names_transformed = self._preprocess_X_fit(
348
+ X, X_names, sample_weight
349
+ )
350
+ self.APLRRegressor.fit(
351
+ X_transformed,
352
+ y,
353
+ sample_weight,
354
+ X_names_transformed,
355
+ cv_observations,
356
+ prioritized_predictors_indexes,
357
+ monotonic_constraints,
358
+ group,
359
+ interaction_constraints,
360
+ other_data,
361
+ predictor_learning_rates,
362
+ predictor_penalties_for_non_linearity,
363
+ predictor_penalties_for_interactions,
364
+ predictor_min_observations_in_split,
365
+ )
366
+
367
+ def predict(
368
+ self,
369
+ X: Union[pd.DataFrame, FloatMatrix],
370
+ cap_predictions_to_minmax_in_training: bool = True,
371
+ ) -> FloatVector:
372
+ if self.link_function == "custom_function":
373
+ self.APLRRegressor.calculate_custom_transform_linear_predictor_to_predictions_function = (
374
+ self.calculate_custom_transform_linear_predictor_to_predictions_function
375
+ )
376
+ X_transformed = self._preprocess_X_predict(X)
377
+ return self.APLRRegressor.predict(
378
+ X_transformed, cap_predictions_to_minmax_in_training
379
+ )
380
+
381
+ def set_term_names(self, X_names: List[str]):
382
+ self.APLRRegressor.set_term_names(X_names)
383
+
384
+ def calculate_feature_importance(
385
+ self,
386
+ X: Union[pd.DataFrame, FloatMatrix],
387
+ sample_weight: FloatVector = np.empty(0),
388
+ ) -> FloatVector:
389
+ X_transformed = self._preprocess_X_predict(X)
390
+ return self.APLRRegressor.calculate_feature_importance(
391
+ X_transformed, sample_weight
392
+ )
393
+
394
+ def calculate_term_importance(
395
+ self,
396
+ X: Union[pd.DataFrame, FloatMatrix],
397
+ sample_weight: FloatVector = np.empty(0),
398
+ ) -> FloatVector:
399
+ X_transformed = self._preprocess_X_predict(X)
400
+ return self.APLRRegressor.calculate_term_importance(
401
+ X_transformed, sample_weight
402
+ )
403
+
404
+ def calculate_local_feature_contribution(
405
+ self, X: Union[pd.DataFrame, FloatMatrix]
406
+ ) -> FloatMatrix:
407
+ X_transformed = self._preprocess_X_predict(X)
408
+ return self.APLRRegressor.calculate_local_feature_contribution(X_transformed)
409
+
410
+ def calculate_local_term_contribution(
411
+ self, X: Union[pd.DataFrame, FloatMatrix]
412
+ ) -> FloatMatrix:
413
+ X_transformed = self._preprocess_X_predict(X)
414
+ return self.APLRRegressor.calculate_local_term_contribution(X_transformed)
415
+
416
+ def calculate_local_contribution_from_selected_terms(
417
+ self, X: Union[pd.DataFrame, FloatMatrix], predictor_indexes: List[int]
418
+ ) -> FloatVector:
419
+ X_transformed = self._preprocess_X_predict(X)
420
+ return self.APLRRegressor.calculate_local_contribution_from_selected_terms(
421
+ X_transformed, predictor_indexes
422
+ )
423
+
424
+ def calculate_terms(self, X: Union[pd.DataFrame, FloatMatrix]) -> FloatMatrix:
425
+ X_transformed = self._preprocess_X_predict(X)
426
+ return self.APLRRegressor.calculate_terms(X_transformed)
427
+
428
+ def get_term_names(self) -> List[str]:
429
+ return self.APLRRegressor.get_term_names()
430
+
431
+ def get_term_affiliations(self) -> List[str]:
432
+ return self.APLRRegressor.get_term_affiliations()
433
+
434
+ def get_unique_term_affiliations(self) -> List[str]:
435
+ return self.APLRRegressor.get_unique_term_affiliations()
436
+
437
+ def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
438
+ return self.APLRRegressor.get_base_predictors_in_each_unique_term_affiliation()
439
+
440
+ def get_term_coefficients(self) -> FloatVector:
441
+ return self.APLRRegressor.get_term_coefficients()
442
+
443
+ def get_validation_error_steps(self) -> FloatMatrix:
444
+ return self.APLRRegressor.get_validation_error_steps()
445
+
446
+ def get_feature_importance(self) -> FloatVector:
447
+ return self.APLRRegressor.get_feature_importance()
448
+
449
+ def get_term_importance(self) -> FloatVector:
450
+ return self.APLRRegressor.get_term_importance()
451
+
452
+ def get_term_main_predictor_indexes(self) -> IntVector:
453
+ return self.APLRRegressor.get_term_main_predictor_indexes()
454
+
455
+ def get_term_interaction_levels(self) -> IntVector:
456
+ return self.APLRRegressor.get_term_interaction_levels()
457
+
458
+ def get_intercept(self) -> float:
459
+ return self.APLRRegressor.get_intercept()
460
+
461
+ def get_optimal_m(self) -> int:
462
+ return self.APLRRegressor.get_optimal_m()
463
+
464
+ def get_validation_tuning_metric(self) -> str:
465
+ return self.APLRRegressor.get_validation_tuning_metric()
466
+
467
+ def get_main_effect_shape(self, predictor_index: int) -> Dict[float, float]:
468
+ return self.APLRRegressor.get_main_effect_shape(predictor_index)
469
+
470
+ def get_unique_term_affiliation_shape(
471
+ self,
472
+ unique_term_affiliation: str,
473
+ max_rows_before_sampling: int = 500000,
474
+ additional_points: int = 250,
475
+ ) -> FloatMatrix:
476
+ return self.APLRRegressor.get_unique_term_affiliation_shape(
477
+ unique_term_affiliation, max_rows_before_sampling, additional_points
478
+ )
479
+
480
+ def get_cv_error(self) -> float:
481
+ return self.APLRRegressor.get_cv_error()
482
+
483
+ def set_intercept(self, value: float):
484
+ self.APLRRegressor.set_intercept(value)
485
+
486
+ def plot_affiliation_shape(
487
+ self,
488
+ affiliation: str,
489
+ plot: bool = True,
490
+ save: bool = False,
491
+ path: str = "",
492
+ ):
493
+ """
494
+ Plots or saves the shape of a given unique term affiliation.
495
+
496
+ For main effects, it produces a line plot. For two-way interactions, it produces a heatmap.
497
+ Plotting for higher-order interactions is not supported.
498
+
499
+ :param affiliation: A string specifying which unique_term_affiliation to use.
500
+ :param plot: If True, displays the plot.
501
+ :param save: If True, saves the plot to a file.
502
+ :param path: The file path to save the plot. If empty and save is True, a default path will be used.
503
+ """
504
+ try:
505
+ import matplotlib.pyplot as plt
506
+ except ImportError:
507
+ raise ImportError("matplotlib is required for plotting. Please install it.")
508
+
509
+ all_affiliations = self.get_unique_term_affiliations()
510
+ if affiliation not in all_affiliations:
511
+ raise ValueError(
512
+ f"Affiliation '{affiliation}' not found in model. "
513
+ f"Available affiliations are: {all_affiliations}"
514
+ )
515
+
516
+ affiliation_index = all_affiliations.index(affiliation)
517
+
518
+ predictors_in_each_affiliation = (
519
+ self.get_base_predictors_in_each_unique_term_affiliation()
520
+ )
521
+ predictor_indexes_used = predictors_in_each_affiliation[affiliation_index]
522
+
523
+ shape = self.get_unique_term_affiliation_shape(affiliation)
524
+ if shape.shape[0] == 0:
525
+ print(f"No shape data available for affiliation '{affiliation}'.")
526
+ return
527
+
528
+ predictor_names = affiliation.split(" & ")
529
+
530
+ is_main_effect: bool = len(predictor_indexes_used) == 1
531
+ is_two_way_interaction: bool = len(predictor_indexes_used) == 2
532
+
533
+ if is_main_effect:
534
+ fig = plt.figure()
535
+ # Sort by predictor value for a clean line plot
536
+ sorted_indices = np.argsort(shape[:, 0])
537
+ plt.plot(shape[sorted_indices, 0], shape[sorted_indices, 1])
538
+ plt.xlabel(predictor_names[0])
539
+ plt.ylabel("Contribution to linear predictor")
540
+ plt.title(f"Main effect of {predictor_names[0]}")
541
+ plt.grid(True)
542
+ elif is_two_way_interaction:
543
+ fig = plt.figure(figsize=(8, 6))
544
+
545
+ # Get unique coordinates and their inverse mapping
546
+ y_unique, y_inv = np.unique(shape[:, 0], return_inverse=True)
547
+ x_unique, x_inv = np.unique(shape[:, 1], return_inverse=True)
548
+
549
+ # Create grid for sums and counts
550
+ grid_sums = np.zeros((len(y_unique), len(x_unique)))
551
+ grid_counts = np.zeros((len(y_unique), len(x_unique)))
552
+
553
+ # Populate sums and counts to later calculate the mean
554
+ np.add.at(grid_sums, (y_inv, x_inv), shape[:, 2])
555
+ np.add.at(grid_counts, (y_inv, x_inv), 1)
556
+
557
+ # Calculate mean, avoiding division by zero
558
+ with np.errstate(divide="ignore", invalid="ignore"):
559
+ pivot_table_values = np.true_divide(grid_sums, grid_counts)
560
+ # Where there's no data, pivot_table_values will be nan, which is fine for imshow.
561
+
562
+ plt.imshow(
563
+ pivot_table_values,
564
+ aspect="auto",
565
+ origin="lower",
566
+ extent=[
567
+ x_unique.min(),
568
+ x_unique.max(),
569
+ y_unique.min(),
570
+ y_unique.max(),
571
+ ],
572
+ cmap="Blues_r",
573
+ )
574
+ plt.colorbar(label="Contribution to the linear predictor")
575
+ plt.xlabel(predictor_names[1])
576
+ plt.ylabel(predictor_names[0])
577
+ plt.title(
578
+ f"Interaction between {predictor_names[0]} and {predictor_names[1]}"
579
+ )
580
+ else:
581
+ print(
582
+ f"Plotting for interaction level > 2 is not supported. Affiliation: {affiliation}"
583
+ )
584
+ return
585
+
586
+ if save:
587
+ save_path = path or f"shape_of_{affiliation.replace(' & ', '_')}.png"
588
+ plt.savefig(save_path)
589
+
590
+ if plot:
591
+ plt.show()
592
+
593
+ plt.close(fig)
594
+
595
+ def remove_provided_custom_functions(self):
596
+ self.APLRRegressor.remove_provided_custom_functions()
597
+ self.calculate_custom_validation_error_function = None
598
+ self.calculate_custom_loss_function = None
599
+ self.calculate_custom_negative_gradient_function = None
600
+
601
+ # For sklearn
602
+ def get_params(self, deep=True):
603
+ return {
604
+ "m": self.m,
605
+ "v": self.v,
606
+ "random_state": self.random_state,
607
+ "loss_function": self.loss_function,
608
+ "link_function": self.link_function,
609
+ "n_jobs": self.n_jobs,
610
+ "cv_folds": self.cv_folds,
611
+ "bins": self.bins,
612
+ "max_interaction_level": self.max_interaction_level,
613
+ "max_interactions": self.max_interactions,
614
+ "verbosity": self.verbosity,
615
+ "min_observations_in_split": self.min_observations_in_split,
616
+ "ineligible_boosting_steps_added": self.ineligible_boosting_steps_added,
617
+ "max_eligible_terms": self.max_eligible_terms,
618
+ "dispersion_parameter": self.dispersion_parameter,
619
+ "validation_tuning_metric": self.validation_tuning_metric,
620
+ "quantile": self.quantile,
621
+ "calculate_custom_validation_error_function": self.calculate_custom_validation_error_function,
622
+ "calculate_custom_loss_function": self.calculate_custom_loss_function,
623
+ "calculate_custom_negative_gradient_function": self.calculate_custom_negative_gradient_function,
624
+ "calculate_custom_transform_linear_predictor_to_predictions_function": self.calculate_custom_transform_linear_predictor_to_predictions_function,
625
+ "calculate_custom_differentiate_predictions_wrt_linear_predictor_function": self.calculate_custom_differentiate_predictions_wrt_linear_predictor_function,
626
+ "boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
627
+ "monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
628
+ "group_mse_by_prediction_bins": self.group_mse_by_prediction_bins,
629
+ "group_mse_cycle_min_obs_in_bin": self.group_mse_cycle_min_obs_in_bin,
630
+ "early_stopping_rounds": self.early_stopping_rounds,
631
+ "num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
632
+ "penalty_for_non_linearity": self.penalty_for_non_linearity,
633
+ "penalty_for_interactions": self.penalty_for_interactions,
634
+ "max_terms": self.max_terms,
635
+ "ridge_penalty": self.ridge_penalty,
636
+ "mean_bias_correction": self.mean_bias_correction,
637
+ "faster_convergence": self.faster_convergence,
638
+ }
639
+
640
+ # For sklearn
641
+ def set_params(self, **parameters):
642
+ for parameter, value in parameters.items():
643
+ setattr(self, parameter, value)
644
+ self.__set_params_cpp()
645
+ return self
646
+
647
+
648
+ class APLRClassifier(BaseAPLR):
649
+ def __init__(
650
+ self,
651
+ m: int = 3000,
652
+ v: float = 0.5,
653
+ random_state: int = 0,
654
+ n_jobs: int = 0,
655
+ cv_folds: int = 5,
656
+ bins: int = 300,
657
+ verbosity: int = 0,
658
+ max_interaction_level: int = 1,
659
+ max_interactions: int = 100000,
660
+ min_observations_in_split: int = 4,
661
+ ineligible_boosting_steps_added: int = 15,
662
+ max_eligible_terms: int = 7,
663
+ boosting_steps_before_interactions_are_allowed: int = 0,
664
+ monotonic_constraints_ignore_interactions: bool = False,
665
+ early_stopping_rounds: int = 200,
666
+ num_first_steps_with_linear_effects_only: int = 0,
667
+ penalty_for_non_linearity: float = 0.0,
668
+ penalty_for_interactions: float = 0.0,
669
+ max_terms: int = 0,
670
+ ridge_penalty: float = 0.0001,
671
+ ):
672
+ self.m = m
673
+ self.v = v
674
+ self.random_state = random_state
675
+ self.n_jobs = n_jobs
676
+ self.cv_folds = cv_folds
677
+ self.bins = bins
678
+ self.verbosity = verbosity
679
+ self.max_interaction_level = max_interaction_level
680
+ self.max_interactions = max_interactions
681
+ self.min_observations_in_split = min_observations_in_split
682
+ self.ineligible_boosting_steps_added = ineligible_boosting_steps_added
683
+ self.max_eligible_terms = max_eligible_terms
684
+ self.boosting_steps_before_interactions_are_allowed = (
685
+ boosting_steps_before_interactions_are_allowed
686
+ )
687
+ self.monotonic_constraints_ignore_interactions = (
688
+ monotonic_constraints_ignore_interactions
689
+ )
690
+ self.early_stopping_rounds = early_stopping_rounds
691
+ self.num_first_steps_with_linear_effects_only = (
692
+ num_first_steps_with_linear_effects_only
693
+ )
694
+ self.penalty_for_non_linearity = penalty_for_non_linearity
695
+ self.penalty_for_interactions = penalty_for_interactions
696
+ self.max_terms = max_terms
697
+ self.ridge_penalty = ridge_penalty
698
+
699
+ # Data transformations
700
+ self.median_values_ = {}
701
+ self.categorical_features_ = []
702
+ self.ohe_columns_ = []
703
+ self.na_imputed_cols_ = []
704
+ self.X_names_ = []
705
+
706
+ # Creating aplr_cpp and setting parameters
707
+ self.APLRClassifier = aplr_cpp.APLRClassifier()
708
+ self.__set_params_cpp()
709
+
710
+ # Sets parameters for aplr_cpp.APLRClassifier cpp object
711
+ def __set_params_cpp(self):
712
+ self.APLRClassifier.m = self.m
713
+ self.APLRClassifier.v = self.v
714
+ self.APLRClassifier.random_state = self.random_state
715
+ self.APLRClassifier.n_jobs = self.n_jobs
716
+ self.APLRClassifier.cv_folds = self.cv_folds
717
+ self.APLRClassifier.bins = self.bins
718
+ self.APLRClassifier.verbosity = self.verbosity
719
+ self.APLRClassifier.max_interaction_level = self.max_interaction_level
720
+ self.APLRClassifier.max_interactions = self.max_interactions
721
+ self.APLRClassifier.min_observations_in_split = self.min_observations_in_split
722
+ self.APLRClassifier.ineligible_boosting_steps_added = (
723
+ self.ineligible_boosting_steps_added
724
+ )
725
+ self.APLRClassifier.max_eligible_terms = self.max_eligible_terms
726
+ self.APLRClassifier.boosting_steps_before_interactions_are_allowed = (
727
+ self.boosting_steps_before_interactions_are_allowed
728
+ )
729
+ self.APLRClassifier.monotonic_constraints_ignore_interactions = (
730
+ self.monotonic_constraints_ignore_interactions
731
+ )
732
+ self.APLRClassifier.early_stopping_rounds = self.early_stopping_rounds
733
+ self.APLRClassifier.num_first_steps_with_linear_effects_only = (
734
+ self.num_first_steps_with_linear_effects_only
735
+ )
736
+ self.APLRClassifier.penalty_for_non_linearity = self.penalty_for_non_linearity
737
+ self.APLRClassifier.penalty_for_interactions = self.penalty_for_interactions
738
+ self.APLRClassifier.max_terms = self.max_terms
739
+ self.APLRClassifier.ridge_penalty = self.ridge_penalty
740
+
741
+ def fit(
742
+ self,
743
+ X: Union[pd.DataFrame, FloatMatrix],
744
+ y: Union[FloatVector, List[str]],
745
+ sample_weight: FloatVector = np.empty(0),
746
+ X_names: List[str] = [],
747
+ cv_observations: IntMatrix = np.empty([0, 0]),
748
+ prioritized_predictors_indexes: List[int] = [],
749
+ monotonic_constraints: List[int] = [],
750
+ interaction_constraints: List[List[int]] = [],
751
+ predictor_learning_rates: List[float] = [],
752
+ predictor_penalties_for_non_linearity: List[float] = [],
753
+ predictor_penalties_for_interactions: List[float] = [],
754
+ predictor_min_observations_in_split: List[int] = [],
755
+ ):
756
+ self._validate_X_fit_rows(X)
757
+ self.__set_params_cpp()
758
+ X_transformed, X_names_transformed = self._preprocess_X_fit(
759
+ X, X_names, sample_weight
760
+ )
761
+
762
+ if isinstance(y, np.ndarray):
763
+ y = y.astype(str).tolist()
764
+ elif isinstance(y, list) and y and not isinstance(y[0], str):
765
+ y = [str(val) for val in y]
766
+
767
+ self.APLRClassifier.fit(
768
+ X_transformed,
769
+ y,
770
+ sample_weight,
771
+ X_names_transformed,
772
+ cv_observations,
773
+ prioritized_predictors_indexes,
774
+ monotonic_constraints,
775
+ interaction_constraints,
776
+ predictor_learning_rates,
777
+ predictor_penalties_for_non_linearity,
778
+ predictor_penalties_for_interactions,
779
+ predictor_min_observations_in_split,
780
+ )
781
+ # For sklearn
782
+ self.classes_ = np.arange(len(self.APLRClassifier.get_categories()))
783
+
784
+ def predict_class_probabilities(
785
+ self,
786
+ X: Union[pd.DataFrame, FloatMatrix],
787
+ cap_predictions_to_minmax_in_training: bool = False,
788
+ ) -> FloatMatrix:
789
+ X_transformed = self._preprocess_X_predict(X)
790
+ return self.APLRClassifier.predict_class_probabilities(
791
+ X_transformed, cap_predictions_to_minmax_in_training
792
+ )
793
+
794
+ def predict(
795
+ self,
796
+ X: Union[pd.DataFrame, FloatMatrix],
797
+ cap_predictions_to_minmax_in_training: bool = False,
798
+ ) -> List[str]:
799
+ X_transformed = self._preprocess_X_predict(X)
800
+ return self.APLRClassifier.predict(
801
+ X_transformed, cap_predictions_to_minmax_in_training
802
+ )
803
+
804
+ def calculate_local_feature_contribution(
805
+ self, X: Union[pd.DataFrame, FloatMatrix]
806
+ ) -> FloatMatrix:
807
+ X_transformed = self._preprocess_X_predict(X)
808
+ return self.APLRClassifier.calculate_local_feature_contribution(X_transformed)
809
+
810
+ def get_categories(self) -> List[str]:
811
+ return self.APLRClassifier.get_categories()
812
+
813
+ def get_logit_model(self, category: str) -> APLRRegressor:
814
+ logit_model_cpp = self.APLRClassifier.get_logit_model(category)
815
+
816
+ logit_model_py = APLRRegressor(
817
+ m=self.m,
818
+ v=self.v,
819
+ random_state=self.random_state,
820
+ loss_function="binomial",
821
+ link_function="logit",
822
+ n_jobs=self.n_jobs,
823
+ cv_folds=self.cv_folds,
824
+ bins=self.bins,
825
+ max_interaction_level=self.max_interaction_level,
826
+ max_interactions=self.max_interactions,
827
+ min_observations_in_split=self.min_observations_in_split,
828
+ ineligible_boosting_steps_added=self.ineligible_boosting_steps_added,
829
+ max_eligible_terms=self.max_eligible_terms,
830
+ verbosity=self.verbosity,
831
+ boosting_steps_before_interactions_are_allowed=self.boosting_steps_before_interactions_are_allowed,
832
+ monotonic_constraints_ignore_interactions=self.monotonic_constraints_ignore_interactions,
833
+ early_stopping_rounds=self.early_stopping_rounds,
834
+ num_first_steps_with_linear_effects_only=self.num_first_steps_with_linear_effects_only,
835
+ penalty_for_non_linearity=self.penalty_for_non_linearity,
836
+ penalty_for_interactions=self.penalty_for_interactions,
837
+ max_terms=self.max_terms,
838
+ ridge_penalty=self.ridge_penalty,
839
+ )
840
+
841
+ logit_model_py.APLRRegressor = logit_model_cpp
842
+
843
+ return logit_model_py
844
+
845
+ def get_validation_error_steps(self) -> FloatMatrix:
846
+ return self.APLRClassifier.get_validation_error_steps()
847
+
848
+ def get_cv_error(self) -> float:
849
+ return self.APLRClassifier.get_cv_error()
850
+
851
+ def get_feature_importance(self) -> FloatVector:
852
+ return self.APLRClassifier.get_feature_importance()
853
+
854
+ def get_unique_term_affiliations(self) -> List[str]:
855
+ return self.APLRClassifier.get_unique_term_affiliations()
856
+
857
+ def get_base_predictors_in_each_unique_term_affiliation(self) -> List[List[int]]:
858
+ return self.APLRClassifier.get_base_predictors_in_each_unique_term_affiliation()
859
+
860
+ # For sklearn
861
+ def get_params(self, deep=True):
862
+ return {
863
+ "m": self.m,
864
+ "v": self.v,
865
+ "random_state": self.random_state,
866
+ "n_jobs": self.n_jobs,
867
+ "cv_folds": self.cv_folds,
868
+ "bins": self.bins,
869
+ "verbosity": self.verbosity,
870
+ "max_interaction_level": self.max_interaction_level,
871
+ "max_interactions": self.max_interactions,
872
+ "min_observations_in_split": self.min_observations_in_split,
873
+ "ineligible_boosting_steps_added": self.ineligible_boosting_steps_added,
874
+ "max_eligible_terms": self.max_eligible_terms,
875
+ "boosting_steps_before_interactions_are_allowed": self.boosting_steps_before_interactions_are_allowed,
876
+ "monotonic_constraints_ignore_interactions": self.monotonic_constraints_ignore_interactions,
877
+ "early_stopping_rounds": self.early_stopping_rounds,
878
+ "num_first_steps_with_linear_effects_only": self.num_first_steps_with_linear_effects_only,
879
+ "penalty_for_non_linearity": self.penalty_for_non_linearity,
880
+ "penalty_for_interactions": self.penalty_for_interactions,
881
+ "max_terms": self.max_terms,
882
+ "ridge_penalty": self.ridge_penalty,
883
+ }
884
+
885
+ # For sklearn
886
+ def set_params(self, **parameters):
887
+ for parameter, value in parameters.items():
888
+ setattr(self, parameter, value)
889
+ self.__set_params_cpp()
890
+ return self
891
+
892
+ # For sklearn
893
+ def predict_proba(self, X: FloatMatrix) -> FloatMatrix:
894
+ return self.predict_class_probabilities(X)
895
+
896
+
897
+ class APLRTuner:
898
+ def __init__(
899
+ self,
900
+ parameters: Union[Dict[str, List[float]], List[Dict[str, List[float]]]] = {
901
+ "max_interaction_level": [0, 1],
902
+ "min_observations_in_split": [4, 10, 20, 100, 500, 1000],
903
+ },
904
+ is_regressor: bool = True,
905
+ ):
906
+ self.parameters = parameters
907
+ self.is_regressor = is_regressor
908
+ self.parameter_grid = self._create_parameter_grid()
909
+
910
+ def _create_parameter_grid(self) -> List[Dict[str, float]]:
911
+ items = sorted(self.parameters.items())
912
+ keys, values = zip(*items)
913
+ combinations = list(itertools.product(*values))
914
+ grid = [dict(zip(keys, combination)) for combination in combinations]
915
+ return grid
916
+
917
+ def fit(self, X: Union[pd.DataFrame, FloatMatrix], y: FloatVector, **kwargs):
918
+ self.cv_results: List[Dict[str, float]] = []
919
+ best_validation_result = np.inf
920
+ for params in self.parameter_grid:
921
+ if self.is_regressor:
922
+ model = APLRRegressor(**params)
923
+ else:
924
+ model = APLRClassifier(**params)
925
+ model.fit(X, y, **kwargs)
926
+ cv_error_for_this_model = model.get_cv_error()
927
+ cv_results_for_this_model = model.get_params()
928
+ cv_results_for_this_model["cv_error"] = cv_error_for_this_model
929
+ self.cv_results.append(cv_results_for_this_model)
930
+ if cv_error_for_this_model < best_validation_result:
931
+ best_validation_result = cv_error_for_this_model
932
+ self.best_model = model
933
+ self.cv_results = sorted(self.cv_results, key=lambda x: x["cv_error"])
934
+
935
+ def predict(
936
+ self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
937
+ ) -> Union[FloatVector, List[str]]:
938
+ return self.best_model.predict(X, **kwargs)
939
+
940
+ def predict_class_probabilities(
941
+ self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
942
+ ) -> FloatMatrix:
943
+ if self.is_regressor == False:
944
+ return self.best_model.predict_class_probabilities(X, **kwargs)
945
+ else:
946
+ raise TypeError(
947
+ "predict_class_probabilities is only possible when is_regressor is False"
948
+ )
949
+
950
+ def predict_proba(
951
+ self, X: Union[pd.DataFrame, FloatMatrix], **kwargs
952
+ ) -> FloatMatrix:
953
+ return self.predict_class_probabilities(X, **kwargs)
954
+
955
+ def get_best_estimator(self) -> Union[APLRClassifier, APLRRegressor]:
956
+ return self.best_model
957
+
958
+ def get_cv_results(self) -> List[Dict[str, float]]:
959
+ return self.cv_results
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Mathias von Ottenbreit <ottenbreitdatascience@gmail.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: aplr
3
+ Version: 10.18.0
4
+ Summary: Automatic Piecewise Linear Regression
5
+ Home-page: https://github.com/ottenbreit-data-science/aplr
6
+ Author: Mathias von Ottenbreit
7
+ Author-email: ottenbreitdatascience@gmail.com
8
+ License: MIT
9
+ Platform: Windows
10
+ Platform: Linux
11
+ Platform: MacOS
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy>=1.11
17
+ Requires-Dist: pandas>=1.0.0
18
+ Provides-Extra: plots
19
+ Requires-Dist: matplotlib>=3.0; extra == "plots"
20
+
21
+ The documentation for Automatic Piecewise Linear Regression is available at [https://github.com/ottenbreit-data-science/aplr](https://github.com/ottenbreit-data-science/aplr).
@@ -0,0 +1,8 @@
1
+ aplr_cpp.cpython-38-x86_64-linux-gnu.so,sha256=rfci_P0fNhia2ChwvbSzWtqC27Sw2d1kcLAsgNX09a0,38070432
2
+ aplr/__init__.py,sha256=rRfTgNWnYZlFatyA920lWqBcjwmQUI7FcvEPFUTJgzE,20
3
+ aplr/aplr.py,sha256=cEI63m6-5U1VVou-CfKIQ85ys0DL8rqi9ghWFBK3BxY,41090
4
+ aplr-10.18.0.dist-info/LICENSE,sha256=g4qcQtkSVPHtGRi3T93DoFCrssvW6ij_emU-2fj_xfY,1113
5
+ aplr-10.18.0.dist-info/METADATA,sha256=FLw37myL48T6IBuMBl0QE0yGRkxhLuY_lHUYS0MNzw4,736
6
+ aplr-10.18.0.dist-info/WHEEL,sha256=TDhnuTXxsaVxA0eY5gHEFoja9375NYy0e69tu9LyUnk,148
7
+ aplr-10.18.0.dist-info/top_level.txt,sha256=DXVC0RIFGpzVnPeKWAZTXQdJheOEZL51Wip6Fx7zbR4,14
8
+ aplr-10.18.0.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp38-cp38-manylinux_2_27_x86_64
5
+ Tag: cp38-cp38-manylinux_2_28_x86_64
6
+
@@ -0,0 +1,2 @@
1
+ aplr
2
+ aplr_cpp
Binary file