wavetrainer 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.1.13/wavetrainer.egg-info → wavetrainer-0.1.15}/PKG-INFO +1 -1
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/setup.py +1 -1
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/create.py +2 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/correlation_reducer.py +51 -36
- {wavetrainer-0.1.13 → wavetrainer-0.1.15/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/LICENSE +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/MANIFEST.in +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/README.md +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/requirements.txt +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/setup.cfg +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/model/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/trainer_test.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_model.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/lightgbm/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/lightgbm/lightgbm_model.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/model_router.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/tabpfn/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/early_stopper.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/xgboost_model.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/params.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/combined_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/pca_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/trainer.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/SOURCES.txt +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.1.
|
26
|
+
version='0.1.15',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -17,6 +17,7 @@ def create(
|
|
17
17
|
cutoff_dt: datetime.datetime | None = None,
|
18
18
|
embedding_cols: list[list[str]] | None = None,
|
19
19
|
allowed_models: set[str] | None = None,
|
20
|
+
max_false_positive_reduction_steps: int | None = None,
|
20
21
|
) -> Trainer:
|
21
22
|
"""Create a trainer."""
|
22
23
|
return Trainer(
|
@@ -29,4 +30,5 @@ def create(
|
|
29
30
|
cutoff_dt=cutoff_dt,
|
30
31
|
embedding_cols=embedding_cols,
|
31
32
|
allowed_models=allowed_models,
|
33
|
+
max_false_positive_reduction_steps=max_false_positive_reduction_steps,
|
32
34
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"""A reducer that removes correlation features."""
|
2
2
|
|
3
|
-
# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate,too-many-locals
|
4
4
|
import json
|
5
5
|
import os
|
6
6
|
from typing import Self
|
@@ -17,51 +17,64 @@ _CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
|
|
17
17
|
_CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
|
18
18
|
|
19
19
|
|
20
|
-
def
|
21
|
-
df: pd.DataFrame,
|
20
|
+
def _get_correlated_features_to_drop_chunked(
|
21
|
+
df: pd.DataFrame,
|
22
|
+
threshold: float = 0.85,
|
23
|
+
chunk_size: int = 10000,
|
24
|
+
random_seed: int = 42,
|
22
25
|
) -> list[str]:
|
23
26
|
"""
|
24
|
-
|
25
|
-
|
26
|
-
Columns are processed in sorted order to ensure deterministic output.
|
27
|
-
|
28
|
-
Args:
|
29
|
-
df (pd.DataFrame): Input DataFrame.
|
30
|
-
threshold (float): Correlation threshold above which features are considered redundant.
|
31
|
-
random_seed (int): Seed used to generate the fixed junk value.
|
32
|
-
|
33
|
-
Returns:
|
34
|
-
List[str]: List of column names to drop.
|
27
|
+
Chunked correlation feature reducer to control memory usage.
|
28
|
+
Applies correlation pruning within chunks, then across surviving features.
|
35
29
|
"""
|
36
30
|
np.random.seed(random_seed)
|
37
|
-
|
38
|
-
# Select and sort numeric columns
|
39
31
|
sorted_cols = sorted(find_non_categorical_numeric_columns(df))
|
40
32
|
df_numeric = df[sorted_cols].copy()
|
41
|
-
|
42
|
-
# Generate and apply a fixed junk value for NaNs
|
43
33
|
junk_value = np.random.uniform(-1e9, 1e9)
|
44
|
-
df_numeric = df_numeric.fillna(junk_value)
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
34
|
+
df_numeric = df_numeric.fillna(junk_value).astype(np.float32)
|
35
|
+
|
36
|
+
# First pass: intra-chunk correlation pruning
|
37
|
+
survivors = []
|
38
|
+
to_drop_total = set()
|
39
|
+
for i in range(0, len(sorted_cols), chunk_size):
|
40
|
+
chunk_cols = sorted_cols[i : i + chunk_size]
|
41
|
+
chunk_df = df_numeric[chunk_cols]
|
42
|
+
chunk_corr = np.corrcoef(chunk_df.values, rowvar=False)
|
43
|
+
abs_corr = np.abs(chunk_corr)
|
44
|
+
|
45
|
+
to_drop = set()
|
46
|
+
for j in range(len(chunk_cols)):
|
47
|
+
if chunk_cols[j] in to_drop:
|
48
|
+
continue
|
49
|
+
for k in range(j + 1, len(chunk_cols)):
|
50
|
+
if chunk_cols[k] in to_drop:
|
51
|
+
continue
|
52
|
+
if abs_corr[j, k] > threshold:
|
53
|
+
to_drop.add(chunk_cols[k])
|
54
|
+
|
55
|
+
survivors.extend([col for col in chunk_cols if col not in to_drop])
|
56
|
+
to_drop_total.update(to_drop)
|
57
|
+
|
58
|
+
# Second pass: global correlation among survivors
|
59
|
+
if len(survivors) < 2:
|
60
|
+
return sorted(to_drop_total)
|
61
|
+
|
62
|
+
survivors_df = df_numeric[survivors]
|
63
|
+
final_corr = np.corrcoef(survivors_df.values, rowvar=False)
|
64
|
+
abs_corr = np.abs(final_corr)
|
65
|
+
|
66
|
+
final_drop = set()
|
67
|
+
for i in range(len(survivors)):
|
68
|
+
if survivors[i] in final_drop:
|
57
69
|
continue
|
58
|
-
for j in range(i + 1, len(
|
59
|
-
if
|
70
|
+
for j in range(i + 1, len(survivors)):
|
71
|
+
if survivors[j] in final_drop:
|
60
72
|
continue
|
61
73
|
if abs_corr[i, j] > threshold:
|
62
|
-
|
74
|
+
final_drop.add(survivors[j])
|
63
75
|
|
64
|
-
|
76
|
+
to_drop_total.update(final_drop)
|
77
|
+
return sorted(to_drop_total)
|
65
78
|
|
66
79
|
|
67
80
|
class CorrelationReducer(Reducer):
|
@@ -102,7 +115,9 @@ class CorrelationReducer(Reducer):
|
|
102
115
|
eval_x: pd.DataFrame | None = None,
|
103
116
|
eval_y: pd.Series | pd.DataFrame | None = None,
|
104
117
|
) -> Self:
|
105
|
-
drop_features =
|
118
|
+
drop_features = _get_correlated_features_to_drop_chunked(
|
119
|
+
df, threshold=self._threshold
|
120
|
+
)
|
106
121
|
self._correlation_drop_features = {x: True for x in drop_features}
|
107
122
|
return self
|
108
123
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_classifier_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_regressor_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|