wavetrainer 0.0.45__tar.gz → 0.0.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.45/wavetrainer.egg-info → wavetrainer-0.0.46}/PKG-INFO +1 -1
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/setup.py +1 -1
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/create.py +2 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/model_router.py +5 -5
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_model.py +3 -1
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/combined_reducer.py +7 -1
- wavetrainer-0.0.46/wavetrainer/reducer/pca_reducer.py +78 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/selector/selector.py +4 -4
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/trainer.py +16 -5
- {wavetrainer-0.0.45 → wavetrainer-0.0.46/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/SOURCES.txt +1 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/LICENSE +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/MANIFEST.in +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/README.md +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/requirements.txt +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/setup.cfg +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/trainer_test.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_model.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/early_stopper.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/base_selector_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.46',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -15,6 +15,7 @@ def create(
|
|
15
15
|
dt_column: str | None = None,
|
16
16
|
max_train_timeout: datetime.timedelta | None = None,
|
17
17
|
cutoff_dt: datetime.datetime | None = None,
|
18
|
+
embedding_cols: list[list[str]] | None = None,
|
18
19
|
) -> Trainer:
|
19
20
|
"""Create a trainer."""
|
20
21
|
return Trainer(
|
@@ -25,4 +26,5 @@ def create(
|
|
25
26
|
dt_column=dt_column,
|
26
27
|
max_train_timeout=max_train_timeout,
|
27
28
|
cutoff_dt=cutoff_dt,
|
29
|
+
embedding_cols=embedding_cols,
|
28
30
|
)
|
@@ -81,11 +81,11 @@ class ModelRouter(Model):
|
|
81
81
|
def set_options(
|
82
82
|
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
83
83
|
) -> None:
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
]()
|
84
|
+
model_name = trial.suggest_categorical(
|
85
|
+
"model", [k for k, v in _MODELS.items() if v.supports_x(df)]
|
86
|
+
)
|
87
|
+
print(f"Using {model_name} model")
|
88
|
+
model = _MODELS[model_name]()
|
89
89
|
model.set_options(trial, df)
|
90
90
|
self._model = model
|
91
91
|
|
@@ -109,7 +109,9 @@ class XGBoostModel(Model):
|
|
109
109
|
def feature_importances(self) -> dict[str, float]:
|
110
110
|
bst = self._provide_xgboost()
|
111
111
|
try:
|
112
|
-
|
112
|
+
score_dict = bst.get_booster().get_score(importance_type="weight") # type: ignore
|
113
|
+
total = sum(score_dict.values()) # type: ignore
|
114
|
+
return {k: v / total for k, v in score_dict.items()} # type: ignore
|
113
115
|
except XGBoostError as exc:
|
114
116
|
print(str(exc))
|
115
117
|
return {}
|
@@ -14,6 +14,7 @@ from .constant_reducer import ConstantReducer
|
|
14
14
|
from .correlation_reducer import CorrelationReducer
|
15
15
|
from .duplicate_reducer import DuplicateReducer
|
16
16
|
from .nonnumeric_reducer import NonNumericReducer
|
17
|
+
from .pca_reducer import PCAReducer
|
17
18
|
from .reducer import Reducer
|
18
19
|
from .select_by_single_feature_performance_reducer import \
|
19
20
|
SelectBySingleFeaturePerformanceReducer
|
@@ -29,12 +30,14 @@ class CombinedReducer(Reducer):
|
|
29
30
|
"""A reducer that combines a series of reducers."""
|
30
31
|
|
31
32
|
# pylint: disable=too-many-positional-arguments,too-many-arguments
|
33
|
+
_folder: str | None
|
32
34
|
|
33
|
-
def __init__(self):
|
35
|
+
def __init__(self, embedding_cols: list[list[str]] | None):
|
34
36
|
super().__init__()
|
35
37
|
self._reducers = [
|
36
38
|
UnseenReducer(),
|
37
39
|
NonNumericReducer(),
|
40
|
+
PCAReducer(embedding_cols),
|
38
41
|
ConstantReducer(),
|
39
42
|
DuplicateReducer(),
|
40
43
|
CorrelationReducer(),
|
@@ -42,6 +45,7 @@ class CombinedReducer(Reducer):
|
|
42
45
|
# SelectBySingleFeaturePerformanceReducer(),
|
43
46
|
]
|
44
47
|
self._folder = None
|
48
|
+
self._embedding_cols = embedding_cols
|
45
49
|
|
46
50
|
@classmethod
|
47
51
|
def name(cls) -> str:
|
@@ -74,6 +78,8 @@ class CombinedReducer(Reducer):
|
|
74
78
|
self._reducers.append(SmartCorrelationReducer())
|
75
79
|
elif reducer_name == SelectBySingleFeaturePerformanceReducer.name():
|
76
80
|
self._reducers.append(SelectBySingleFeaturePerformanceReducer())
|
81
|
+
elif reducer_name == PCAReducer.name():
|
82
|
+
self._reducers.append(PCAReducer(self._embedding_cols))
|
77
83
|
for reducer in self._reducers:
|
78
84
|
reducer.load(folder)
|
79
85
|
self._folder = folder
|
@@ -0,0 +1,78 @@
|
|
1
|
+
"""A reducer that reduces embeddings using PCA."""
|
2
|
+
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
4
|
+
import hashlib
|
5
|
+
import os
|
6
|
+
from typing import Self
|
7
|
+
|
8
|
+
import joblib # type: ignore
|
9
|
+
import optuna
|
10
|
+
import pandas as pd
|
11
|
+
from sklearn.decomposition import PCA # type: ignore
|
12
|
+
|
13
|
+
from .reducer import Reducer
|
14
|
+
|
15
|
+
_PCA_THRESHOLD = "pca_threshold"
|
16
|
+
|
17
|
+
|
18
|
+
class PCAReducer(Reducer):
|
19
|
+
"""A class that reduces embeddings using PCA."""
|
20
|
+
|
21
|
+
_pcas: dict[str, PCA]
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def name(cls) -> str:
|
25
|
+
return "pca"
|
26
|
+
|
27
|
+
def __init__(self, embedding_cols: list[list[str]] | None):
|
28
|
+
super().__init__()
|
29
|
+
self._embedding_cols = embedding_cols if embedding_cols is not None else []
|
30
|
+
self._pcas = {}
|
31
|
+
|
32
|
+
@property
|
33
|
+
def _embedding_dict(self) -> dict[str, list[str]]:
|
34
|
+
return {
|
35
|
+
hashlib.sha256("|".join(sorted(x)).encode()).hexdigest(): x
|
36
|
+
for x in self._embedding_cols
|
37
|
+
}
|
38
|
+
|
39
|
+
def set_options(
|
40
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
41
|
+
) -> None:
|
42
|
+
if self._embedding_cols is None:
|
43
|
+
return
|
44
|
+
threshold = trial.suggest_float(_PCA_THRESHOLD, 0.7, 0.99)
|
45
|
+
self._pcas = {k: PCA(n_components=threshold) for k in self._embedding_dict}
|
46
|
+
|
47
|
+
def load(self, folder: str) -> None:
|
48
|
+
for k in self._embedding_dict:
|
49
|
+
self._pcas[k] = joblib.load(os.path.join(folder, f"{k}_pca_reducer.joblib"))
|
50
|
+
|
51
|
+
def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
|
52
|
+
for k, v in self._pcas.items():
|
53
|
+
joblib.dump(v, os.path.join(folder, f"{k}_pca_reducer.joblib"))
|
54
|
+
|
55
|
+
def fit(
|
56
|
+
self,
|
57
|
+
df: pd.DataFrame,
|
58
|
+
y: pd.Series | pd.DataFrame | None = None,
|
59
|
+
w: pd.Series | None = None,
|
60
|
+
eval_x: pd.DataFrame | None = None,
|
61
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
62
|
+
) -> Self:
|
63
|
+
if self._embedding_cols is None:
|
64
|
+
return self
|
65
|
+
for k, v in self._pcas.items():
|
66
|
+
v.fit(df[self._embedding_dict[k]])
|
67
|
+
return self
|
68
|
+
|
69
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
70
|
+
if self._embedding_cols is None:
|
71
|
+
return df
|
72
|
+
for k, v in self._pcas.items():
|
73
|
+
cols = self._embedding_dict[k]
|
74
|
+
compressed_embedding = v.transform(df[cols])
|
75
|
+
embedding_len = compressed_embedding.shape[0]
|
76
|
+
df[cols[:embedding_len]] = compressed_embedding
|
77
|
+
df = df.drop(columns=cols[embedding_len:])
|
78
|
+
return df
|
@@ -83,18 +83,18 @@ class Selector(Params, Fit):
|
|
83
83
|
current_features = [list(feature_importances.keys())[0]]
|
84
84
|
current_features = current_features[:required_features]
|
85
85
|
print(
|
86
|
-
f"Current Features:\n{pd.Series(data=
|
86
|
+
f"Current Features:\n{pd.Series(data=[feature_importances[x] for x in current_features], index=current_features)}\n"
|
87
87
|
)
|
88
88
|
|
89
89
|
n_features = len(current_features)
|
90
90
|
for i in range(self._steps):
|
91
|
-
print(
|
92
|
-
f"Recursive Feature Elimination Step {i}, current features: {len(current_features)}"
|
93
|
-
)
|
94
91
|
ratio_diff = 1.0 - self._feature_ratio
|
95
92
|
ratio_step = ratio_diff / float(self._steps)
|
96
93
|
current_ratio = 1.0 - (ratio_step * i)
|
97
94
|
n_features = max(1, int(total_columns * current_ratio))
|
95
|
+
print(
|
96
|
+
f"Recursive Feature Elimination Step {i}, current features: {len(current_features)} required features: {n_features}"
|
97
|
+
)
|
98
98
|
if n_features >= len(current_features):
|
99
99
|
continue
|
100
100
|
|
@@ -63,6 +63,7 @@ class Trainer(Fit):
|
|
63
63
|
dt_column: str | None = None,
|
64
64
|
max_train_timeout: datetime.timedelta | None = None,
|
65
65
|
cutoff_dt: datetime.datetime | None = None,
|
66
|
+
embedding_cols: list[list[str]] | None = None,
|
66
67
|
):
|
67
68
|
tqdm.tqdm.pandas()
|
68
69
|
|
@@ -153,6 +154,7 @@ class Trainer(Fit):
|
|
153
154
|
self._dt_column = dt_column
|
154
155
|
self._max_train_timeout = max_train_timeout
|
155
156
|
self._cutoff_dt = cutoff_dt
|
157
|
+
self._embedding_cols = embedding_cols
|
156
158
|
|
157
159
|
def _provide_study(self, column: str) -> optuna.Study:
|
158
160
|
storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
|
@@ -247,7 +249,7 @@ class Trainer(Fit):
|
|
247
249
|
|
248
250
|
# Perform common reductions
|
249
251
|
start_reducer = time.time()
|
250
|
-
reducer = CombinedReducer()
|
252
|
+
reducer = CombinedReducer(self._embedding_cols)
|
251
253
|
reducer.set_options(trial, x)
|
252
254
|
x_train = reducer.fit_transform(x_train, y=y_train)
|
253
255
|
x_test = reducer.transform(x_test)
|
@@ -415,7 +417,14 @@ class Trainer(Fit):
|
|
415
417
|
break
|
416
418
|
if found:
|
417
419
|
last_processed_dt = test_dt
|
418
|
-
_fit(
|
420
|
+
_fit(
|
421
|
+
study.best_trial,
|
422
|
+
test_df.copy(),
|
423
|
+
test_series,
|
424
|
+
True,
|
425
|
+
test_idx,
|
426
|
+
True,
|
427
|
+
)
|
419
428
|
continue
|
420
429
|
if (
|
421
430
|
last_processed_dt is not None
|
@@ -431,7 +440,7 @@ class Trainer(Fit):
|
|
431
440
|
def validate_objctive(
|
432
441
|
trial: optuna.Trial, idx: datetime.datetime, series: pd.Series
|
433
442
|
) -> float:
|
434
|
-
return _fit(trial, test_df, series, False, idx, False)
|
443
|
+
return _fit(trial, test_df.copy(), series, False, idx, False)
|
435
444
|
|
436
445
|
study.optimize(
|
437
446
|
functools.partial(
|
@@ -445,7 +454,9 @@ class Trainer(Fit):
|
|
445
454
|
else:
|
446
455
|
break
|
447
456
|
|
448
|
-
_fit(
|
457
|
+
_fit(
|
458
|
+
study.best_trial, test_df.copy(), test_series, True, test_idx, True
|
459
|
+
)
|
449
460
|
last_processed_dt = test_idx
|
450
461
|
|
451
462
|
if isinstance(y, pd.Series):
|
@@ -503,7 +514,7 @@ class Trainer(Fit):
|
|
503
514
|
date_str = dates[-1].isoformat()
|
504
515
|
folder = os.path.join(column_path, date_str)
|
505
516
|
|
506
|
-
reducer = CombinedReducer()
|
517
|
+
reducer = CombinedReducer(self._embedding_cols)
|
507
518
|
reducer.load(folder)
|
508
519
|
|
509
520
|
model = ModelRouter()
|
@@ -47,6 +47,7 @@ wavetrainer/reducer/correlation_reducer.py
|
|
47
47
|
wavetrainer/reducer/duplicate_reducer.py
|
48
48
|
wavetrainer/reducer/non_categorical_numeric_columns.py
|
49
49
|
wavetrainer/reducer/nonnumeric_reducer.py
|
50
|
+
wavetrainer/reducer/pca_reducer.py
|
50
51
|
wavetrainer/reducer/reducer.py
|
51
52
|
wavetrainer/reducer/select_by_single_feature_performance_reducer.py
|
52
53
|
wavetrainer/reducer/smart_correlation_reducer.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_classifier_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_regressor_wrap.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|