wavetrainer 0.0.37__tar.gz → 0.0.39__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wavetrainer-0.0.37/wavetrainer.egg-info → wavetrainer-0.0.39}/PKG-INFO +1 -1
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/setup.py +1 -1
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/__init__.py +1 -1
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_model.py +3 -2
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/tabpfn_model.py +7 -1
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/base_selector_reducer.py +9 -3
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/combined_reducer.py +11 -1
- wavetrainer-0.0.39/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +60 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/smart_correlation_reducer.py +4 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/trainer.py +18 -4
- {wavetrainer-0.0.37 → wavetrainer-0.0.39/wavetrainer.egg-info}/PKG-INFO +1 -1
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/SOURCES.txt +1 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/LICENSE +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/MANIFEST.in +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/README.md +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/requirements.txt +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/setup.cfg +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/model/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/model/catboost_kwargs_test.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/trainer_test.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/calibrator.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/calibrator_router.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/create.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/exceptions.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/fit.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_kwargs.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/model.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/model_router.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model_type.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/params.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/constant_reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/correlation_reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/duplicate_reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/unseen_reducer.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/selector/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/selector/selector.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/class_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/combined_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/exponential_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/linear_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/noop_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/sigmoid_weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/weights.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/weights_router.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/windower/__init__.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/windower/windower.py +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/dependency_links.txt +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/not-zip-safe +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/requires.txt +0 -0
- {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
|
|
23
23
|
|
24
24
|
setup(
|
25
25
|
name='wavetrainer',
|
26
|
-
version='0.0.
|
26
|
+
version='0.0.39',
|
27
27
|
description='A library for automatically finding the optimal model within feature and hyperparameter space.',
|
28
28
|
long_description=long_description,
|
29
29
|
long_description_content_type='text/markdown',
|
@@ -148,7 +148,7 @@ class CatboostModel(Model):
|
|
148
148
|
)
|
149
149
|
catboost = self._provide_catboost()
|
150
150
|
catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
|
151
|
-
trial.
|
151
|
+
trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
|
152
152
|
|
153
153
|
def fit(
|
154
154
|
self,
|
@@ -219,9 +219,10 @@ class CatboostModel(Model):
|
|
219
219
|
best_iteration if best_iteration is not None else self._iterations
|
220
220
|
)
|
221
221
|
logging.info(
|
222
|
-
"Creating catboost model with depth %d, boosting type %s",
|
222
|
+
"Creating catboost model with depth %d, boosting type %s, best iteration %d",
|
223
223
|
self._depth,
|
224
224
|
self._boosting_type,
|
225
|
+
best_iteration,
|
225
226
|
)
|
226
227
|
match self._model_type:
|
227
228
|
case ModelType.BINARY:
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments
|
3
3
|
|
4
4
|
import json
|
5
|
+
import logging
|
5
6
|
import os
|
6
7
|
import pickle
|
7
8
|
from typing import Any, Self
|
@@ -13,6 +14,7 @@ import torch
|
|
13
14
|
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
|
14
15
|
AutoTabPFNClassifier, AutoTabPFNRegressor)
|
15
16
|
|
17
|
+
from ..exceptions import WavetrainException
|
16
18
|
from ..model_type import ModelType, determine_model_type
|
17
19
|
from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
|
18
20
|
|
@@ -104,7 +106,11 @@ class TabPFNModel(Model):
|
|
104
106
|
raise ValueError("y is null.")
|
105
107
|
self._model_type = determine_model_type(y)
|
106
108
|
tabpfn = self._provide_tabpfn()
|
107
|
-
|
109
|
+
try:
|
110
|
+
tabpfn.fit(df, y)
|
111
|
+
except ValueError as exc:
|
112
|
+
logging.warning(str(exc))
|
113
|
+
raise WavetrainException() from exc
|
108
114
|
return self
|
109
115
|
|
110
116
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -56,8 +56,8 @@ class BaseSelectorReducer(Reducer):
|
|
56
56
|
if len(df.columns) <= 1:
|
57
57
|
return self
|
58
58
|
try:
|
59
|
-
self._base_selector.fit(df) # type: ignore
|
60
|
-
except ValueError as exc:
|
59
|
+
self._base_selector.fit(df, y=y) # type: ignore
|
60
|
+
except (ValueError, AttributeError) as exc:
|
61
61
|
logging.warning(str(exc))
|
62
62
|
if self.should_raise():
|
63
63
|
raise WavetrainException() from exc
|
@@ -66,4 +66,10 @@ class BaseSelectorReducer(Reducer):
|
|
66
66
|
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
67
67
|
if len(df.columns) <= 1:
|
68
68
|
return df
|
69
|
-
|
69
|
+
try:
|
70
|
+
return self._base_selector.transform(df)
|
71
|
+
except (ValueError, AttributeError) as exc:
|
72
|
+
logging.warning(str(exc))
|
73
|
+
if self.should_raise():
|
74
|
+
raise WavetrainException() from exc
|
75
|
+
return df
|
@@ -3,6 +3,7 @@
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
import os
|
6
|
+
import time
|
6
7
|
from typing import Self
|
7
8
|
|
8
9
|
import optuna
|
@@ -13,6 +14,8 @@ from .correlation_reducer import CorrelationReducer
|
|
13
14
|
from .duplicate_reducer import DuplicateReducer
|
14
15
|
from .nonnumeric_reducer import NonNumericReducer
|
15
16
|
from .reducer import Reducer
|
17
|
+
from .select_by_single_feature_performance_reducer import \
|
18
|
+
SelectBySingleFeaturePerformanceReducer
|
16
19
|
from .smart_correlation_reducer import SmartCorrelationReducer
|
17
20
|
from .unseen_reducer import UnseenReducer
|
18
21
|
|
@@ -35,6 +38,7 @@ class CombinedReducer(Reducer):
|
|
35
38
|
DuplicateReducer(),
|
36
39
|
CorrelationReducer(),
|
37
40
|
SmartCorrelationReducer(),
|
41
|
+
# SelectBySingleFeaturePerformanceReducer(),
|
38
42
|
]
|
39
43
|
self._folder = None
|
40
44
|
|
@@ -67,6 +71,8 @@ class CombinedReducer(Reducer):
|
|
67
71
|
self._reducers.append(UnseenReducer())
|
68
72
|
elif reducer_name == SmartCorrelationReducer.name():
|
69
73
|
self._reducers.append(SmartCorrelationReducer())
|
74
|
+
elif reducer_name == SelectBySingleFeaturePerformanceReducer.name():
|
75
|
+
self._reducers.append(SelectBySingleFeaturePerformanceReducer())
|
70
76
|
for reducer in self._reducers:
|
71
77
|
reducer.load(folder)
|
72
78
|
self._folder = folder
|
@@ -94,12 +100,16 @@ class CombinedReducer(Reducer):
|
|
94
100
|
) -> Self:
|
95
101
|
removed_columns_dict = {}
|
96
102
|
for reducer in self._reducers:
|
103
|
+
start_reducer = time.time()
|
97
104
|
before_columns = set(df.columns.values)
|
98
|
-
df = reducer.fit_transform(df)
|
105
|
+
df = reducer.fit_transform(df, y=y)
|
99
106
|
after_columns = set(df.columns.values)
|
100
107
|
removed_columns = before_columns.difference(after_columns)
|
101
108
|
if removed_columns:
|
102
109
|
removed_columns_dict[reducer.name()] = list(removed_columns)
|
110
|
+
logging.info(
|
111
|
+
"%s reducer took %f", reducer.name(), time.time() - start_reducer
|
112
|
+
)
|
103
113
|
if self._folder is not None:
|
104
114
|
with open(
|
105
115
|
os.path.join(self._folder, _REMOVED_COLUMNS_FILE), encoding="utf8"
|
@@ -0,0 +1,60 @@
|
|
1
|
+
"""A reducer that removes features by their single performance via further heuristics."""
|
2
|
+
|
3
|
+
# pylint: disable=too-many-arguments,too-many-positional-arguments
|
4
|
+
from typing import Self
|
5
|
+
|
6
|
+
import optuna
|
7
|
+
import pandas as pd
|
8
|
+
from feature_engine.selection import SelectBySingleFeaturePerformance
|
9
|
+
from sklearn.ensemble import RandomForestClassifier # type: ignore
|
10
|
+
|
11
|
+
from ..model_type import ModelType, determine_model_type
|
12
|
+
from .base_selector_reducer import BaseSelectorReducer
|
13
|
+
|
14
|
+
_SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME = (
|
15
|
+
"single_feature_performance_reducer.joblib"
|
16
|
+
)
|
17
|
+
_SINGLE_FEATURE_PERFORMANCE_REDUCER_THRESHOLD = (
|
18
|
+
"single_feature_performance_reducer_threshold"
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
|
23
|
+
"""A class that removes smart correlated values from a dataset."""
|
24
|
+
|
25
|
+
def __init__(self) -> None:
|
26
|
+
self._singlefeatureperformance_selector = SelectBySingleFeaturePerformance(
|
27
|
+
RandomForestClassifier(random_state=42, n_jobs=-1), scoring="accuracy", cv=1
|
28
|
+
)
|
29
|
+
super().__init__(
|
30
|
+
self._singlefeatureperformance_selector,
|
31
|
+
_SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME,
|
32
|
+
)
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def name(cls) -> str:
|
36
|
+
return "single_feature_performance"
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def should_raise(cls) -> bool:
|
40
|
+
return False
|
41
|
+
|
42
|
+
def set_options(
|
43
|
+
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
44
|
+
) -> None:
|
45
|
+
self._singlefeatureperformance_selector.threshold = trial.suggest_float(
|
46
|
+
_SINGLE_FEATURE_PERFORMANCE_REDUCER_THRESHOLD, 0.1, 0.9
|
47
|
+
)
|
48
|
+
|
49
|
+
def fit(
|
50
|
+
self,
|
51
|
+
df: pd.DataFrame,
|
52
|
+
y: pd.Series | pd.DataFrame | None = None,
|
53
|
+
w: pd.Series | None = None,
|
54
|
+
eval_x: pd.DataFrame | None = None,
|
55
|
+
eval_y: pd.Series | pd.DataFrame | None = None,
|
56
|
+
) -> Self:
|
57
|
+
self._singlefeatureperformance_selector.scoring = (
|
58
|
+
"r2" if determine_model_type(y) == ModelType.REGRESSION else "accuracy" # type: ignore
|
59
|
+
)
|
60
|
+
return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
|
@@ -29,6 +29,10 @@ class SmartCorrelationReducer(BaseSelectorReducer):
|
|
29
29
|
def name(cls) -> str:
|
30
30
|
return "smart_correlation"
|
31
31
|
|
32
|
+
@classmethod
|
33
|
+
def should_raise(cls) -> bool:
|
34
|
+
return False
|
35
|
+
|
32
36
|
def set_options(
|
33
37
|
self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
|
34
38
|
) -> None:
|
@@ -6,6 +6,7 @@ import json
|
|
6
6
|
import logging
|
7
7
|
import os
|
8
8
|
import pickle
|
9
|
+
import time
|
9
10
|
from typing import Self
|
10
11
|
|
11
12
|
import optuna
|
@@ -231,6 +232,7 @@ class Trainer(Fit):
|
|
231
232
|
|
232
233
|
try:
|
233
234
|
# Window the data
|
235
|
+
start_windower = time.time()
|
234
236
|
windower = Windower(self._dt_column)
|
235
237
|
windower.set_options(trial, x)
|
236
238
|
x_train = windower.fit_transform(x_train)
|
@@ -240,25 +242,31 @@ class Trainer(Fit):
|
|
240
242
|
os.removedirs(folder)
|
241
243
|
logging.warning("Y train only contains 1 unique datapoint.")
|
242
244
|
return _BAD_OUTPUT
|
245
|
+
logging.info("Windowing took %f", time.time() - start_windower)
|
243
246
|
|
244
247
|
# Perform common reductions
|
248
|
+
start_reducer = time.time()
|
245
249
|
reducer = CombinedReducer()
|
246
250
|
reducer.set_options(trial, x)
|
247
|
-
x_train = reducer.fit_transform(x_train)
|
251
|
+
x_train = reducer.fit_transform(x_train, y=y_train)
|
248
252
|
x_test = reducer.transform(x_test)
|
253
|
+
logging.info("Reducing took %f", time.time() - start_reducer)
|
249
254
|
|
250
255
|
# Calculate the row weights
|
256
|
+
start_row_weights = time.time()
|
251
257
|
weights = CombinedWeights()
|
252
258
|
weights.set_options(trial, x)
|
253
259
|
w = weights.fit(x_train, y=y_train).transform(y_train.to_frame())[
|
254
260
|
WEIGHTS_COLUMN
|
255
261
|
]
|
262
|
+
logging.info("Row weights took %f", time.time() - start_row_weights)
|
256
263
|
|
257
264
|
# Create model
|
258
265
|
model = ModelRouter()
|
259
266
|
model.set_options(trial, x)
|
260
267
|
|
261
268
|
# Train
|
269
|
+
start_train = time.time()
|
262
270
|
selector = Selector(model)
|
263
271
|
selector.set_options(trial, x)
|
264
272
|
selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
|
@@ -267,11 +275,14 @@ class Trainer(Fit):
|
|
267
275
|
x_pred = model.fit_transform(
|
268
276
|
x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test
|
269
277
|
)
|
278
|
+
logging.info("Training took %f", time.time() - start_train)
|
270
279
|
|
271
280
|
# Calibrate
|
281
|
+
start_calibrate = time.time()
|
272
282
|
calibrator = CalibratorRouter(model)
|
273
283
|
calibrator.set_options(trial, x)
|
274
284
|
calibrator.fit(x_pred, y=y_train)
|
285
|
+
logging.info("Calibrating took %f", time.time() - start_calibrate)
|
275
286
|
|
276
287
|
# Output
|
277
288
|
y_pred = model.transform(x_test)
|
@@ -521,8 +532,11 @@ class Trainer(Fit):
|
|
521
532
|
date_path = os.path.join(column_path, date_str)
|
522
533
|
if not os.path.isdir(date_path):
|
523
534
|
continue
|
524
|
-
|
525
|
-
|
526
|
-
|
535
|
+
try:
|
536
|
+
model = ModelRouter()
|
537
|
+
model.load(date_path)
|
538
|
+
feature_importances[date_str] = model.feature_importances
|
539
|
+
except FileNotFoundError as exc:
|
540
|
+
logging.warning(str(exc))
|
527
541
|
|
528
542
|
return feature_importances
|
@@ -42,6 +42,7 @@ wavetrainer/reducer/duplicate_reducer.py
|
|
42
42
|
wavetrainer/reducer/non_categorical_numeric_columns.py
|
43
43
|
wavetrainer/reducer/nonnumeric_reducer.py
|
44
44
|
wavetrainer/reducer/reducer.py
|
45
|
+
wavetrainer/reducer/select_by_single_feature_performance_reducer.py
|
45
46
|
wavetrainer/reducer/smart_correlation_reducer.py
|
46
47
|
wavetrainer/reducer/unseen_reducer.py
|
47
48
|
wavetrainer/selector/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/non_categorical_numeric_columns.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|