wavetrainer 0.0.37__tar.gz → 0.0.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {wavetrainer-0.0.37/wavetrainer.egg-info → wavetrainer-0.0.39}/PKG-INFO +1 -1
  2. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/setup.py +1 -1
  3. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/__init__.py +1 -1
  4. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_model.py +3 -2
  5. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/tabpfn_model.py +7 -1
  6. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/base_selector_reducer.py +9 -3
  7. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/combined_reducer.py +11 -1
  8. wavetrainer-0.0.39/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +60 -0
  9. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/smart_correlation_reducer.py +4 -0
  10. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/trainer.py +18 -4
  11. {wavetrainer-0.0.37 → wavetrainer-0.0.39/wavetrainer.egg-info}/PKG-INFO +1 -1
  12. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/SOURCES.txt +1 -0
  13. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/LICENSE +0 -0
  14. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/MANIFEST.in +0 -0
  15. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/README.md +0 -0
  16. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/requirements.txt +0 -0
  17. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/setup.cfg +0 -0
  18. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/__init__.py +0 -0
  19. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/model/__init__.py +0 -0
  20. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/model/catboost_kwargs_test.py +0 -0
  21. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/tests/trainer_test.py +0 -0
  22. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/__init__.py +0 -0
  23. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/calibrator.py +0 -0
  24. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/calibrator_router.py +0 -0
  25. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  26. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  27. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/create.py +0 -0
  28. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/exceptions.py +0 -0
  29. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/fit.py +0 -0
  30. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/__init__.py +0 -0
  31. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_classifier_wrap.py +0 -0
  32. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_kwargs.py +0 -0
  33. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/catboost_regressor_wrap.py +0 -0
  34. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/model.py +0 -0
  35. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model/model_router.py +0 -0
  36. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/model_type.py +0 -0
  37. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/params.py +0 -0
  38. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/__init__.py +0 -0
  39. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/constant_reducer.py +0 -0
  40. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/correlation_reducer.py +0 -0
  41. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  42. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  43. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  44. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/reducer.py +0 -0
  45. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/reducer/unseen_reducer.py +0 -0
  46. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/selector/__init__.py +0 -0
  47. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/selector/selector.py +0 -0
  48. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/__init__.py +0 -0
  49. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/class_weights.py +0 -0
  50. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/combined_weights.py +0 -0
  51. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/exponential_weights.py +0 -0
  52. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/linear_weights.py +0 -0
  53. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/noop_weights.py +0 -0
  54. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/sigmoid_weights.py +0 -0
  55. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/weights.py +0 -0
  56. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/weights/weights_router.py +0 -0
  57. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/windower/__init__.py +0 -0
  58. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer/windower/windower.py +0 -0
  59. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/dependency_links.txt +0 -0
  60. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/not-zip-safe +0 -0
  61. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/requires.txt +0 -0
  62. {wavetrainer-0.0.37 → wavetrainer-0.0.39}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.37
3
+ Version: 0.0.39
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.37',
26
+ version='0.0.39',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.37"
5
+ __VERSION__ = "0.0.39"
6
6
  __all__ = ("create",)
@@ -148,7 +148,7 @@ class CatboostModel(Model):
148
148
  )
149
149
  catboost = self._provide_catboost()
150
150
  catboost.save_model(os.path.join(folder, _MODEL_FILENAME))
151
- trial.user_attrs[_BEST_ITERATION_KEY] = self._best_iteration
151
+ trial.set_user_attr(_BEST_ITERATION_KEY, self._best_iteration)
152
152
 
153
153
  def fit(
154
154
  self,
@@ -219,9 +219,10 @@ class CatboostModel(Model):
219
219
  best_iteration if best_iteration is not None else self._iterations
220
220
  )
221
221
  logging.info(
222
- "Creating catboost model with depth %d, boosting type %s",
222
+ "Creating catboost model with depth %d, boosting type %s, best iteration %d",
223
223
  self._depth,
224
224
  self._boosting_type,
225
+ best_iteration,
225
226
  )
226
227
  match self._model_type:
227
228
  case ModelType.BINARY:
@@ -2,6 +2,7 @@
2
2
  # pylint: disable=duplicate-code,too-many-arguments,too-many-positional-arguments
3
3
 
4
4
  import json
5
+ import logging
5
6
  import os
6
7
  import pickle
7
8
  from typing import Any, Self
@@ -13,6 +14,7 @@ import torch
13
14
  from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import ( # type: ignore
14
15
  AutoTabPFNClassifier, AutoTabPFNRegressor)
15
16
 
17
+ from ..exceptions import WavetrainException
16
18
  from ..model_type import ModelType, determine_model_type
17
19
  from .model import PREDICTION_COLUMN, PROBABILITY_COLUMN_PREFIX, Model
18
20
 
@@ -104,7 +106,11 @@ class TabPFNModel(Model):
104
106
  raise ValueError("y is null.")
105
107
  self._model_type = determine_model_type(y)
106
108
  tabpfn = self._provide_tabpfn()
107
- tabpfn.fit(df, y)
109
+ try:
110
+ tabpfn.fit(df, y)
111
+ except ValueError as exc:
112
+ logging.warning(str(exc))
113
+ raise WavetrainException() from exc
108
114
  return self
109
115
 
110
116
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -56,8 +56,8 @@ class BaseSelectorReducer(Reducer):
56
56
  if len(df.columns) <= 1:
57
57
  return self
58
58
  try:
59
- self._base_selector.fit(df) # type: ignore
60
- except ValueError as exc:
59
+ self._base_selector.fit(df, y=y) # type: ignore
60
+ except (ValueError, AttributeError) as exc:
61
61
  logging.warning(str(exc))
62
62
  if self.should_raise():
63
63
  raise WavetrainException() from exc
@@ -66,4 +66,10 @@ class BaseSelectorReducer(Reducer):
66
66
  def transform(self, df: pd.DataFrame) -> pd.DataFrame:
67
67
  if len(df.columns) <= 1:
68
68
  return df
69
- return self._base_selector.transform(df)
69
+ try:
70
+ return self._base_selector.transform(df)
71
+ except (ValueError, AttributeError) as exc:
72
+ logging.warning(str(exc))
73
+ if self.should_raise():
74
+ raise WavetrainException() from exc
75
+ return df
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import time
6
7
  from typing import Self
7
8
 
8
9
  import optuna
@@ -13,6 +14,8 @@ from .correlation_reducer import CorrelationReducer
13
14
  from .duplicate_reducer import DuplicateReducer
14
15
  from .nonnumeric_reducer import NonNumericReducer
15
16
  from .reducer import Reducer
17
+ from .select_by_single_feature_performance_reducer import \
18
+ SelectBySingleFeaturePerformanceReducer
16
19
  from .smart_correlation_reducer import SmartCorrelationReducer
17
20
  from .unseen_reducer import UnseenReducer
18
21
 
@@ -35,6 +38,7 @@ class CombinedReducer(Reducer):
35
38
  DuplicateReducer(),
36
39
  CorrelationReducer(),
37
40
  SmartCorrelationReducer(),
41
+ # SelectBySingleFeaturePerformanceReducer(),
38
42
  ]
39
43
  self._folder = None
40
44
 
@@ -67,6 +71,8 @@ class CombinedReducer(Reducer):
67
71
  self._reducers.append(UnseenReducer())
68
72
  elif reducer_name == SmartCorrelationReducer.name():
69
73
  self._reducers.append(SmartCorrelationReducer())
74
+ elif reducer_name == SelectBySingleFeaturePerformanceReducer.name():
75
+ self._reducers.append(SelectBySingleFeaturePerformanceReducer())
70
76
  for reducer in self._reducers:
71
77
  reducer.load(folder)
72
78
  self._folder = folder
@@ -94,12 +100,16 @@ class CombinedReducer(Reducer):
94
100
  ) -> Self:
95
101
  removed_columns_dict = {}
96
102
  for reducer in self._reducers:
103
+ start_reducer = time.time()
97
104
  before_columns = set(df.columns.values)
98
- df = reducer.fit_transform(df)
105
+ df = reducer.fit_transform(df, y=y)
99
106
  after_columns = set(df.columns.values)
100
107
  removed_columns = before_columns.difference(after_columns)
101
108
  if removed_columns:
102
109
  removed_columns_dict[reducer.name()] = list(removed_columns)
110
+ logging.info(
111
+ "%s reducer took %f", reducer.name(), time.time() - start_reducer
112
+ )
103
113
  if self._folder is not None:
104
114
  with open(
105
115
  os.path.join(self._folder, _REMOVED_COLUMNS_FILE), encoding="utf8"
@@ -0,0 +1,60 @@
1
+ """A reducer that removes features by their single performance via further heuristics."""
2
+
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments
4
+ from typing import Self
5
+
6
+ import optuna
7
+ import pandas as pd
8
+ from feature_engine.selection import SelectBySingleFeaturePerformance
9
+ from sklearn.ensemble import RandomForestClassifier # type: ignore
10
+
11
+ from ..model_type import ModelType, determine_model_type
12
+ from .base_selector_reducer import BaseSelectorReducer
13
+
14
+ _SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME = (
15
+ "single_feature_performance_reducer.joblib"
16
+ )
17
+ _SINGLE_FEATURE_PERFORMANCE_REDUCER_THRESHOLD = (
18
+ "single_feature_performance_reducer_threshold"
19
+ )
20
+
21
+
22
+ class SelectBySingleFeaturePerformanceReducer(BaseSelectorReducer):
23
+ """A class that removes smart correlated values from a dataset."""
24
+
25
+ def __init__(self) -> None:
26
+ self._singlefeatureperformance_selector = SelectBySingleFeaturePerformance(
27
+ RandomForestClassifier(random_state=42, n_jobs=-1), scoring="accuracy", cv=1
28
+ )
29
+ super().__init__(
30
+ self._singlefeatureperformance_selector,
31
+ _SINGLE_FEATURE_PERFORMANCE_REDUCER_FILENAME,
32
+ )
33
+
34
+ @classmethod
35
+ def name(cls) -> str:
36
+ return "single_feature_performance"
37
+
38
+ @classmethod
39
+ def should_raise(cls) -> bool:
40
+ return False
41
+
42
+ def set_options(
43
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
44
+ ) -> None:
45
+ self._singlefeatureperformance_selector.threshold = trial.suggest_float(
46
+ _SINGLE_FEATURE_PERFORMANCE_REDUCER_THRESHOLD, 0.1, 0.9
47
+ )
48
+
49
+ def fit(
50
+ self,
51
+ df: pd.DataFrame,
52
+ y: pd.Series | pd.DataFrame | None = None,
53
+ w: pd.Series | None = None,
54
+ eval_x: pd.DataFrame | None = None,
55
+ eval_y: pd.Series | pd.DataFrame | None = None,
56
+ ) -> Self:
57
+ self._singlefeatureperformance_selector.scoring = (
58
+ "r2" if determine_model_type(y) == ModelType.REGRESSION else "accuracy" # type: ignore
59
+ )
60
+ return super().fit(df, y=y, w=w, eval_x=eval_x, eval_y=eval_y)
@@ -29,6 +29,10 @@ class SmartCorrelationReducer(BaseSelectorReducer):
29
29
  def name(cls) -> str:
30
30
  return "smart_correlation"
31
31
 
32
+ @classmethod
33
+ def should_raise(cls) -> bool:
34
+ return False
35
+
32
36
  def set_options(
33
37
  self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
34
38
  ) -> None:
@@ -6,6 +6,7 @@ import json
6
6
  import logging
7
7
  import os
8
8
  import pickle
9
+ import time
9
10
  from typing import Self
10
11
 
11
12
  import optuna
@@ -231,6 +232,7 @@ class Trainer(Fit):
231
232
 
232
233
  try:
233
234
  # Window the data
235
+ start_windower = time.time()
234
236
  windower = Windower(self._dt_column)
235
237
  windower.set_options(trial, x)
236
238
  x_train = windower.fit_transform(x_train)
@@ -240,25 +242,31 @@ class Trainer(Fit):
240
242
  os.removedirs(folder)
241
243
  logging.warning("Y train only contains 1 unique datapoint.")
242
244
  return _BAD_OUTPUT
245
+ logging.info("Windowing took %f", time.time() - start_windower)
243
246
 
244
247
  # Perform common reductions
248
+ start_reducer = time.time()
245
249
  reducer = CombinedReducer()
246
250
  reducer.set_options(trial, x)
247
- x_train = reducer.fit_transform(x_train)
251
+ x_train = reducer.fit_transform(x_train, y=y_train)
248
252
  x_test = reducer.transform(x_test)
253
+ logging.info("Reducing took %f", time.time() - start_reducer)
249
254
 
250
255
  # Calculate the row weights
256
+ start_row_weights = time.time()
251
257
  weights = CombinedWeights()
252
258
  weights.set_options(trial, x)
253
259
  w = weights.fit(x_train, y=y_train).transform(y_train.to_frame())[
254
260
  WEIGHTS_COLUMN
255
261
  ]
262
+ logging.info("Row weights took %f", time.time() - start_row_weights)
256
263
 
257
264
  # Create model
258
265
  model = ModelRouter()
259
266
  model.set_options(trial, x)
260
267
 
261
268
  # Train
269
+ start_train = time.time()
262
270
  selector = Selector(model)
263
271
  selector.set_options(trial, x)
264
272
  selector.fit(x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test)
@@ -267,11 +275,14 @@ class Trainer(Fit):
267
275
  x_pred = model.fit_transform(
268
276
  x_train, y=y_train, w=w, eval_x=x_test, eval_y=y_test
269
277
  )
278
+ logging.info("Training took %f", time.time() - start_train)
270
279
 
271
280
  # Calibrate
281
+ start_calibrate = time.time()
272
282
  calibrator = CalibratorRouter(model)
273
283
  calibrator.set_options(trial, x)
274
284
  calibrator.fit(x_pred, y=y_train)
285
+ logging.info("Calibrating took %f", time.time() - start_calibrate)
275
286
 
276
287
  # Output
277
288
  y_pred = model.transform(x_test)
@@ -521,8 +532,11 @@ class Trainer(Fit):
521
532
  date_path = os.path.join(column_path, date_str)
522
533
  if not os.path.isdir(date_path):
523
534
  continue
524
- model = ModelRouter()
525
- model.load(date_path)
526
- feature_importances[date_str] = model.feature_importances
535
+ try:
536
+ model = ModelRouter()
537
+ model.load(date_path)
538
+ feature_importances[date_str] = model.feature_importances
539
+ except FileNotFoundError as exc:
540
+ logging.warning(str(exc))
527
541
 
528
542
  return feature_importances
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.37
3
+ Version: 0.0.39
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -42,6 +42,7 @@ wavetrainer/reducer/duplicate_reducer.py
42
42
  wavetrainer/reducer/non_categorical_numeric_columns.py
43
43
  wavetrainer/reducer/nonnumeric_reducer.py
44
44
  wavetrainer/reducer/reducer.py
45
+ wavetrainer/reducer/select_by_single_feature_performance_reducer.py
45
46
  wavetrainer/reducer/smart_correlation_reducer.py
46
47
  wavetrainer/reducer/unseen_reducer.py
47
48
  wavetrainer/selector/__init__.py
File without changes
File without changes
File without changes
File without changes