wavetrainer 0.0.45__tar.gz → 0.0.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {wavetrainer-0.0.45/wavetrainer.egg-info → wavetrainer-0.0.46}/PKG-INFO +1 -1
  2. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/setup.py +1 -1
  3. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/__init__.py +1 -1
  4. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/create.py +2 -0
  5. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/model_router.py +5 -5
  6. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_model.py +3 -1
  7. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/combined_reducer.py +7 -1
  8. wavetrainer-0.0.46/wavetrainer/reducer/pca_reducer.py +78 -0
  9. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/selector/selector.py +4 -4
  10. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/trainer.py +16 -5
  11. {wavetrainer-0.0.45 → wavetrainer-0.0.46/wavetrainer.egg-info}/PKG-INFO +1 -1
  12. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/SOURCES.txt +1 -0
  13. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/LICENSE +0 -0
  14. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/MANIFEST.in +0 -0
  15. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/README.md +0 -0
  16. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/requirements.txt +0 -0
  17. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/setup.cfg +0 -0
  18. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/__init__.py +0 -0
  19. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/model/__init__.py +0 -0
  20. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/model/catboost_kwargs_test.py +0 -0
  21. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/tests/trainer_test.py +0 -0
  22. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/__init__.py +0 -0
  23. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator.py +0 -0
  24. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator_router.py +0 -0
  25. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  26. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  27. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/exceptions.py +0 -0
  28. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/fit.py +0 -0
  29. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/__init__.py +0 -0
  30. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/__init__.py +0 -0
  31. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  32. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  33. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_model.py +0 -0
  34. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  35. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/model.py +0 -0
  36. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/__init__.py +0 -0
  37. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  38. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/__init__.py +0 -0
  39. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  40. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  41. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/model_type.py +0 -0
  42. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/params.py +0 -0
  43. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/__init__.py +0 -0
  44. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  45. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/constant_reducer.py +0 -0
  46. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/correlation_reducer.py +0 -0
  47. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  48. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  49. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  50. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/reducer.py +0 -0
  51. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  52. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  53. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/reducer/unseen_reducer.py +0 -0
  54. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/selector/__init__.py +0 -0
  55. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/dependency_links.txt +0 -0
  67. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/not-zip-safe +0 -0
  68. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/requires.txt +0 -0
  69. {wavetrainer-0.0.45 → wavetrainer-0.0.46}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.45
3
+ Version: 0.0.46
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.45',
26
+ version='0.0.46',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.45"
5
+ __VERSION__ = "0.0.46"
6
6
  __all__ = ("create",)
@@ -15,6 +15,7 @@ def create(
15
15
  dt_column: str | None = None,
16
16
  max_train_timeout: datetime.timedelta | None = None,
17
17
  cutoff_dt: datetime.datetime | None = None,
18
+ embedding_cols: list[list[str]] | None = None,
18
19
  ) -> Trainer:
19
20
  """Create a trainer."""
20
21
  return Trainer(
@@ -25,4 +26,5 @@ def create(
25
26
  dt_column=dt_column,
26
27
  max_train_timeout=max_train_timeout,
27
28
  cutoff_dt=cutoff_dt,
29
+ embedding_cols=embedding_cols,
28
30
  )
@@ -81,11 +81,11 @@ class ModelRouter(Model):
81
81
  def set_options(
82
82
  self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
83
83
  ) -> None:
84
- model = _MODELS[
85
- trial.suggest_categorical(
86
- "model", [k for k, v in _MODELS.items() if v.supports_x(df)]
87
- )
88
- ]()
84
+ model_name = trial.suggest_categorical(
85
+ "model", [k for k, v in _MODELS.items() if v.supports_x(df)]
86
+ )
87
+ print(f"Using {model_name} model")
88
+ model = _MODELS[model_name]()
89
89
  model.set_options(trial, df)
90
90
  self._model = model
91
91
 
@@ -109,7 +109,9 @@ class XGBoostModel(Model):
109
109
  def feature_importances(self) -> dict[str, float]:
110
110
  bst = self._provide_xgboost()
111
111
  try:
112
- return bst.get_booster().get_score(importance_type="weight") # type: ignore
112
+ score_dict = bst.get_booster().get_score(importance_type="weight") # type: ignore
113
+ total = sum(score_dict.values()) # type: ignore
114
+ return {k: v / total for k, v in score_dict.items()} # type: ignore
113
115
  except XGBoostError as exc:
114
116
  print(str(exc))
115
117
  return {}
@@ -14,6 +14,7 @@ from .constant_reducer import ConstantReducer
14
14
  from .correlation_reducer import CorrelationReducer
15
15
  from .duplicate_reducer import DuplicateReducer
16
16
  from .nonnumeric_reducer import NonNumericReducer
17
+ from .pca_reducer import PCAReducer
17
18
  from .reducer import Reducer
18
19
  from .select_by_single_feature_performance_reducer import \
19
20
  SelectBySingleFeaturePerformanceReducer
@@ -29,12 +30,14 @@ class CombinedReducer(Reducer):
29
30
  """A reducer that combines a series of reducers."""
30
31
 
31
32
  # pylint: disable=too-many-positional-arguments,too-many-arguments
33
+ _folder: str | None
32
34
 
33
- def __init__(self):
35
+ def __init__(self, embedding_cols: list[list[str]] | None):
34
36
  super().__init__()
35
37
  self._reducers = [
36
38
  UnseenReducer(),
37
39
  NonNumericReducer(),
40
+ PCAReducer(embedding_cols),
38
41
  ConstantReducer(),
39
42
  DuplicateReducer(),
40
43
  CorrelationReducer(),
@@ -42,6 +45,7 @@ class CombinedReducer(Reducer):
42
45
  # SelectBySingleFeaturePerformanceReducer(),
43
46
  ]
44
47
  self._folder = None
48
+ self._embedding_cols = embedding_cols
45
49
 
46
50
  @classmethod
47
51
  def name(cls) -> str:
@@ -74,6 +78,8 @@ class CombinedReducer(Reducer):
74
78
  self._reducers.append(SmartCorrelationReducer())
75
79
  elif reducer_name == SelectBySingleFeaturePerformanceReducer.name():
76
80
  self._reducers.append(SelectBySingleFeaturePerformanceReducer())
81
+ elif reducer_name == PCAReducer.name():
82
+ self._reducers.append(PCAReducer(self._embedding_cols))
77
83
  for reducer in self._reducers:
78
84
  reducer.load(folder)
79
85
  self._folder = folder
@@ -0,0 +1,78 @@
1
+ """A reducer that reduces embeddings using PCA."""
2
+
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments
4
+ import hashlib
5
+ import os
6
+ from typing import Self
7
+
8
+ import joblib # type: ignore
9
+ import optuna
10
+ import pandas as pd
11
+ from sklearn.decomposition import PCA # type: ignore
12
+
13
+ from .reducer import Reducer
14
+
15
+ _PCA_THRESHOLD = "pca_threshold"
16
+
17
+
18
+ class PCAReducer(Reducer):
19
+ """A class that reduces embeddings using PCA."""
20
+
21
+ _pcas: dict[str, PCA]
22
+
23
+ @classmethod
24
+ def name(cls) -> str:
25
+ return "pca"
26
+
27
+ def __init__(self, embedding_cols: list[list[str]] | None):
28
+ super().__init__()
29
+ self._embedding_cols = embedding_cols if embedding_cols is not None else []
30
+ self._pcas = {}
31
+
32
+ @property
33
+ def _embedding_dict(self) -> dict[str, list[str]]:
34
+ return {
35
+ hashlib.sha256("|".join(sorted(x)).encode()).hexdigest(): x
36
+ for x in self._embedding_cols
37
+ }
38
+
39
+ def set_options(
40
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
41
+ ) -> None:
42
+ if self._embedding_cols is None:
43
+ return
44
+ threshold = trial.suggest_float(_PCA_THRESHOLD, 0.7, 0.99)
45
+ self._pcas = {k: PCA(n_components=threshold) for k in self._embedding_dict}
46
+
47
+ def load(self, folder: str) -> None:
48
+ for k in self._embedding_dict:
49
+ self._pcas[k] = joblib.load(os.path.join(folder, f"{k}_pca_reducer.joblib"))
50
+
51
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
52
+ for k, v in self._pcas.items():
53
+ joblib.dump(v, os.path.join(folder, f"{k}_pca_reducer.joblib"))
54
+
55
+ def fit(
56
+ self,
57
+ df: pd.DataFrame,
58
+ y: pd.Series | pd.DataFrame | None = None,
59
+ w: pd.Series | None = None,
60
+ eval_x: pd.DataFrame | None = None,
61
+ eval_y: pd.Series | pd.DataFrame | None = None,
62
+ ) -> Self:
63
+ if self._embedding_cols is None:
64
+ return self
65
+ for k, v in self._pcas.items():
66
+ v.fit(df[self._embedding_dict[k]])
67
+ return self
68
+
69
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
70
+ if self._embedding_cols is None:
71
+ return df
72
+ for k, v in self._pcas.items():
73
+ cols = self._embedding_dict[k]
74
+ compressed_embedding = v.transform(df[cols])
75
+ embedding_len = compressed_embedding.shape[0]
76
+ df[cols[:embedding_len]] = compressed_embedding
77
+ df = df.drop(columns=cols[embedding_len:])
78
+ return df
@@ -83,18 +83,18 @@ class Selector(Params, Fit):
83
83
  current_features = [list(feature_importances.keys())[0]]
84
84
  current_features = current_features[:required_features]
85
85
  print(
86
- f"Current Features:\n{pd.Series(data=list(feature_importances.values()), index=list(feature_importances.keys()))}\n"
86
+ f"Current Features:\n{pd.Series(data=[feature_importances[x] for x in current_features], index=current_features)}\n"
87
87
  )
88
88
 
89
89
  n_features = len(current_features)
90
90
  for i in range(self._steps):
91
- print(
92
- f"Recursive Feature Elimination Step {i}, current features: {len(current_features)}"
93
- )
94
91
  ratio_diff = 1.0 - self._feature_ratio
95
92
  ratio_step = ratio_diff / float(self._steps)
96
93
  current_ratio = 1.0 - (ratio_step * i)
97
94
  n_features = max(1, int(total_columns * current_ratio))
95
+ print(
96
+ f"Recursive Feature Elimination Step {i}, current features: {len(current_features)} required features: {n_features}"
97
+ )
98
98
  if n_features >= len(current_features):
99
99
  continue
100
100
 
@@ -63,6 +63,7 @@ class Trainer(Fit):
63
63
  dt_column: str | None = None,
64
64
  max_train_timeout: datetime.timedelta | None = None,
65
65
  cutoff_dt: datetime.datetime | None = None,
66
+ embedding_cols: list[list[str]] | None = None,
66
67
  ):
67
68
  tqdm.tqdm.pandas()
68
69
 
@@ -153,6 +154,7 @@ class Trainer(Fit):
153
154
  self._dt_column = dt_column
154
155
  self._max_train_timeout = max_train_timeout
155
156
  self._cutoff_dt = cutoff_dt
157
+ self._embedding_cols = embedding_cols
156
158
 
157
159
  def _provide_study(self, column: str) -> optuna.Study:
158
160
  storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
@@ -247,7 +249,7 @@ class Trainer(Fit):
247
249
 
248
250
  # Perform common reductions
249
251
  start_reducer = time.time()
250
- reducer = CombinedReducer()
252
+ reducer = CombinedReducer(self._embedding_cols)
251
253
  reducer.set_options(trial, x)
252
254
  x_train = reducer.fit_transform(x_train, y=y_train)
253
255
  x_test = reducer.transform(x_test)
@@ -415,7 +417,14 @@ class Trainer(Fit):
415
417
  break
416
418
  if found:
417
419
  last_processed_dt = test_dt
418
- _fit(study.best_trial, test_df, test_series, True, test_idx, True)
420
+ _fit(
421
+ study.best_trial,
422
+ test_df.copy(),
423
+ test_series,
424
+ True,
425
+ test_idx,
426
+ True,
427
+ )
419
428
  continue
420
429
  if (
421
430
  last_processed_dt is not None
@@ -431,7 +440,7 @@ class Trainer(Fit):
431
440
  def validate_objctive(
432
441
  trial: optuna.Trial, idx: datetime.datetime, series: pd.Series
433
442
  ) -> float:
434
- return _fit(trial, test_df, series, False, idx, False)
443
+ return _fit(trial, test_df.copy(), series, False, idx, False)
435
444
 
436
445
  study.optimize(
437
446
  functools.partial(
@@ -445,7 +454,9 @@ class Trainer(Fit):
445
454
  else:
446
455
  break
447
456
 
448
- _fit(study.best_trial, test_df, test_series, True, test_idx, True)
457
+ _fit(
458
+ study.best_trial, test_df.copy(), test_series, True, test_idx, True
459
+ )
449
460
  last_processed_dt = test_idx
450
461
 
451
462
  if isinstance(y, pd.Series):
@@ -503,7 +514,7 @@ class Trainer(Fit):
503
514
  date_str = dates[-1].isoformat()
504
515
  folder = os.path.join(column_path, date_str)
505
516
 
506
- reducer = CombinedReducer()
517
+ reducer = CombinedReducer(self._embedding_cols)
507
518
  reducer.load(folder)
508
519
 
509
520
  model = ModelRouter()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.45
3
+ Version: 0.0.46
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -47,6 +47,7 @@ wavetrainer/reducer/correlation_reducer.py
47
47
  wavetrainer/reducer/duplicate_reducer.py
48
48
  wavetrainer/reducer/non_categorical_numeric_columns.py
49
49
  wavetrainer/reducer/nonnumeric_reducer.py
50
+ wavetrainer/reducer/pca_reducer.py
50
51
  wavetrainer/reducer/reducer.py
51
52
  wavetrainer/reducer/select_by_single_feature_performance_reducer.py
52
53
  wavetrainer/reducer/smart_correlation_reducer.py
File without changes
File without changes
File without changes
File without changes