wavetrainer 0.0.44__tar.gz → 0.0.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {wavetrainer-0.0.44/wavetrainer.egg-info → wavetrainer-0.0.46}/PKG-INFO +1 -1
  2. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/setup.py +1 -1
  3. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/__init__.py +1 -1
  4. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/create.py +2 -0
  5. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/model_router.py +5 -5
  6. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_model.py +3 -1
  7. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/combined_reducer.py +7 -1
  8. wavetrainer-0.0.46/wavetrainer/reducer/pca_reducer.py +78 -0
  9. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/selector/selector.py +4 -4
  10. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/trainer.py +18 -6
  11. {wavetrainer-0.0.44 → wavetrainer-0.0.46/wavetrainer.egg-info}/PKG-INFO +1 -1
  12. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer.egg-info/SOURCES.txt +1 -0
  13. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/LICENSE +0 -0
  14. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/MANIFEST.in +0 -0
  15. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/README.md +0 -0
  16. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/requirements.txt +0 -0
  17. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/setup.cfg +0 -0
  18. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/tests/__init__.py +0 -0
  19. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/tests/model/__init__.py +0 -0
  20. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/tests/model/catboost_kwargs_test.py +0 -0
  21. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/tests/trainer_test.py +0 -0
  22. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/calibrator/__init__.py +0 -0
  23. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator.py +0 -0
  24. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/calibrator/calibrator_router.py +0 -0
  25. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/calibrator/mapie_calibrator.py +0 -0
  26. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  27. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/exceptions.py +0 -0
  28. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/fit.py +0 -0
  29. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/__init__.py +0 -0
  30. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/catboost/__init__.py +0 -0
  31. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  32. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  33. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_model.py +0 -0
  34. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  35. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/model.py +0 -0
  36. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/__init__.py +0 -0
  37. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  38. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/__init__.py +0 -0
  39. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  40. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  41. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/model_type.py +0 -0
  42. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/params.py +0 -0
  43. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/__init__.py +0 -0
  44. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  45. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/constant_reducer.py +0 -0
  46. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/correlation_reducer.py +0 -0
  47. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  48. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  49. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  50. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/reducer.py +0 -0
  51. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  52. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  53. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/reducer/unseen_reducer.py +0 -0
  54. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/selector/__init__.py +0 -0
  55. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer.egg-info/dependency_links.txt +0 -0
  67. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer.egg-info/not-zip-safe +0 -0
  68. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer.egg-info/requires.txt +0 -0
  69. {wavetrainer-0.0.44 → wavetrainer-0.0.46}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.44
3
+ Version: 0.0.46
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.0.44',
26
+ version='0.0.46',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.0.44"
5
+ __VERSION__ = "0.0.46"
6
6
  __all__ = ("create",)
@@ -15,6 +15,7 @@ def create(
15
15
  dt_column: str | None = None,
16
16
  max_train_timeout: datetime.timedelta | None = None,
17
17
  cutoff_dt: datetime.datetime | None = None,
18
+ embedding_cols: list[list[str]] | None = None,
18
19
  ) -> Trainer:
19
20
  """Create a trainer."""
20
21
  return Trainer(
@@ -25,4 +26,5 @@ def create(
25
26
  dt_column=dt_column,
26
27
  max_train_timeout=max_train_timeout,
27
28
  cutoff_dt=cutoff_dt,
29
+ embedding_cols=embedding_cols,
28
30
  )
@@ -81,11 +81,11 @@ class ModelRouter(Model):
81
81
  def set_options(
82
82
  self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
83
83
  ) -> None:
84
- model = _MODELS[
85
- trial.suggest_categorical(
86
- "model", [k for k, v in _MODELS.items() if v.supports_x(df)]
87
- )
88
- ]()
84
+ model_name = trial.suggest_categorical(
85
+ "model", [k for k, v in _MODELS.items() if v.supports_x(df)]
86
+ )
87
+ print(f"Using {model_name} model")
88
+ model = _MODELS[model_name]()
89
89
  model.set_options(trial, df)
90
90
  self._model = model
91
91
 
@@ -109,7 +109,9 @@ class XGBoostModel(Model):
109
109
  def feature_importances(self) -> dict[str, float]:
110
110
  bst = self._provide_xgboost()
111
111
  try:
112
- return bst.get_booster().get_score(importance_type="weight") # type: ignore
112
+ score_dict = bst.get_booster().get_score(importance_type="weight") # type: ignore
113
+ total = sum(score_dict.values()) # type: ignore
114
+ return {k: v / total for k, v in score_dict.items()} # type: ignore
113
115
  except XGBoostError as exc:
114
116
  print(str(exc))
115
117
  return {}
@@ -14,6 +14,7 @@ from .constant_reducer import ConstantReducer
14
14
  from .correlation_reducer import CorrelationReducer
15
15
  from .duplicate_reducer import DuplicateReducer
16
16
  from .nonnumeric_reducer import NonNumericReducer
17
+ from .pca_reducer import PCAReducer
17
18
  from .reducer import Reducer
18
19
  from .select_by_single_feature_performance_reducer import \
19
20
  SelectBySingleFeaturePerformanceReducer
@@ -29,12 +30,14 @@ class CombinedReducer(Reducer):
29
30
  """A reducer that combines a series of reducers."""
30
31
 
31
32
  # pylint: disable=too-many-positional-arguments,too-many-arguments
33
+ _folder: str | None
32
34
 
33
- def __init__(self):
35
+ def __init__(self, embedding_cols: list[list[str]] | None):
34
36
  super().__init__()
35
37
  self._reducers = [
36
38
  UnseenReducer(),
37
39
  NonNumericReducer(),
40
+ PCAReducer(embedding_cols),
38
41
  ConstantReducer(),
39
42
  DuplicateReducer(),
40
43
  CorrelationReducer(),
@@ -42,6 +45,7 @@ class CombinedReducer(Reducer):
42
45
  # SelectBySingleFeaturePerformanceReducer(),
43
46
  ]
44
47
  self._folder = None
48
+ self._embedding_cols = embedding_cols
45
49
 
46
50
  @classmethod
47
51
  def name(cls) -> str:
@@ -74,6 +78,8 @@ class CombinedReducer(Reducer):
74
78
  self._reducers.append(SmartCorrelationReducer())
75
79
  elif reducer_name == SelectBySingleFeaturePerformanceReducer.name():
76
80
  self._reducers.append(SelectBySingleFeaturePerformanceReducer())
81
+ elif reducer_name == PCAReducer.name():
82
+ self._reducers.append(PCAReducer(self._embedding_cols))
77
83
  for reducer in self._reducers:
78
84
  reducer.load(folder)
79
85
  self._folder = folder
@@ -0,0 +1,78 @@
1
+ """A reducer that reduces embeddings using PCA."""
2
+
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments
4
+ import hashlib
5
+ import os
6
+ from typing import Self
7
+
8
+ import joblib # type: ignore
9
+ import optuna
10
+ import pandas as pd
11
+ from sklearn.decomposition import PCA # type: ignore
12
+
13
+ from .reducer import Reducer
14
+
15
+ _PCA_THRESHOLD = "pca_threshold"
16
+
17
+
18
+ class PCAReducer(Reducer):
19
+ """A class that reduces embeddings using PCA."""
20
+
21
+ _pcas: dict[str, PCA]
22
+
23
+ @classmethod
24
+ def name(cls) -> str:
25
+ return "pca"
26
+
27
+ def __init__(self, embedding_cols: list[list[str]] | None):
28
+ super().__init__()
29
+ self._embedding_cols = embedding_cols if embedding_cols is not None else []
30
+ self._pcas = {}
31
+
32
+ @property
33
+ def _embedding_dict(self) -> dict[str, list[str]]:
34
+ return {
35
+ hashlib.sha256("|".join(sorted(x)).encode()).hexdigest(): x
36
+ for x in self._embedding_cols
37
+ }
38
+
39
+ def set_options(
40
+ self, trial: optuna.Trial | optuna.trial.FrozenTrial, df: pd.DataFrame
41
+ ) -> None:
42
+ if self._embedding_cols is None:
43
+ return
44
+ threshold = trial.suggest_float(_PCA_THRESHOLD, 0.7, 0.99)
45
+ self._pcas = {k: PCA(n_components=threshold) for k in self._embedding_dict}
46
+
47
+ def load(self, folder: str) -> None:
48
+ for k in self._embedding_dict:
49
+ self._pcas[k] = joblib.load(os.path.join(folder, f"{k}_pca_reducer.joblib"))
50
+
51
+ def save(self, folder: str, trial: optuna.Trial | optuna.trial.FrozenTrial) -> None:
52
+ for k, v in self._pcas.items():
53
+ joblib.dump(v, os.path.join(folder, f"{k}_pca_reducer.joblib"))
54
+
55
+ def fit(
56
+ self,
57
+ df: pd.DataFrame,
58
+ y: pd.Series | pd.DataFrame | None = None,
59
+ w: pd.Series | None = None,
60
+ eval_x: pd.DataFrame | None = None,
61
+ eval_y: pd.Series | pd.DataFrame | None = None,
62
+ ) -> Self:
63
+ if self._embedding_cols is None:
64
+ return self
65
+ for k, v in self._pcas.items():
66
+ v.fit(df[self._embedding_dict[k]])
67
+ return self
68
+
69
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
70
+ if self._embedding_cols is None:
71
+ return df
72
+ for k, v in self._pcas.items():
73
+ cols = self._embedding_dict[k]
74
+ compressed_embedding = v.transform(df[cols])
75
+ embedding_len = compressed_embedding.shape[0]
76
+ df[cols[:embedding_len]] = compressed_embedding
77
+ df = df.drop(columns=cols[embedding_len:])
78
+ return df
@@ -83,18 +83,18 @@ class Selector(Params, Fit):
83
83
  current_features = [list(feature_importances.keys())[0]]
84
84
  current_features = current_features[:required_features]
85
85
  print(
86
- f"Current Features:\n{pd.Series(data=list(feature_importances.values()), index=list(feature_importances.keys()))}\n"
86
+ f"Current Features:\n{pd.Series(data=[feature_importances[x] for x in current_features], index=current_features)}\n"
87
87
  )
88
88
 
89
89
  n_features = len(current_features)
90
90
  for i in range(self._steps):
91
- print(
92
- f"Recursive Feature Elimination Step {i}, current features: {len(current_features)}"
93
- )
94
91
  ratio_diff = 1.0 - self._feature_ratio
95
92
  ratio_step = ratio_diff / float(self._steps)
96
93
  current_ratio = 1.0 - (ratio_step * i)
97
94
  n_features = max(1, int(total_columns * current_ratio))
95
+ print(
96
+ f"Recursive Feature Elimination Step {i}, current features: {len(current_features)} required features: {n_features}"
97
+ )
98
98
  if n_features >= len(current_features):
99
99
  continue
100
100
 
@@ -63,6 +63,7 @@ class Trainer(Fit):
63
63
  dt_column: str | None = None,
64
64
  max_train_timeout: datetime.timedelta | None = None,
65
65
  cutoff_dt: datetime.datetime | None = None,
66
+ embedding_cols: list[list[str]] | None = None,
66
67
  ):
67
68
  tqdm.tqdm.pandas()
68
69
 
@@ -153,6 +154,7 @@ class Trainer(Fit):
153
154
  self._dt_column = dt_column
154
155
  self._max_train_timeout = max_train_timeout
155
156
  self._cutoff_dt = cutoff_dt
157
+ self._embedding_cols = embedding_cols
156
158
 
157
159
  def _provide_study(self, column: str) -> optuna.Study:
158
160
  storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
@@ -247,7 +249,7 @@ class Trainer(Fit):
247
249
 
248
250
  # Perform common reductions
249
251
  start_reducer = time.time()
250
- reducer = CombinedReducer()
252
+ reducer = CombinedReducer(self._embedding_cols)
251
253
  reducer.set_options(trial, x)
252
254
  x_train = reducer.fit_transform(x_train, y=y_train)
253
255
  x_test = reducer.transform(x_test)
@@ -403,6 +405,8 @@ class Trainer(Fit):
403
405
  enumerate(dt_index[dt_index >= start_test_index])
404
406
  ):
405
407
  test_dt = test_idx.to_pydatetime()
408
+ test_df = df.iloc[: train_len + count + test_len]
409
+ test_series = y_series.iloc[: train_len + count + test_len]
406
410
  found = False
407
411
  for trial in study.trials:
408
412
  dt_idx = datetime.datetime.fromisoformat(
@@ -413,6 +417,14 @@ class Trainer(Fit):
413
417
  break
414
418
  if found:
415
419
  last_processed_dt = test_dt
420
+ _fit(
421
+ study.best_trial,
422
+ test_df.copy(),
423
+ test_series,
424
+ True,
425
+ test_idx,
426
+ True,
427
+ )
416
428
  continue
417
429
  if (
418
430
  last_processed_dt is not None
@@ -420,8 +432,6 @@ class Trainer(Fit):
420
432
  ):
421
433
  continue
422
434
 
423
- test_df = df.iloc[: train_len + count + test_len]
424
- test_series = y_series.iloc[: train_len + count + test_len]
425
435
  if len(test_df) <= 3:
426
436
  continue
427
437
 
@@ -430,7 +440,7 @@ class Trainer(Fit):
430
440
  def validate_objctive(
431
441
  trial: optuna.Trial, idx: datetime.datetime, series: pd.Series
432
442
  ) -> float:
433
- return _fit(trial, test_df, series, False, idx, False)
443
+ return _fit(trial, test_df.copy(), series, False, idx, False)
434
444
 
435
445
  study.optimize(
436
446
  functools.partial(
@@ -444,7 +454,9 @@ class Trainer(Fit):
444
454
  else:
445
455
  break
446
456
 
447
- _fit(study.best_trial, test_df, test_series, True, test_idx, True)
457
+ _fit(
458
+ study.best_trial, test_df.copy(), test_series, True, test_idx, True
459
+ )
448
460
  last_processed_dt = test_idx
449
461
 
450
462
  if isinstance(y, pd.Series):
@@ -502,7 +514,7 @@ class Trainer(Fit):
502
514
  date_str = dates[-1].isoformat()
503
515
  folder = os.path.join(column_path, date_str)
504
516
 
505
- reducer = CombinedReducer()
517
+ reducer = CombinedReducer(self._embedding_cols)
506
518
  reducer.load(folder)
507
519
 
508
520
  model = ModelRouter()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.0.44
3
+ Version: 0.0.46
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -47,6 +47,7 @@ wavetrainer/reducer/correlation_reducer.py
47
47
  wavetrainer/reducer/duplicate_reducer.py
48
48
  wavetrainer/reducer/non_categorical_numeric_columns.py
49
49
  wavetrainer/reducer/nonnumeric_reducer.py
50
+ wavetrainer/reducer/pca_reducer.py
50
51
  wavetrainer/reducer/reducer.py
51
52
  wavetrainer/reducer/select_by_single_feature_performance_reducer.py
52
53
  wavetrainer/reducer/smart_correlation_reducer.py
File without changes
File without changes
File without changes
File without changes