wavetrainer 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {wavetrainer-0.1.14/wavetrainer.egg-info → wavetrainer-0.1.16}/PKG-INFO +1 -1
  2. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/setup.py +1 -1
  3. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/__init__.py +1 -1
  4. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/create.py +2 -0
  5. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/combined_reducer.py +10 -3
  6. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/correlation_reducer.py +55 -37
  7. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/trainer.py +15 -4
  8. {wavetrainer-0.1.14 → wavetrainer-0.1.16/wavetrainer.egg-info}/PKG-INFO +1 -1
  9. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/LICENSE +0 -0
  10. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/MANIFEST.in +0 -0
  11. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/README.md +0 -0
  12. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/requirements.txt +0 -0
  13. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/setup.cfg +0 -0
  14. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/tests/__init__.py +0 -0
  15. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/tests/model/__init__.py +0 -0
  16. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/tests/model/catboost_kwargs_test.py +0 -0
  17. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/tests/trainer_test.py +0 -0
  18. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/calibrator/__init__.py +0 -0
  19. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/calibrator/calibrator.py +0 -0
  20. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/calibrator/calibrator_router.py +0 -0
  21. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  22. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/exceptions.py +0 -0
  23. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/fit.py +0 -0
  24. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/__init__.py +0 -0
  25. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/catboost/__init__.py +0 -0
  26. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  27. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  28. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/catboost/catboost_model.py +0 -0
  29. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  30. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/lightgbm/__init__.py +0 -0
  31. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/lightgbm/lightgbm_model.py +0 -0
  32. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/model.py +0 -0
  33. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/model_router.py +0 -0
  34. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/tabpfn/__init__.py +0 -0
  35. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  36. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/xgboost/__init__.py +0 -0
  37. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  38. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  39. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model/xgboost/xgboost_model.py +0 -0
  40. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/model_type.py +0 -0
  41. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/params.py +0 -0
  42. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/__init__.py +0 -0
  43. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  44. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/constant_reducer.py +0 -0
  45. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  46. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  47. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  48. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/pca_reducer.py +0 -0
  49. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/reducer.py +0 -0
  50. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  51. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  52. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/unseen_reducer.py +0 -0
  53. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/selector/__init__.py +0 -0
  54. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/selector/selector.py +0 -0
  55. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer.egg-info/SOURCES.txt +0 -0
  67. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer.egg-info/dependency_links.txt +0 -0
  68. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer.egg-info/not-zip-safe +0 -0
  69. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer.egg-info/requires.txt +0 -0
  70. {wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.1.14',
26
+ version='0.1.16',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.1.14"
5
+ __VERSION__ = "0.1.15"
6
6
  __all__ = ("create",)
@@ -18,6 +18,7 @@ def create(
18
18
  embedding_cols: list[list[str]] | None = None,
19
19
  allowed_models: set[str] | None = None,
20
20
  max_false_positive_reduction_steps: int | None = None,
21
+ correlation_chunk_size: int | None = None,
21
22
  ) -> Trainer:
22
23
  """Create a trainer."""
23
24
  return Trainer(
@@ -31,4 +32,5 @@ def create(
31
32
  embedding_cols=embedding_cols,
32
33
  allowed_models=allowed_models,
33
34
  max_false_positive_reduction_steps=max_false_positive_reduction_steps,
35
+ correlation_chunk_size=correlation_chunk_size,
34
36
  )
@@ -31,20 +31,25 @@ class CombinedReducer(Reducer):
31
31
  # pylint: disable=too-many-positional-arguments,too-many-arguments
32
32
  _folder: str | None
33
33
 
34
- def __init__(self, embedding_cols: list[list[str]] | None):
34
+ def __init__(
35
+ self, embedding_cols: list[list[str]] | None, correlation_chunk_size: int | None
36
+ ):
35
37
  super().__init__()
38
+ if correlation_chunk_size is None:
39
+ correlation_chunk_size = 500
36
40
  self._reducers = [
37
41
  UnseenReducer(),
38
42
  NonNumericReducer(),
39
43
  PCAReducer(embedding_cols),
40
44
  ConstantReducer(),
41
45
  DuplicateReducer(),
42
- CorrelationReducer(),
46
+ CorrelationReducer(correlation_chunk_size=correlation_chunk_size),
43
47
  SmartCorrelationReducer(),
44
48
  # SelectBySingleFeaturePerformanceReducer(),
45
49
  ]
46
50
  self._folder = None
47
51
  self._embedding_cols = embedding_cols
52
+ self._correlation_chunk_size = correlation_chunk_size
48
53
 
49
54
  @classmethod
50
55
  def name(cls) -> str:
@@ -68,7 +73,9 @@ class CombinedReducer(Reducer):
68
73
  elif reducer_name == DuplicateReducer.name():
69
74
  self._reducers.append(DuplicateReducer())
70
75
  elif reducer_name == CorrelationReducer.name():
71
- self._reducers.append(CorrelationReducer())
76
+ self._reducers.append(
77
+ CorrelationReducer(self._correlation_chunk_size)
78
+ )
72
79
  elif reducer_name == NonNumericReducer.name():
73
80
  self._reducers.append(NonNumericReducer())
74
81
  elif reducer_name == UnseenReducer.name():
@@ -1,6 +1,6 @@
1
1
  """A reducer that removes correlation features."""
2
2
 
3
- # pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate,too-many-locals
4
4
  import json
5
5
  import os
6
6
  from typing import Self
@@ -17,51 +17,64 @@ _CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
17
17
  _CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
18
18
 
19
19
 
20
- def _get_correlated_features_to_drop(
21
- df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
20
+ def _get_correlated_features_to_drop_chunked(
21
+ df: pd.DataFrame,
22
+ threshold: float = 0.85,
23
+ chunk_size: int = 10000,
24
+ random_seed: int = 42,
22
25
  ) -> list[str]:
23
26
  """
24
- Identify highly correlated features to drop, keeping one per group.
25
- NaNs are replaced with a single fixed junk value to allow correlation computation.
26
- Columns are processed in sorted order to ensure deterministic output.
27
-
28
- Args:
29
- df (pd.DataFrame): Input DataFrame.
30
- threshold (float): Correlation threshold above which features are considered redundant.
31
- random_seed (int): Seed used to generate the fixed junk value.
32
-
33
- Returns:
34
- List[str]: List of column names to drop.
27
+ Chunked correlation feature reducer to control memory usage.
28
+ Applies correlation pruning within chunks, then across surviving features.
35
29
  """
36
30
  np.random.seed(random_seed)
37
-
38
- # Select and sort numeric columns
39
31
  sorted_cols = sorted(find_non_categorical_numeric_columns(df))
40
32
  df_numeric = df[sorted_cols].copy()
41
-
42
- # Generate and apply a fixed junk value for NaNs
43
33
  junk_value = np.random.uniform(-1e9, 1e9)
44
- df_numeric = df_numeric.fillna(junk_value)
45
-
46
- if df_numeric.shape[1] < 2:
47
- return []
48
-
49
- # Compute absolute correlation matrix
50
- corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
51
- abs_corr = np.abs(corr_matrix)
52
-
53
- # Greedy feature drop based on sorted order
54
- to_drop = set()
55
- for i in range(len(sorted_cols)):
56
- if sorted_cols[i] in to_drop:
34
+ df_numeric = df_numeric.fillna(junk_value).astype(np.float32)
35
+
36
+ # First pass: intra-chunk correlation pruning
37
+ survivors = []
38
+ to_drop_total = set()
39
+ for i in range(0, len(sorted_cols), chunk_size):
40
+ chunk_cols = sorted_cols[i : i + chunk_size]
41
+ chunk_df = df_numeric[chunk_cols]
42
+ chunk_corr = np.corrcoef(chunk_df.values, rowvar=False)
43
+ abs_corr = np.abs(chunk_corr)
44
+
45
+ to_drop = set()
46
+ for j in range(len(chunk_cols)):
47
+ if chunk_cols[j] in to_drop:
48
+ continue
49
+ for k in range(j + 1, len(chunk_cols)):
50
+ if chunk_cols[k] in to_drop:
51
+ continue
52
+ if abs_corr[j, k] > threshold:
53
+ to_drop.add(chunk_cols[k])
54
+
55
+ survivors.extend([col for col in chunk_cols if col not in to_drop])
56
+ to_drop_total.update(to_drop)
57
+
58
+ # Second pass: global correlation among survivors
59
+ if len(survivors) < 2:
60
+ return sorted(to_drop_total)
61
+
62
+ survivors_df = df_numeric[survivors]
63
+ final_corr = np.corrcoef(survivors_df.values, rowvar=False)
64
+ abs_corr = np.abs(final_corr)
65
+
66
+ final_drop = set()
67
+ for i in range(len(survivors)):
68
+ if survivors[i] in final_drop:
57
69
  continue
58
- for j in range(i + 1, len(sorted_cols)):
59
- if sorted_cols[j] in to_drop:
70
+ for j in range(i + 1, len(survivors)):
71
+ if survivors[j] in final_drop:
60
72
  continue
61
73
  if abs_corr[i, j] > threshold:
62
- to_drop.add(sorted_cols[j])
74
+ final_drop.add(survivors[j])
63
75
 
64
- return sorted(to_drop)
76
+ to_drop_total.update(final_drop)
77
+ return sorted(to_drop_total)
65
78
 
66
79
 
67
80
  class CorrelationReducer(Reducer):
@@ -69,9 +82,10 @@ class CorrelationReducer(Reducer):
69
82
 
70
83
  _correlation_drop_features: dict[str, bool]
71
84
 
72
- def __init__(self) -> None:
85
+ def __init__(self, correlation_chunk_size: int) -> None:
73
86
  self._threshold = 0.0
74
87
  self._correlation_drop_features = {}
88
+ self._correlation_chunk_size = correlation_chunk_size
75
89
 
76
90
  @classmethod
77
91
  def name(cls) -> str:
@@ -102,7 +116,11 @@ class CorrelationReducer(Reducer):
102
116
  eval_x: pd.DataFrame | None = None,
103
117
  eval_y: pd.Series | pd.DataFrame | None = None,
104
118
  ) -> Self:
105
- drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
119
+ drop_features = _get_correlated_features_to_drop_chunked(
120
+ df.copy(),
121
+ threshold=self._threshold,
122
+ chunk_size=self._correlation_chunk_size,
123
+ )
106
124
  self._correlation_drop_features = {x: True for x in drop_features}
107
125
  return self
108
126
 
@@ -42,6 +42,7 @@ _VALIDATION_SIZE_KEY = "validation_size"
42
42
  _IDX_USR_ATTR_KEY = "idx"
43
43
  _DT_COLUMN_KEY = "dt_column"
44
44
  _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY = "max_false_positive_reduction_steps"
45
+ _CORRELATION_CHUNK_SIZE_KEY = "correlation_chunk_size"
45
46
  _BAD_OUTPUT = -1000.0
46
47
 
47
48
 
@@ -75,6 +76,7 @@ class Trainer(Fit):
75
76
  embedding_cols: list[list[str]] | None = None,
76
77
  allowed_models: set[str] | None = None,
77
78
  max_false_positive_reduction_steps: int | None = None,
79
+ correlation_chunk_size: int | None = None,
78
80
  ):
79
81
  tqdm.tqdm.pandas()
80
82
 
@@ -129,6 +131,8 @@ class Trainer(Fit):
129
131
  max_false_positive_reduction_steps = params.get(
130
132
  _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY
131
133
  )
134
+ if correlation_chunk_size is None:
135
+ correlation_chunk_size = params.get(_CORRELATION_CHUNK_SIZE_KEY)
132
136
  else:
133
137
  with open(params_file, "w", encoding="utf8") as handle:
134
138
  validation_size_value = None
@@ -160,6 +164,7 @@ class Trainer(Fit):
160
164
  _VALIDATION_SIZE_KEY: validation_size_value,
161
165
  _DT_COLUMN_KEY: dt_column,
162
166
  _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY: max_false_positive_reduction_steps,
167
+ _CORRELATION_CHUNK_SIZE_KEY: correlation_chunk_size,
163
168
  },
164
169
  handle,
165
170
  )
@@ -173,6 +178,7 @@ class Trainer(Fit):
173
178
  self.embedding_cols = embedding_cols
174
179
  self._allowed_models = allowed_models
175
180
  self._max_false_positive_reduction_steps = max_false_positive_reduction_steps
181
+ self._correlation_chunk_size = correlation_chunk_size
176
182
 
177
183
  def _provide_study(self, column: str) -> optuna.Study:
178
184
  storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
@@ -246,7 +252,8 @@ class Trainer(Fit):
246
252
  "Found trial %d previously executed, skipping...",
247
253
  trial.number,
248
254
  )
249
- return trial_info["output"]
255
+ return tuple(trial_info["output"])
256
+ print("Retraining for different trial number.")
250
257
 
251
258
  train_dt_index = dt_index[: len(x)]
252
259
  x_train = x[train_dt_index < split_idx] # type: ignore
@@ -270,7 +277,9 @@ class Trainer(Fit):
270
277
 
271
278
  # Perform common reductions
272
279
  start_reducer = time.time()
273
- reducer = CombinedReducer(self.embedding_cols)
280
+ reducer = CombinedReducer(
281
+ self.embedding_cols, self._correlation_chunk_size
282
+ )
274
283
  reducer.set_options(trial, x)
275
284
  x_train = reducer.fit_transform(x_train, y=y_train)
276
285
  x_test = reducer.transform(x_test)
@@ -367,7 +376,7 @@ class Trainer(Fit):
367
376
  json.dump(
368
377
  {
369
378
  "number": trial.number,
370
- "output": output,
379
+ "output": [output, loss],
371
380
  },
372
381
  handle,
373
382
  )
@@ -583,7 +592,9 @@ class Trainer(Fit):
583
592
  date_str = dates[-1].isoformat()
584
593
  folder = os.path.join(column_path, date_str)
585
594
 
586
- reducer = CombinedReducer(self.embedding_cols)
595
+ reducer = CombinedReducer(
596
+ self.embedding_cols, self._correlation_chunk_size
597
+ )
587
598
  reducer.load(folder)
588
599
 
589
600
  model = ModelRouter(None, None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
File without changes
File without changes
File without changes
File without changes