wavetrainer 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {wavetrainer-0.1.13/wavetrainer.egg-info → wavetrainer-0.1.15}/PKG-INFO +1 -1
  2. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/setup.py +1 -1
  3. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/__init__.py +1 -1
  4. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/create.py +2 -0
  5. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/correlation_reducer.py +51 -36
  6. {wavetrainer-0.1.13 → wavetrainer-0.1.15/wavetrainer.egg-info}/PKG-INFO +1 -1
  7. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/LICENSE +0 -0
  8. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/MANIFEST.in +0 -0
  9. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/README.md +0 -0
  10. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/requirements.txt +0 -0
  11. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/setup.cfg +0 -0
  12. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/__init__.py +0 -0
  13. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/model/__init__.py +0 -0
  14. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/model/catboost_kwargs_test.py +0 -0
  15. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/tests/trainer_test.py +0 -0
  16. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/__init__.py +0 -0
  17. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/calibrator.py +0 -0
  18. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/calibrator_router.py +0 -0
  19. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/calibrator/vennabers_calibrator.py +0 -0
  20. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/exceptions.py +0 -0
  21. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/fit.py +0 -0
  22. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/__init__.py +0 -0
  23. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/__init__.py +0 -0
  24. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_classifier_wrap.py +0 -0
  25. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_kwargs.py +0 -0
  26. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_model.py +0 -0
  27. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/catboost/catboost_regressor_wrap.py +0 -0
  28. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/lightgbm/__init__.py +0 -0
  29. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/lightgbm/lightgbm_model.py +0 -0
  30. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/model.py +0 -0
  31. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/model_router.py +0 -0
  32. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/tabpfn/__init__.py +0 -0
  33. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/tabpfn/tabpfn_model.py +0 -0
  34. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/__init__.py +0 -0
  35. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/early_stopper.py +0 -0
  36. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/xgboost_logger.py +0 -0
  37. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model/xgboost/xgboost_model.py +0 -0
  38. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/model_type.py +0 -0
  39. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/params.py +0 -0
  40. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/__init__.py +0 -0
  41. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/base_selector_reducer.py +0 -0
  42. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/combined_reducer.py +0 -0
  43. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/constant_reducer.py +0 -0
  44. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/duplicate_reducer.py +0 -0
  45. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/non_categorical_numeric_columns.py +0 -0
  46. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/nonnumeric_reducer.py +0 -0
  47. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/pca_reducer.py +0 -0
  48. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/reducer.py +0 -0
  49. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/select_by_single_feature_performance_reducer.py +0 -0
  50. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/smart_correlation_reducer.py +0 -0
  51. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/reducer/unseen_reducer.py +0 -0
  52. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/selector/__init__.py +0 -0
  53. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/selector/selector.py +0 -0
  54. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/trainer.py +0 -0
  55. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/__init__.py +0 -0
  56. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/class_weights.py +0 -0
  57. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/combined_weights.py +0 -0
  58. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/exponential_weights.py +0 -0
  59. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/linear_weights.py +0 -0
  60. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/noop_weights.py +0 -0
  61. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/sigmoid_weights.py +0 -0
  62. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/weights.py +0 -0
  63. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/weights/weights_router.py +0 -0
  64. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/windower/__init__.py +0 -0
  65. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer/windower/windower.py +0 -0
  66. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/SOURCES.txt +0 -0
  67. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/dependency_links.txt +0 -0
  68. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/not-zip-safe +0 -0
  69. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/requires.txt +0 -0
  70. {wavetrainer-0.1.13 → wavetrainer-0.1.15}/wavetrainer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
23
23
 
24
24
  setup(
25
25
  name='wavetrainer',
26
- version='0.1.13',
26
+ version='0.1.15',
27
27
  description='A library for automatically finding the optimal model within feature and hyperparameter space.',
28
28
  long_description=long_description,
29
29
  long_description_content_type='text/markdown',
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .create import create
4
4
 
5
- __VERSION__ = "0.1.13"
5
+ __VERSION__ = "0.1.15"
6
6
  __all__ = ("create",)
@@ -17,6 +17,7 @@ def create(
17
17
  cutoff_dt: datetime.datetime | None = None,
18
18
  embedding_cols: list[list[str]] | None = None,
19
19
  allowed_models: set[str] | None = None,
20
+ max_false_positive_reduction_steps: int | None = None,
20
21
  ) -> Trainer:
21
22
  """Create a trainer."""
22
23
  return Trainer(
@@ -29,4 +30,5 @@ def create(
29
30
  cutoff_dt=cutoff_dt,
30
31
  embedding_cols=embedding_cols,
31
32
  allowed_models=allowed_models,
33
+ max_false_positive_reduction_steps=max_false_positive_reduction_steps,
32
34
  )
@@ -1,6 +1,6 @@
1
1
  """A reducer that removes correlation features."""
2
2
 
3
- # pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
3
+ # pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate,too-many-locals
4
4
  import json
5
5
  import os
6
6
  from typing import Self
@@ -17,51 +17,64 @@ _CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
17
17
  _CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
18
18
 
19
19
 
20
- def _get_correlated_features_to_drop(
21
- df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
20
+ def _get_correlated_features_to_drop_chunked(
21
+ df: pd.DataFrame,
22
+ threshold: float = 0.85,
23
+ chunk_size: int = 10000,
24
+ random_seed: int = 42,
22
25
  ) -> list[str]:
23
26
  """
24
- Identify highly correlated features to drop, keeping one per group.
25
- NaNs are replaced with a single fixed junk value to allow correlation computation.
26
- Columns are processed in sorted order to ensure deterministic output.
27
-
28
- Args:
29
- df (pd.DataFrame): Input DataFrame.
30
- threshold (float): Correlation threshold above which features are considered redundant.
31
- random_seed (int): Seed used to generate the fixed junk value.
32
-
33
- Returns:
34
- List[str]: List of column names to drop.
27
+ Chunked correlation feature reducer to control memory usage.
28
+ Applies correlation pruning within chunks, then across surviving features.
35
29
  """
36
30
  np.random.seed(random_seed)
37
-
38
- # Select and sort numeric columns
39
31
  sorted_cols = sorted(find_non_categorical_numeric_columns(df))
40
32
  df_numeric = df[sorted_cols].copy()
41
-
42
- # Generate and apply a fixed junk value for NaNs
43
33
  junk_value = np.random.uniform(-1e9, 1e9)
44
- df_numeric = df_numeric.fillna(junk_value)
45
-
46
- if df_numeric.shape[1] < 2:
47
- return []
48
-
49
- # Compute absolute correlation matrix
50
- corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
51
- abs_corr = np.abs(corr_matrix)
52
-
53
- # Greedy feature drop based on sorted order
54
- to_drop = set()
55
- for i in range(len(sorted_cols)):
56
- if sorted_cols[i] in to_drop:
34
+ df_numeric = df_numeric.fillna(junk_value).astype(np.float32)
35
+
36
+ # First pass: intra-chunk correlation pruning
37
+ survivors = []
38
+ to_drop_total = set()
39
+ for i in range(0, len(sorted_cols), chunk_size):
40
+ chunk_cols = sorted_cols[i : i + chunk_size]
41
+ chunk_df = df_numeric[chunk_cols]
42
+ chunk_corr = np.corrcoef(chunk_df.values, rowvar=False)
43
+ abs_corr = np.abs(chunk_corr)
44
+
45
+ to_drop = set()
46
+ for j in range(len(chunk_cols)):
47
+ if chunk_cols[j] in to_drop:
48
+ continue
49
+ for k in range(j + 1, len(chunk_cols)):
50
+ if chunk_cols[k] in to_drop:
51
+ continue
52
+ if abs_corr[j, k] > threshold:
53
+ to_drop.add(chunk_cols[k])
54
+
55
+ survivors.extend([col for col in chunk_cols if col not in to_drop])
56
+ to_drop_total.update(to_drop)
57
+
58
+ # Second pass: global correlation among survivors
59
+ if len(survivors) < 2:
60
+ return sorted(to_drop_total)
61
+
62
+ survivors_df = df_numeric[survivors]
63
+ final_corr = np.corrcoef(survivors_df.values, rowvar=False)
64
+ abs_corr = np.abs(final_corr)
65
+
66
+ final_drop = set()
67
+ for i in range(len(survivors)):
68
+ if survivors[i] in final_drop:
57
69
  continue
58
- for j in range(i + 1, len(sorted_cols)):
59
- if sorted_cols[j] in to_drop:
70
+ for j in range(i + 1, len(survivors)):
71
+ if survivors[j] in final_drop:
60
72
  continue
61
73
  if abs_corr[i, j] > threshold:
62
- to_drop.add(sorted_cols[j])
74
+ final_drop.add(survivors[j])
63
75
 
64
- return sorted(to_drop)
76
+ to_drop_total.update(final_drop)
77
+ return sorted(to_drop_total)
65
78
 
66
79
 
67
80
  class CorrelationReducer(Reducer):
@@ -102,7 +115,9 @@ class CorrelationReducer(Reducer):
102
115
  eval_x: pd.DataFrame | None = None,
103
116
  eval_y: pd.Series | pd.DataFrame | None = None,
104
117
  ) -> Self:
105
- drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
118
+ drop_features = _get_correlated_features_to_drop_chunked(
119
+ df, threshold=self._threshold
120
+ )
106
121
  self._correlation_drop_features = {x: True for x in drop_features}
107
122
  return self
108
123
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wavetrainer
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
5
5
  Home-page: https://github.com/8W9aG/wavetrainer
6
6
  Author: Will Sackfield
File without changes
File without changes
File without changes
File without changes