PyPI - wavetrainer - Versions diffs - 0.1.14__tar.gz → 0.1.16__tar.gz - Mend

wavetrainer 0.1.14tar.gz → 0.1.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{wavetrainer-0.1.14/wavetrainer.egg-info → wavetrainer-0.1.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.1.14
+Version: 0.1.16
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/setup.py RENAMED Viewed

@@ -23,7 +23,7 @@ def install_requires() -> typing.List[str]:
 setup(
     name='wavetrainer',
-    version='0.1.14',
+    version='0.1.16',
     description='A library for automatically finding the optimal model within feature and hyperparameter space.',
     long_description=long_description,
     long_description_content_type='text/markdown',

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@
 from .create import create
-__VERSION__ = "0.1.14"
+__VERSION__ = "0.1.15"
 __all__ = ("create",)

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/create.py RENAMED Viewed

@@ -18,6 +18,7 @@ def create(
     embedding_cols: list[list[str]] | None = None,
     allowed_models: set[str] | None = None,
     max_false_positive_reduction_steps: int | None = None,
+    correlation_chunk_size: int | None = None,
 ) -> Trainer:
     """Create a trainer."""
     return Trainer(
@@ -31,4 +32,5 @@ def create(
         embedding_cols=embedding_cols,
         allowed_models=allowed_models,
         max_false_positive_reduction_steps=max_false_positive_reduction_steps,
+        correlation_chunk_size=correlation_chunk_size,
     )

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/combined_reducer.py RENAMED Viewed

@@ -31,20 +31,25 @@ class CombinedReducer(Reducer):
     # pylint: disable=too-many-positional-arguments,too-many-arguments
     _folder: str | None
-    def __init__(self, embedding_cols: list[list[str]] | None):
+    def __init__(
+        self, embedding_cols: list[list[str]] | None, correlation_chunk_size: int | None
+    ):
         super().__init__()
+        if correlation_chunk_size is None:
+            correlation_chunk_size = 500
         self._reducers = [
             UnseenReducer(),
             NonNumericReducer(),
             PCAReducer(embedding_cols),
             ConstantReducer(),
             DuplicateReducer(),
-            CorrelationReducer(),
+            CorrelationReducer(correlation_chunk_size=correlation_chunk_size),
             SmartCorrelationReducer(),
             # SelectBySingleFeaturePerformanceReducer(),
         ]
         self._folder = None
         self._embedding_cols = embedding_cols
+        self._correlation_chunk_size = correlation_chunk_size
     @classmethod
     def name(cls) -> str:
@@ -68,7 +73,9 @@ class CombinedReducer(Reducer):
                 elif reducer_name == DuplicateReducer.name():
                     self._reducers.append(DuplicateReducer())
                 elif reducer_name == CorrelationReducer.name():
-                    self._reducers.append(CorrelationReducer())
+                    self._reducers.append(
+                        CorrelationReducer(self._correlation_chunk_size)
+                    )
                 elif reducer_name == NonNumericReducer.name():
                     self._reducers.append(NonNumericReducer())
                 elif reducer_name == UnseenReducer.name():

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/reducer/correlation_reducer.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """A reducer that removes correlation features."""
-# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate
+# pylint: disable=too-many-arguments,too-many-positional-arguments,consider-using-enumerate,too-many-locals
 import json
 import os
 from typing import Self
@@ -17,51 +17,64 @@ _CORRELATION_REDUCER_FILENAME = "correlation_reducer.json"
 _CORRELATION_REDUCER_THRESHOLD = "correlation_reducer_threshold"
-def _get_correlated_features_to_drop(
-    df: pd.DataFrame, threshold: float = 0.85, random_seed: int = 42
+def _get_correlated_features_to_drop_chunked(
+    df: pd.DataFrame,
+    threshold: float = 0.85,
+    chunk_size: int = 10000,
+    random_seed: int = 42,
 ) -> list[str]:
     """
-    Identify highly correlated features to drop, keeping one per group.
-    NaNs are replaced with a single fixed junk value to allow correlation computation.
-    Columns are processed in sorted order to ensure deterministic output.
-    Args:
-        df (pd.DataFrame): Input DataFrame.
-        threshold (float): Correlation threshold above which features are considered redundant.
-        random_seed (int): Seed used to generate the fixed junk value.
-    Returns:
-        List[str]: List of column names to drop.
+    Chunked correlation feature reducer to control memory usage.
+    Applies correlation pruning within chunks, then across surviving features.
     """
     np.random.seed(random_seed)
-    # Select and sort numeric columns
     sorted_cols = sorted(find_non_categorical_numeric_columns(df))
     df_numeric = df[sorted_cols].copy()
-    # Generate and apply a fixed junk value for NaNs
     junk_value = np.random.uniform(-1e9, 1e9)
-    df_numeric = df_numeric.fillna(junk_value)
-    if df_numeric.shape[1] < 2:
-        return []
-    # Compute absolute correlation matrix
-    corr_matrix = np.corrcoef(df_numeric.values, rowvar=False)
-    abs_corr = np.abs(corr_matrix)
-    # Greedy feature drop based on sorted order
-    to_drop = set()
-    for i in range(len(sorted_cols)):
-        if sorted_cols[i] in to_drop:
+    df_numeric = df_numeric.fillna(junk_value).astype(np.float32)
+    # First pass: intra-chunk correlation pruning
+    survivors = []
+    to_drop_total = set()
+    for i in range(0, len(sorted_cols), chunk_size):
+        chunk_cols = sorted_cols[i : i + chunk_size]
+        chunk_df = df_numeric[chunk_cols]
+        chunk_corr = np.corrcoef(chunk_df.values, rowvar=False)
+        abs_corr = np.abs(chunk_corr)
+        to_drop = set()
+        for j in range(len(chunk_cols)):
+            if chunk_cols[j] in to_drop:
+                continue
+            for k in range(j + 1, len(chunk_cols)):
+                if chunk_cols[k] in to_drop:
+                    continue
+                if abs_corr[j, k] > threshold:
+                    to_drop.add(chunk_cols[k])
+        survivors.extend([col for col in chunk_cols if col not in to_drop])
+        to_drop_total.update(to_drop)
+    # Second pass: global correlation among survivors
+    if len(survivors) < 2:
+        return sorted(to_drop_total)
+    survivors_df = df_numeric[survivors]
+    final_corr = np.corrcoef(survivors_df.values, rowvar=False)
+    abs_corr = np.abs(final_corr)
+    final_drop = set()
+    for i in range(len(survivors)):
+        if survivors[i] in final_drop:
             continue
-        for j in range(i + 1, len(sorted_cols)):
-            if sorted_cols[j] in to_drop:
+        for j in range(i + 1, len(survivors)):
+            if survivors[j] in final_drop:
                 continue
             if abs_corr[i, j] > threshold:
-                to_drop.add(sorted_cols[j])
+                final_drop.add(survivors[j])
-    return sorted(to_drop)
+    to_drop_total.update(final_drop)
+    return sorted(to_drop_total)
 class CorrelationReducer(Reducer):
@@ -69,9 +82,10 @@ class CorrelationReducer(Reducer):
     _correlation_drop_features: dict[str, bool]
-    def __init__(self) -> None:
+    def __init__(self, correlation_chunk_size: int) -> None:
         self._threshold = 0.0
         self._correlation_drop_features = {}
+        self._correlation_chunk_size = correlation_chunk_size
     @classmethod
     def name(cls) -> str:
@@ -102,7 +116,11 @@ class CorrelationReducer(Reducer):
         eval_x: pd.DataFrame | None = None,
         eval_y: pd.Series | pd.DataFrame | None = None,
     ) -> Self:
-        drop_features = _get_correlated_features_to_drop(df, threshold=self._threshold)
+        drop_features = _get_correlated_features_to_drop_chunked(
+            df.copy(),
+            threshold=self._threshold,
+            chunk_size=self._correlation_chunk_size,
+        )
         self._correlation_drop_features = {x: True for x in drop_features}
         return self

{wavetrainer-0.1.14 → wavetrainer-0.1.16}/wavetrainer/trainer.py RENAMED Viewed

@@ -42,6 +42,7 @@ _VALIDATION_SIZE_KEY = "validation_size"
 _IDX_USR_ATTR_KEY = "idx"
 _DT_COLUMN_KEY = "dt_column"
 _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY = "max_false_positive_reduction_steps"
+_CORRELATION_CHUNK_SIZE_KEY = "correlation_chunk_size"
 _BAD_OUTPUT = -1000.0
@@ -75,6 +76,7 @@ class Trainer(Fit):
         embedding_cols: list[list[str]] | None = None,
         allowed_models: set[str] | None = None,
         max_false_positive_reduction_steps: int | None = None,
+        correlation_chunk_size: int | None = None,
     ):
         tqdm.tqdm.pandas()
@@ -129,6 +131,8 @@ class Trainer(Fit):
                     max_false_positive_reduction_steps = params.get(
                         _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY
                     )
+                if correlation_chunk_size is None:
+                    correlation_chunk_size = params.get(_CORRELATION_CHUNK_SIZE_KEY)
         else:
             with open(params_file, "w", encoding="utf8") as handle:
                 validation_size_value = None
@@ -160,6 +164,7 @@ class Trainer(Fit):
                         _VALIDATION_SIZE_KEY: validation_size_value,
                         _DT_COLUMN_KEY: dt_column,
                         _MAX_FALSE_POSITIVE_REDUCTION_STEPS_KEY: max_false_positive_reduction_steps,
+                        _CORRELATION_CHUNK_SIZE_KEY: correlation_chunk_size,
                     },
                     handle,
                 )
@@ -173,6 +178,7 @@ class Trainer(Fit):
         self.embedding_cols = embedding_cols
         self._allowed_models = allowed_models
         self._max_false_positive_reduction_steps = max_false_positive_reduction_steps
+        self._correlation_chunk_size = correlation_chunk_size
     def _provide_study(self, column: str) -> optuna.Study:
         storage_name = f"sqlite:///{self._folder}/{column}/{_STUDYDB_FILENAME}"
@@ -246,7 +252,8 @@ class Trainer(Fit):
                                 "Found trial %d previously executed, skipping...",
                                 trial.number,
                             )
-                            return trial_info["output"]
+                            return tuple(trial_info["output"])
+                        print("Retraining for different trial number.")
                 train_dt_index = dt_index[: len(x)]
                 x_train = x[train_dt_index < split_idx]  # type: ignore
@@ -270,7 +277,9 @@ class Trainer(Fit):
                     # Perform common reductions
                     start_reducer = time.time()
-                    reducer = CombinedReducer(self.embedding_cols)
+                    reducer = CombinedReducer(
+                        self.embedding_cols, self._correlation_chunk_size
+                    )
                     reducer.set_options(trial, x)
                     x_train = reducer.fit_transform(x_train, y=y_train)
                     x_test = reducer.transform(x_test)
@@ -367,7 +376,7 @@ class Trainer(Fit):
                             json.dump(
                                 {
                                     "number": trial.number,
-                                    "output": output,
+                                    "output": [output, loss],
                                 },
                                 handle,
                             )
@@ -583,7 +592,9 @@ class Trainer(Fit):
                 date_str = dates[-1].isoformat()
                 folder = os.path.join(column_path, date_str)
-                reducer = CombinedReducer(self.embedding_cols)
+                reducer = CombinedReducer(
+                    self.embedding_cols, self._correlation_chunk_size
+                )
                 reducer.load(folder)
                 model = ModelRouter(None, None)

{wavetrainer-0.1.14 → wavetrainer-0.1.16/wavetrainer.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wavetrainer
-Version: 0.1.14
+Version: 0.1.16
 Summary: A library for automatically finding the optimal model within feature and hyperparameter space.
 Home-page: https://github.com/8W9aG/wavetrainer
 Author: Will Sackfield