PBstats 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. pbstats-0.1.0/LICENSE +21 -0
  2. pbstats-0.1.0/PBstats/__init__.py +5 -0
  3. pbstats-0.1.0/PBstats/core/__init__.py +0 -0
  4. pbstats-0.1.0/PBstats/core/base.py +71 -0
  5. pbstats-0.1.0/PBstats/core/data.py +68 -0
  6. pbstats-0.1.0/PBstats/core/pipeline.py +303 -0
  7. pbstats-0.1.0/PBstats/preprocessing/__init__.py +10 -0
  8. pbstats-0.1.0/PBstats/preprocessing/cleaning.py +68 -0
  9. pbstats-0.1.0/PBstats/preprocessing/imputation.py +60 -0
  10. pbstats-0.1.0/PBstats/preprocessing/normalization.py +77 -0
  11. pbstats-0.1.0/PBstats/preprocessing/outliers.py +84 -0
  12. pbstats-0.1.0/PBstats/statistics/__init__.py +9 -0
  13. pbstats-0.1.0/PBstats/statistics/base.py +95 -0
  14. pbstats-0.1.0/PBstats/statistics/correlation.py +90 -0
  15. pbstats-0.1.0/PBstats/statistics/descriptive.py +71 -0
  16. pbstats-0.1.0/PBstats/statistics/hypothesis.py +167 -0
  17. pbstats-0.1.0/PBstats/statistics/regression.py +72 -0
  18. pbstats-0.1.0/PBstats/transforms/__init__.py +10 -0
  19. pbstats-0.1.0/PBstats/transforms/base.py +71 -0
  20. pbstats-0.1.0/PBstats/transforms/fft.py +69 -0
  21. pbstats-0.1.0/PBstats/transforms/hilbert.py +44 -0
  22. pbstats-0.1.0/PBstats/transforms/stft.py +53 -0
  23. pbstats-0.1.0/PBstats/transforms/wavelet.py +64 -0
  24. pbstats-0.1.0/PBstats/visualization/__init__.py +13 -0
  25. pbstats-0.1.0/PBstats/visualization/base.py +50 -0
  26. pbstats-0.1.0/PBstats/visualization/plots.py +409 -0
  27. pbstats-0.1.0/PBstats/visualization/spectrum.py +225 -0
  28. pbstats-0.1.0/PBstats.egg-info/PKG-INFO +83 -0
  29. pbstats-0.1.0/PBstats.egg-info/SOURCES.txt +38 -0
  30. pbstats-0.1.0/PBstats.egg-info/dependency_links.txt +1 -0
  31. pbstats-0.1.0/PBstats.egg-info/requires.txt +13 -0
  32. pbstats-0.1.0/PBstats.egg-info/top_level.txt +5 -0
  33. pbstats-0.1.0/PKG-INFO +83 -0
  34. pbstats-0.1.0/README.md +61 -0
  35. pbstats-0.1.0/examples/demo.py +0 -0
  36. pbstats-0.1.0/pyproject.toml +26 -0
  37. pbstats-0.1.0/setup.cfg +4 -0
  38. pbstats-0.1.0/tests/test_data.py +72 -0
  39. pbstats-0.1.0/tests/test_pipeline.py +86 -0
  40. pbstats-0.1.0/tests/test_transform.py +63 -0
pbstats-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pallab Biswas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ from PBstats.core.data import Data
2
+ from PBstats.core.pipeline import Pipeline, FunctionalPipeline
3
+
4
+ __version__ = "0.1.0"
5
+ __all__ = ["Data", "Pipeline", "FunctionalPipeline"]
File without changes
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass, field
3
+ import numpy as np
4
+
5
+ @dataclass
6
+ class FFTResult:
7
+ freqs: np.ndarray # frequency axis in Hz
8
+ magnitude: np.ndarray # magnitude spectrum
9
+ phase: np.ndarray # phase spectrum in radians
10
+ power: np.ndarray # power spectrum (magnitude²)
11
+ fs: float # sampling frequency used
12
+ label: str = ""
13
+
14
+ def peak_frequency(self) -> float:
15
+ """Return the frequency with highest magnitude."""
16
+ return float(self.freqs[np.argmax(self.magnitude)])
17
+
18
+ def band_power(self, low: float, high: float) -> float:
19
+ """Total power within a frequency band [low, high] Hz."""
20
+ mask = (self.freqs >= low) & (self.freqs <= high)
21
+ return float(self.power[mask].sum())
22
+
23
+ def dominant_bands(self, n: int = 3) -> list[tuple[float, float]]:
24
+ """Return the n frequencies with highest power."""
25
+ idx = np.argsort(self.magnitude)[::-1][:n]
26
+ return [(float(self.freqs[i]), float(self.magnitude[i])) for i in idx]
27
+
28
+
29
+ @dataclass
30
+ class HilbertResult:
31
+ analytic: np.ndarray # complex analytic signal
32
+ envelope: np.ndarray # amplitude envelope (instantaneous amplitude)
33
+ phase: np.ndarray # instantaneous phase (radians)
34
+ frequency: np.ndarray # instantaneous frequency (Hz)
35
+ fs: float
36
+ label: str = ""
37
+
38
+ def mean_envelope(self) -> float:
39
+ return float(self.envelope.mean())
40
+
41
+ def peak_envelope(self) -> float:
42
+ return float(self.envelope.max())
43
+
44
+
45
+ @dataclass
46
+ class WaveletResult:
47
+ coefficients: np.ndarray # 2D array: (scales, time)
48
+ freqs: np.ndarray # frequency axis
49
+ times: np.ndarray # time axis
50
+ scales: np.ndarray
51
+ wavelet: str
52
+ label: str = ""
53
+
54
+ def scalogram(self) -> np.ndarray:
55
+ """Return power scalogram (|coefficients|²)."""
56
+ return np.abs(self.coefficients) ** 2
57
+
58
+
59
+ @dataclass
60
+ class STFTResult:
61
+ freqs: np.ndarray # frequency axis
62
+ times: np.ndarray # time axis
63
+ Zxx: np.ndarray # complex STFT matrix
64
+ magnitude: np.ndarray # |Zxx|
65
+ fs: float
66
+ label: str = ""
67
+
68
+ def spectrogram(self) -> np.ndarray:
69
+ """Power spectrogram in dB."""
70
+ power = np.abs(self.Zxx) ** 2
71
+ return 10 * np.log10(power + 1e-12)
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ import pandas as pd
4
+ from typing import Union, Optional
5
+
6
+ from PBstats.preprocessing.cleaning import CleaningMixin
7
+ from PBstats.preprocessing.normalization import NormalizationMixin
8
+ from PBstats.preprocessing.outliers import OutlierMixin
9
+ from PBstats.preprocessing.imputation import ImputationMixin
10
+ from PBstats.transforms.fft import FFTMixin
11
+ from PBstats.transforms.hilbert import HilbertMixin
12
+ from PBstats.transforms.wavelet import WaveletMixin
13
+ from PBstats.transforms.stft import STFTMixin
14
+ from PBstats.statistics.descriptive import DescriptiveMixin
15
+ from PBstats.statistics.hypothesis import HypothesisMixin
16
+ from PBstats.statistics.regression import RegressionMixin
17
+ from PBstats.statistics.correlation import CorrelationMixin
18
+ from PBstats.visualization.spectrum import VisualizationMixin
19
+
20
+ ArrayLike = Union[np.ndarray, pd.Series, pd.DataFrame, list]
21
+
22
+ class Data(
23
+ CleaningMixin, NormalizationMixin, OutlierMixin, ImputationMixin,
24
+ FFTMixin, HilbertMixin, WaveletMixin, STFTMixin,
25
+ DescriptiveMixin, HypothesisMixin, RegressionMixin, CorrelationMixin,
26
+ VisualizationMixin,
27
+ ):
28
+ def __init__(
29
+ self,
30
+ data: ArrayLike,
31
+ fs: Optional[float] = None,
32
+ label: str = "",
33
+ ):
34
+ self._raw = self._coerce(data)
35
+ self.data = self._raw.copy()
36
+ self.fs = fs
37
+ self.label = label
38
+ self._log: list[str] = []
39
+
40
+ def _coerce(self, data: ArrayLike) -> np.ndarray:
41
+ if isinstance(data, pd.DataFrame):
42
+ return data.to_numpy()
43
+ if isinstance(data, pd.Series):
44
+ return data.to_numpy()
45
+ return np.asarray(data, dtype=float)
46
+
47
+ def _record(self, step: str) -> None:
48
+ self._log.append(step)
49
+
50
+ def reset(self) -> "Data":
51
+ self.data = self._raw.copy()
52
+ self._log.clear()
53
+ return self
54
+
55
+ def history(self) -> list[str]:
56
+ return self._log.copy()
57
+
58
+ def to_numpy(self) -> np.ndarray:
59
+ return self.data.copy()
60
+
61
+ def to_series(self) -> pd.Series:
62
+ return pd.Series(self.data.flatten(), name=self.label)
63
+
64
+ def __repr__(self) -> str:
65
+ return (
66
+ f"Data(shape={self.data.shape}, fs={self.fs}, "
67
+ f"steps={len(self._log)})"
68
+ )
@@ -0,0 +1,303 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import time
4
+ import numpy as np
5
+ from typing import Callable, Any
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class PipelineStep:
11
+ """One step in the pipeline — a method name + its kwargs."""
12
+ name: str # e.g. "remove_dc"
13
+ kwargs: dict = field(default_factory=dict)
14
+ duration_ms: float = 0.0 # filled after execution
15
+ records: str = "" # filled from Data._log
16
+
17
+
18
+ class Pipeline:
19
+ """
20
+ Define a preprocessing + transform workflow once.
21
+ Apply it to any number of Data objects reproducibly.
22
+
23
+ Usage
24
+ -----
25
+ pipe = (
26
+ Pipeline("ecg_clean")
27
+ .add("remove_missing", strategy="interpolate")
28
+ .add("remove_dc")
29
+ .add("remove_outliers", method="mad", threshold=3.0)
30
+ .add("standardize")
31
+ .add("hilbert")
32
+ )
33
+
34
+ result = pipe.run(Data(signal, fs=1000))
35
+ result2 = pipe.run(Data(new_signal, fs=1000)) # same steps, new data
36
+
37
+ pipe.save("ecg_pipeline.json")
38
+ pipe2 = Pipeline.load("ecg_pipeline.json")
39
+ """
40
+
41
+ def __init__(self, name: str = "pipeline"):
42
+ self.name = name
43
+ self.steps: list[PipelineStep] = []
44
+ self._run_log: list[dict] = [] # history of all .run() calls
45
+
46
+ # ── building ────────────────────────────────────────────────────────────
47
+
48
+ def add(self, name: str, **kwargs) -> "Pipeline":
49
+ """
50
+ Add a step by method name.
51
+ The name must match a method on the Data class exactly.
52
+
53
+ Example
54
+ -------
55
+ pipe.add("remove_outliers", method="mad", threshold=3.5)
56
+ """
57
+ self.steps.append(PipelineStep(name=name, kwargs=kwargs))
58
+ return self
59
+
60
+ def __len__(self) -> int:
61
+ return len(self.steps)
62
+
63
+ def __repr__(self) -> str:
64
+ step_names = " → ".join(s.name for s in self.steps)
65
+ return f"Pipeline('{self.name}', steps=[{step_names}])"
66
+
67
+ # ── execution ───────────────────────────────────────────────────────────
68
+
69
+ def run(self, data_obj, verbose: bool = False) -> Any:
70
+ """
71
+ Execute all steps on a Data object in order.
72
+
73
+ Parameters
74
+ ----------
75
+ data_obj : a pallabstats.Data instance
76
+ verbose : if True, print each step as it runs
77
+
78
+ Returns
79
+ -------
80
+ The same Data object after all transforms applied.
81
+ """
82
+ self._validate_steps(data_obj)
83
+
84
+ run_record = {
85
+ "pipeline": self.name,
86
+ "label": data_obj.label,
87
+ "steps": [],
88
+ "total_ms": 0.0,
89
+ }
90
+
91
+ for step in self.steps:
92
+ method = getattr(data_obj, step.name)
93
+
94
+ t_start = time.perf_counter()
95
+ method(**step.kwargs)
96
+ t_end = time.perf_counter()
97
+
98
+ step.duration_ms = (t_end - t_start) * 1000
99
+
100
+ if verbose:
101
+ print(f" [{step.duration_ms:6.2f} ms] {step.name}({step.kwargs})")
102
+
103
+ run_record["steps"].append({
104
+ "step": step.name,
105
+ "kwargs": step.kwargs,
106
+ "duration_ms": round(step.duration_ms, 3),
107
+ })
108
+
109
+ run_record["total_ms"] = sum(
110
+ s["duration_ms"] for s in run_record["steps"]
111
+ )
112
+ self._run_log.append(run_record)
113
+
114
+ return data_obj
115
+
116
+ def _validate_steps(self, data_obj) -> None:
117
+ """Check all step names exist on the Data class before running."""
118
+ missing = [
119
+ s.name for s in self.steps
120
+ if not hasattr(data_obj, s.name)
121
+ ]
122
+ if missing:
123
+ raise AttributeError(
124
+ f"Pipeline '{self.name}' has steps not found on Data: {missing}\n"
125
+ f"Check spelling — available methods: "
126
+ f"{[m for m in dir(data_obj) if not m.startswith('_')]}"
127
+ )
128
+
129
+ # ── batch processing ─────────────────────────────────────────────────────
130
+
131
+ def run_batch(
132
+ self,
133
+ data_list: list,
134
+ verbose: bool = False,
135
+ on_error: str = "raise",
136
+ ) -> list:
137
+ """
138
+ Run the pipeline on a list of Data objects.
139
+
140
+ Parameters
141
+ ----------
142
+ data_list : list of Data instances
143
+ verbose : print progress
144
+ on_error : 'raise' — stop on first error (default)
145
+ 'skip' — log error, continue with next sample
146
+ 'warn' — print warning, continue
147
+
148
+ Returns
149
+ -------
150
+ List of processed Data objects (failed ones excluded if on_error != 'raise')
151
+ """
152
+ results = []
153
+ errors = []
154
+
155
+ for i, d in enumerate(data_list):
156
+ label = getattr(d, "label", f"sample_{i}")
157
+ try:
158
+ if verbose:
159
+ print(f"\nProcessing [{i+1}/{len(data_list)}]: {label}")
160
+ results.append(self.run(d, verbose=verbose))
161
+
162
+ except Exception as e:
163
+ msg = f"Error on '{label}': {type(e).__name__}: {e}"
164
+ errors.append({"label": label, "error": msg})
165
+
166
+ if on_error == "raise":
167
+ raise
168
+ elif on_error == "warn":
169
+ print(f" WARNING — {msg}")
170
+ elif on_error == "skip":
171
+ pass
172
+
173
+ if errors and on_error != "raise":
174
+ print(f"\nBatch complete: {len(results)} succeeded, "
175
+ f"{len(errors)} failed.")
176
+
177
+ return results
178
+
179
+ # ── timing report ────────────────────────────────────────────────────────
180
+
181
+ def timing_report(self) -> dict:
182
+ """
183
+ Return per-step average timing across all .run() calls.
184
+ Useful for finding bottlenecks in long pipelines.
185
+ """
186
+ if not self._run_log:
187
+ return {}
188
+
189
+ step_times: dict[str, list[float]] = {}
190
+ for run in self._run_log:
191
+ for s in run["steps"]:
192
+ step_times.setdefault(s["step"], []).append(s["duration_ms"])
193
+
194
+ return {
195
+ step: {
196
+ "mean_ms": round(np.mean(times), 3),
197
+ "max_ms": round(np.max(times), 3),
198
+ "n_runs": len(times),
199
+ }
200
+ for step, times in step_times.items()
201
+ }
202
+
203
+ # ── persistence ──────────────────────────────────────────────────────────
204
+
205
+ def save(self, path: str) -> None:
206
+ """
207
+ Save pipeline definition to JSON.
208
+ This saves the step names and kwargs — not the data.
209
+ Load it later with Pipeline.load(path).
210
+ """
211
+ payload = {
212
+ "name": self.name,
213
+ "steps": [
214
+ {"name": s.name, "kwargs": s.kwargs}
215
+ for s in self.steps
216
+ ],
217
+ }
218
+ with open(path, "w") as f:
219
+ json.dump(payload, f, indent=2)
220
+ print(f"Pipeline saved → {path}")
221
+
222
+ @classmethod
223
+ def load(cls, path: str) -> "Pipeline":
224
+ """Load a pipeline from a saved JSON file."""
225
+ with open(path) as f:
226
+ payload = json.load(f)
227
+
228
+ pipe = cls(name=payload["name"])
229
+ for step in payload["steps"]:
230
+ pipe.add(step["name"], **step["kwargs"])
231
+
232
+ print(f"Pipeline loaded: {pipe}")
233
+ return pipe
234
+
235
+ # ── introspection ────────────────────────────────────────────────────────
236
+
237
+ def summary(self) -> None:
238
+ """Print a readable summary of all pipeline steps."""
239
+ print(f"\nPipeline: '{self.name}' ({len(self.steps)} steps)")
240
+ print("─" * 48)
241
+ for i, step in enumerate(self.steps, 1):
242
+ kwargs_str = ", ".join(f"{k}={v!r}" for k, v in step.kwargs.items())
243
+ print(f" {i:2d}. {step.name}({kwargs_str})")
244
+ print("─" * 48)
245
+ class FunctionalPipeline(Pipeline):
246
+ """
247
+ Extended pipeline that supports arbitrary callables as steps.
248
+
249
+ Useful when you need a custom transform that isn't a Data method yet.
250
+
251
+ Example
252
+ -------
253
+ def my_bandpass(data_obj):
254
+ from scipy.signal import butter, filtfilt
255
+ b, a = butter(4, [40, 60], btype='band', fs=data_obj.fs)
256
+ data_obj.data = filtfilt(b, a, data_obj.data)
257
+ data_obj._record("custom_bandpass(40-60Hz)")
258
+ return data_obj
259
+
260
+ pipe = (
261
+ FunctionalPipeline("custom")
262
+ .add("remove_dc")
263
+ .add_fn(my_bandpass, name="bandpass_40_60")
264
+ .add("standardize")
265
+ )
266
+ """
267
+
268
+ def __init__(self, name: str = "pipeline"):
269
+ super().__init__(name)
270
+ self._fn_registry: dict[str, Callable] = {}
271
+
272
+ def add_fn(self, fn: Callable, name: str = None) -> "FunctionalPipeline":
273
+ """Add an arbitrary callable as a pipeline step."""
274
+ step_name = name or fn.__name__
275
+ self._fn_registry[step_name] = fn
276
+ self.steps.append(PipelineStep(name=step_name, kwargs={}))
277
+ return self
278
+
279
+ def run(self, data_obj, verbose: bool = False):
280
+ self._validate_steps_fn(data_obj)
281
+
282
+ for step in self.steps:
283
+ t_start = time.perf_counter()
284
+
285
+ if step.name in self._fn_registry:
286
+ self._fn_registry[step.name](data_obj)
287
+ else:
288
+ getattr(data_obj, step.name)(**step.kwargs)
289
+
290
+ step.duration_ms = (time.perf_counter() - t_start) * 1000
291
+ if verbose:
292
+ print(f" [{step.duration_ms:6.2f} ms] {step.name}")
293
+
294
+ return data_obj
295
+
296
+ def _validate_steps_fn(self, data_obj) -> None:
297
+ missing = [
298
+ s.name for s in self.steps
299
+ if not hasattr(data_obj, s.name)
300
+ and s.name not in self._fn_registry
301
+ ]
302
+ if missing:
303
+ raise AttributeError(f"Steps not found: {missing}")
@@ -0,0 +1,10 @@
1
+ from PBstats.preprocessing.cleaning import CleaningMixin
2
+ from PBstats.preprocessing.normalization import NormalizationMixin
3
+ from PBstats.preprocessing.outliers import OutlierMixin
4
+ from PBstats.preprocessing.imputation import ImputationMixin
5
+ __all__ = [
6
+ "CleaningMixin",
7
+ "NormalizationMixin",
8
+ "OutlierMixin",
9
+ "ImputationMixin",
10
+ ]
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+ import numpy as np
3
+
4
+ class CleaningMixin:
5
+ """
6
+ Methods: remove_missing, remove_dc, clip
7
+ Each mutates self.data, records the step, returns self.
8
+ """
9
+
10
+ def remove_missing(self, strategy: str = "interpolate") -> "CleaningMixin":
11
+ """
12
+ Replace NaN values in the signal.
13
+
14
+ Parameters
15
+ ----------
16
+ strategy : 'interpolate' — linear interpolation between neighbours
17
+ 'zero' — replace with 0
18
+ 'mean' — replace with signal mean
19
+ 'drop' — remove NaN positions entirely
20
+ """
21
+ x = self.data.astype(float)
22
+
23
+ if strategy == "interpolate":
24
+ nan_mask = np.isnan(x)
25
+ if nan_mask.any():
26
+ indices = np.arange(len(x))
27
+ x[nan_mask] = np.interp(
28
+ indices[nan_mask],
29
+ indices[~nan_mask],
30
+ x[~nan_mask]
31
+ )
32
+
33
+ elif strategy == "zero":
34
+ x = np.nan_to_num(x, nan=0.0)
35
+
36
+ elif strategy == "mean":
37
+ mean_val = np.nanmean(x)
38
+ x = np.where(np.isnan(x), mean_val, x)
39
+
40
+ elif strategy == "drop":
41
+ x = x[~np.isnan(x)]
42
+
43
+ else:
44
+ raise ValueError(f"Unknown strategy '{strategy}'. "
45
+ f"Choose: interpolate, zero, mean, drop")
46
+
47
+ self.data = x
48
+ self._record(f"remove_missing(strategy={strategy})")
49
+ return self
50
+
51
+ def remove_dc(self) -> "CleaningMixin":
52
+ """
53
+ Subtract the mean (remove DC offset).
54
+ Essential for spectral analysis — DC shows up as a massive
55
+ spike at 0 Hz in FFT if not removed.
56
+ """
57
+ self.data = self.data - np.mean(self.data)
58
+ self._record("remove_dc()")
59
+ return self
60
+
61
+ def clip(self, low: float, high: float) -> "CleaningMixin":
62
+ """
63
+ Hard-clip values to [low, high].
64
+ Useful for removing sensor saturation artifacts.
65
+ """
66
+ self.data = np.clip(self.data, low, high)
67
+ self._record(f"clip(low={low}, high={high})")
68
+ return self
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+ import numpy as np
3
+
4
+ class ImputationMixin:
5
+ """
6
+ Methods: impute
7
+ More sophisticated missing value handling for 2D (tabular) data.
8
+ For 1D signals, remove_missing() in CleaningMixin is sufficient.
9
+ """
10
+
11
+ def impute(self, method: str = "mean") -> "ImputationMixin":
12
+ """
13
+ Fill NaNs in a 2D array column-by-column.
14
+
15
+ Parameters
16
+ ----------
17
+ method : 'mean' — fill with column mean
18
+ 'median' — fill with column median
19
+ 'ffill' — forward-fill (carry last valid value forward)
20
+ 'bfill' — backward-fill
21
+ """
22
+ if self.data.ndim == 1:
23
+ # Redirect 1D to the simpler cleaner
24
+ return self.remove_missing(
25
+ strategy="interpolate" if method in ("ffill","bfill") else method
26
+ )
27
+
28
+ x = self.data.astype(float)
29
+
30
+ for col in range(x.shape[1]):
31
+ column = x[:, col]
32
+ nan_mask = np.isnan(column)
33
+ if not nan_mask.any():
34
+ continue
35
+
36
+ if method == "mean":
37
+ column[nan_mask] = np.nanmean(column)
38
+ elif method == "median":
39
+ column[nan_mask] = np.nanmedian(column)
40
+ elif method == "ffill":
41
+ # Vectorized ffill: mask valid indices, then use ffill logic
42
+ mask = ~nan_mask
43
+ idx = np.where(mask, np.arange(len(column)), 0)
44
+ np.maximum.accumulate(idx, out=idx)
45
+ column[:] = column[idx]
46
+ elif method == "bfill":
47
+ # Vectorized bfill: reverse, ffill, reverse
48
+ mask = ~nan_mask
49
+ idx = np.where(mask, np.arange(len(column)), len(column) - 1)
50
+ # Reverse accumulation for bfill
51
+ idx = len(column) - 1 - np.maximum.accumulate((len(column) - 1 - idx)[::-1])[::-1]
52
+ column[:] = column[idx]
53
+ else:
54
+ raise ValueError(f"Unknown method '{method}'. "
55
+ f"Choose: mean, median, ffill, bfill")
56
+ x[:, col] = column
57
+
58
+ self.data = x
59
+ self._record(f"impute(method={method})")
60
+ return self
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+ import numpy as np
3
+
4
+ class NormalizationMixin:
5
+ """
6
+ Methods: normalize, standardize, minmax, robust_scale
7
+ """
8
+
9
+ def normalize(self, method: str = "minmax") -> "NormalizationMixin":
10
+ """
11
+ Convenience wrapper — calls the right method by name.
12
+
13
+ Parameters
14
+ ----------
15
+ method : 'minmax' — scale to [0, 1]
16
+ 'zscore' — zero mean, unit variance
17
+ 'robust' — median and IQR based (outlier-resistant)
18
+ 'l2' — divide by L2 norm (unit vector)
19
+ """
20
+ dispatch = {
21
+ "minmax": self.minmax,
22
+ "zscore": self.standardize,
23
+ "robust": self.robust_scale,
24
+ "l2": self._l2_norm,
25
+ }
26
+ if method not in dispatch:
27
+ raise ValueError(f"Unknown method '{method}'. "
28
+ f"Choose: {list(dispatch)}")
29
+ return dispatch[method]()
30
+
31
+ def minmax(self) -> "NormalizationMixin":
32
+ """Scale to [0, 1]."""
33
+ x = self.data
34
+ xmin, xmax = x.min(), x.max()
35
+ if xmax == xmin:
36
+ self.data = np.zeros_like(x)
37
+ else:
38
+ self.data = (x - xmin) / (xmax - xmin)
39
+ self._record("minmax()")
40
+ return self
41
+
42
+ def standardize(self) -> "NormalizationMixin":
43
+ """Zero mean, unit variance (z-score normalization)."""
44
+ x = self.data
45
+ std = x.std()
46
+ if std == 0:
47
+ self.data = np.zeros_like(x)
48
+ else:
49
+ self.data = (x - x.mean()) / std
50
+ self._record("standardize()")
51
+ return self
52
+
53
+ def robust_scale(self) -> "NormalizationMixin":
54
+ """
55
+ Scale using median and IQR instead of mean and std.
56
+ Much better for biomedical data with spikes or outliers.
57
+ """
58
+ x = self.data
59
+ median = np.median(x)
60
+ q75, q25 = np.percentile(x, [75, 25])
61
+ iqr = q75 - q25
62
+ if iqr == 0:
63
+ self.data = np.zeros_like(x)
64
+ else:
65
+ self.data = (x - median) / iqr
66
+ self._record("robust_scale()")
67
+ return self
68
+
69
+ def _l2_norm(self) -> "NormalizationMixin":
70
+ """Divide by L2 norm — makes the signal a unit vector."""
71
+ norm = np.linalg.norm(self.data)
72
+ if norm == 0:
73
+ self.data = np.zeros_like(self.data)
74
+ else:
75
+ self.data = self.data / norm
76
+ self._record("l2_norm()")
77
+ return self