PBstats 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PBstats/__init__.py +5 -0
- PBstats/core/__init__.py +0 -0
- PBstats/core/base.py +71 -0
- PBstats/core/data.py +68 -0
- PBstats/core/pipeline.py +303 -0
- PBstats/preprocessing/__init__.py +10 -0
- PBstats/preprocessing/cleaning.py +68 -0
- PBstats/preprocessing/imputation.py +60 -0
- PBstats/preprocessing/normalization.py +77 -0
- PBstats/preprocessing/outliers.py +84 -0
- PBstats/statistics/__init__.py +9 -0
- PBstats/statistics/base.py +95 -0
- PBstats/statistics/correlation.py +90 -0
- PBstats/statistics/descriptive.py +71 -0
- PBstats/statistics/hypothesis.py +167 -0
- PBstats/statistics/regression.py +72 -0
- PBstats/transforms/__init__.py +10 -0
- PBstats/transforms/base.py +71 -0
- PBstats/transforms/fft.py +69 -0
- PBstats/transforms/hilbert.py +44 -0
- PBstats/transforms/stft.py +53 -0
- PBstats/transforms/wavelet.py +64 -0
- PBstats/visualization/__init__.py +13 -0
- PBstats/visualization/base.py +50 -0
- PBstats/visualization/plots.py +409 -0
- PBstats/visualization/spectrum.py +225 -0
- examples/demo.py +0 -0
- pbstats-0.1.0.dist-info/METADATA +83 -0
- pbstats-0.1.0.dist-info/RECORD +35 -0
- pbstats-0.1.0.dist-info/WHEEL +5 -0
- pbstats-0.1.0.dist-info/licenses/LICENSE +21 -0
- pbstats-0.1.0.dist-info/top_level.txt +3 -0
- tests/test_data.py +72 -0
- tests/test_pipeline.py +86 -0
- tests/test_transform.py +63 -0
PBstats/__init__.py
ADDED
PBstats/core/__init__.py
ADDED
|
File without changes
|
PBstats/core/base.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class FFTResult:
|
|
7
|
+
freqs: np.ndarray # frequency axis in Hz
|
|
8
|
+
magnitude: np.ndarray # magnitude spectrum
|
|
9
|
+
phase: np.ndarray # phase spectrum in radians
|
|
10
|
+
power: np.ndarray # power spectrum (magnitude²)
|
|
11
|
+
fs: float # sampling frequency used
|
|
12
|
+
label: str = ""
|
|
13
|
+
|
|
14
|
+
def peak_frequency(self) -> float:
|
|
15
|
+
"""Return the frequency with highest magnitude."""
|
|
16
|
+
return float(self.freqs[np.argmax(self.magnitude)])
|
|
17
|
+
|
|
18
|
+
def band_power(self, low: float, high: float) -> float:
|
|
19
|
+
"""Total power within a frequency band [low, high] Hz."""
|
|
20
|
+
mask = (self.freqs >= low) & (self.freqs <= high)
|
|
21
|
+
return float(self.power[mask].sum())
|
|
22
|
+
|
|
23
|
+
def dominant_bands(self, n: int = 3) -> list[tuple[float, float]]:
|
|
24
|
+
"""Return the n frequencies with highest power."""
|
|
25
|
+
idx = np.argsort(self.magnitude)[::-1][:n]
|
|
26
|
+
return [(float(self.freqs[i]), float(self.magnitude[i])) for i in idx]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class HilbertResult:
|
|
31
|
+
analytic: np.ndarray # complex analytic signal
|
|
32
|
+
envelope: np.ndarray # amplitude envelope (instantaneous amplitude)
|
|
33
|
+
phase: np.ndarray # instantaneous phase (radians)
|
|
34
|
+
frequency: np.ndarray # instantaneous frequency (Hz)
|
|
35
|
+
fs: float
|
|
36
|
+
label: str = ""
|
|
37
|
+
|
|
38
|
+
def mean_envelope(self) -> float:
|
|
39
|
+
return float(self.envelope.mean())
|
|
40
|
+
|
|
41
|
+
def peak_envelope(self) -> float:
|
|
42
|
+
return float(self.envelope.max())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class WaveletResult:
|
|
47
|
+
coefficients: np.ndarray # 2D array: (scales, time)
|
|
48
|
+
freqs: np.ndarray # frequency axis
|
|
49
|
+
times: np.ndarray # time axis
|
|
50
|
+
scales: np.ndarray
|
|
51
|
+
wavelet: str
|
|
52
|
+
label: str = ""
|
|
53
|
+
|
|
54
|
+
def scalogram(self) -> np.ndarray:
|
|
55
|
+
"""Return power scalogram (|coefficients|²)."""
|
|
56
|
+
return np.abs(self.coefficients) ** 2
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class STFTResult:
|
|
61
|
+
freqs: np.ndarray # frequency axis
|
|
62
|
+
times: np.ndarray # time axis
|
|
63
|
+
Zxx: np.ndarray # complex STFT matrix
|
|
64
|
+
magnitude: np.ndarray # |Zxx|
|
|
65
|
+
fs: float
|
|
66
|
+
label: str = ""
|
|
67
|
+
|
|
68
|
+
def spectrogram(self) -> np.ndarray:
|
|
69
|
+
"""Power spectrogram in dB."""
|
|
70
|
+
power = np.abs(self.Zxx) ** 2
|
|
71
|
+
return 10 * np.log10(power + 1e-12)
|
PBstats/core/data.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Union, Optional
|
|
5
|
+
|
|
6
|
+
from PBstats.preprocessing.cleaning import CleaningMixin
|
|
7
|
+
from PBstats.preprocessing.normalization import NormalizationMixin
|
|
8
|
+
from PBstats.preprocessing.outliers import OutlierMixin
|
|
9
|
+
from PBstats.preprocessing.imputation import ImputationMixin
|
|
10
|
+
from PBstats.transforms.fft import FFTMixin
|
|
11
|
+
from PBstats.transforms.hilbert import HilbertMixin
|
|
12
|
+
from PBstats.transforms.wavelet import WaveletMixin
|
|
13
|
+
from PBstats.transforms.stft import STFTMixin
|
|
14
|
+
from PBstats.statistics.descriptive import DescriptiveMixin
|
|
15
|
+
from PBstats.statistics.hypothesis import HypothesisMixin
|
|
16
|
+
from PBstats.statistics.regression import RegressionMixin
|
|
17
|
+
from PBstats.statistics.correlation import CorrelationMixin
|
|
18
|
+
from PBstats.visualization.spectrum import VisualizationMixin
|
|
19
|
+
|
|
20
|
+
ArrayLike = Union[np.ndarray, pd.Series, pd.DataFrame, list]
|
|
21
|
+
|
|
22
|
+
class Data(
|
|
23
|
+
CleaningMixin, NormalizationMixin, OutlierMixin, ImputationMixin,
|
|
24
|
+
FFTMixin, HilbertMixin, WaveletMixin, STFTMixin,
|
|
25
|
+
DescriptiveMixin, HypothesisMixin, RegressionMixin, CorrelationMixin,
|
|
26
|
+
VisualizationMixin,
|
|
27
|
+
):
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
data: ArrayLike,
|
|
31
|
+
fs: Optional[float] = None,
|
|
32
|
+
label: str = "",
|
|
33
|
+
):
|
|
34
|
+
self._raw = self._coerce(data)
|
|
35
|
+
self.data = self._raw.copy()
|
|
36
|
+
self.fs = fs
|
|
37
|
+
self.label = label
|
|
38
|
+
self._log: list[str] = []
|
|
39
|
+
|
|
40
|
+
def _coerce(self, data: ArrayLike) -> np.ndarray:
|
|
41
|
+
if isinstance(data, pd.DataFrame):
|
|
42
|
+
return data.to_numpy()
|
|
43
|
+
if isinstance(data, pd.Series):
|
|
44
|
+
return data.to_numpy()
|
|
45
|
+
return np.asarray(data, dtype=float)
|
|
46
|
+
|
|
47
|
+
def _record(self, step: str) -> None:
|
|
48
|
+
self._log.append(step)
|
|
49
|
+
|
|
50
|
+
def reset(self) -> "Data":
|
|
51
|
+
self.data = self._raw.copy()
|
|
52
|
+
self._log.clear()
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
def history(self) -> list[str]:
|
|
56
|
+
return self._log.copy()
|
|
57
|
+
|
|
58
|
+
def to_numpy(self) -> np.ndarray:
|
|
59
|
+
return self.data.copy()
|
|
60
|
+
|
|
61
|
+
def to_series(self) -> pd.Series:
|
|
62
|
+
return pd.Series(self.data.flatten(), name=self.label)
|
|
63
|
+
|
|
64
|
+
def __repr__(self) -> str:
|
|
65
|
+
return (
|
|
66
|
+
f"Data(shape={self.data.shape}, fs={self.fs}, "
|
|
67
|
+
f"steps={len(self._log)})"
|
|
68
|
+
)
|
PBstats/core/pipeline.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import Callable, Any
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PipelineStep:
|
|
11
|
+
"""One step in the pipeline — a method name + its kwargs."""
|
|
12
|
+
name: str # e.g. "remove_dc"
|
|
13
|
+
kwargs: dict = field(default_factory=dict)
|
|
14
|
+
duration_ms: float = 0.0 # filled after execution
|
|
15
|
+
records: str = "" # filled from Data._log
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Pipeline:
|
|
19
|
+
"""
|
|
20
|
+
Define a preprocessing + transform workflow once.
|
|
21
|
+
Apply it to any number of Data objects reproducibly.
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
-----
|
|
25
|
+
pipe = (
|
|
26
|
+
Pipeline("ecg_clean")
|
|
27
|
+
.add("remove_missing", strategy="interpolate")
|
|
28
|
+
.add("remove_dc")
|
|
29
|
+
.add("remove_outliers", method="mad", threshold=3.0)
|
|
30
|
+
.add("standardize")
|
|
31
|
+
.add("hilbert")
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
result = pipe.run(Data(signal, fs=1000))
|
|
35
|
+
result2 = pipe.run(Data(new_signal, fs=1000)) # same steps, new data
|
|
36
|
+
|
|
37
|
+
pipe.save("ecg_pipeline.json")
|
|
38
|
+
pipe2 = Pipeline.load("ecg_pipeline.json")
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, name: str = "pipeline"):
|
|
42
|
+
self.name = name
|
|
43
|
+
self.steps: list[PipelineStep] = []
|
|
44
|
+
self._run_log: list[dict] = [] # history of all .run() calls
|
|
45
|
+
|
|
46
|
+
# ── building ────────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
def add(self, name: str, **kwargs) -> "Pipeline":
|
|
49
|
+
"""
|
|
50
|
+
Add a step by method name.
|
|
51
|
+
The name must match a method on the Data class exactly.
|
|
52
|
+
|
|
53
|
+
Example
|
|
54
|
+
-------
|
|
55
|
+
pipe.add("remove_outliers", method="mad", threshold=3.5)
|
|
56
|
+
"""
|
|
57
|
+
self.steps.append(PipelineStep(name=name, kwargs=kwargs))
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def __len__(self) -> int:
|
|
61
|
+
return len(self.steps)
|
|
62
|
+
|
|
63
|
+
def __repr__(self) -> str:
|
|
64
|
+
step_names = " → ".join(s.name for s in self.steps)
|
|
65
|
+
return f"Pipeline('{self.name}', steps=[{step_names}])"
|
|
66
|
+
|
|
67
|
+
# ── execution ───────────────────────────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
def run(self, data_obj, verbose: bool = False) -> Any:
|
|
70
|
+
"""
|
|
71
|
+
Execute all steps on a Data object in order.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
data_obj : a pallabstats.Data instance
|
|
76
|
+
verbose : if True, print each step as it runs
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
The same Data object after all transforms applied.
|
|
81
|
+
"""
|
|
82
|
+
self._validate_steps(data_obj)
|
|
83
|
+
|
|
84
|
+
run_record = {
|
|
85
|
+
"pipeline": self.name,
|
|
86
|
+
"label": data_obj.label,
|
|
87
|
+
"steps": [],
|
|
88
|
+
"total_ms": 0.0,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
for step in self.steps:
|
|
92
|
+
method = getattr(data_obj, step.name)
|
|
93
|
+
|
|
94
|
+
t_start = time.perf_counter()
|
|
95
|
+
method(**step.kwargs)
|
|
96
|
+
t_end = time.perf_counter()
|
|
97
|
+
|
|
98
|
+
step.duration_ms = (t_end - t_start) * 1000
|
|
99
|
+
|
|
100
|
+
if verbose:
|
|
101
|
+
print(f" [{step.duration_ms:6.2f} ms] {step.name}({step.kwargs})")
|
|
102
|
+
|
|
103
|
+
run_record["steps"].append({
|
|
104
|
+
"step": step.name,
|
|
105
|
+
"kwargs": step.kwargs,
|
|
106
|
+
"duration_ms": round(step.duration_ms, 3),
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
run_record["total_ms"] = sum(
|
|
110
|
+
s["duration_ms"] for s in run_record["steps"]
|
|
111
|
+
)
|
|
112
|
+
self._run_log.append(run_record)
|
|
113
|
+
|
|
114
|
+
return data_obj
|
|
115
|
+
|
|
116
|
+
def _validate_steps(self, data_obj) -> None:
|
|
117
|
+
"""Check all step names exist on the Data class before running."""
|
|
118
|
+
missing = [
|
|
119
|
+
s.name for s in self.steps
|
|
120
|
+
if not hasattr(data_obj, s.name)
|
|
121
|
+
]
|
|
122
|
+
if missing:
|
|
123
|
+
raise AttributeError(
|
|
124
|
+
f"Pipeline '{self.name}' has steps not found on Data: {missing}\n"
|
|
125
|
+
f"Check spelling — available methods: "
|
|
126
|
+
f"{[m for m in dir(data_obj) if not m.startswith('_')]}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# ── batch processing ─────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
def run_batch(
|
|
132
|
+
self,
|
|
133
|
+
data_list: list,
|
|
134
|
+
verbose: bool = False,
|
|
135
|
+
on_error: str = "raise",
|
|
136
|
+
) -> list:
|
|
137
|
+
"""
|
|
138
|
+
Run the pipeline on a list of Data objects.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
data_list : list of Data instances
|
|
143
|
+
verbose : print progress
|
|
144
|
+
on_error : 'raise' — stop on first error (default)
|
|
145
|
+
'skip' — log error, continue with next sample
|
|
146
|
+
'warn' — print warning, continue
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
List of processed Data objects (failed ones excluded if on_error != 'raise')
|
|
151
|
+
"""
|
|
152
|
+
results = []
|
|
153
|
+
errors = []
|
|
154
|
+
|
|
155
|
+
for i, d in enumerate(data_list):
|
|
156
|
+
label = getattr(d, "label", f"sample_{i}")
|
|
157
|
+
try:
|
|
158
|
+
if verbose:
|
|
159
|
+
print(f"\nProcessing [{i+1}/{len(data_list)}]: {label}")
|
|
160
|
+
results.append(self.run(d, verbose=verbose))
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
msg = f"Error on '{label}': {type(e).__name__}: {e}"
|
|
164
|
+
errors.append({"label": label, "error": msg})
|
|
165
|
+
|
|
166
|
+
if on_error == "raise":
|
|
167
|
+
raise
|
|
168
|
+
elif on_error == "warn":
|
|
169
|
+
print(f" WARNING — {msg}")
|
|
170
|
+
elif on_error == "skip":
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
if errors and on_error != "raise":
|
|
174
|
+
print(f"\nBatch complete: {len(results)} succeeded, "
|
|
175
|
+
f"{len(errors)} failed.")
|
|
176
|
+
|
|
177
|
+
return results
|
|
178
|
+
|
|
179
|
+
# ── timing report ────────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
def timing_report(self) -> dict:
|
|
182
|
+
"""
|
|
183
|
+
Return per-step average timing across all .run() calls.
|
|
184
|
+
Useful for finding bottlenecks in long pipelines.
|
|
185
|
+
"""
|
|
186
|
+
if not self._run_log:
|
|
187
|
+
return {}
|
|
188
|
+
|
|
189
|
+
step_times: dict[str, list[float]] = {}
|
|
190
|
+
for run in self._run_log:
|
|
191
|
+
for s in run["steps"]:
|
|
192
|
+
step_times.setdefault(s["step"], []).append(s["duration_ms"])
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
step: {
|
|
196
|
+
"mean_ms": round(np.mean(times), 3),
|
|
197
|
+
"max_ms": round(np.max(times), 3),
|
|
198
|
+
"n_runs": len(times),
|
|
199
|
+
}
|
|
200
|
+
for step, times in step_times.items()
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# ── persistence ──────────────────────────────────────────────────────────
|
|
204
|
+
|
|
205
|
+
def save(self, path: str) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Save pipeline definition to JSON.
|
|
208
|
+
This saves the step names and kwargs — not the data.
|
|
209
|
+
Load it later with Pipeline.load(path).
|
|
210
|
+
"""
|
|
211
|
+
payload = {
|
|
212
|
+
"name": self.name,
|
|
213
|
+
"steps": [
|
|
214
|
+
{"name": s.name, "kwargs": s.kwargs}
|
|
215
|
+
for s in self.steps
|
|
216
|
+
],
|
|
217
|
+
}
|
|
218
|
+
with open(path, "w") as f:
|
|
219
|
+
json.dump(payload, f, indent=2)
|
|
220
|
+
print(f"Pipeline saved → {path}")
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def load(cls, path: str) -> "Pipeline":
|
|
224
|
+
"""Load a pipeline from a saved JSON file."""
|
|
225
|
+
with open(path) as f:
|
|
226
|
+
payload = json.load(f)
|
|
227
|
+
|
|
228
|
+
pipe = cls(name=payload["name"])
|
|
229
|
+
for step in payload["steps"]:
|
|
230
|
+
pipe.add(step["name"], **step["kwargs"])
|
|
231
|
+
|
|
232
|
+
print(f"Pipeline loaded: {pipe}")
|
|
233
|
+
return pipe
|
|
234
|
+
|
|
235
|
+
# ── introspection ────────────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
def summary(self) -> None:
|
|
238
|
+
"""Print a readable summary of all pipeline steps."""
|
|
239
|
+
print(f"\nPipeline: '{self.name}' ({len(self.steps)} steps)")
|
|
240
|
+
print("─" * 48)
|
|
241
|
+
for i, step in enumerate(self.steps, 1):
|
|
242
|
+
kwargs_str = ", ".join(f"{k}={v!r}" for k, v in step.kwargs.items())
|
|
243
|
+
print(f" {i:2d}. {step.name}({kwargs_str})")
|
|
244
|
+
print("─" * 48)
|
|
245
|
+
class FunctionalPipeline(Pipeline):
|
|
246
|
+
"""
|
|
247
|
+
Extended pipeline that supports arbitrary callables as steps.
|
|
248
|
+
|
|
249
|
+
Useful when you need a custom transform that isn't a Data method yet.
|
|
250
|
+
|
|
251
|
+
Example
|
|
252
|
+
-------
|
|
253
|
+
def my_bandpass(data_obj):
|
|
254
|
+
from scipy.signal import butter, filtfilt
|
|
255
|
+
b, a = butter(4, [40, 60], btype='band', fs=data_obj.fs)
|
|
256
|
+
data_obj.data = filtfilt(b, a, data_obj.data)
|
|
257
|
+
data_obj._record("custom_bandpass(40-60Hz)")
|
|
258
|
+
return data_obj
|
|
259
|
+
|
|
260
|
+
pipe = (
|
|
261
|
+
FunctionalPipeline("custom")
|
|
262
|
+
.add("remove_dc")
|
|
263
|
+
.add_fn(my_bandpass, name="bandpass_40_60")
|
|
264
|
+
.add("standardize")
|
|
265
|
+
)
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
def __init__(self, name: str = "pipeline"):
|
|
269
|
+
super().__init__(name)
|
|
270
|
+
self._fn_registry: dict[str, Callable] = {}
|
|
271
|
+
|
|
272
|
+
def add_fn(self, fn: Callable, name: str = None) -> "FunctionalPipeline":
|
|
273
|
+
"""Add an arbitrary callable as a pipeline step."""
|
|
274
|
+
step_name = name or fn.__name__
|
|
275
|
+
self._fn_registry[step_name] = fn
|
|
276
|
+
self.steps.append(PipelineStep(name=step_name, kwargs={}))
|
|
277
|
+
return self
|
|
278
|
+
|
|
279
|
+
def run(self, data_obj, verbose: bool = False):
|
|
280
|
+
self._validate_steps_fn(data_obj)
|
|
281
|
+
|
|
282
|
+
for step in self.steps:
|
|
283
|
+
t_start = time.perf_counter()
|
|
284
|
+
|
|
285
|
+
if step.name in self._fn_registry:
|
|
286
|
+
self._fn_registry[step.name](data_obj)
|
|
287
|
+
else:
|
|
288
|
+
getattr(data_obj, step.name)(**step.kwargs)
|
|
289
|
+
|
|
290
|
+
step.duration_ms = (time.perf_counter() - t_start) * 1000
|
|
291
|
+
if verbose:
|
|
292
|
+
print(f" [{step.duration_ms:6.2f} ms] {step.name}")
|
|
293
|
+
|
|
294
|
+
return data_obj
|
|
295
|
+
|
|
296
|
+
def _validate_steps_fn(self, data_obj) -> None:
|
|
297
|
+
missing = [
|
|
298
|
+
s.name for s in self.steps
|
|
299
|
+
if not hasattr(data_obj, s.name)
|
|
300
|
+
and s.name not in self._fn_registry
|
|
301
|
+
]
|
|
302
|
+
if missing:
|
|
303
|
+
raise AttributeError(f"Steps not found: {missing}")
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from PBstats.preprocessing.cleaning import CleaningMixin
|
|
2
|
+
from PBstats.preprocessing.normalization import NormalizationMixin
|
|
3
|
+
from PBstats.preprocessing.outliers import OutlierMixin
|
|
4
|
+
from PBstats.preprocessing.imputation import ImputationMixin
|
|
5
|
+
__all__ = [
|
|
6
|
+
"CleaningMixin",
|
|
7
|
+
"NormalizationMixin",
|
|
8
|
+
"OutlierMixin",
|
|
9
|
+
"ImputationMixin",
|
|
10
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
class CleaningMixin:
|
|
5
|
+
"""
|
|
6
|
+
Methods: remove_missing, remove_dc, clip
|
|
7
|
+
Each mutates self.data, records the step, returns self.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def remove_missing(self, strategy: str = "interpolate") -> "CleaningMixin":
|
|
11
|
+
"""
|
|
12
|
+
Replace NaN values in the signal.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
strategy : 'interpolate' — linear interpolation between neighbours
|
|
17
|
+
'zero' — replace with 0
|
|
18
|
+
'mean' — replace with signal mean
|
|
19
|
+
'drop' — remove NaN positions entirely
|
|
20
|
+
"""
|
|
21
|
+
x = self.data.astype(float)
|
|
22
|
+
|
|
23
|
+
if strategy == "interpolate":
|
|
24
|
+
nan_mask = np.isnan(x)
|
|
25
|
+
if nan_mask.any():
|
|
26
|
+
indices = np.arange(len(x))
|
|
27
|
+
x[nan_mask] = np.interp(
|
|
28
|
+
indices[nan_mask],
|
|
29
|
+
indices[~nan_mask],
|
|
30
|
+
x[~nan_mask]
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
elif strategy == "zero":
|
|
34
|
+
x = np.nan_to_num(x, nan=0.0)
|
|
35
|
+
|
|
36
|
+
elif strategy == "mean":
|
|
37
|
+
mean_val = np.nanmean(x)
|
|
38
|
+
x = np.where(np.isnan(x), mean_val, x)
|
|
39
|
+
|
|
40
|
+
elif strategy == "drop":
|
|
41
|
+
x = x[~np.isnan(x)]
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unknown strategy '{strategy}'. "
|
|
45
|
+
f"Choose: interpolate, zero, mean, drop")
|
|
46
|
+
|
|
47
|
+
self.data = x
|
|
48
|
+
self._record(f"remove_missing(strategy={strategy})")
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def remove_dc(self) -> "CleaningMixin":
|
|
52
|
+
"""
|
|
53
|
+
Subtract the mean (remove DC offset).
|
|
54
|
+
Essential for spectral analysis — DC shows up as a massive
|
|
55
|
+
spike at 0 Hz in FFT if not removed.
|
|
56
|
+
"""
|
|
57
|
+
self.data = self.data - np.mean(self.data)
|
|
58
|
+
self._record("remove_dc()")
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def clip(self, low: float, high: float) -> "CleaningMixin":
|
|
62
|
+
"""
|
|
63
|
+
Hard-clip values to [low, high].
|
|
64
|
+
Useful for removing sensor saturation artifacts.
|
|
65
|
+
"""
|
|
66
|
+
self.data = np.clip(self.data, low, high)
|
|
67
|
+
self._record(f"clip(low={low}, high={high})")
|
|
68
|
+
return self
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
class ImputationMixin:
|
|
5
|
+
"""
|
|
6
|
+
Methods: impute
|
|
7
|
+
More sophisticated missing value handling for 2D (tabular) data.
|
|
8
|
+
For 1D signals, remove_missing() in CleaningMixin is sufficient.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def impute(self, method: str = "mean") -> "ImputationMixin":
|
|
12
|
+
"""
|
|
13
|
+
Fill NaNs in a 2D array column-by-column.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
method : 'mean' — fill with column mean
|
|
18
|
+
'median' — fill with column median
|
|
19
|
+
'ffill' — forward-fill (carry last valid value forward)
|
|
20
|
+
'bfill' — backward-fill
|
|
21
|
+
"""
|
|
22
|
+
if self.data.ndim == 1:
|
|
23
|
+
# Redirect 1D to the simpler cleaner
|
|
24
|
+
return self.remove_missing(
|
|
25
|
+
strategy="interpolate" if method in ("ffill","bfill") else method
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
x = self.data.astype(float)
|
|
29
|
+
|
|
30
|
+
for col in range(x.shape[1]):
|
|
31
|
+
column = x[:, col]
|
|
32
|
+
nan_mask = np.isnan(column)
|
|
33
|
+
if not nan_mask.any():
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
if method == "mean":
|
|
37
|
+
column[nan_mask] = np.nanmean(column)
|
|
38
|
+
elif method == "median":
|
|
39
|
+
column[nan_mask] = np.nanmedian(column)
|
|
40
|
+
elif method == "ffill":
|
|
41
|
+
# Vectorized ffill: mask valid indices, then use ffill logic
|
|
42
|
+
mask = ~nan_mask
|
|
43
|
+
idx = np.where(mask, np.arange(len(column)), 0)
|
|
44
|
+
np.maximum.accumulate(idx, out=idx)
|
|
45
|
+
column[:] = column[idx]
|
|
46
|
+
elif method == "bfill":
|
|
47
|
+
# Vectorized bfill: reverse, ffill, reverse
|
|
48
|
+
mask = ~nan_mask
|
|
49
|
+
idx = np.where(mask, np.arange(len(column)), len(column) - 1)
|
|
50
|
+
# Reverse accumulation for bfill
|
|
51
|
+
idx = len(column) - 1 - np.maximum.accumulate((len(column) - 1 - idx)[::-1])[::-1]
|
|
52
|
+
column[:] = column[idx]
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError(f"Unknown method '{method}'. "
|
|
55
|
+
f"Choose: mean, median, ffill, bfill")
|
|
56
|
+
x[:, col] = column
|
|
57
|
+
|
|
58
|
+
self.data = x
|
|
59
|
+
self._record(f"impute(method={method})")
|
|
60
|
+
return self
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
class NormalizationMixin:
|
|
5
|
+
"""
|
|
6
|
+
Methods: normalize, standardize, minmax, robust_scale
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def normalize(self, method: str = "minmax") -> "NormalizationMixin":
|
|
10
|
+
"""
|
|
11
|
+
Convenience wrapper — calls the right method by name.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
method : 'minmax' — scale to [0, 1]
|
|
16
|
+
'zscore' — zero mean, unit variance
|
|
17
|
+
'robust' — median and IQR based (outlier-resistant)
|
|
18
|
+
'l2' — divide by L2 norm (unit vector)
|
|
19
|
+
"""
|
|
20
|
+
dispatch = {
|
|
21
|
+
"minmax": self.minmax,
|
|
22
|
+
"zscore": self.standardize,
|
|
23
|
+
"robust": self.robust_scale,
|
|
24
|
+
"l2": self._l2_norm,
|
|
25
|
+
}
|
|
26
|
+
if method not in dispatch:
|
|
27
|
+
raise ValueError(f"Unknown method '{method}'. "
|
|
28
|
+
f"Choose: {list(dispatch)}")
|
|
29
|
+
return dispatch[method]()
|
|
30
|
+
|
|
31
|
+
def minmax(self) -> "NormalizationMixin":
|
|
32
|
+
"""Scale to [0, 1]."""
|
|
33
|
+
x = self.data
|
|
34
|
+
xmin, xmax = x.min(), x.max()
|
|
35
|
+
if xmax == xmin:
|
|
36
|
+
self.data = np.zeros_like(x)
|
|
37
|
+
else:
|
|
38
|
+
self.data = (x - xmin) / (xmax - xmin)
|
|
39
|
+
self._record("minmax()")
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def standardize(self) -> "NormalizationMixin":
|
|
43
|
+
"""Zero mean, unit variance (z-score normalization)."""
|
|
44
|
+
x = self.data
|
|
45
|
+
std = x.std()
|
|
46
|
+
if std == 0:
|
|
47
|
+
self.data = np.zeros_like(x)
|
|
48
|
+
else:
|
|
49
|
+
self.data = (x - x.mean()) / std
|
|
50
|
+
self._record("standardize()")
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def robust_scale(self) -> "NormalizationMixin":
|
|
54
|
+
"""
|
|
55
|
+
Scale using median and IQR instead of mean and std.
|
|
56
|
+
Much better for biomedical data with spikes or outliers.
|
|
57
|
+
"""
|
|
58
|
+
x = self.data
|
|
59
|
+
median = np.median(x)
|
|
60
|
+
q75, q25 = np.percentile(x, [75, 25])
|
|
61
|
+
iqr = q75 - q25
|
|
62
|
+
if iqr == 0:
|
|
63
|
+
self.data = np.zeros_like(x)
|
|
64
|
+
else:
|
|
65
|
+
self.data = (x - median) / iqr
|
|
66
|
+
self._record("robust_scale()")
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
def _l2_norm(self) -> "NormalizationMixin":
|
|
70
|
+
"""Divide by L2 norm — makes the signal a unit vector."""
|
|
71
|
+
norm = np.linalg.norm(self.data)
|
|
72
|
+
if norm == 0:
|
|
73
|
+
self.data = np.zeros_like(self.data)
|
|
74
|
+
else:
|
|
75
|
+
self.data = self.data / norm
|
|
76
|
+
self._record("l2_norm()")
|
|
77
|
+
return self
|