photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
overlap_metrics/core.py
CHANGED
|
@@ -1,284 +1,284 @@
|
|
|
1
|
-
"""Core data structures and base classes for overlap_metrics library."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import time
|
|
6
|
-
from abc import ABC, abstractmethod
|
|
7
|
-
from collections.abc import Callable, Sequence
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
from enum import Enum
|
|
10
|
-
|
|
11
|
-
import numpy as np
|
|
12
|
-
import numpy.typing as npt
|
|
13
|
-
import pandas as pd
|
|
14
|
-
|
|
15
|
-
from .config import NUMERICS, VALIDATION
|
|
16
|
-
from .utils import make_grid
|
|
17
|
-
|
|
18
|
-
# Type aliases
|
|
19
|
-
ArrayLike1D = npt.NDArray[np.float64] | pd.Series | Sequence[float]
|
|
20
|
-
PDF = Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class EstimatorName(Enum):
|
|
24
|
-
"""Available density estimator types."""
|
|
25
|
-
|
|
26
|
-
HIST = "hist"
|
|
27
|
-
BETA = "beta"
|
|
28
|
-
LOGIT_KDE = "logit_kde"
|
|
29
|
-
# Reserved for future implementation
|
|
30
|
-
BETA_MIX = "beta_mix"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class MetricName(Enum):
|
|
34
|
-
"""Available separation/overlap metrics."""
|
|
35
|
-
|
|
36
|
-
SEPARATION_OVL = "separation_ovl"
|
|
37
|
-
BHATTACHARYYA_DISTANCE = "bhattacharyya_distance"
|
|
38
|
-
JENSEN_SHANNON = "js_divergence"
|
|
39
|
-
HELLINGER = "hellinger_distance"
|
|
40
|
-
TOTAL_VARIATION = "total_variation"
|
|
41
|
-
WASSERSTEIN_1D = "wasserstein_1d"
|
|
42
|
-
KS_STAT = "ks_stat"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass(frozen=True)
|
|
46
|
-
class ScoreSamples:
|
|
47
|
-
"""Container for positive and negative score samples."""
|
|
48
|
-
|
|
49
|
-
pos: npt.NDArray[np.float64]
|
|
50
|
-
neg: npt.NDArray[np.float64]
|
|
51
|
-
|
|
52
|
-
def __post_init__(self) -> None:
|
|
53
|
-
"""Validate sample arrays after construction."""
|
|
54
|
-
if len(self.pos) < VALIDATION.MIN_SAMPLES:
|
|
55
|
-
raise ValueError(f"Need at least {VALIDATION.MIN_SAMPLES} positive samples, got {len(self.pos)}")
|
|
56
|
-
if len(self.neg) < VALIDATION.MIN_SAMPLES:
|
|
57
|
-
raise ValueError(f"Need at least {VALIDATION.MIN_SAMPLES} negative samples, got {len(self.neg)}")
|
|
58
|
-
|
|
59
|
-
if self.pos.ndim != 1:
|
|
60
|
-
raise ValueError(f"Positive samples must be 1D, got shape {self.pos.shape}")
|
|
61
|
-
if self.neg.ndim != 1:
|
|
62
|
-
raise ValueError(f"Negative samples must be 1D, got shape {self.neg.shape}")
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
def to_ndarray1d(x: ArrayLike1D, dropna: bool, clip01: bool) -> npt.NDArray[np.float64]:
|
|
66
|
-
"""Convert array-like input to 1D float64 numpy array."""
|
|
67
|
-
vals: npt.NDArray[np.float64]
|
|
68
|
-
if isinstance(x, pd.Series):
|
|
69
|
-
vals = x.to_numpy(copy=False, dtype=NUMERICS.DTYPE_FLOAT)
|
|
70
|
-
if dropna:
|
|
71
|
-
vals = vals[~pd.isna(vals)]
|
|
72
|
-
else:
|
|
73
|
-
vals = np.asarray(x, dtype=NUMERICS.DTYPE_FLOAT)
|
|
74
|
-
|
|
75
|
-
if clip01:
|
|
76
|
-
vals = np.clip(vals, NUMERICS.SCORE_MIN, NUMERICS.SCORE_MAX)
|
|
77
|
-
|
|
78
|
-
if vals.ndim != 1:
|
|
79
|
-
raise ValueError(f"Expected 1D scores, got shape {vals.shape}")
|
|
80
|
-
|
|
81
|
-
return vals
|
|
82
|
-
|
|
83
|
-
@classmethod
|
|
84
|
-
def from_arrays(
|
|
85
|
-
cls,
|
|
86
|
-
pos: ArrayLike1D,
|
|
87
|
-
neg: ArrayLike1D,
|
|
88
|
-
dropna: bool = True,
|
|
89
|
-
clip01: bool = True,
|
|
90
|
-
) -> ScoreSamples:
|
|
91
|
-
"""Create ScoreSamples from two array-like objects."""
|
|
92
|
-
pos_array: npt.NDArray[np.float64] = cls.to_ndarray1d(pos, dropna=dropna, clip01=clip01)
|
|
93
|
-
neg_array: npt.NDArray[np.float64] = cls.to_ndarray1d(neg, dropna=dropna, clip01=clip01)
|
|
94
|
-
return cls(pos=pos_array, neg=neg_array)
|
|
95
|
-
|
|
96
|
-
@classmethod
|
|
97
|
-
def from_frame(
|
|
98
|
-
cls,
|
|
99
|
-
df: pd.DataFrame,
|
|
100
|
-
pos_col: str,
|
|
101
|
-
neg_col: str,
|
|
102
|
-
dropna: bool = True,
|
|
103
|
-
clip01: bool = True,
|
|
104
|
-
) -> ScoreSamples:
|
|
105
|
-
"""Create ScoreSamples from DataFrame columns."""
|
|
106
|
-
return cls.from_arrays(df[pos_col], df[neg_col], dropna=dropna, clip01=clip01)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
@dataclass(frozen=True)
|
|
110
|
-
class MetricResult:
|
|
111
|
-
"""Result of a metric computation with metadata."""
|
|
112
|
-
|
|
113
|
-
name: str
|
|
114
|
-
value: float
|
|
115
|
-
lower_is_better: bool
|
|
116
|
-
bounds: tuple[float, float]
|
|
117
|
-
estimator_name: str
|
|
118
|
-
details: dict[str, float]
|
|
119
|
-
meta: dict[str, float]
|
|
120
|
-
|
|
121
|
-
def __post_init__(self) -> None:
|
|
122
|
-
"""Validate metric result after construction."""
|
|
123
|
-
if not np.isfinite(self.value):
|
|
124
|
-
raise ValueError(f"Metric value must be finite, got {self.value}")
|
|
125
|
-
|
|
126
|
-
bound_min: float
|
|
127
|
-
bound_max: float
|
|
128
|
-
bound_min, bound_max = self.bounds
|
|
129
|
-
if not (bound_min <= self.value <= bound_max):
|
|
130
|
-
# Allow slight numerical tolerance for bounds checking
|
|
131
|
-
tolerance: float = 1e-10
|
|
132
|
-
if not (bound_min - tolerance <= self.value <= bound_max + tolerance):
|
|
133
|
-
raise ValueError(f"Metric value {self.value} outside bounds {self.bounds} for {self.name}")
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class DensityEstimatorBase(ABC):
|
|
137
|
-
"""Abstract base class for density estimators on [0,1]."""
|
|
138
|
-
|
|
139
|
-
def __init__(self, name: str):
|
|
140
|
-
self.name: str = name
|
|
141
|
-
self._fitted: bool = False
|
|
142
|
-
|
|
143
|
-
@abstractmethod
|
|
144
|
-
def fit(
|
|
145
|
-
self,
|
|
146
|
-
samples: npt.NDArray[np.float64],
|
|
147
|
-
weights: npt.NDArray[np.float64] | None = None,
|
|
148
|
-
random_state: int | None = None,
|
|
149
|
-
) -> None:
|
|
150
|
-
"""Learn parameters from samples in [0,1].
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
samples: 1D array of samples in [0,1]
|
|
154
|
-
weights: Optional sample weights (non-negative, sum arbitrary)
|
|
155
|
-
random_state: Random seed for stochastic initialization
|
|
156
|
-
"""
|
|
157
|
-
pass
|
|
158
|
-
|
|
159
|
-
@abstractmethod
|
|
160
|
-
def pdf(self, x: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
|
|
161
|
-
"""Evaluate fitted PDF at points x in [0,1].
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
x: Points to evaluate, should be in [0,1]
|
|
165
|
-
|
|
166
|
-
Returns:
|
|
167
|
-
PDF values (non-negative)
|
|
168
|
-
"""
|
|
169
|
-
pass
|
|
170
|
-
|
|
171
|
-
def integral(self, n_grid: int, grid: str) -> float:
|
|
172
|
-
"""Compute integral of PDF over [0,1] using trapezoidal rule."""
|
|
173
|
-
if not self._fitted:
|
|
174
|
-
raise RuntimeError(f"Estimator {self.name} must be fitted before computing integral")
|
|
175
|
-
|
|
176
|
-
xs: npt.NDArray[np.float64] = make_grid(n_grid=n_grid, mode=grid)
|
|
177
|
-
ys: npt.NDArray[np.float64] = np.maximum(self.pdf(xs), 0.0)
|
|
178
|
-
integral_val: float = float(np.trapezoid(ys, xs))
|
|
179
|
-
return integral_val
|
|
180
|
-
|
|
181
|
-
def _mark_fitted(self) -> None:
|
|
182
|
-
"""Mark estimator as fitted (call from subclass fit methods)."""
|
|
183
|
-
self._fitted = True
|
|
184
|
-
|
|
185
|
-
def _check_fitted(self) -> None:
|
|
186
|
-
"""Raise error if estimator not fitted."""
|
|
187
|
-
if not self._fitted:
|
|
188
|
-
raise RuntimeError(f"Estimator {self.name} must be fitted before use")
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
class MetricBase(ABC):
|
|
192
|
-
"""Abstract base class for separation/overlap metrics."""
|
|
193
|
-
|
|
194
|
-
def __init__(self, name: str, lower_is_better: bool, bounds: tuple[float, float]):
|
|
195
|
-
self.name: str = name
|
|
196
|
-
self.lower_is_better: bool = lower_is_better
|
|
197
|
-
self.bounds: tuple[float, float] = bounds
|
|
198
|
-
|
|
199
|
-
@abstractmethod
|
|
200
|
-
def from_pdfs(
|
|
201
|
-
self,
|
|
202
|
-
p: PDF,
|
|
203
|
-
q: PDF,
|
|
204
|
-
n_grid: int,
|
|
205
|
-
grid: str,
|
|
206
|
-
) -> MetricResult:
|
|
207
|
-
"""Compute metric from two PDF functions by integration on [0,1] grid."""
|
|
208
|
-
pass
|
|
209
|
-
|
|
210
|
-
def from_samples(
|
|
211
|
-
self,
|
|
212
|
-
samples: ScoreSamples,
|
|
213
|
-
estimator: DensityEstimatorBase,
|
|
214
|
-
n_grid: int,
|
|
215
|
-
grid: str,
|
|
216
|
-
weights_pos: npt.NDArray[np.float64] | None = None,
|
|
217
|
-
weights_neg: npt.NDArray[np.float64] | None = None,
|
|
218
|
-
random_state: int | None = None,
|
|
219
|
-
) -> MetricResult:
|
|
220
|
-
"""Compute metric from samples using density estimation."""
|
|
221
|
-
start_time: float = time.perf_counter()
|
|
222
|
-
|
|
223
|
-
# Create fresh estimator instances
|
|
224
|
-
estimator_type: type[DensityEstimatorBase] = type(estimator)
|
|
225
|
-
estimator_pos: DensityEstimatorBase = estimator_type(estimator.name)
|
|
226
|
-
estimator_neg: DensityEstimatorBase = estimator_type(estimator.name)
|
|
227
|
-
|
|
228
|
-
# Fit both estimators
|
|
229
|
-
estimator_pos.fit(samples.pos, weights=weights_pos, random_state=random_state)
|
|
230
|
-
estimator_neg.fit(samples.neg, weights=weights_neg, random_state=random_state)
|
|
231
|
-
|
|
232
|
-
# Compute metric from PDFs
|
|
233
|
-
result: MetricResult = self.from_pdfs(estimator_pos.pdf, estimator_neg.pdf, n_grid=n_grid, grid=grid)
|
|
234
|
-
|
|
235
|
-
# Add runtime metadata
|
|
236
|
-
runtime_ms: float = (time.perf_counter() - start_time) * 1000.0
|
|
237
|
-
|
|
238
|
-
# Create updated result with metadata
|
|
239
|
-
updated_meta: dict[str, float] = {
|
|
240
|
-
**result.meta,
|
|
241
|
-
"n_pos": float(len(samples.pos)),
|
|
242
|
-
"n_neg": float(len(samples.neg)),
|
|
243
|
-
"runtime_ms": runtime_ms,
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
return MetricResult(
|
|
247
|
-
name=result.name,
|
|
248
|
-
value=result.value,
|
|
249
|
-
lower_is_better=result.lower_is_better,
|
|
250
|
-
bounds=result.bounds,
|
|
251
|
-
estimator_name=result.estimator_name,
|
|
252
|
-
details=result.details,
|
|
253
|
-
meta=updated_meta,
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
class SampleBasedMetric(MetricBase):
|
|
258
|
-
"""Base class for metrics that operate directly on samples."""
|
|
259
|
-
|
|
260
|
-
def from_pdfs(
|
|
261
|
-
self,
|
|
262
|
-
p: PDF,
|
|
263
|
-
q: PDF,
|
|
264
|
-
n_grid: int,
|
|
265
|
-
grid: str,
|
|
266
|
-
) -> MetricResult:
|
|
267
|
-
"""Sample-based metrics cannot be computed from PDFs."""
|
|
268
|
-
raise NotImplementedError(
|
|
269
|
-
f"Metric {self.name} is sample-based and cannot be computed from PDFs. Use from_samples() instead."
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
@abstractmethod
|
|
273
|
-
def from_samples(
|
|
274
|
-
self,
|
|
275
|
-
samples: ScoreSamples,
|
|
276
|
-
estimator: DensityEstimatorBase,
|
|
277
|
-
n_grid: int,
|
|
278
|
-
grid: str,
|
|
279
|
-
weights_pos: npt.NDArray[np.float64] | None = None,
|
|
280
|
-
weights_neg: npt.NDArray[np.float64] | None = None,
|
|
281
|
-
random_state: int | None = None,
|
|
282
|
-
) -> MetricResult:
|
|
283
|
-
"""Compute metric directly from samples."""
|
|
284
|
-
pass
|
|
1
|
+
"""Core data structures and base classes for overlap_metrics library."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections.abc import Callable, Sequence
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import numpy.typing as npt
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from .config import NUMERICS, VALIDATION
|
|
16
|
+
from .utils import make_grid
|
|
17
|
+
|
|
18
|
+
# Type aliases
|
|
19
|
+
ArrayLike1D = npt.NDArray[np.float64] | pd.Series | Sequence[float]
|
|
20
|
+
PDF = Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EstimatorName(Enum):
|
|
24
|
+
"""Available density estimator types."""
|
|
25
|
+
|
|
26
|
+
HIST = "hist"
|
|
27
|
+
BETA = "beta"
|
|
28
|
+
LOGIT_KDE = "logit_kde"
|
|
29
|
+
# Reserved for future implementation
|
|
30
|
+
BETA_MIX = "beta_mix"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MetricName(Enum):
|
|
34
|
+
"""Available separation/overlap metrics."""
|
|
35
|
+
|
|
36
|
+
SEPARATION_OVL = "separation_ovl"
|
|
37
|
+
BHATTACHARYYA_DISTANCE = "bhattacharyya_distance"
|
|
38
|
+
JENSEN_SHANNON = "js_divergence"
|
|
39
|
+
HELLINGER = "hellinger_distance"
|
|
40
|
+
TOTAL_VARIATION = "total_variation"
|
|
41
|
+
WASSERSTEIN_1D = "wasserstein_1d"
|
|
42
|
+
KS_STAT = "ks_stat"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class ScoreSamples:
|
|
47
|
+
"""Container for positive and negative score samples."""
|
|
48
|
+
|
|
49
|
+
pos: npt.NDArray[np.float64]
|
|
50
|
+
neg: npt.NDArray[np.float64]
|
|
51
|
+
|
|
52
|
+
def __post_init__(self) -> None:
|
|
53
|
+
"""Validate sample arrays after construction."""
|
|
54
|
+
if len(self.pos) < VALIDATION.MIN_SAMPLES:
|
|
55
|
+
raise ValueError(f"Need at least {VALIDATION.MIN_SAMPLES} positive samples, got {len(self.pos)}")
|
|
56
|
+
if len(self.neg) < VALIDATION.MIN_SAMPLES:
|
|
57
|
+
raise ValueError(f"Need at least {VALIDATION.MIN_SAMPLES} negative samples, got {len(self.neg)}")
|
|
58
|
+
|
|
59
|
+
if self.pos.ndim != 1:
|
|
60
|
+
raise ValueError(f"Positive samples must be 1D, got shape {self.pos.shape}")
|
|
61
|
+
if self.neg.ndim != 1:
|
|
62
|
+
raise ValueError(f"Negative samples must be 1D, got shape {self.neg.shape}")
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def to_ndarray1d(x: ArrayLike1D, dropna: bool, clip01: bool) -> npt.NDArray[np.float64]:
|
|
66
|
+
"""Convert array-like input to 1D float64 numpy array."""
|
|
67
|
+
vals: npt.NDArray[np.float64]
|
|
68
|
+
if isinstance(x, pd.Series):
|
|
69
|
+
vals = x.to_numpy(copy=False, dtype=NUMERICS.DTYPE_FLOAT)
|
|
70
|
+
if dropna:
|
|
71
|
+
vals = vals[~pd.isna(vals)]
|
|
72
|
+
else:
|
|
73
|
+
vals = np.asarray(x, dtype=NUMERICS.DTYPE_FLOAT)
|
|
74
|
+
|
|
75
|
+
if clip01:
|
|
76
|
+
vals = np.clip(vals, NUMERICS.SCORE_MIN, NUMERICS.SCORE_MAX)
|
|
77
|
+
|
|
78
|
+
if vals.ndim != 1:
|
|
79
|
+
raise ValueError(f"Expected 1D scores, got shape {vals.shape}")
|
|
80
|
+
|
|
81
|
+
return vals
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_arrays(
|
|
85
|
+
cls,
|
|
86
|
+
pos: ArrayLike1D,
|
|
87
|
+
neg: ArrayLike1D,
|
|
88
|
+
dropna: bool = True,
|
|
89
|
+
clip01: bool = True,
|
|
90
|
+
) -> ScoreSamples:
|
|
91
|
+
"""Create ScoreSamples from two array-like objects."""
|
|
92
|
+
pos_array: npt.NDArray[np.float64] = cls.to_ndarray1d(pos, dropna=dropna, clip01=clip01)
|
|
93
|
+
neg_array: npt.NDArray[np.float64] = cls.to_ndarray1d(neg, dropna=dropna, clip01=clip01)
|
|
94
|
+
return cls(pos=pos_array, neg=neg_array)
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def from_frame(
|
|
98
|
+
cls,
|
|
99
|
+
df: pd.DataFrame,
|
|
100
|
+
pos_col: str,
|
|
101
|
+
neg_col: str,
|
|
102
|
+
dropna: bool = True,
|
|
103
|
+
clip01: bool = True,
|
|
104
|
+
) -> ScoreSamples:
|
|
105
|
+
"""Create ScoreSamples from DataFrame columns."""
|
|
106
|
+
return cls.from_arrays(df[pos_col], df[neg_col], dropna=dropna, clip01=clip01)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass(frozen=True)
|
|
110
|
+
class MetricResult:
|
|
111
|
+
"""Result of a metric computation with metadata."""
|
|
112
|
+
|
|
113
|
+
name: str
|
|
114
|
+
value: float
|
|
115
|
+
lower_is_better: bool
|
|
116
|
+
bounds: tuple[float, float]
|
|
117
|
+
estimator_name: str
|
|
118
|
+
details: dict[str, float]
|
|
119
|
+
meta: dict[str, float]
|
|
120
|
+
|
|
121
|
+
def __post_init__(self) -> None:
|
|
122
|
+
"""Validate metric result after construction."""
|
|
123
|
+
if not np.isfinite(self.value):
|
|
124
|
+
raise ValueError(f"Metric value must be finite, got {self.value}")
|
|
125
|
+
|
|
126
|
+
bound_min: float
|
|
127
|
+
bound_max: float
|
|
128
|
+
bound_min, bound_max = self.bounds
|
|
129
|
+
if not (bound_min <= self.value <= bound_max):
|
|
130
|
+
# Allow slight numerical tolerance for bounds checking
|
|
131
|
+
tolerance: float = 1e-10
|
|
132
|
+
if not (bound_min - tolerance <= self.value <= bound_max + tolerance):
|
|
133
|
+
raise ValueError(f"Metric value {self.value} outside bounds {self.bounds} for {self.name}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class DensityEstimatorBase(ABC):
|
|
137
|
+
"""Abstract base class for density estimators on [0,1]."""
|
|
138
|
+
|
|
139
|
+
def __init__(self, name: str):
|
|
140
|
+
self.name: str = name
|
|
141
|
+
self._fitted: bool = False
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
def fit(
|
|
145
|
+
self,
|
|
146
|
+
samples: npt.NDArray[np.float64],
|
|
147
|
+
weights: npt.NDArray[np.float64] | None = None,
|
|
148
|
+
random_state: int | None = None,
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Learn parameters from samples in [0,1].
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
samples: 1D array of samples in [0,1]
|
|
154
|
+
weights: Optional sample weights (non-negative, sum arbitrary)
|
|
155
|
+
random_state: Random seed for stochastic initialization
|
|
156
|
+
"""
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
@abstractmethod
|
|
160
|
+
def pdf(self, x: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
|
|
161
|
+
"""Evaluate fitted PDF at points x in [0,1].
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
x: Points to evaluate, should be in [0,1]
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
PDF values (non-negative)
|
|
168
|
+
"""
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
def integral(self, n_grid: int, grid: str) -> float:
|
|
172
|
+
"""Compute integral of PDF over [0,1] using trapezoidal rule."""
|
|
173
|
+
if not self._fitted:
|
|
174
|
+
raise RuntimeError(f"Estimator {self.name} must be fitted before computing integral")
|
|
175
|
+
|
|
176
|
+
xs: npt.NDArray[np.float64] = make_grid(n_grid=n_grid, mode=grid)
|
|
177
|
+
ys: npt.NDArray[np.float64] = np.maximum(self.pdf(xs), 0.0)
|
|
178
|
+
integral_val: float = float(np.trapezoid(ys, xs))
|
|
179
|
+
return integral_val
|
|
180
|
+
|
|
181
|
+
def _mark_fitted(self) -> None:
|
|
182
|
+
"""Mark estimator as fitted (call from subclass fit methods)."""
|
|
183
|
+
self._fitted = True
|
|
184
|
+
|
|
185
|
+
def _check_fitted(self) -> None:
|
|
186
|
+
"""Raise error if estimator not fitted."""
|
|
187
|
+
if not self._fitted:
|
|
188
|
+
raise RuntimeError(f"Estimator {self.name} must be fitted before use")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class MetricBase(ABC):
|
|
192
|
+
"""Abstract base class for separation/overlap metrics."""
|
|
193
|
+
|
|
194
|
+
def __init__(self, name: str, lower_is_better: bool, bounds: tuple[float, float]):
|
|
195
|
+
self.name: str = name
|
|
196
|
+
self.lower_is_better: bool = lower_is_better
|
|
197
|
+
self.bounds: tuple[float, float] = bounds
|
|
198
|
+
|
|
199
|
+
@abstractmethod
|
|
200
|
+
def from_pdfs(
|
|
201
|
+
self,
|
|
202
|
+
p: PDF,
|
|
203
|
+
q: PDF,
|
|
204
|
+
n_grid: int,
|
|
205
|
+
grid: str,
|
|
206
|
+
) -> MetricResult:
|
|
207
|
+
"""Compute metric from two PDF functions by integration on [0,1] grid."""
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
def from_samples(
|
|
211
|
+
self,
|
|
212
|
+
samples: ScoreSamples,
|
|
213
|
+
estimator: DensityEstimatorBase,
|
|
214
|
+
n_grid: int,
|
|
215
|
+
grid: str,
|
|
216
|
+
weights_pos: npt.NDArray[np.float64] | None = None,
|
|
217
|
+
weights_neg: npt.NDArray[np.float64] | None = None,
|
|
218
|
+
random_state: int | None = None,
|
|
219
|
+
) -> MetricResult:
|
|
220
|
+
"""Compute metric from samples using density estimation."""
|
|
221
|
+
start_time: float = time.perf_counter()
|
|
222
|
+
|
|
223
|
+
# Create fresh estimator instances
|
|
224
|
+
estimator_type: type[DensityEstimatorBase] = type(estimator)
|
|
225
|
+
estimator_pos: DensityEstimatorBase = estimator_type(estimator.name)
|
|
226
|
+
estimator_neg: DensityEstimatorBase = estimator_type(estimator.name)
|
|
227
|
+
|
|
228
|
+
# Fit both estimators
|
|
229
|
+
estimator_pos.fit(samples.pos, weights=weights_pos, random_state=random_state)
|
|
230
|
+
estimator_neg.fit(samples.neg, weights=weights_neg, random_state=random_state)
|
|
231
|
+
|
|
232
|
+
# Compute metric from PDFs
|
|
233
|
+
result: MetricResult = self.from_pdfs(estimator_pos.pdf, estimator_neg.pdf, n_grid=n_grid, grid=grid)
|
|
234
|
+
|
|
235
|
+
# Add runtime metadata
|
|
236
|
+
runtime_ms: float = (time.perf_counter() - start_time) * 1000.0
|
|
237
|
+
|
|
238
|
+
# Create updated result with metadata
|
|
239
|
+
updated_meta: dict[str, float] = {
|
|
240
|
+
**result.meta,
|
|
241
|
+
"n_pos": float(len(samples.pos)),
|
|
242
|
+
"n_neg": float(len(samples.neg)),
|
|
243
|
+
"runtime_ms": runtime_ms,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return MetricResult(
|
|
247
|
+
name=result.name,
|
|
248
|
+
value=result.value,
|
|
249
|
+
lower_is_better=result.lower_is_better,
|
|
250
|
+
bounds=result.bounds,
|
|
251
|
+
estimator_name=result.estimator_name,
|
|
252
|
+
details=result.details,
|
|
253
|
+
meta=updated_meta,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class SampleBasedMetric(MetricBase):
|
|
258
|
+
"""Base class for metrics that operate directly on samples."""
|
|
259
|
+
|
|
260
|
+
def from_pdfs(
|
|
261
|
+
self,
|
|
262
|
+
p: PDF,
|
|
263
|
+
q: PDF,
|
|
264
|
+
n_grid: int,
|
|
265
|
+
grid: str,
|
|
266
|
+
) -> MetricResult:
|
|
267
|
+
"""Sample-based metrics cannot be computed from PDFs."""
|
|
268
|
+
raise NotImplementedError(
|
|
269
|
+
f"Metric {self.name} is sample-based and cannot be computed from PDFs. Use from_samples() instead."
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
@abstractmethod
|
|
273
|
+
def from_samples(
|
|
274
|
+
self,
|
|
275
|
+
samples: ScoreSamples,
|
|
276
|
+
estimator: DensityEstimatorBase,
|
|
277
|
+
n_grid: int,
|
|
278
|
+
grid: str,
|
|
279
|
+
weights_pos: npt.NDArray[np.float64] | None = None,
|
|
280
|
+
weights_neg: npt.NDArray[np.float64] | None = None,
|
|
281
|
+
random_state: int | None = None,
|
|
282
|
+
) -> MetricResult:
|
|
283
|
+
"""Compute metric directly from samples."""
|
|
284
|
+
pass
|