finval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- finval/__init__.py +29 -0
- finval/baselines/__init__.py +32 -0
- finval/baselines/gaussian.py +64 -0
- finval/baselines/historical.py +80 -0
- finval/core/__init__.py +11 -0
- finval/core/bootstrap.py +116 -0
- finval/core/result.py +201 -0
- finval/core/thresholds.py +227 -0
- finval/metrics/__init__.py +62 -0
- finval/metrics/calibration.py +314 -0
- finval/metrics/dependence.py +416 -0
- finval/metrics/distribution.py +328 -0
- finval/metrics/paths.py +106 -0
- finval/metrics/temporal.py +354 -0
- finval/validate.py +339 -0
- finval-0.1.0.dist-info/METADATA +174 -0
- finval-0.1.0.dist-info/RECORD +19 -0
- finval-0.1.0.dist-info/WHEEL +4 -0
- finval-0.1.0.dist-info/licenses/LICENSE +21 -0
finval/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""finval — rigorous validation for synthetic financial time series.
|
|
2
|
+
|
|
3
|
+
Quick start:
|
|
4
|
+
|
|
5
|
+
import finval
|
|
6
|
+
|
|
7
|
+
# synthetic and real are (n_samples, n_features) arrays of returns
|
|
8
|
+
report = finval.validate(synthetic, real)
|
|
9
|
+
print(report.summary())
|
|
10
|
+
report.to_dict()
|
|
11
|
+
|
|
12
|
+
For path-level validation with drawdowns and calibration:
|
|
13
|
+
|
|
14
|
+
# paths are (n_paths, horizon, n_features) arrays
|
|
15
|
+
report = finval.validate_paths(synthetic_paths, real_returns)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from finval.core.result import MetricResult, ValidationReport
|
|
19
|
+
from finval.validate import validate, validate_paths
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"MetricResult",
|
|
25
|
+
"ValidationReport",
|
|
26
|
+
"validate",
|
|
27
|
+
"validate_paths",
|
|
28
|
+
"__version__",
|
|
29
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Baseline generators for benchmarking synthetic financial data.
|
|
2
|
+
|
|
3
|
+
A baseline is a simple generator that produces synthetic data from real
|
|
4
|
+
data. Baselines are not meant to be good models — they're meant to be
|
|
5
|
+
reference points so users can answer "is my fancy model actually better
|
|
6
|
+
than X?" where X is a trivially simple generator.
|
|
7
|
+
|
|
8
|
+
Three baselines ship with finval:
|
|
9
|
+
|
|
10
|
+
- `gaussian`: Multivariate Gaussian with the empirical mean and covariance
|
|
11
|
+
of the real data. No temporal structure, no tails, no vol clustering.
|
|
12
|
+
This is the minimum bar any generative model should clear.
|
|
13
|
+
|
|
14
|
+
- `historical_bootstrap`: Random sampling (with replacement) from real
|
|
15
|
+
returns. Reproduces marginals and joint distribution exactly in
|
|
16
|
+
expectation, but destroys all temporal structure (no ACF, no leverage).
|
|
17
|
+
|
|
18
|
+
- `block_bootstrap`: Moving-block bootstrap preserves local dependence.
|
|
19
|
+
This is the strongest simple baseline — it reproduces short-range
|
|
20
|
+
temporal structure and most stylized facts at the cost of exact
|
|
21
|
+
marginal duplication. A generative model that beats block bootstrap
|
|
22
|
+
on all metrics is genuinely doing something new.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from finval.baselines.gaussian import gaussian_baseline
|
|
26
|
+
from finval.baselines.historical import block_bootstrap, historical_bootstrap
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"gaussian_baseline",
|
|
30
|
+
"historical_bootstrap",
|
|
31
|
+
"block_bootstrap",
|
|
32
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Multivariate Gaussian baseline.
|
|
2
|
+
|
|
3
|
+
Fits a multivariate normal to the real returns and samples from it.
|
|
4
|
+
This is the naive "Gaussian i.i.d." benchmark. A model that can't beat
|
|
5
|
+
this on temporal metrics is not capturing any time structure.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def gaussian_baseline(
|
|
14
|
+
real: np.ndarray,
|
|
15
|
+
n_samples: int | None = None,
|
|
16
|
+
n_paths: int | None = None,
|
|
17
|
+
path_length: int | None = None,
|
|
18
|
+
seed: int = 42,
|
|
19
|
+
) -> np.ndarray:
|
|
20
|
+
"""Generate samples from a multivariate Gaussian fit to real returns.
|
|
21
|
+
|
|
22
|
+
Two output modes:
|
|
23
|
+
|
|
24
|
+
1. Flat mode (n_samples given): returns shape (n_samples, n_features),
|
|
25
|
+
matching the format expected by distribution / dependence metrics.
|
|
26
|
+
|
|
27
|
+
2. Path mode (n_paths and path_length given): returns shape
|
|
28
|
+
(n_paths, path_length, n_features), matching the format expected
|
|
29
|
+
by temporal and path metrics. Each row within a path is an
|
|
30
|
+
independent draw — there is no temporal structure.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
real: (n_obs, n_features) real return series used to fit mean+cov.
|
|
34
|
+
n_samples: Number of flat samples to return.
|
|
35
|
+
n_paths: Number of paths to return (requires path_length).
|
|
36
|
+
path_length: Length of each path.
|
|
37
|
+
seed: RNG seed.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
numpy array of shape (n_samples, n_features) or (n_paths, path_length, n_features).
|
|
41
|
+
"""
|
|
42
|
+
real = np.asarray(real)
|
|
43
|
+
if real.ndim != 2:
|
|
44
|
+
raise ValueError(f"real must be 2D, got shape {real.shape}")
|
|
45
|
+
|
|
46
|
+
mean = np.nanmean(real, axis=0)
|
|
47
|
+
# Handle NaN by dropping rows before covariance
|
|
48
|
+
clean = real[~np.any(np.isnan(real), axis=1)]
|
|
49
|
+
if len(clean) < 2:
|
|
50
|
+
raise ValueError("need at least 2 clean rows to fit covariance")
|
|
51
|
+
cov = np.cov(clean, rowvar=False)
|
|
52
|
+
# Regularize to ensure positive-definite
|
|
53
|
+
d = cov.shape[0]
|
|
54
|
+
cov = cov + 1e-10 * np.eye(d)
|
|
55
|
+
|
|
56
|
+
rng = np.random.default_rng(seed)
|
|
57
|
+
|
|
58
|
+
if n_samples is not None:
|
|
59
|
+
return rng.multivariate_normal(mean, cov, size=n_samples)
|
|
60
|
+
|
|
61
|
+
if n_paths is not None and path_length is not None:
|
|
62
|
+
return rng.multivariate_normal(mean, cov, size=(n_paths, path_length))
|
|
63
|
+
|
|
64
|
+
raise ValueError("must specify either n_samples or (n_paths and path_length)")
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Historical resampling baselines.
|
|
2
|
+
|
|
3
|
+
- `historical_bootstrap` samples rows i.i.d. from real returns. Preserves
|
|
4
|
+
the empirical marginal and joint distribution perfectly but destroys
|
|
5
|
+
all temporal structure.
|
|
6
|
+
|
|
7
|
+
- `block_bootstrap` samples contiguous blocks of rows. Preserves short-
|
|
8
|
+
range temporal dependence (within a block) at the cost of some joint
|
|
9
|
+
distribution fidelity at block boundaries. This is the strongest simple
|
|
10
|
+
baseline for temporal stylized facts.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def historical_bootstrap(
|
|
19
|
+
real: np.ndarray,
|
|
20
|
+
n_samples: int | None = None,
|
|
21
|
+
n_paths: int | None = None,
|
|
22
|
+
path_length: int | None = None,
|
|
23
|
+
seed: int = 42,
|
|
24
|
+
) -> np.ndarray:
|
|
25
|
+
"""I.i.d. bootstrap: sample rows of real with replacement.
|
|
26
|
+
|
|
27
|
+
Perfect marginal and joint distribution match in expectation.
|
|
28
|
+
Zero temporal structure (independent rows).
|
|
29
|
+
"""
|
|
30
|
+
real = np.asarray(real)
|
|
31
|
+
if real.ndim != 2:
|
|
32
|
+
raise ValueError(f"real must be 2D, got shape {real.shape}")
|
|
33
|
+
|
|
34
|
+
rng = np.random.default_rng(seed)
|
|
35
|
+
n = len(real)
|
|
36
|
+
|
|
37
|
+
if n_samples is not None:
|
|
38
|
+
idx = rng.integers(0, n, size=n_samples)
|
|
39
|
+
return real[idx]
|
|
40
|
+
|
|
41
|
+
if n_paths is not None and path_length is not None:
|
|
42
|
+
idx = rng.integers(0, n, size=(n_paths, path_length))
|
|
43
|
+
return real[idx]
|
|
44
|
+
|
|
45
|
+
raise ValueError("must specify either n_samples or (n_paths and path_length)")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def block_bootstrap(
|
|
49
|
+
real: np.ndarray,
|
|
50
|
+
n_paths: int,
|
|
51
|
+
path_length: int,
|
|
52
|
+
block_size: int = 20,
|
|
53
|
+
seed: int = 42,
|
|
54
|
+
) -> np.ndarray:
|
|
55
|
+
"""Moving-block bootstrap: sample contiguous blocks of rows.
|
|
56
|
+
|
|
57
|
+
Preserves temporal dependence up to block_size. A block size of
|
|
58
|
+
~20 trading days is typical for financial data. Only supports path
|
|
59
|
+
output (the concept doesn't make sense for flat samples).
|
|
60
|
+
|
|
61
|
+
Returns shape (n_paths, path_length, n_features).
|
|
62
|
+
"""
|
|
63
|
+
real = np.asarray(real)
|
|
64
|
+
if real.ndim != 2:
|
|
65
|
+
raise ValueError(f"real must be 2D, got shape {real.shape}")
|
|
66
|
+
if len(real) < block_size:
|
|
67
|
+
raise ValueError(f"real has {len(real)} rows, need >= block_size={block_size}")
|
|
68
|
+
|
|
69
|
+
rng = np.random.default_rng(seed)
|
|
70
|
+
n = len(real)
|
|
71
|
+
n_features = real.shape[1]
|
|
72
|
+
n_blocks = (path_length + block_size - 1) // block_size
|
|
73
|
+
|
|
74
|
+
paths = np.empty((n_paths, path_length, n_features), dtype=real.dtype)
|
|
75
|
+
for p in range(n_paths):
|
|
76
|
+
starts = rng.integers(0, n - block_size + 1, size=n_blocks)
|
|
77
|
+
blocks = [real[s : s + block_size] for s in starts]
|
|
78
|
+
full = np.concatenate(blocks, axis=0)
|
|
79
|
+
paths[p] = full[:path_length]
|
|
80
|
+
return paths
|
finval/core/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Core types and utilities for finval."""
|
|
2
|
+
|
|
3
|
+
from finval.core.result import MetricResult, ValidationReport
|
|
4
|
+
from finval.core.thresholds import DEFAULT_THRESHOLDS, quality_from_value
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MetricResult",
|
|
8
|
+
"ValidationReport",
|
|
9
|
+
"DEFAULT_THRESHOLDS",
|
|
10
|
+
"quality_from_value",
|
|
11
|
+
]
|
finval/core/bootstrap.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Bootstrap confidence intervals for metrics.
|
|
2
|
+
|
|
3
|
+
Block bootstrap is appropriate for time series (preserves local dependence).
|
|
4
|
+
Naive i.i.d. bootstrap is appropriate for flattened-sample metrics like
|
|
5
|
+
marginal KS or Pearson correlation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def iid_bootstrap_ci(
|
|
16
|
+
metric_fn: Callable[[np.ndarray, np.ndarray], float],
|
|
17
|
+
synthetic: np.ndarray,
|
|
18
|
+
real: np.ndarray,
|
|
19
|
+
n_bootstrap: int = 200,
|
|
20
|
+
confidence: float = 0.95,
|
|
21
|
+
seed: int = 42,
|
|
22
|
+
) -> tuple[float, float]:
|
|
23
|
+
"""Compute a bootstrap confidence interval for a metric under i.i.d. resampling.
|
|
24
|
+
|
|
25
|
+
This is valid for cross-sectional metrics where rows are treated as
|
|
26
|
+
exchangeable samples (e.g., marginal KS, pairwise correlation error).
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
metric_fn: Callable taking (synthetic, real) arrays and returning
|
|
30
|
+
a scalar metric value.
|
|
31
|
+
synthetic: (n_samples, n_features) synthetic data.
|
|
32
|
+
real: (n_samples, n_features) real data.
|
|
33
|
+
n_bootstrap: Number of bootstrap replicates.
|
|
34
|
+
confidence: Desired confidence level, e.g. 0.95 for a 95% CI.
|
|
35
|
+
seed: RNG seed for reproducibility.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
(ci_low, ci_high) for the metric.
|
|
39
|
+
"""
|
|
40
|
+
rng = np.random.default_rng(seed)
|
|
41
|
+
n_syn = len(synthetic)
|
|
42
|
+
n_real = len(real)
|
|
43
|
+
|
|
44
|
+
values: list[float] = []
|
|
45
|
+
for _ in range(n_bootstrap):
|
|
46
|
+
idx_syn = rng.integers(0, n_syn, size=n_syn)
|
|
47
|
+
idx_real = rng.integers(0, n_real, size=n_real)
|
|
48
|
+
try:
|
|
49
|
+
v = metric_fn(synthetic[idx_syn], real[idx_real])
|
|
50
|
+
if np.isfinite(v):
|
|
51
|
+
values.append(float(v))
|
|
52
|
+
except Exception:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if not values:
|
|
56
|
+
return (float("nan"), float("nan"))
|
|
57
|
+
|
|
58
|
+
alpha = (1 - confidence) / 2
|
|
59
|
+
lo = float(np.percentile(values, alpha * 100))
|
|
60
|
+
hi = float(np.percentile(values, (1 - alpha) * 100))
|
|
61
|
+
return (lo, hi)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def block_bootstrap_ci(
|
|
65
|
+
metric_fn: Callable[[np.ndarray, np.ndarray], float],
|
|
66
|
+
synthetic: np.ndarray,
|
|
67
|
+
real: np.ndarray,
|
|
68
|
+
block_size: int = 20,
|
|
69
|
+
n_bootstrap: int = 200,
|
|
70
|
+
confidence: float = 0.95,
|
|
71
|
+
seed: int = 42,
|
|
72
|
+
) -> tuple[float, float]:
|
|
73
|
+
"""Compute a bootstrap CI using moving-block bootstrap for time series.
|
|
74
|
+
|
|
75
|
+
Preserves local temporal dependence by resampling contiguous blocks.
|
|
76
|
+
Appropriate for metrics that depend on the order of samples (ACF,
|
|
77
|
+
volatility clustering, leverage effect).
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
metric_fn: Callable taking (synthetic, real) arrays and returning
|
|
81
|
+
a scalar metric value.
|
|
82
|
+
synthetic: (n_timesteps, n_features) synthetic time series.
|
|
83
|
+
real: (n_timesteps, n_features) real time series.
|
|
84
|
+
block_size: Length of each resampling block.
|
|
85
|
+
n_bootstrap: Number of bootstrap replicates.
|
|
86
|
+
confidence: Desired confidence level.
|
|
87
|
+
seed: RNG seed.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
(ci_low, ci_high).
|
|
91
|
+
"""
|
|
92
|
+
rng = np.random.default_rng(seed)
|
|
93
|
+
|
|
94
|
+
def resample(x: np.ndarray) -> np.ndarray:
|
|
95
|
+
n = len(x)
|
|
96
|
+
n_blocks = (n + block_size - 1) // block_size
|
|
97
|
+
starts = rng.integers(0, max(n - block_size + 1, 1), size=n_blocks)
|
|
98
|
+
out = np.concatenate([x[s : s + block_size] for s in starts], axis=0)
|
|
99
|
+
return out[:n]
|
|
100
|
+
|
|
101
|
+
values: list[float] = []
|
|
102
|
+
for _ in range(n_bootstrap):
|
|
103
|
+
try:
|
|
104
|
+
v = metric_fn(resample(synthetic), resample(real))
|
|
105
|
+
if np.isfinite(v):
|
|
106
|
+
values.append(float(v))
|
|
107
|
+
except Exception:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
if not values:
|
|
111
|
+
return (float("nan"), float("nan"))
|
|
112
|
+
|
|
113
|
+
alpha = (1 - confidence) / 2
|
|
114
|
+
lo = float(np.percentile(values, alpha * 100))
|
|
115
|
+
hi = float(np.percentile(values, (1 - alpha) * 100))
|
|
116
|
+
return (lo, hi)
|
finval/core/result.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Result types for validation metrics.
|
|
2
|
+
|
|
3
|
+
A `MetricResult` captures the output of a single metric: value, quality
|
|
4
|
+
grade, thresholds used, and optional bootstrap confidence interval and
|
|
5
|
+
per-feature/per-pair breakdown.
|
|
6
|
+
|
|
7
|
+
A `ValidationReport` aggregates multiple MetricResults into a single object
|
|
8
|
+
with an overall quality grade and weighted pass rate.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
QUALITY_LEVELS = ("excellent", "good", "acceptable", "poor")
|
|
19
|
+
QUALITY_SCORES = {"excellent": 1.0, "good": 0.75, "acceptable": 0.5, "poor": 0.0}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class MetricResult:
|
|
24
|
+
"""Result of a single validation metric.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
name: Metric identifier, e.g. "marginal_ks".
|
|
28
|
+
value: Scalar metric value (lower is always better by convention).
|
|
29
|
+
quality: One of "excellent", "good", "acceptable", "poor".
|
|
30
|
+
passed: True if quality is not "poor".
|
|
31
|
+
thresholds: The threshold dict used to assign quality.
|
|
32
|
+
category: Metric category, e.g. "distribution", "dependence".
|
|
33
|
+
interpretation: Human-readable one-line summary.
|
|
34
|
+
ci_low: Lower bound of bootstrap confidence interval (optional).
|
|
35
|
+
ci_high: Upper bound of bootstrap confidence interval (optional).
|
|
36
|
+
per_feature: Per-feature breakdown (optional).
|
|
37
|
+
per_pair: Per-pair breakdown for dependence metrics (optional).
|
|
38
|
+
metadata: Additional metric-specific information.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
name: str
|
|
42
|
+
value: float
|
|
43
|
+
quality: str
|
|
44
|
+
passed: bool
|
|
45
|
+
thresholds: dict[str, float] = field(default_factory=dict)
|
|
46
|
+
category: str = "uncategorized"
|
|
47
|
+
interpretation: str = ""
|
|
48
|
+
ci_low: float | None = None
|
|
49
|
+
ci_high: float | None = None
|
|
50
|
+
per_feature: dict[str, Any] = field(default_factory=dict)
|
|
51
|
+
per_pair: dict[str, Any] = field(default_factory=dict)
|
|
52
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
53
|
+
|
|
54
|
+
def __post_init__(self) -> None:
|
|
55
|
+
# Sanitize NaN/Inf to ensure serializable output
|
|
56
|
+
if self.value is not None and (np.isnan(self.value) or np.isinf(self.value)):
|
|
57
|
+
self.value = float("inf")
|
|
58
|
+
self.quality = "poor"
|
|
59
|
+
self.passed = False
|
|
60
|
+
if self.quality not in QUALITY_LEVELS:
|
|
61
|
+
raise ValueError(f"quality must be one of {QUALITY_LEVELS}, got {self.quality!r}")
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def score(self) -> float:
|
|
65
|
+
"""Numeric score in [0, 1] for weighted aggregation."""
|
|
66
|
+
return QUALITY_SCORES[self.quality]
|
|
67
|
+
|
|
68
|
+
def to_dict(self) -> dict[str, Any]:
|
|
69
|
+
"""Serialize to a plain dict (JSON-safe)."""
|
|
70
|
+
out: dict[str, Any] = {
|
|
71
|
+
"name": self.name,
|
|
72
|
+
"value": self.value,
|
|
73
|
+
"quality": self.quality,
|
|
74
|
+
"passed": self.passed,
|
|
75
|
+
"score": self.score,
|
|
76
|
+
"thresholds": self.thresholds,
|
|
77
|
+
"category": self.category,
|
|
78
|
+
"interpretation": self.interpretation,
|
|
79
|
+
}
|
|
80
|
+
if self.ci_low is not None:
|
|
81
|
+
out["ci_low"] = self.ci_low
|
|
82
|
+
out["ci_high"] = self.ci_high
|
|
83
|
+
if self.per_feature:
|
|
84
|
+
out["per_feature"] = self.per_feature
|
|
85
|
+
if self.per_pair:
|
|
86
|
+
out["per_pair"] = self.per_pair
|
|
87
|
+
if self.metadata:
|
|
88
|
+
out["metadata"] = self.metadata
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def create_error_metric(
|
|
93
|
+
name: str,
|
|
94
|
+
error: str,
|
|
95
|
+
category: str = "uncategorized",
|
|
96
|
+
) -> MetricResult:
|
|
97
|
+
"""Build a poor-quality result representing a computation failure."""
|
|
98
|
+
return MetricResult(
|
|
99
|
+
name=name,
|
|
100
|
+
value=float("inf"),
|
|
101
|
+
quality="poor",
|
|
102
|
+
passed=False,
|
|
103
|
+
category=category,
|
|
104
|
+
interpretation=f"Failed: {error}",
|
|
105
|
+
metadata={"error": error},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class ValidationReport:
|
|
111
|
+
"""Aggregated report of multiple validation metrics.
|
|
112
|
+
|
|
113
|
+
A ValidationReport groups metrics by category, computes a weighted
|
|
114
|
+
overall score, and assigns an overall quality grade.
|
|
115
|
+
|
|
116
|
+
Attributes:
|
|
117
|
+
metrics: Dict mapping metric name to MetricResult.
|
|
118
|
+
weights: Dict mapping metric name to absolute weight (fractions
|
|
119
|
+
should sum to ~1 over included metrics; missing metrics
|
|
120
|
+
contribute 0).
|
|
121
|
+
category_weights: Dict mapping category to category weight.
|
|
122
|
+
overall_score: Weighted sum of metric scores in [0, 1].
|
|
123
|
+
overall_quality: One of "excellent" (>=0.85), "good" (>=0.65),
|
|
124
|
+
"acceptable" (>=0.45), "poor" otherwise.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
metrics: dict[str, MetricResult] = field(default_factory=dict)
|
|
128
|
+
weights: dict[str, float] = field(default_factory=dict)
|
|
129
|
+
category_weights: dict[str, float] = field(default_factory=dict)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def overall_score(self) -> float:
|
|
133
|
+
"""Weighted average of metric scores. Weights for missing metrics
|
|
134
|
+
contribute zero, which penalizes incomplete validation runs."""
|
|
135
|
+
if not self.metrics or not self.weights:
|
|
136
|
+
return 0.0
|
|
137
|
+
total_weight = sum(self.weights.values())
|
|
138
|
+
if total_weight == 0:
|
|
139
|
+
return 0.0
|
|
140
|
+
weighted = sum(
|
|
141
|
+
m.score * self.weights.get(m.name, 0.0) for m in self.metrics.values()
|
|
142
|
+
)
|
|
143
|
+
return weighted / total_weight
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def overall_quality(self) -> str:
|
|
147
|
+
"""Quality grade based on overall_score."""
|
|
148
|
+
s = self.overall_score
|
|
149
|
+
if s >= 0.85:
|
|
150
|
+
return "excellent"
|
|
151
|
+
if s >= 0.65:
|
|
152
|
+
return "good"
|
|
153
|
+
if s >= 0.45:
|
|
154
|
+
return "acceptable"
|
|
155
|
+
return "poor"
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def pass_rate(self) -> float:
|
|
159
|
+
"""Fraction of metrics with quality >= acceptable."""
|
|
160
|
+
if not self.metrics:
|
|
161
|
+
return 0.0
|
|
162
|
+
passed = sum(1 for m in self.metrics.values() if m.passed)
|
|
163
|
+
return passed / len(self.metrics)
|
|
164
|
+
|
|
165
|
+
def by_category(self) -> dict[str, list[MetricResult]]:
|
|
166
|
+
"""Group metrics by category."""
|
|
167
|
+
out: dict[str, list[MetricResult]] = {}
|
|
168
|
+
for m in self.metrics.values():
|
|
169
|
+
out.setdefault(m.category, []).append(m)
|
|
170
|
+
return out
|
|
171
|
+
|
|
172
|
+
def summary(self) -> str:
|
|
173
|
+
"""Human-readable one-page summary."""
|
|
174
|
+
lines = [
|
|
175
|
+
f"finval ValidationReport — {self.overall_quality.upper()} ({self.overall_score:.0%})",
|
|
176
|
+
f" metrics: {len(self.metrics)}, passed: {int(self.pass_rate * len(self.metrics))}/{len(self.metrics)}",
|
|
177
|
+
"",
|
|
178
|
+
]
|
|
179
|
+
for category, mlist in self.by_category().items():
|
|
180
|
+
lines.append(f" [{category}]")
|
|
181
|
+
for m in sorted(mlist, key=lambda x: x.name):
|
|
182
|
+
status = "PASS" if m.passed else "FAIL"
|
|
183
|
+
ci = ""
|
|
184
|
+
if m.ci_low is not None:
|
|
185
|
+
ci = f" ({m.ci_low:.3f}–{m.ci_high:.3f})"
|
|
186
|
+
lines.append(
|
|
187
|
+
f" {status} {m.name:28s} value={m.value:7.4f}{ci} {m.quality}"
|
|
188
|
+
)
|
|
189
|
+
lines.append("")
|
|
190
|
+
return "\n".join(lines)
|
|
191
|
+
|
|
192
|
+
def to_dict(self) -> dict[str, Any]:
|
|
193
|
+
"""Serialize to a plain dict (JSON-safe)."""
|
|
194
|
+
return {
|
|
195
|
+
"overall_score": self.overall_score,
|
|
196
|
+
"overall_quality": self.overall_quality,
|
|
197
|
+
"pass_rate": self.pass_rate,
|
|
198
|
+
"metrics": {name: m.to_dict() for name, m in self.metrics.items()},
|
|
199
|
+
"weights": dict(self.weights),
|
|
200
|
+
"category_weights": dict(self.category_weights),
|
|
201
|
+
}
|