geodispbench3d 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geodispbench3d/__init__.py +19 -0
- geodispbench3d/_version.py +24 -0
- geodispbench3d/analysis/__init__.py +31 -0
- geodispbench3d/analysis/loader.py +209 -0
- geodispbench3d/analysis/runner.py +205 -0
- geodispbench3d/cli.py +469 -0
- geodispbench3d/conf/schema/analysis.schema.json +45 -0
- geodispbench3d/conf/schema/dataset.schema.json +45 -0
- geodispbench3d/conf/schema/metrics.schema.json +32 -0
- geodispbench3d/conf/schema/suite.schema.json +36 -0
- geodispbench3d/conf/schema/tool.schema.json +76 -0
- geodispbench3d/dashboard/__init__.py +3 -0
- geodispbench3d/dashboard/app.py +177 -0
- geodispbench3d/dataset/__init__.py +31 -0
- geodispbench3d/dataset/ground_truth.py +128 -0
- geodispbench3d/dataset/schema.py +150 -0
- geodispbench3d/diagnostics.py +42 -0
- geodispbench3d/metrics/__init__.py +19 -0
- geodispbench3d/metrics/builtins.py +259 -0
- geodispbench3d/metrics/registry.py +114 -0
- geodispbench3d/results/__init__.py +27 -0
- geodispbench3d/results/predictions_cache.py +219 -0
- geodispbench3d/results/store.py +56 -0
- geodispbench3d/suite/__init__.py +7 -0
- geodispbench3d/suite/loader.py +170 -0
- geodispbench3d/sweep/__init__.py +33 -0
- geodispbench3d/sweep/evaluation.py +215 -0
- geodispbench3d/sweep/parameters.py +223 -0
- geodispbench3d/sweep/rescore.py +475 -0
- geodispbench3d/sweep/runner.py +701 -0
- geodispbench3d/sweep/trial_record.py +369 -0
- geodispbench3d/tool/__init__.py +30 -0
- geodispbench3d/tool/base.py +134 -0
- geodispbench3d/tool/callable_adapter.py +131 -0
- geodispbench3d/tool/cli_adapter.py +533 -0
- geodispbench3d/tool/loader.py +253 -0
- geodispbench3d-0.2.0.dist-info/METADATA +132 -0
- geodispbench3d-0.2.0.dist-info/RECORD +55 -0
- geodispbench3d-0.2.0.dist-info/WHEEL +5 -0
- geodispbench3d-0.2.0.dist-info/entry_points.txt +3 -0
- geodispbench3d-0.2.0.dist-info/licenses/LICENSE +34 -0
- geodispbench3d-0.2.0.dist-info/scm_file_list.json +121 -0
- geodispbench3d-0.2.0.dist-info/scm_version.json +8 -0
- geodispbench3d-0.2.0.dist-info/top_level.txt +3 -0
- geodispbench3d_f2s3/__init__.py +13 -0
- geodispbench3d_f2s3/conf/tool/f2s3.yaml +74 -0
- geodispbench3d_f2s3/output_parser.py +230 -0
- geodispbench3d_iof3d/__init__.py +88 -0
- geodispbench3d_iof3d/_sweep_cli.py +143 -0
- geodispbench3d_iof3d/adapter.py +580 -0
- geodispbench3d_iof3d/cli.py +28 -0
- geodispbench3d_iof3d/conf/config_ax.yaml +14 -0
- geodispbench3d_iof3d/conf/tool/iof3d.yaml +69 -0
- geodispbench3d_iof3d/factory.py +137 -0
- geodispbench3d_iof3d/output_parser.py +187 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""geodispbench3d: a generic benchmark framework for 3D displacement / optical-flow tools.
|
|
2
|
+
|
|
3
|
+
The package is tool-agnostic: any tool that can be described by a
|
|
4
|
+
:class:`~geodispbench3d.tool.base.ToolAdapter` can be swept, evaluated against a
|
|
5
|
+
dataset, and scored with configurable metrics.
|
|
6
|
+
|
|
7
|
+
Public surface is intentionally small; see the submodule docstrings for details.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"tool",
|
|
14
|
+
"dataset",
|
|
15
|
+
"metrics",
|
|
16
|
+
"sweep",
|
|
17
|
+
"suite",
|
|
18
|
+
"results",
|
|
19
|
+
]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.2.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 2, 0)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = 'g22a498d'
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Tool-agnostic analysis: re-score cached predictions across tools.
|
|
2
|
+
|
|
3
|
+
Where a suite's ``rescore`` pass is bound to one tool's run directories
|
|
4
|
+
and runs phase 2 against tool-specific outputs, an analysis YAML works
|
|
5
|
+
purely from the predictions cache. Predictions live in the common
|
|
6
|
+
``{per_point: [...]}`` shape, so an analysis can mix iof3D and F2S3
|
|
7
|
+
results in one parquet output, with the metric set as the only knob.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .loader import (
|
|
13
|
+
AnalysisConfig,
|
|
14
|
+
PredictionFilter,
|
|
15
|
+
PredictionRef,
|
|
16
|
+
PredictionsConfig,
|
|
17
|
+
ResultsConfig,
|
|
18
|
+
load_analysis,
|
|
19
|
+
)
|
|
20
|
+
from .runner import AnalysisSummary, analyze
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AnalysisConfig",
|
|
24
|
+
"AnalysisSummary",
|
|
25
|
+
"PredictionFilter",
|
|
26
|
+
"PredictionRef",
|
|
27
|
+
"PredictionsConfig",
|
|
28
|
+
"ResultsConfig",
|
|
29
|
+
"analyze",
|
|
30
|
+
"load_analysis",
|
|
31
|
+
]
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""``analysis.yaml`` schema and loader.
|
|
2
|
+
|
|
3
|
+
An analysis YAML composes a dataset + a metrics file with a set of
|
|
4
|
+
*cached predictions*. There is no tool reference: predictions are tool-
|
|
5
|
+
agnostic by the time they reach the cache, so the analysis verb can mix
|
|
6
|
+
runs from any number of tools in a single parquet output.
|
|
7
|
+
|
|
8
|
+
Three ways to point at predictions, mix-and-match in any combination:
|
|
9
|
+
|
|
10
|
+
predictions:
|
|
11
|
+
- path: <abs/relative.json> # explicit single file
|
|
12
|
+
- glob: <pattern> # any pattern resolved relative
|
|
13
|
+
# to the analysis YAML
|
|
14
|
+
- root: <dir> # walk the cache layout under
|
|
15
|
+
filter: # this root, optionally
|
|
16
|
+
tool_id: iof3d-v2 # filtering by provenance
|
|
17
|
+
dataset_id: mattertal # segment. Each filter is
|
|
18
|
+
case: mattertal-all # optional.
|
|
19
|
+
|
|
20
|
+
Resolution returns a flat list of prediction file paths in the order
|
|
21
|
+
declared (with glob results sorted lexicographically).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import glob as _glob
|
|
27
|
+
from collections.abc import Mapping, Sequence
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from omegaconf import OmegaConf
|
|
33
|
+
|
|
34
|
+
from geodispbench3d.dataset.schema import DatasetSpec, load_dataset
|
|
35
|
+
from geodispbench3d.metrics.registry import MetricsConfig, load_metrics_config
|
|
36
|
+
from geodispbench3d.results.predictions_cache import find_predictions
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class PredictionFilter:
|
|
41
|
+
"""Provenance filter for a ``root:`` entry in ``predictions:``.
|
|
42
|
+
|
|
43
|
+
Each ``None`` field matches any value in that segment of the cache
|
|
44
|
+
layout (``<root>/<tool_id>/<dataset_id>/<case>/<run_hash>.json``).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
tool_id: str | None = None
|
|
48
|
+
dataset_id: str | None = None
|
|
49
|
+
case: str | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class PredictionRef:
|
|
54
|
+
"""One source of predictions to consume.
|
|
55
|
+
|
|
56
|
+
Exactly one of ``path``, ``glob``, or ``root`` is populated. The
|
|
57
|
+
loader normalises everything to an iterable of resolved Path objects
|
|
58
|
+
via :meth:`resolve`.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
path: Path | None = None
|
|
62
|
+
glob: str | None = None
|
|
63
|
+
root: Path | None = None
|
|
64
|
+
filter: PredictionFilter = field(default_factory=PredictionFilter)
|
|
65
|
+
|
|
66
|
+
def resolve(self) -> list[Path]:
|
|
67
|
+
if self.path is not None:
|
|
68
|
+
return [self.path] if self.path.is_file() else []
|
|
69
|
+
if self.glob is not None:
|
|
70
|
+
return sorted(Path(p) for p in _glob.glob(self.glob, recursive=True))
|
|
71
|
+
if self.root is not None:
|
|
72
|
+
return find_predictions(
|
|
73
|
+
self.root,
|
|
74
|
+
tool_id=self.filter.tool_id,
|
|
75
|
+
dataset_id=self.filter.dataset_id,
|
|
76
|
+
case=self.filter.case,
|
|
77
|
+
)
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class PredictionsConfig:
|
|
83
|
+
"""The ``predictions:`` block plus an aggregate resolver."""
|
|
84
|
+
|
|
85
|
+
refs: Sequence[PredictionRef] = ()
|
|
86
|
+
|
|
87
|
+
def resolve_all(self) -> list[Path]:
|
|
88
|
+
seen: set[Path] = set()
|
|
89
|
+
ordered: list[Path] = []
|
|
90
|
+
for ref in self.refs:
|
|
91
|
+
for path in ref.resolve():
|
|
92
|
+
resolved = path.resolve()
|
|
93
|
+
if resolved in seen:
|
|
94
|
+
continue
|
|
95
|
+
seen.add(resolved)
|
|
96
|
+
ordered.append(resolved)
|
|
97
|
+
return ordered
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass(frozen=True)
|
|
101
|
+
class ResultsConfig:
|
|
102
|
+
parquet_path: Path | None = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class AnalysisConfig:
|
|
107
|
+
"""Composite analysis definition with all referenced configs loaded."""
|
|
108
|
+
|
|
109
|
+
id: str
|
|
110
|
+
dataset: DatasetSpec
|
|
111
|
+
metrics: MetricsConfig
|
|
112
|
+
predictions: PredictionsConfig
|
|
113
|
+
results: ResultsConfig = field(default_factory=ResultsConfig)
|
|
114
|
+
pass_id: str | None = None
|
|
115
|
+
source_path: Path | None = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def load_analysis(path: str | Path) -> AnalysisConfig:
|
|
119
|
+
"""Load an ``analysis.yaml`` and resolve its references."""
|
|
120
|
+
|
|
121
|
+
yaml_path = Path(path).resolve()
|
|
122
|
+
if not yaml_path.is_file():
|
|
123
|
+
raise FileNotFoundError(f"Analysis YAML not found: {yaml_path}")
|
|
124
|
+
|
|
125
|
+
raw = OmegaConf.to_container(OmegaConf.load(str(yaml_path)), resolve=True)
|
|
126
|
+
if not isinstance(raw, dict):
|
|
127
|
+
raise ValueError(f"Analysis YAML at {yaml_path} must be a mapping")
|
|
128
|
+
|
|
129
|
+
base = yaml_path.parent
|
|
130
|
+
|
|
131
|
+
dataset_ref = raw.get("dataset")
|
|
132
|
+
metrics_ref = raw.get("metrics")
|
|
133
|
+
if not (dataset_ref and metrics_ref):
|
|
134
|
+
raise ValueError(f"Analysis {yaml_path} must reference 'dataset' and 'metrics'")
|
|
135
|
+
|
|
136
|
+
dataset_spec = load_dataset(_resolve_path(dataset_ref, base))
|
|
137
|
+
metrics_cfg = load_metrics_config(_resolve_path(metrics_ref, base))
|
|
138
|
+
|
|
139
|
+
predictions_raw = raw.get("predictions") or []
|
|
140
|
+
if not isinstance(predictions_raw, list):
|
|
141
|
+
raise ValueError(f"Analysis {yaml_path}: 'predictions' must be a list")
|
|
142
|
+
refs = tuple(_load_prediction_ref(entry, base) for entry in predictions_raw)
|
|
143
|
+
if not refs:
|
|
144
|
+
raise ValueError(f"Analysis {yaml_path}: at least one prediction source required")
|
|
145
|
+
|
|
146
|
+
results_raw = raw.get("results") or {}
|
|
147
|
+
results = ResultsConfig(
|
|
148
|
+
parquet_path=_resolve_optional_path(results_raw.get("parquet_path"), base),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return AnalysisConfig(
|
|
152
|
+
id=str(raw.get("id", yaml_path.stem)),
|
|
153
|
+
dataset=dataset_spec,
|
|
154
|
+
metrics=metrics_cfg,
|
|
155
|
+
predictions=PredictionsConfig(refs=refs),
|
|
156
|
+
results=results,
|
|
157
|
+
pass_id=raw.get("pass_id"),
|
|
158
|
+
source_path=yaml_path,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
# Internals
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _load_prediction_ref(entry: Mapping[str, Any], base: Path) -> PredictionRef:
|
|
168
|
+
if "path" in entry:
|
|
169
|
+
return PredictionRef(path=_resolve_path(entry["path"], base))
|
|
170
|
+
if "glob" in entry:
|
|
171
|
+
# Resolve relative to the analysis YAML's directory.
|
|
172
|
+
pattern = str(entry["glob"])
|
|
173
|
+
if not Path(pattern).is_absolute():
|
|
174
|
+
pattern = str(base / pattern)
|
|
175
|
+
return PredictionRef(glob=pattern)
|
|
176
|
+
if "root" in entry:
|
|
177
|
+
flt = entry.get("filter") or {}
|
|
178
|
+
return PredictionRef(
|
|
179
|
+
root=_resolve_path(entry["root"], base),
|
|
180
|
+
filter=PredictionFilter(
|
|
181
|
+
tool_id=flt.get("tool_id"),
|
|
182
|
+
dataset_id=flt.get("dataset_id"),
|
|
183
|
+
case=flt.get("case"),
|
|
184
|
+
),
|
|
185
|
+
)
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"predictions entry must declare one of 'path', 'glob', or 'root', got {dict(entry)!r}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _resolve_path(value: Any, base: Path) -> Path:
|
|
192
|
+
p = Path(str(value))
|
|
193
|
+
return p if p.is_absolute() else (base / p).resolve()
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _resolve_optional_path(value: Any, base: Path) -> Path | None:
|
|
197
|
+
if value is None:
|
|
198
|
+
return None
|
|
199
|
+
return _resolve_path(value, base)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
__all__ = [
|
|
203
|
+
"AnalysisConfig",
|
|
204
|
+
"PredictionFilter",
|
|
205
|
+
"PredictionRef",
|
|
206
|
+
"PredictionsConfig",
|
|
207
|
+
"ResultsConfig",
|
|
208
|
+
"load_analysis",
|
|
209
|
+
]
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Analyze runner: score cached predictions against an AnalysisConfig.
|
|
2
|
+
|
|
3
|
+
Loads each prediction JSON file, picks the matching dataset case from
|
|
4
|
+
the analysis config (preferring the case recorded in the prediction's
|
|
5
|
+
provenance, falling back to a single-case dataset), and dispatches the
|
|
6
|
+
metric registry through :func:`evaluate_trial` with the cached
|
|
7
|
+
prediction supplied as ``prediction_override`` so phase 2 is skipped
|
|
8
|
+
entirely.
|
|
9
|
+
|
|
10
|
+
Record rows carry ``mode="analyze"`` plus the prediction's recorded
|
|
11
|
+
``tool_id`` / ``dataset_id`` / ``case`` so a single parquet file can
|
|
12
|
+
mix runs from multiple tools across multiple analyses without
|
|
13
|
+
columns colliding.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from datetime import UTC, datetime
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from geodispbench3d.dataset.schema import CaseSpec, DatasetSpec
|
|
26
|
+
from geodispbench3d.diagnostics import PassDiagnostics
|
|
27
|
+
from geodispbench3d.metrics.registry import MetricRegistry
|
|
28
|
+
from geodispbench3d.results.predictions_cache import read_prediction
|
|
29
|
+
from geodispbench3d.sweep.evaluation import evaluate_trial
|
|
30
|
+
from geodispbench3d.tool.base import TrialOutputs, TrialResult
|
|
31
|
+
|
|
32
|
+
from .loader import AnalysisConfig
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class AnalysisSummary:
|
|
37
|
+
"""One-line counters returned by :func:`analyze`.
|
|
38
|
+
|
|
39
|
+
``non_fatal_failures`` totals the swallowed fail-soft failures across the
|
|
40
|
+
pass (corrupt prediction reads, per-prediction evaluation skips), surfaced
|
|
41
|
+
as the CLI's aggregate "N non-fatal failures" line (F-08).
|
|
42
|
+
|
|
43
|
+
``eval_failures`` (03-01) is the genuine-parser/metric subset — the
|
|
44
|
+
``"evaluation"`` diag kind (both the raise-case skip and the inner
|
|
45
|
+
evaluation non-fatals) — which Plan 02 reads for the analyze exit-1
|
|
46
|
+
condition. ``skipped_unreadable`` stays a genuine data error and
|
|
47
|
+
``skipped_no_case`` stays a benign skip.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
total: int = 0
|
|
51
|
+
succeeded: int = 0
|
|
52
|
+
skipped_unreadable: int = 0
|
|
53
|
+
skipped_no_case: int = 0
|
|
54
|
+
rows_emitted: int = 0
|
|
55
|
+
non_fatal_failures: int = 0
|
|
56
|
+
eval_failures: int = 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def analyze(
|
|
60
|
+
*,
|
|
61
|
+
config: AnalysisConfig,
|
|
62
|
+
on_record_rows: Callable[[Sequence[Mapping[str, Any]]], None] | None = None,
|
|
63
|
+
logger: logging.Logger | None = None,
|
|
64
|
+
) -> AnalysisSummary:
|
|
65
|
+
"""Score every prediction referenced by ``config`` and emit record rows."""
|
|
66
|
+
|
|
67
|
+
log = logger or logging.getLogger("geodispbench3d.analysis")
|
|
68
|
+
summary = AnalysisSummary()
|
|
69
|
+
pass_id = config.pass_id or _utcnow_compact()
|
|
70
|
+
|
|
71
|
+
case_index: Mapping[str, CaseSpec] = {c.name: c for c in config.dataset.cases}
|
|
72
|
+
registry = MetricRegistry()
|
|
73
|
+
|
|
74
|
+
# One PassDiagnostics for the pass: corrupt prediction reads and per-
|
|
75
|
+
# prediction evaluation skips record here, surfaced on AnalysisSummary (F-08).
|
|
76
|
+
diag = PassDiagnostics()
|
|
77
|
+
|
|
78
|
+
paths = config.predictions.resolve_all()
|
|
79
|
+
log.info("analyze: %d prediction file(s) to score (pass_id=%s)", len(paths), pass_id)
|
|
80
|
+
|
|
81
|
+
for path in paths:
|
|
82
|
+
summary.total += 1
|
|
83
|
+
payload = read_prediction(path, on_non_fatal=lambda _exc: diag.add("prediction_read"))
|
|
84
|
+
if payload is None:
|
|
85
|
+
log.warning("analyze: cannot read %s, skipping", path)
|
|
86
|
+
summary.skipped_unreadable += 1
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
prediction = payload.get("prediction")
|
|
90
|
+
provenance = payload.get("provenance") or {}
|
|
91
|
+
case = _resolve_case(provenance, case_index, config.dataset)
|
|
92
|
+
if case is None:
|
|
93
|
+
log.warning(
|
|
94
|
+
"analyze: cannot map prediction %s to a dataset case (provenance=%s); skipping",
|
|
95
|
+
path,
|
|
96
|
+
provenance.get("dataset"),
|
|
97
|
+
)
|
|
98
|
+
summary.skipped_no_case += 1
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
record_extras = {
|
|
102
|
+
"tool_id": _provenance_id(provenance, "tool"),
|
|
103
|
+
"dataset_id": _provenance_id(provenance, "dataset") or config.dataset.id,
|
|
104
|
+
"case": case.name,
|
|
105
|
+
"trial_index": _provenance_run_hash(provenance, path),
|
|
106
|
+
"mode": "analyze",
|
|
107
|
+
"pass_id": pass_id,
|
|
108
|
+
"prediction_path": str(path),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
trial_result = TrialResult(
|
|
112
|
+
outputs=TrialOutputs(run_dir=Path(provenance.get("run_dir") or path.parent)),
|
|
113
|
+
scalar_metrics={},
|
|
114
|
+
duration_seconds=0.0,
|
|
115
|
+
success=True,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
evaluation = evaluate_trial(
|
|
120
|
+
trial_result=trial_result,
|
|
121
|
+
parameters={},
|
|
122
|
+
case=case,
|
|
123
|
+
metrics=config.metrics,
|
|
124
|
+
registry=registry,
|
|
125
|
+
output_parser=None,
|
|
126
|
+
output_parser_options=None,
|
|
127
|
+
prediction_override=prediction,
|
|
128
|
+
trial_index=None,
|
|
129
|
+
record_extras=record_extras,
|
|
130
|
+
logger=log,
|
|
131
|
+
)
|
|
132
|
+
except Exception:
|
|
133
|
+
# Plugin/user callable boundary: evaluate_trial runs arbitrary
|
|
134
|
+
# metric code, so a closed exception set is inapplicable. Stay broad
|
|
135
|
+
# so one prediction's failure skips it instead of aborting the whole
|
|
136
|
+
# analyze pass (fail-soft, F-08).
|
|
137
|
+
log.exception("analyze: evaluate_trial raised for %s", path)
|
|
138
|
+
diag.add("evaluation")
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
diag.add("evaluation", evaluation.non_fatal_failures)
|
|
142
|
+
|
|
143
|
+
if on_record_rows and evaluation.record_rows:
|
|
144
|
+
on_record_rows(list(evaluation.record_rows))
|
|
145
|
+
summary.rows_emitted += len(evaluation.record_rows)
|
|
146
|
+
summary.succeeded += 1
|
|
147
|
+
|
|
148
|
+
summary.non_fatal_failures = diag.non_fatal_failures
|
|
149
|
+
summary.eval_failures = diag.by_kind.get("evaluation", 0)
|
|
150
|
+
log.info(
|
|
151
|
+
"analyze done: succeeded=%d total=%d unreadable=%d no_case=%d rows=%d "
|
|
152
|
+
"non_fatal_failures=%d",
|
|
153
|
+
summary.succeeded,
|
|
154
|
+
summary.total,
|
|
155
|
+
summary.skipped_unreadable,
|
|
156
|
+
summary.skipped_no_case,
|
|
157
|
+
summary.rows_emitted,
|
|
158
|
+
summary.non_fatal_failures,
|
|
159
|
+
)
|
|
160
|
+
return summary
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Internals
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _resolve_case(
|
|
169
|
+
provenance: Mapping[str, Any],
|
|
170
|
+
case_index: Mapping[str, CaseSpec],
|
|
171
|
+
dataset: DatasetSpec,
|
|
172
|
+
) -> CaseSpec | None:
|
|
173
|
+
block = provenance.get("dataset")
|
|
174
|
+
if isinstance(block, Mapping):
|
|
175
|
+
name = block.get("case")
|
|
176
|
+
if isinstance(name, str) and name in case_index:
|
|
177
|
+
return case_index[name]
|
|
178
|
+
if len(dataset.cases) == 1:
|
|
179
|
+
return dataset.cases[0]
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _provenance_id(provenance: Mapping[str, Any], key: str) -> str | None:
|
|
184
|
+
block = provenance.get(key)
|
|
185
|
+
if isinstance(block, Mapping):
|
|
186
|
+
value = block.get("id")
|
|
187
|
+
if isinstance(value, str):
|
|
188
|
+
return value
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _provenance_run_hash(provenance: Mapping[str, Any], path: Path) -> str:
|
|
193
|
+
"""Best-effort identifier for the row's `trial_index` column."""
|
|
194
|
+
|
|
195
|
+
run_dir = provenance.get("run_dir")
|
|
196
|
+
if isinstance(run_dir, str) and run_dir:
|
|
197
|
+
return Path(run_dir).name
|
|
198
|
+
return path.stem
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _utcnow_compact() -> str:
|
|
202
|
+
return datetime.now(UTC).strftime("analyze-%Y%m%dT%H%M%S")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
__all__ = ["AnalysisSummary", "analyze"]
|