xrtm-eval 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xrtm_eval-0.1.1/src/xrtm_eval.egg-info → xrtm_eval-0.2.0}/PKG-INFO +37 -2
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/README.md +36 -1
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/pyproject.toml +1 -1
- xrtm_eval-0.2.0/src/xrtm/eval/core/__init__.py +42 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/definitions.py +3 -4
- xrtm_eval-0.2.0/src/xrtm/eval/core/schemas/__init__.py +24 -0
- xrtm_eval-0.2.0/src/xrtm/eval/core/schemas/forecast.py +59 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/bias.py +13 -4
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/epistemic_evaluator.py +2 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/intervention.py +9 -2
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/metrics.py +15 -12
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/resilience.py +14 -2
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/viz.py +10 -1
- xrtm_eval-0.2.0/src/xrtm/eval/providers/__init__.py +24 -0
- xrtm_eval-0.2.0/src/xrtm/eval/version.py +28 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0/src/xrtm_eval.egg-info}/PKG-INFO +37 -2
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/SOURCES.txt +5 -2
- xrtm_eval-0.2.0/tests/test_ece.py +68 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/tests/test_metrics.py +3 -0
- xrtm_eval-0.1.1/src/xrtm/eval/core/__init__.py +0 -14
- xrtm_eval-0.1.1/src/xrtm/eval/schemas/__init__.py +0 -3
- xrtm_eval-0.1.1/src/xrtm/eval/schemas/forecast.py +0 -21
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/LICENSE +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/setup.cfg +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/__init__.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/epistemics.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/__init__.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/aggregation.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/bayesian.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/__init__.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/analytics.py +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/dependency_links.txt +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/requires.txt +0 -0
- {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xrtm-eval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: The Judge/Scoring engine for XRTM.
|
|
5
5
|
Author-email: XRTM Team <moy@xrtm.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -23,15 +23,27 @@ Dynamic: license-file
|
|
|
23
23
|
|
|
24
24
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
25
25
|
[](https://www.python.org/downloads/)
|
|
26
|
+
[](https://pypi.org/project/xrtm-eval/)
|
|
26
27
|
|
|
27
28
|
**The Judge for XRTM.**
|
|
28
29
|
|
|
29
30
|
`xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
|
|
30
31
|
|
|
32
|
+
## Part of the XRTM Ecosystem
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Layer 4: xrtm-train → (imports all)
|
|
36
|
+
Layer 3: xrtm-forecast → (imports eval, data)
|
|
37
|
+
Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
|
|
38
|
+
Layer 1: xrtm-data → (zero dependencies)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
|
|
42
|
+
|
|
31
43
|
## Installation
|
|
32
44
|
|
|
33
45
|
```bash
|
|
34
|
-
|
|
46
|
+
pip install xrtm-eval
|
|
35
47
|
```
|
|
36
48
|
|
|
37
49
|
## Core Primitives
|
|
@@ -54,6 +66,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
|
|
|
54
66
|
### 2. Expected Calibration Error (ECE)
|
|
55
67
|
Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
|
|
56
68
|
|
|
69
|
+
### 3. Epistemic Trust Primitives (v0.1.1+)
|
|
70
|
+
`xrtm-eval` now includes trust scoring infrastructure:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
|
|
74
|
+
|
|
75
|
+
registry = SourceTrustRegistry()
|
|
76
|
+
guardian = IntegrityGuardian(registry)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Project Structure
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
src/xrtm/eval/
|
|
83
|
+
├── core/ # Interfaces & Schemas
|
|
84
|
+
│ ├── eval/ # Evaluator protocol, EvaluationResult
|
|
85
|
+
│ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
|
|
86
|
+
│ └── schemas/ # ForecastResolution
|
|
87
|
+
├── kit/ # Composable evaluator implementations
|
|
88
|
+
│ └── eval/metrics.py # BrierScoreEvaluator, ECE
|
|
89
|
+
└── providers/ # External evaluation services (future)
|
|
90
|
+
```
|
|
91
|
+
|
|
57
92
|
## Development
|
|
58
93
|
|
|
59
94
|
Prerequisites:
|
|
@@ -2,15 +2,27 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
4
4
|
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://pypi.org/project/xrtm-eval/)
|
|
5
6
|
|
|
6
7
|
**The Judge for XRTM.**
|
|
7
8
|
|
|
8
9
|
`xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
|
|
9
10
|
|
|
11
|
+
## Part of the XRTM Ecosystem
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
Layer 4: xrtm-train → (imports all)
|
|
15
|
+
Layer 3: xrtm-forecast → (imports eval, data)
|
|
16
|
+
Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
|
|
17
|
+
Layer 1: xrtm-data → (zero dependencies)
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
`xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
|
|
21
|
+
|
|
10
22
|
## Installation
|
|
11
23
|
|
|
12
24
|
```bash
|
|
13
|
-
|
|
25
|
+
pip install xrtm-eval
|
|
14
26
|
```
|
|
15
27
|
|
|
16
28
|
## Core Primitives
|
|
@@ -33,6 +45,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
|
|
|
33
45
|
### 2. Expected Calibration Error (ECE)
|
|
34
46
|
Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
|
|
35
47
|
|
|
48
|
+
### 3. Epistemic Trust Primitives (v0.1.1+)
|
|
49
|
+
`xrtm-eval` now includes trust scoring infrastructure:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
|
|
53
|
+
|
|
54
|
+
registry = SourceTrustRegistry()
|
|
55
|
+
guardian = IntegrityGuardian(registry)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Project Structure
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
src/xrtm/eval/
|
|
62
|
+
├── core/ # Interfaces & Schemas
|
|
63
|
+
│ ├── eval/ # Evaluator protocol, EvaluationResult
|
|
64
|
+
│ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
|
|
65
|
+
│ └── schemas/ # ForecastResolution
|
|
66
|
+
├── kit/ # Composable evaluator implementations
|
|
67
|
+
│ └── eval/metrics.py # BrierScoreEvaluator, ECE
|
|
68
|
+
└── providers/ # External evaluation services (future)
|
|
69
|
+
```
|
|
70
|
+
|
|
36
71
|
## Development
|
|
37
72
|
|
|
38
73
|
Prerequisites:
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""
|
|
17
|
+
Core interfaces and domain-agnostic logic for xrtm-eval.
|
|
18
|
+
|
|
19
|
+
This module exports evaluator protocols, epistemics utilities, and
|
|
20
|
+
core schemas. MUST NOT import from kit/ or providers/.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from xrtm.eval.core.epistemics import (
|
|
24
|
+
IntegrityGuardian,
|
|
25
|
+
SourceTrustEntry,
|
|
26
|
+
SourceTrustRegistry,
|
|
27
|
+
)
|
|
28
|
+
from xrtm.eval.core.eval import EvaluationReport, EvaluationResult, Evaluator
|
|
29
|
+
from xrtm.eval.core.schemas import ForecastResolution
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Evaluator protocol
|
|
33
|
+
"Evaluator",
|
|
34
|
+
"EvaluationResult",
|
|
35
|
+
"EvaluationReport",
|
|
36
|
+
# Epistemics
|
|
37
|
+
"IntegrityGuardian",
|
|
38
|
+
"SourceTrustRegistry",
|
|
39
|
+
"SourceTrustEntry",
|
|
40
|
+
# Schemas
|
|
41
|
+
"ForecastResolution",
|
|
42
|
+
]
|
|
@@ -30,11 +30,9 @@ class BrierDecomposition(BaseModel):
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class Evaluator(Protocol):
|
|
33
|
-
def score(self, prediction: Any, ground_truth: Any) -> float:
|
|
34
|
-
...
|
|
33
|
+
def score(self, prediction: Any, ground_truth: Any) -> float: ...
|
|
35
34
|
|
|
36
|
-
def evaluate(self, prediction: Any, ground_truth: Any, subject_id: str) -> EvaluationResult:
|
|
37
|
-
...
|
|
35
|
+
def evaluate(self, prediction: Any, ground_truth: Any, subject_id: str) -> EvaluationResult: ...
|
|
38
36
|
|
|
39
37
|
|
|
40
38
|
class EvaluationReport(BaseModel):
|
|
@@ -55,6 +53,7 @@ class EvaluationReport(BaseModel):
|
|
|
55
53
|
def to_pandas(self) -> Any:
|
|
56
54
|
try:
|
|
57
55
|
import pandas as pd
|
|
56
|
+
|
|
58
57
|
return pd.DataFrame([r.model_dump() for r in self.results])
|
|
59
58
|
except ImportError:
|
|
60
59
|
raise ImportError("Pandas is required for to_pandas(). Install it with `pip install pandas`.")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""
|
|
17
|
+
Core schemas for xrtm-eval.
|
|
18
|
+
|
|
19
|
+
This module exports evaluation-related Pydantic models.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from xrtm.eval.core.schemas.forecast import ForecastResolution
|
|
23
|
+
|
|
24
|
+
__all__ = ["ForecastResolution"]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""
|
|
17
|
+
Forecast resolution schema for evaluation.
|
|
18
|
+
|
|
19
|
+
This module defines the ground-truth outcome schema used to evaluate
|
|
20
|
+
forecast accuracy.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> from xrtm.eval.core.schemas import ForecastResolution
|
|
24
|
+
>>> resolution = ForecastResolution(
|
|
25
|
+
... question_id="q1",
|
|
26
|
+
... outcome="yes",
|
|
27
|
+
... )
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from typing import Any, Dict
|
|
32
|
+
|
|
33
|
+
from pydantic import BaseModel, Field
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ForecastResolution(BaseModel):
|
|
37
|
+
r"""
|
|
38
|
+
The ground-truth outcome used to evaluate forecast accuracy.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
question_id: Reference to the forecasted question.
|
|
42
|
+
outcome: The final winning outcome or value.
|
|
43
|
+
resolved_at: When the outcome was determined.
|
|
44
|
+
metadata: Source info, verification method, etc.
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> resolution = ForecastResolution(question_id="q1", outcome="yes")
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
question_id: str = Field(..., description="Reference to the forecasted question")
|
|
51
|
+
outcome: str = Field(..., description="The final winning outcome or value")
|
|
52
|
+
resolved_at: datetime = Field(
|
|
53
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
54
|
+
description="When the outcome was determined",
|
|
55
|
+
)
|
|
56
|
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Source info, verification method")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
__all__ = ["ForecastResolution"]
|
|
@@ -8,10 +8,18 @@ from xrtm.eval.core.eval.definitions import EvaluationResult, Evaluator
|
|
|
8
8
|
|
|
9
9
|
class BiasInterceptor(Evaluator):
|
|
10
10
|
COGNITIVE_BIASES = [
|
|
11
|
-
"Base-Rate Neglect",
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
11
|
+
"Base-Rate Neglect",
|
|
12
|
+
"Overconfidence",
|
|
13
|
+
"Availability Heuristic",
|
|
14
|
+
"Confirmation Bias",
|
|
15
|
+
"Anchoring Bias",
|
|
16
|
+
"Sunk Cost Fallacy",
|
|
17
|
+
"Hindsight Bias",
|
|
18
|
+
"Optimism Bias",
|
|
19
|
+
"Pessimism Bias",
|
|
20
|
+
"Status Quo Bias",
|
|
21
|
+
"Framing Effect",
|
|
22
|
+
"Recency Bias",
|
|
15
23
|
]
|
|
16
24
|
|
|
17
25
|
def __init__(self, model: Any):
|
|
@@ -46,4 +54,5 @@ class BiasInterceptor(Evaluator):
|
|
|
46
54
|
metadata={"type": "bias_audit"},
|
|
47
55
|
)
|
|
48
56
|
|
|
57
|
+
|
|
49
58
|
__all__ = ["BiasInterceptor"]
|
|
@@ -12,6 +12,7 @@ from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
|
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
|
+
|
|
15
16
|
class EpistemicEvaluator:
|
|
16
17
|
def __init__(self, registry: Optional[SourceTrustRegistry] = None):
|
|
17
18
|
self.registry = registry or SourceTrustRegistry()
|
|
@@ -28,4 +29,5 @@ class EpistemicEvaluator:
|
|
|
28
29
|
"integrity_level": "HIGH" if avg_trust > 0.8 else "MEDIUM" if avg_trust >= 0.5 else "LOW",
|
|
29
30
|
}
|
|
30
31
|
|
|
32
|
+
|
|
31
33
|
__all__ = ["EpistemicEvaluator"]
|
|
@@ -8,6 +8,7 @@ from xrtm.data.schemas.forecast import ForecastOutput
|
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
class InterventionEngine:
|
|
12
13
|
@staticmethod
|
|
13
14
|
def apply_intervention(output: ForecastOutput, node_id: str, new_probability: float) -> ForecastOutput:
|
|
@@ -29,12 +30,18 @@ class InterventionEngine:
|
|
|
29
30
|
weight = data.get("weight", 1.0)
|
|
30
31
|
target_node = next(n for n in new_output.logical_trace if n.node_id == target_id)
|
|
31
32
|
old_target_prob = target_node.probability or 0.5
|
|
32
|
-
normalized_delta = (
|
|
33
|
+
normalized_delta = (
|
|
34
|
+
current_node.probability - (dg.nodes[current_id].get("probability") or 0.5)
|
|
35
|
+
) * weight
|
|
33
36
|
target_node.probability = max(0.0, min(1.0, old_target_prob + normalized_delta))
|
|
34
37
|
leaf_nodes = [n for n in dg.nodes() if dg.out_degree(n) == 0]
|
|
35
38
|
if leaf_nodes:
|
|
36
|
-
avg_leaf_prob = sum(
|
|
39
|
+
avg_leaf_prob = sum(
|
|
40
|
+
next(n.probability for n in new_output.logical_trace if n.node_id == leaf_id) or 0.0
|
|
41
|
+
for leaf_id in leaf_nodes
|
|
42
|
+
) / len(leaf_nodes)
|
|
37
43
|
new_output.confidence = avg_leaf_prob
|
|
38
44
|
return new_output
|
|
39
45
|
|
|
46
|
+
|
|
40
47
|
__all__ = ["InterventionEngine"]
|
|
@@ -73,15 +73,22 @@ class ExpectedCalibrationErrorEvaluator(Evaluator):
|
|
|
73
73
|
|
|
74
74
|
def compute_calibration_data(self, results: List[EvaluationResult]) -> Tuple[float, List[ReliabilityBin]]:
|
|
75
75
|
bin_size = 1.0 / self.num_bins
|
|
76
|
-
bins: List[List[
|
|
76
|
+
bins: List[List[Tuple[float, float]]] = [[] for _ in range(self.num_bins)]
|
|
77
77
|
|
|
78
78
|
for res in results:
|
|
79
79
|
try:
|
|
80
|
-
|
|
80
|
+
raw_conf = float(res.prediction)
|
|
81
|
+
conf = min(max(raw_conf, 0.0), 1.0)
|
|
81
82
|
idx = int(conf / bin_size)
|
|
82
83
|
if idx == self.num_bins:
|
|
83
84
|
idx -= 1
|
|
84
|
-
|
|
85
|
+
|
|
86
|
+
gt = res.ground_truth
|
|
87
|
+
normalized_gt = (
|
|
88
|
+
1.0 if (gt.lower() in ["yes", "1", "true", "won", "pass"] if isinstance(gt, str) else gt) else 0.0
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
bins[idx].append((raw_conf, normalized_gt))
|
|
85
92
|
except (ValueError, TypeError):
|
|
86
93
|
continue
|
|
87
94
|
|
|
@@ -94,17 +101,13 @@ class ExpectedCalibrationErrorEvaluator(Evaluator):
|
|
|
94
101
|
bin_center = (i + 0.5) * bin_size
|
|
95
102
|
|
|
96
103
|
if n_b > 0:
|
|
97
|
-
mean_conf = sum(
|
|
98
|
-
|
|
99
|
-
for x in bin_items:
|
|
100
|
-
gt = x.ground_truth
|
|
101
|
-
normalized_gt = 1.0 if (gt.lower() in ["yes", "1", "true", "won", "pass"] if isinstance(gt, str) else gt) else 0.0
|
|
102
|
-
accuracies.append(normalized_gt)
|
|
103
|
-
|
|
104
|
-
mean_acc = sum(accuracies) / n_b
|
|
104
|
+
mean_conf = sum(x[0] for x in bin_items) / n_b
|
|
105
|
+
mean_acc = sum(x[1] for x in bin_items) / n_b
|
|
105
106
|
ece += (n_b / total_count) * abs(mean_acc - mean_conf)
|
|
106
107
|
reliability_data.append(
|
|
107
|
-
ReliabilityBin(
|
|
108
|
+
ReliabilityBin(
|
|
109
|
+
bin_center=bin_center, mean_prediction=mean_conf, mean_ground_truth=mean_acc, count=n_b
|
|
110
|
+
)
|
|
108
111
|
)
|
|
109
112
|
else:
|
|
110
113
|
reliability_data.append(
|
|
@@ -11,22 +11,34 @@ class FakeNewsItem(BaseModel):
|
|
|
11
11
|
trust_score: float = 0.1
|
|
12
12
|
intended_bias: str = Field(..., description="e.g. 'Bearish', 'Bullish'")
|
|
13
13
|
|
|
14
|
+
|
|
14
15
|
class GullibilityReport(BaseModel):
|
|
15
16
|
initial_confidence: float
|
|
16
17
|
post_injection_confidence: float
|
|
17
18
|
delta: float
|
|
18
19
|
resilience_score: float
|
|
19
20
|
|
|
21
|
+
|
|
20
22
|
class AdversarialInjector:
|
|
21
23
|
def __init__(self, intensity: float = 0.5):
|
|
22
24
|
self.intensity = intensity
|
|
25
|
+
|
|
23
26
|
def generate_attack(self, subject: str, direction: str) -> FakeNewsItem:
|
|
24
27
|
if direction.lower() == "bearish":
|
|
25
|
-
return FakeNewsItem(
|
|
28
|
+
return FakeNewsItem(
|
|
29
|
+
headline=f"BREAKING: {subject} CEO Under Investigation", content="...", intended_bias="Bearish"
|
|
30
|
+
)
|
|
26
31
|
return FakeNewsItem(headline=f"{subject} Secures Massive Contract", content="...", intended_bias="Bullish")
|
|
32
|
+
|
|
27
33
|
def measure_resilience(self, initial_confidence: float, post_injection_confidence: float) -> GullibilityReport:
|
|
28
34
|
delta = post_injection_confidence - initial_confidence
|
|
29
35
|
score = max(0.0, 1.0 - abs(delta))
|
|
30
|
-
return GullibilityReport(
|
|
36
|
+
return GullibilityReport(
|
|
37
|
+
initial_confidence=initial_confidence,
|
|
38
|
+
post_injection_confidence=post_injection_confidence,
|
|
39
|
+
delta=delta,
|
|
40
|
+
resilience_score=score,
|
|
41
|
+
)
|
|
42
|
+
|
|
31
43
|
|
|
32
44
|
__all__ = ["FakeNewsItem", "GullibilityReport", "AdversarialInjector"]
|
|
@@ -9,12 +9,14 @@ import numpy as np
|
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
@dataclass
|
|
13
14
|
class ReliabilityCurveData:
|
|
14
15
|
prob_pred: np.ndarray
|
|
15
16
|
prob_true: np.ndarray
|
|
16
17
|
ece: float
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
def compute_calibration_curve(y_true: List[int], y_prob: List[float], n_bins: int = 10) -> ReliabilityCurveData:
|
|
19
21
|
y_true_arr = np.array(y_true)
|
|
20
22
|
y_prob_arr = np.array(y_prob)
|
|
@@ -39,7 +41,10 @@ def compute_calibration_curve(y_true: List[int], y_prob: List[float], n_bins: in
|
|
|
39
41
|
ece += (count / total_samples) * np.abs(fraction_true - mean_prob)
|
|
40
42
|
return ReliabilityCurveData(prob_pred=np.array(bin_pred), prob_true=np.array(bin_true), ece=ece)
|
|
41
43
|
|
|
42
|
-
|
|
44
|
+
|
|
45
|
+
def plot_reliability_diagram(
|
|
46
|
+
data: ReliabilityCurveData, title: str = "Reliability Diagram", save_path: Optional[str] = None
|
|
47
|
+
) -> Any:
|
|
43
48
|
try:
|
|
44
49
|
import matplotlib.pyplot as plt
|
|
45
50
|
import seaborn as sns
|
|
@@ -61,13 +66,17 @@ def plot_reliability_diagram(data: ReliabilityCurveData, title: str = "Reliabili
|
|
|
61
66
|
plt.savefig(save_path)
|
|
62
67
|
return fig
|
|
63
68
|
|
|
69
|
+
|
|
64
70
|
class ReliabilityDiagram:
|
|
65
71
|
def __init__(self, n_bins: int = 10):
|
|
66
72
|
self.n_bins = n_bins
|
|
73
|
+
|
|
67
74
|
def compute(self, y_true: List[int], y_prob: List[float]) -> ReliabilityCurveData:
|
|
68
75
|
return compute_calibration_curve(y_true, y_prob, self.n_bins)
|
|
76
|
+
|
|
69
77
|
def plot(self, y_true: List[int], y_prob: List[float], save_path: Optional[str] = None) -> Any:
|
|
70
78
|
data = self.compute(y_true, y_prob)
|
|
71
79
|
return plot_reliability_diagram(data, save_path=save_path)
|
|
72
80
|
|
|
81
|
+
|
|
73
82
|
__all__ = ["ReliabilityCurveData", "compute_calibration_curve", "plot_reliability_diagram", "ReliabilityDiagram"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""
|
|
17
|
+
External providers for xrtm-eval.
|
|
18
|
+
|
|
19
|
+
This module provides adapters for external evaluation services.
|
|
20
|
+
Currently empty - will be populated with remote judges, LLM-as-judge
|
|
21
|
+
integrations, etc.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
__all__: list[str] = []
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""
|
|
17
|
+
Version information for xrtm-eval.
|
|
18
|
+
|
|
19
|
+
This module provides the single source of truth for the package version.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__all__ = ["__version__", "__author__", "__contact__", "__license__", "__copyright__"]
|
|
23
|
+
|
|
24
|
+
__version__ = "0.2.0"
|
|
25
|
+
__author__ = "XRTM Team"
|
|
26
|
+
__contact__ = "moy@xrtm.org"
|
|
27
|
+
__license__ = "Apache-2.0"
|
|
28
|
+
__copyright__ = "Copyright 2026 XRTM Team"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xrtm-eval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: The Judge/Scoring engine for XRTM.
|
|
5
5
|
Author-email: XRTM Team <moy@xrtm.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -23,15 +23,27 @@ Dynamic: license-file
|
|
|
23
23
|
|
|
24
24
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
25
25
|
[](https://www.python.org/downloads/)
|
|
26
|
+
[](https://pypi.org/project/xrtm-eval/)
|
|
26
27
|
|
|
27
28
|
**The Judge for XRTM.**
|
|
28
29
|
|
|
29
30
|
`xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
|
|
30
31
|
|
|
32
|
+
## Part of the XRTM Ecosystem
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Layer 4: xrtm-train → (imports all)
|
|
36
|
+
Layer 3: xrtm-forecast → (imports eval, data)
|
|
37
|
+
Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
|
|
38
|
+
Layer 1: xrtm-data → (zero dependencies)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
|
|
42
|
+
|
|
31
43
|
## Installation
|
|
32
44
|
|
|
33
45
|
```bash
|
|
34
|
-
|
|
46
|
+
pip install xrtm-eval
|
|
35
47
|
```
|
|
36
48
|
|
|
37
49
|
## Core Primitives
|
|
@@ -54,6 +66,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
|
|
|
54
66
|
### 2. Expected Calibration Error (ECE)
|
|
55
67
|
Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
|
|
56
68
|
|
|
69
|
+
### 3. Epistemic Trust Primitives (v0.1.1+)
|
|
70
|
+
`xrtm-eval` now includes trust scoring infrastructure:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
|
|
74
|
+
|
|
75
|
+
registry = SourceTrustRegistry()
|
|
76
|
+
guardian = IntegrityGuardian(registry)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Project Structure
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
src/xrtm/eval/
|
|
83
|
+
├── core/ # Interfaces & Schemas
|
|
84
|
+
│ ├── eval/ # Evaluator protocol, EvaluationResult
|
|
85
|
+
│ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
|
|
86
|
+
│ └── schemas/ # ForecastResolution
|
|
87
|
+
├── kit/ # Composable evaluator implementations
|
|
88
|
+
│ └── eval/metrics.py # BrierScoreEvaluator, ECE
|
|
89
|
+
└── providers/ # External evaluation services (future)
|
|
90
|
+
```
|
|
91
|
+
|
|
57
92
|
## Development
|
|
58
93
|
|
|
59
94
|
Prerequisites:
|
|
@@ -2,12 +2,15 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
src/xrtm/eval/__init__.py
|
|
5
|
+
src/xrtm/eval/version.py
|
|
5
6
|
src/xrtm/eval/core/__init__.py
|
|
6
7
|
src/xrtm/eval/core/epistemics.py
|
|
7
8
|
src/xrtm/eval/core/eval/__init__.py
|
|
8
9
|
src/xrtm/eval/core/eval/aggregation.py
|
|
9
10
|
src/xrtm/eval/core/eval/bayesian.py
|
|
10
11
|
src/xrtm/eval/core/eval/definitions.py
|
|
12
|
+
src/xrtm/eval/core/schemas/__init__.py
|
|
13
|
+
src/xrtm/eval/core/schemas/forecast.py
|
|
11
14
|
src/xrtm/eval/kit/eval/__init__.py
|
|
12
15
|
src/xrtm/eval/kit/eval/analytics.py
|
|
13
16
|
src/xrtm/eval/kit/eval/bias.py
|
|
@@ -16,11 +19,11 @@ src/xrtm/eval/kit/eval/intervention.py
|
|
|
16
19
|
src/xrtm/eval/kit/eval/metrics.py
|
|
17
20
|
src/xrtm/eval/kit/eval/resilience.py
|
|
18
21
|
src/xrtm/eval/kit/eval/viz.py
|
|
19
|
-
src/xrtm/eval/
|
|
20
|
-
src/xrtm/eval/schemas/forecast.py
|
|
22
|
+
src/xrtm/eval/providers/__init__.py
|
|
21
23
|
src/xrtm_eval.egg-info/PKG-INFO
|
|
22
24
|
src/xrtm_eval.egg-info/SOURCES.txt
|
|
23
25
|
src/xrtm_eval.egg-info/dependency_links.txt
|
|
24
26
|
src/xrtm_eval.egg-info/requires.txt
|
|
25
27
|
src/xrtm_eval.egg-info/top_level.txt
|
|
28
|
+
tests/test_ece.py
|
|
26
29
|
tests/test_metrics.py
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from xrtm.eval.core.eval.definitions import EvaluationResult
|
|
2
|
+
from xrtm.eval.kit.eval.metrics import ExpectedCalibrationErrorEvaluator
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_ece_basic():
|
|
6
|
+
evaluator = ExpectedCalibrationErrorEvaluator(num_bins=10)
|
|
7
|
+
results = [
|
|
8
|
+
EvaluationResult(subject_id="1", score=0, ground_truth=1, prediction=0.9, metadata={}), # Bin 9
|
|
9
|
+
EvaluationResult(subject_id="2", score=0, ground_truth=0, prediction=0.1, metadata={}), # Bin 1
|
|
10
|
+
]
|
|
11
|
+
ece, bins = evaluator.compute_calibration_data(results)
|
|
12
|
+
# Bin 9: 1 item, pred 0.9, gt 1. acc 1. mean_conf 0.9. abs(1 - 0.9) = 0.1
|
|
13
|
+
# Bin 1: 1 item, pred 0.1, gt 0. acc 0. mean_conf 0.1. abs(0 - 0.1) = 0.1
|
|
14
|
+
# ECE = (1/2)*0.1 + (1/2)*0.1 = 0.1
|
|
15
|
+
assert abs(ece - 0.1) < 1e-6
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_ece_mixed_types():
|
|
19
|
+
evaluator = ExpectedCalibrationErrorEvaluator(num_bins=2)
|
|
20
|
+
results = [
|
|
21
|
+
EvaluationResult(subject_id="1", score=0, ground_truth="yes", prediction=0.8, metadata={}),
|
|
22
|
+
EvaluationResult(subject_id="2", score=0, ground_truth="no", prediction="0.2", metadata={}),
|
|
23
|
+
EvaluationResult(subject_id="3", score=0, ground_truth=True, prediction=0.9, metadata={}),
|
|
24
|
+
EvaluationResult(subject_id="4", score=0, ground_truth=False, prediction=0.1, metadata={}),
|
|
25
|
+
]
|
|
26
|
+
# Bin 0 (0-0.5): Items 2 (0.2), 4 (0.1).
|
|
27
|
+
# Item 2: gt "no" -> 0.0. pred 0.2.
|
|
28
|
+
# Item 4: gt False -> 0.0. pred 0.1.
|
|
29
|
+
# Bin 0 mean_conf = (0.2 + 0.1)/2 = 0.15. mean_acc = 0.
|
|
30
|
+
# Bin 1 (0.5-1.0): Items 1 (0.8), 3 (0.9).
|
|
31
|
+
# Item 1: gt "yes" -> 1.0. pred 0.8.
|
|
32
|
+
# Item 3: gt True -> 1.0. pred 0.9.
|
|
33
|
+
# Bin 1 mean_conf = (0.8 + 0.9)/2 = 0.85. mean_acc = 1.0.
|
|
34
|
+
|
|
35
|
+
# ECE = (2/4)*abs(0 - 0.15) + (2/4)*abs(1 - 0.85) = 0.5 * 0.15 + 0.5 * 0.15 = 0.075 + 0.075 = 0.15
|
|
36
|
+
ece, bins = evaluator.compute_calibration_data(results)
|
|
37
|
+
assert abs(ece - 0.15) < 1e-6
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_ece_out_of_bounds():
|
|
41
|
+
evaluator = ExpectedCalibrationErrorEvaluator(num_bins=10)
|
|
42
|
+
results = [
|
|
43
|
+
EvaluationResult(subject_id="1", score=0, ground_truth=1, prediction=1.5, metadata={}),
|
|
44
|
+
EvaluationResult(subject_id="2", score=0, ground_truth=0, prediction=-0.5, metadata={}),
|
|
45
|
+
]
|
|
46
|
+
# Prediction 1.5 -> Clamped to 1.0 -> Bin 9 (last bin)
|
|
47
|
+
# Prediction -0.5 -> Clamped to 0.0 -> Bin 0
|
|
48
|
+
|
|
49
|
+
# Bin 9: 1 item. pred 1.5. gt 1. mean_conf 1.5. mean_acc 1. abs(1 - 1.5) = 0.5
|
|
50
|
+
# Bin 0: 1 item. pred -0.5. gt 0. mean_conf -0.5. mean_acc 0. abs(0 - -0.5) = 0.5
|
|
51
|
+
|
|
52
|
+
# ECE = 0.5 * 0.5 + 0.5 * 0.5 = 0.5
|
|
53
|
+
|
|
54
|
+
ece, bins = evaluator.compute_calibration_data(results)
|
|
55
|
+
assert abs(ece - 0.5) < 1e-6
|
|
56
|
+
|
|
57
|
+
# Check stored bins for correct values
|
|
58
|
+
# The last bin should have mean_prediction 1.5
|
|
59
|
+
assert abs(bins[9].mean_prediction - 1.5) < 1e-6
|
|
60
|
+
# The first bin should have mean_prediction -0.5
|
|
61
|
+
assert abs(bins[0].mean_prediction + 0.5) < 1e-6
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
test_ece_basic()
|
|
66
|
+
test_ece_mixed_types()
|
|
67
|
+
test_ece_out_of_bounds()
|
|
68
|
+
print("All tests passed!")
|
|
@@ -25,6 +25,7 @@ def test_brier_score_perfect_accurate():
|
|
|
25
25
|
score = evaluator.score(prediction=0.0, ground_truth=0)
|
|
26
26
|
assert score == 0.0
|
|
27
27
|
|
|
28
|
+
|
|
28
29
|
def test_brier_score_worst_case():
|
|
29
30
|
"""Verify Brier score is 1.0 for completely wrong prediction."""
|
|
30
31
|
evaluator = BrierScoreEvaluator()
|
|
@@ -34,12 +35,14 @@ def test_brier_score_worst_case():
|
|
|
34
35
|
score = evaluator.score(prediction=0.0, ground_truth=1)
|
|
35
36
|
assert score == 1.0
|
|
36
37
|
|
|
38
|
+
|
|
37
39
|
def test_brier_score_uncertainty():
|
|
38
40
|
"""Verify Brier score for 0.5 prediction."""
|
|
39
41
|
evaluator = BrierScoreEvaluator()
|
|
40
42
|
score = evaluator.score(prediction=0.5, ground_truth=1)
|
|
41
43
|
assert score == 0.25 # (0.5 - 1.0)^2 = 0.25
|
|
42
44
|
|
|
45
|
+
|
|
43
46
|
def test_string_ground_truth_handling():
|
|
44
47
|
"""Verify string handling (Resolution logic)."""
|
|
45
48
|
evaluator = BrierScoreEvaluator()
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
# coding=utf-8
|
|
2
|
-
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
-
|
|
4
|
-
from .epistemics import IntegrityGuardian, SourceTrustEntry, SourceTrustRegistry
|
|
5
|
-
from .eval import EvaluationReport, EvaluationResult, Evaluator
|
|
6
|
-
|
|
7
|
-
__all__ = [
|
|
8
|
-
"Evaluator",
|
|
9
|
-
"EvaluationResult",
|
|
10
|
-
"EvaluationReport",
|
|
11
|
-
"IntegrityGuardian",
|
|
12
|
-
"SourceTrustRegistry",
|
|
13
|
-
"SourceTrustEntry",
|
|
14
|
-
]
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
# coding=utf-8
|
|
2
|
-
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
-
|
|
4
|
-
from datetime import datetime, timezone
|
|
5
|
-
from typing import Any, Dict
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel, Field
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class ForecastResolution(BaseModel):
|
|
11
|
-
r"""
|
|
12
|
-
The ground-truth outcome used to evaluate forecast accuracy.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
question_id: str
|
|
16
|
-
outcome: str = Field(..., description="The final winning outcome or value")
|
|
17
|
-
resolved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
18
|
-
metadata: Dict[str, Any] = Field(default_factory=dict, description="Source info, verification method")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
__all__ = ["ForecastResolution"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|