xrtm-eval 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {xrtm_eval-0.1.1/src/xrtm_eval.egg-info → xrtm_eval-0.2.0}/PKG-INFO +37 -2
  2. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/README.md +36 -1
  3. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/pyproject.toml +1 -1
  4. xrtm_eval-0.2.0/src/xrtm/eval/core/__init__.py +42 -0
  5. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/definitions.py +3 -4
  6. xrtm_eval-0.2.0/src/xrtm/eval/core/schemas/__init__.py +24 -0
  7. xrtm_eval-0.2.0/src/xrtm/eval/core/schemas/forecast.py +59 -0
  8. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/bias.py +13 -4
  9. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/epistemic_evaluator.py +2 -0
  10. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/intervention.py +9 -2
  11. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/metrics.py +15 -12
  12. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/resilience.py +14 -2
  13. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/viz.py +10 -1
  14. xrtm_eval-0.2.0/src/xrtm/eval/providers/__init__.py +24 -0
  15. xrtm_eval-0.2.0/src/xrtm/eval/version.py +28 -0
  16. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0/src/xrtm_eval.egg-info}/PKG-INFO +37 -2
  17. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/SOURCES.txt +5 -2
  18. xrtm_eval-0.2.0/tests/test_ece.py +68 -0
  19. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/tests/test_metrics.py +3 -0
  20. xrtm_eval-0.1.1/src/xrtm/eval/core/__init__.py +0 -14
  21. xrtm_eval-0.1.1/src/xrtm/eval/schemas/__init__.py +0 -3
  22. xrtm_eval-0.1.1/src/xrtm/eval/schemas/forecast.py +0 -21
  23. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/LICENSE +0 -0
  24. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/setup.cfg +0 -0
  25. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/__init__.py +0 -0
  26. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/epistemics.py +0 -0
  27. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/__init__.py +0 -0
  28. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/aggregation.py +0 -0
  29. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/core/eval/bayesian.py +0 -0
  30. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/__init__.py +0 -0
  31. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm/eval/kit/eval/analytics.py +0 -0
  32. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/dependency_links.txt +0 -0
  33. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/requires.txt +0 -0
  34. {xrtm_eval-0.1.1 → xrtm_eval-0.2.0}/src/xrtm_eval.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xrtm-eval
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: The Judge/Scoring engine for XRTM.
5
5
  Author-email: XRTM Team <moy@xrtm.org>
6
6
  License: Apache-2.0
@@ -23,15 +23,27 @@ Dynamic: license-file
23
23
 
24
24
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
25
25
  [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
26
+ [![PyPI](https://img.shields.io/pypi/v/xrtm-eval.svg)](https://pypi.org/project/xrtm-eval/)
26
27
 
27
28
  **The Judge for XRTM.**
28
29
 
29
30
  `xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
30
31
 
32
+ ## Part of the XRTM Ecosystem
33
+
34
+ ```
35
+ Layer 4: xrtm-train → (imports all)
36
+ Layer 3: xrtm-forecast → (imports eval, data)
37
+ Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
38
+ Layer 1: xrtm-data → (zero dependencies)
39
+ ```
40
+
41
+ `xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
42
+
31
43
  ## Installation
32
44
 
33
45
  ```bash
34
- uv pip install xrtm-eval
46
+ pip install xrtm-eval
35
47
  ```
36
48
 
37
49
  ## Core Primitives
@@ -54,6 +66,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
54
66
  ### 2. Expected Calibration Error (ECE)
55
67
  Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
56
68
 
69
+ ### 3. Epistemic Trust Primitives (v0.1.1+)
70
+ `xrtm-eval` now includes trust scoring infrastructure:
71
+
72
+ ```python
73
+ from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
74
+
75
+ registry = SourceTrustRegistry()
76
+ guardian = IntegrityGuardian(registry)
77
+ ```
78
+
79
+ ## Project Structure
80
+
81
+ ```
82
+ src/xrtm/eval/
83
+ ├── core/ # Interfaces & Schemas
84
+ │ ├── eval/ # Evaluator protocol, EvaluationResult
85
+ │ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
86
+ │ └── schemas/ # ForecastResolution
87
+ ├── kit/ # Composable evaluator implementations
88
+ │ └── eval/metrics.py # BrierScoreEvaluator, ECE
89
+ └── providers/ # External evaluation services (future)
90
+ ```
91
+
57
92
  ## Development
58
93
 
59
94
  Prerequisites:
@@ -2,15 +2,27 @@
2
2
 
3
3
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
4
4
  [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
5
+ [![PyPI](https://img.shields.io/pypi/v/xrtm-eval.svg)](https://pypi.org/project/xrtm-eval/)
5
6
 
6
7
  **The Judge for XRTM.**
7
8
 
8
9
  `xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
9
10
 
11
+ ## Part of the XRTM Ecosystem
12
+
13
+ ```
14
+ Layer 4: xrtm-train → (imports all)
15
+ Layer 3: xrtm-forecast → (imports eval, data)
16
+ Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
17
+ Layer 1: xrtm-data → (zero dependencies)
18
+ ```
19
+
20
+ `xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
21
+
10
22
  ## Installation
11
23
 
12
24
  ```bash
13
- uv pip install xrtm-eval
25
+ pip install xrtm-eval
14
26
  ```
15
27
 
16
28
  ## Core Primitives
@@ -33,6 +45,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
33
45
  ### 2. Expected Calibration Error (ECE)
34
46
  Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
35
47
 
48
+ ### 3. Epistemic Trust Primitives (v0.1.1+)
49
+ `xrtm-eval` now includes trust scoring infrastructure:
50
+
51
+ ```python
52
+ from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
53
+
54
+ registry = SourceTrustRegistry()
55
+ guardian = IntegrityGuardian(registry)
56
+ ```
57
+
58
+ ## Project Structure
59
+
60
+ ```
61
+ src/xrtm/eval/
62
+ ├── core/ # Interfaces & Schemas
63
+ │ ├── eval/ # Evaluator protocol, EvaluationResult
64
+ │ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
65
+ │ └── schemas/ # ForecastResolution
66
+ ├── kit/ # Composable evaluator implementations
67
+ │ └── eval/metrics.py # BrierScoreEvaluator, ECE
68
+ └── providers/ # External evaluation services (future)
69
+ ```
70
+
36
71
  ## Development
37
72
 
38
73
  Prerequisites:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xrtm-eval"
7
- version = "0.1.1"
7
+ version = "0.2.0"
8
8
  description = "The Judge/Scoring engine for XRTM."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1,42 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""
17
+ Core interfaces and domain-agnostic logic for xrtm-eval.
18
+
19
+ This module exports evaluator protocols, epistemics utilities, and
20
+ core schemas. MUST NOT import from kit/ or providers/.
21
+ """
22
+
23
+ from xrtm.eval.core.epistemics import (
24
+ IntegrityGuardian,
25
+ SourceTrustEntry,
26
+ SourceTrustRegistry,
27
+ )
28
+ from xrtm.eval.core.eval import EvaluationReport, EvaluationResult, Evaluator
29
+ from xrtm.eval.core.schemas import ForecastResolution
30
+
31
+ __all__ = [
32
+ # Evaluator protocol
33
+ "Evaluator",
34
+ "EvaluationResult",
35
+ "EvaluationReport",
36
+ # Epistemics
37
+ "IntegrityGuardian",
38
+ "SourceTrustRegistry",
39
+ "SourceTrustEntry",
40
+ # Schemas
41
+ "ForecastResolution",
42
+ ]
@@ -30,11 +30,9 @@ class BrierDecomposition(BaseModel):
30
30
 
31
31
 
32
32
  class Evaluator(Protocol):
33
- def score(self, prediction: Any, ground_truth: Any) -> float:
34
- ...
33
+ def score(self, prediction: Any, ground_truth: Any) -> float: ...
35
34
 
36
- def evaluate(self, prediction: Any, ground_truth: Any, subject_id: str) -> EvaluationResult:
37
- ...
35
+ def evaluate(self, prediction: Any, ground_truth: Any, subject_id: str) -> EvaluationResult: ...
38
36
 
39
37
 
40
38
  class EvaluationReport(BaseModel):
@@ -55,6 +53,7 @@ class EvaluationReport(BaseModel):
55
53
  def to_pandas(self) -> Any:
56
54
  try:
57
55
  import pandas as pd
56
+
58
57
  return pd.DataFrame([r.model_dump() for r in self.results])
59
58
  except ImportError:
60
59
  raise ImportError("Pandas is required for to_pandas(). Install it with `pip install pandas`.")
@@ -0,0 +1,24 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""
17
+ Core schemas for xrtm-eval.
18
+
19
+ This module exports evaluation-related Pydantic models.
20
+ """
21
+
22
+ from xrtm.eval.core.schemas.forecast import ForecastResolution
23
+
24
+ __all__ = ["ForecastResolution"]
@@ -0,0 +1,59 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""
17
+ Forecast resolution schema for evaluation.
18
+
19
+ This module defines the ground-truth outcome schema used to evaluate
20
+ forecast accuracy.
21
+
22
+ Example:
23
+ >>> from xrtm.eval.core.schemas import ForecastResolution
24
+ >>> resolution = ForecastResolution(
25
+ ... question_id="q1",
26
+ ... outcome="yes",
27
+ ... )
28
+ """
29
+
30
+ from datetime import datetime, timezone
31
+ from typing import Any, Dict
32
+
33
+ from pydantic import BaseModel, Field
34
+
35
+
36
+ class ForecastResolution(BaseModel):
37
+ r"""
38
+ The ground-truth outcome used to evaluate forecast accuracy.
39
+
40
+ Attributes:
41
+ question_id: Reference to the forecasted question.
42
+ outcome: The final winning outcome or value.
43
+ resolved_at: When the outcome was determined.
44
+ metadata: Source info, verification method, etc.
45
+
46
+ Example:
47
+ >>> resolution = ForecastResolution(question_id="q1", outcome="yes")
48
+ """
49
+
50
+ question_id: str = Field(..., description="Reference to the forecasted question")
51
+ outcome: str = Field(..., description="The final winning outcome or value")
52
+ resolved_at: datetime = Field(
53
+ default_factory=lambda: datetime.now(timezone.utc),
54
+ description="When the outcome was determined",
55
+ )
56
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Source info, verification method")
57
+
58
+
59
+ __all__ = ["ForecastResolution"]
@@ -8,10 +8,18 @@ from xrtm.eval.core.eval.definitions import EvaluationResult, Evaluator
8
8
 
9
9
  class BiasInterceptor(Evaluator):
10
10
  COGNITIVE_BIASES = [
11
- "Base-Rate Neglect", "Overconfidence", "Availability Heuristic",
12
- "Confirmation Bias", "Anchoring Bias", "Sunk Cost Fallacy",
13
- "Hindsight Bias", "Optimism Bias", "Pessimism Bias",
14
- "Status Quo Bias", "Framing Effect", "Recency Bias",
11
+ "Base-Rate Neglect",
12
+ "Overconfidence",
13
+ "Availability Heuristic",
14
+ "Confirmation Bias",
15
+ "Anchoring Bias",
16
+ "Sunk Cost Fallacy",
17
+ "Hindsight Bias",
18
+ "Optimism Bias",
19
+ "Pessimism Bias",
20
+ "Status Quo Bias",
21
+ "Framing Effect",
22
+ "Recency Bias",
15
23
  ]
16
24
 
17
25
  def __init__(self, model: Any):
@@ -46,4 +54,5 @@ class BiasInterceptor(Evaluator):
46
54
  metadata={"type": "bias_audit"},
47
55
  )
48
56
 
57
+
49
58
  __all__ = ["BiasInterceptor"]
@@ -12,6 +12,7 @@ from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
+
15
16
  class EpistemicEvaluator:
16
17
  def __init__(self, registry: Optional[SourceTrustRegistry] = None):
17
18
  self.registry = registry or SourceTrustRegistry()
@@ -28,4 +29,5 @@ class EpistemicEvaluator:
28
29
  "integrity_level": "HIGH" if avg_trust > 0.8 else "MEDIUM" if avg_trust >= 0.5 else "LOW",
29
30
  }
30
31
 
32
+
31
33
  __all__ = ["EpistemicEvaluator"]
@@ -8,6 +8,7 @@ from xrtm.data.schemas.forecast import ForecastOutput
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
+
11
12
  class InterventionEngine:
12
13
  @staticmethod
13
14
  def apply_intervention(output: ForecastOutput, node_id: str, new_probability: float) -> ForecastOutput:
@@ -29,12 +30,18 @@ class InterventionEngine:
29
30
  weight = data.get("weight", 1.0)
30
31
  target_node = next(n for n in new_output.logical_trace if n.node_id == target_id)
31
32
  old_target_prob = target_node.probability or 0.5
32
- normalized_delta = (current_node.probability - (dg.nodes[current_id].get("probability") or 0.5)) * weight
33
+ normalized_delta = (
34
+ current_node.probability - (dg.nodes[current_id].get("probability") or 0.5)
35
+ ) * weight
33
36
  target_node.probability = max(0.0, min(1.0, old_target_prob + normalized_delta))
34
37
  leaf_nodes = [n for n in dg.nodes() if dg.out_degree(n) == 0]
35
38
  if leaf_nodes:
36
- avg_leaf_prob = sum(next(n.probability for n in new_output.logical_trace if n.node_id == leaf_id) or 0.0 for leaf_id in leaf_nodes) / len(leaf_nodes)
39
+ avg_leaf_prob = sum(
40
+ next(n.probability for n in new_output.logical_trace if n.node_id == leaf_id) or 0.0
41
+ for leaf_id in leaf_nodes
42
+ ) / len(leaf_nodes)
37
43
  new_output.confidence = avg_leaf_prob
38
44
  return new_output
39
45
 
46
+
40
47
  __all__ = ["InterventionEngine"]
@@ -73,15 +73,22 @@ class ExpectedCalibrationErrorEvaluator(Evaluator):
73
73
 
74
74
  def compute_calibration_data(self, results: List[EvaluationResult]) -> Tuple[float, List[ReliabilityBin]]:
75
75
  bin_size = 1.0 / self.num_bins
76
- bins: List[List[EvaluationResult]] = [[] for _ in range(self.num_bins)]
76
+ bins: List[List[Tuple[float, float]]] = [[] for _ in range(self.num_bins)]
77
77
 
78
78
  for res in results:
79
79
  try:
80
- conf = min(max(float(res.prediction), 0.0), 1.0)
80
+ raw_conf = float(res.prediction)
81
+ conf = min(max(raw_conf, 0.0), 1.0)
81
82
  idx = int(conf / bin_size)
82
83
  if idx == self.num_bins:
83
84
  idx -= 1
84
- bins[idx].append(res)
85
+
86
+ gt = res.ground_truth
87
+ normalized_gt = (
88
+ 1.0 if (gt.lower() in ["yes", "1", "true", "won", "pass"] if isinstance(gt, str) else gt) else 0.0
89
+ )
90
+
91
+ bins[idx].append((raw_conf, normalized_gt))
85
92
  except (ValueError, TypeError):
86
93
  continue
87
94
 
@@ -94,17 +101,13 @@ class ExpectedCalibrationErrorEvaluator(Evaluator):
94
101
  bin_center = (i + 0.5) * bin_size
95
102
 
96
103
  if n_b > 0:
97
- mean_conf = sum(float(x.prediction) for x in bin_items) / n_b
98
- accuracies = []
99
- for x in bin_items:
100
- gt = x.ground_truth
101
- normalized_gt = 1.0 if (gt.lower() in ["yes", "1", "true", "won", "pass"] if isinstance(gt, str) else gt) else 0.0
102
- accuracies.append(normalized_gt)
103
-
104
- mean_acc = sum(accuracies) / n_b
104
+ mean_conf = sum(x[0] for x in bin_items) / n_b
105
+ mean_acc = sum(x[1] for x in bin_items) / n_b
105
106
  ece += (n_b / total_count) * abs(mean_acc - mean_conf)
106
107
  reliability_data.append(
107
- ReliabilityBin(bin_center=bin_center, mean_prediction=mean_conf, mean_ground_truth=mean_acc, count=n_b)
108
+ ReliabilityBin(
109
+ bin_center=bin_center, mean_prediction=mean_conf, mean_ground_truth=mean_acc, count=n_b
110
+ )
108
111
  )
109
112
  else:
110
113
  reliability_data.append(
@@ -11,22 +11,34 @@ class FakeNewsItem(BaseModel):
11
11
  trust_score: float = 0.1
12
12
  intended_bias: str = Field(..., description="e.g. 'Bearish', 'Bullish'")
13
13
 
14
+
14
15
  class GullibilityReport(BaseModel):
15
16
  initial_confidence: float
16
17
  post_injection_confidence: float
17
18
  delta: float
18
19
  resilience_score: float
19
20
 
21
+
20
22
  class AdversarialInjector:
21
23
  def __init__(self, intensity: float = 0.5):
22
24
  self.intensity = intensity
25
+
23
26
  def generate_attack(self, subject: str, direction: str) -> FakeNewsItem:
24
27
  if direction.lower() == "bearish":
25
- return FakeNewsItem(headline=f"BREAKING: {subject} CEO Under Investigation", content="...", intended_bias="Bearish")
28
+ return FakeNewsItem(
29
+ headline=f"BREAKING: {subject} CEO Under Investigation", content="...", intended_bias="Bearish"
30
+ )
26
31
  return FakeNewsItem(headline=f"{subject} Secures Massive Contract", content="...", intended_bias="Bullish")
32
+
27
33
  def measure_resilience(self, initial_confidence: float, post_injection_confidence: float) -> GullibilityReport:
28
34
  delta = post_injection_confidence - initial_confidence
29
35
  score = max(0.0, 1.0 - abs(delta))
30
- return GullibilityReport(initial_confidence=initial_confidence, post_injection_confidence=post_injection_confidence, delta=delta, resilience_score=score)
36
+ return GullibilityReport(
37
+ initial_confidence=initial_confidence,
38
+ post_injection_confidence=post_injection_confidence,
39
+ delta=delta,
40
+ resilience_score=score,
41
+ )
42
+
31
43
 
32
44
  __all__ = ["FakeNewsItem", "GullibilityReport", "AdversarialInjector"]
@@ -9,12 +9,14 @@ import numpy as np
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
12
+
12
13
  @dataclass
13
14
  class ReliabilityCurveData:
14
15
  prob_pred: np.ndarray
15
16
  prob_true: np.ndarray
16
17
  ece: float
17
18
 
19
+
18
20
  def compute_calibration_curve(y_true: List[int], y_prob: List[float], n_bins: int = 10) -> ReliabilityCurveData:
19
21
  y_true_arr = np.array(y_true)
20
22
  y_prob_arr = np.array(y_prob)
@@ -39,7 +41,10 @@ def compute_calibration_curve(y_true: List[int], y_prob: List[float], n_bins: in
39
41
  ece += (count / total_samples) * np.abs(fraction_true - mean_prob)
40
42
  return ReliabilityCurveData(prob_pred=np.array(bin_pred), prob_true=np.array(bin_true), ece=ece)
41
43
 
42
- def plot_reliability_diagram(data: ReliabilityCurveData, title: str = "Reliability Diagram", save_path: Optional[str] = None) -> Any:
44
+
45
+ def plot_reliability_diagram(
46
+ data: ReliabilityCurveData, title: str = "Reliability Diagram", save_path: Optional[str] = None
47
+ ) -> Any:
43
48
  try:
44
49
  import matplotlib.pyplot as plt
45
50
  import seaborn as sns
@@ -61,13 +66,17 @@ def plot_reliability_diagram(data: ReliabilityCurveData, title: str = "Reliabili
61
66
  plt.savefig(save_path)
62
67
  return fig
63
68
 
69
+
64
70
  class ReliabilityDiagram:
65
71
  def __init__(self, n_bins: int = 10):
66
72
  self.n_bins = n_bins
73
+
67
74
  def compute(self, y_true: List[int], y_prob: List[float]) -> ReliabilityCurveData:
68
75
  return compute_calibration_curve(y_true, y_prob, self.n_bins)
76
+
69
77
  def plot(self, y_true: List[int], y_prob: List[float], save_path: Optional[str] = None) -> Any:
70
78
  data = self.compute(y_true, y_prob)
71
79
  return plot_reliability_diagram(data, save_path=save_path)
72
80
 
81
+
73
82
  __all__ = ["ReliabilityCurveData", "compute_calibration_curve", "plot_reliability_diagram", "ReliabilityDiagram"]
@@ -0,0 +1,24 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""
17
+ External providers for xrtm-eval.
18
+
19
+ This module provides adapters for external evaluation services.
20
+ Currently empty - will be populated with remote judges, LLM-as-judge
21
+ integrations, etc.
22
+ """
23
+
24
+ __all__: list[str] = []
@@ -0,0 +1,28 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""
17
+ Version information for xrtm-eval.
18
+
19
+ This module provides the single source of truth for the package version.
20
+ """
21
+
22
+ __all__ = ["__version__", "__author__", "__contact__", "__license__", "__copyright__"]
23
+
24
+ __version__ = "0.2.0"
25
+ __author__ = "XRTM Team"
26
+ __contact__ = "moy@xrtm.org"
27
+ __license__ = "Apache-2.0"
28
+ __copyright__ = "Copyright 2026 XRTM Team"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xrtm-eval
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: The Judge/Scoring engine for XRTM.
5
5
  Author-email: XRTM Team <moy@xrtm.org>
6
6
  License: Apache-2.0
@@ -23,15 +23,27 @@ Dynamic: license-file
23
23
 
24
24
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
25
25
  [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
26
+ [![PyPI](https://img.shields.io/pypi/v/xrtm-eval.svg)](https://pypi.org/project/xrtm-eval/)
26
27
 
27
28
  **The Judge for XRTM.**
28
29
 
29
30
  `xrtm-eval` is the rigorous scoring engine used to grade probabilistic forecasts. It operates independently of the inference engine to ensure objective evaluation.
30
31
 
32
+ ## Part of the XRTM Ecosystem
33
+
34
+ ```
35
+ Layer 4: xrtm-train → (imports all)
36
+ Layer 3: xrtm-forecast → (imports eval, data)
37
+ Layer 2: xrtm-eval → (imports data) ← YOU ARE HERE
38
+ Layer 1: xrtm-data → (zero dependencies)
39
+ ```
40
+
41
+ `xrtm-eval` provides scoring metrics AND trust primitives used by the forecast engine.
42
+
31
43
  ## Installation
32
44
 
33
45
  ```bash
34
- uv pip install xrtm-eval
46
+ pip install xrtm-eval
35
47
  ```
36
48
 
37
49
  ## Core Primitives
@@ -54,6 +66,29 @@ score = evaluator.score(prediction=0.7, ground_truth=1)
54
66
  ### 2. Expected Calibration Error (ECE)
55
67
  Use the `ExpectedCalibrationErrorEvaluator` to measure the gap between confidence and accuracy across bin buckets.
56
68
 
69
+ ### 3. Epistemic Trust Primitives (v0.1.1+)
70
+ `xrtm-eval` now includes trust scoring infrastructure:
71
+
72
+ ```python
73
+ from xrtm.eval.core.epistemics import IntegrityGuardian, SourceTrustRegistry
74
+
75
+ registry = SourceTrustRegistry()
76
+ guardian = IntegrityGuardian(registry)
77
+ ```
78
+
79
+ ## Project Structure
80
+
81
+ ```
82
+ src/xrtm/eval/
83
+ ├── core/ # Interfaces & Schemas
84
+ │ ├── eval/ # Evaluator protocol, EvaluationResult
85
+ │ ├── epistemics.py # Trust primitives (SourceTrustRegistry)
86
+ │ └── schemas/ # ForecastResolution
87
+ ├── kit/ # Composable evaluator implementations
88
+ │ └── eval/metrics.py # BrierScoreEvaluator, ECE
89
+ └── providers/ # External evaluation services (future)
90
+ ```
91
+
57
92
  ## Development
58
93
 
59
94
  Prerequisites:
@@ -2,12 +2,15 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/xrtm/eval/__init__.py
5
+ src/xrtm/eval/version.py
5
6
  src/xrtm/eval/core/__init__.py
6
7
  src/xrtm/eval/core/epistemics.py
7
8
  src/xrtm/eval/core/eval/__init__.py
8
9
  src/xrtm/eval/core/eval/aggregation.py
9
10
  src/xrtm/eval/core/eval/bayesian.py
10
11
  src/xrtm/eval/core/eval/definitions.py
12
+ src/xrtm/eval/core/schemas/__init__.py
13
+ src/xrtm/eval/core/schemas/forecast.py
11
14
  src/xrtm/eval/kit/eval/__init__.py
12
15
  src/xrtm/eval/kit/eval/analytics.py
13
16
  src/xrtm/eval/kit/eval/bias.py
@@ -16,11 +19,11 @@ src/xrtm/eval/kit/eval/intervention.py
16
19
  src/xrtm/eval/kit/eval/metrics.py
17
20
  src/xrtm/eval/kit/eval/resilience.py
18
21
  src/xrtm/eval/kit/eval/viz.py
19
- src/xrtm/eval/schemas/__init__.py
20
- src/xrtm/eval/schemas/forecast.py
22
+ src/xrtm/eval/providers/__init__.py
21
23
  src/xrtm_eval.egg-info/PKG-INFO
22
24
  src/xrtm_eval.egg-info/SOURCES.txt
23
25
  src/xrtm_eval.egg-info/dependency_links.txt
24
26
  src/xrtm_eval.egg-info/requires.txt
25
27
  src/xrtm_eval.egg-info/top_level.txt
28
+ tests/test_ece.py
26
29
  tests/test_metrics.py
@@ -0,0 +1,68 @@
1
+ from xrtm.eval.core.eval.definitions import EvaluationResult
2
+ from xrtm.eval.kit.eval.metrics import ExpectedCalibrationErrorEvaluator
3
+
4
+
5
+ def test_ece_basic():
6
+ evaluator = ExpectedCalibrationErrorEvaluator(num_bins=10)
7
+ results = [
8
+ EvaluationResult(subject_id="1", score=0, ground_truth=1, prediction=0.9, metadata={}), # Bin 9
9
+ EvaluationResult(subject_id="2", score=0, ground_truth=0, prediction=0.1, metadata={}), # Bin 1
10
+ ]
11
+ ece, bins = evaluator.compute_calibration_data(results)
12
+ # Bin 9: 1 item, pred 0.9, gt 1. acc 1. mean_conf 0.9. abs(1 - 0.9) = 0.1
13
+ # Bin 1: 1 item, pred 0.1, gt 0. acc 0. mean_conf 0.1. abs(0 - 0.1) = 0.1
14
+ # ECE = (1/2)*0.1 + (1/2)*0.1 = 0.1
15
+ assert abs(ece - 0.1) < 1e-6
16
+
17
+
18
+ def test_ece_mixed_types():
19
+ evaluator = ExpectedCalibrationErrorEvaluator(num_bins=2)
20
+ results = [
21
+ EvaluationResult(subject_id="1", score=0, ground_truth="yes", prediction=0.8, metadata={}),
22
+ EvaluationResult(subject_id="2", score=0, ground_truth="no", prediction="0.2", metadata={}),
23
+ EvaluationResult(subject_id="3", score=0, ground_truth=True, prediction=0.9, metadata={}),
24
+ EvaluationResult(subject_id="4", score=0, ground_truth=False, prediction=0.1, metadata={}),
25
+ ]
26
+ # Bin 0 (0-0.5): Items 2 (0.2), 4 (0.1).
27
+ # Item 2: gt "no" -> 0.0. pred 0.2.
28
+ # Item 4: gt False -> 0.0. pred 0.1.
29
+ # Bin 0 mean_conf = (0.2 + 0.1)/2 = 0.15. mean_acc = 0.
30
+ # Bin 1 (0.5-1.0): Items 1 (0.8), 3 (0.9).
31
+ # Item 1: gt "yes" -> 1.0. pred 0.8.
32
+ # Item 3: gt True -> 1.0. pred 0.9.
33
+ # Bin 1 mean_conf = (0.8 + 0.9)/2 = 0.85. mean_acc = 1.0.
34
+
35
+ # ECE = (2/4)*abs(0 - 0.15) + (2/4)*abs(1 - 0.85) = 0.5 * 0.15 + 0.5 * 0.15 = 0.075 + 0.075 = 0.15
36
+ ece, bins = evaluator.compute_calibration_data(results)
37
+ assert abs(ece - 0.15) < 1e-6
38
+
39
+
40
+ def test_ece_out_of_bounds():
41
+ evaluator = ExpectedCalibrationErrorEvaluator(num_bins=10)
42
+ results = [
43
+ EvaluationResult(subject_id="1", score=0, ground_truth=1, prediction=1.5, metadata={}),
44
+ EvaluationResult(subject_id="2", score=0, ground_truth=0, prediction=-0.5, metadata={}),
45
+ ]
46
+ # Prediction 1.5 -> Clamped to 1.0 -> Bin 9 (last bin)
47
+ # Prediction -0.5 -> Clamped to 0.0 -> Bin 0
48
+
49
+ # Bin 9: 1 item. pred 1.5. gt 1. mean_conf 1.5. mean_acc 1. abs(1 - 1.5) = 0.5
50
+ # Bin 0: 1 item. pred -0.5. gt 0. mean_conf -0.5. mean_acc 0. abs(0 - -0.5) = 0.5
51
+
52
+ # ECE = 0.5 * 0.5 + 0.5 * 0.5 = 0.5
53
+
54
+ ece, bins = evaluator.compute_calibration_data(results)
55
+ assert abs(ece - 0.5) < 1e-6
56
+
57
+ # Check stored bins for correct values
58
+ # The last bin should have mean_prediction 1.5
59
+ assert abs(bins[9].mean_prediction - 1.5) < 1e-6
60
+ # The first bin should have mean_prediction -0.5
61
+ assert abs(bins[0].mean_prediction + 0.5) < 1e-6
62
+
63
+
64
+ if __name__ == "__main__":
65
+ test_ece_basic()
66
+ test_ece_mixed_types()
67
+ test_ece_out_of_bounds()
68
+ print("All tests passed!")
@@ -25,6 +25,7 @@ def test_brier_score_perfect_accurate():
25
25
  score = evaluator.score(prediction=0.0, ground_truth=0)
26
26
  assert score == 0.0
27
27
 
28
+
28
29
  def test_brier_score_worst_case():
29
30
  """Verify Brier score is 1.0 for completely wrong prediction."""
30
31
  evaluator = BrierScoreEvaluator()
@@ -34,12 +35,14 @@ def test_brier_score_worst_case():
34
35
  score = evaluator.score(prediction=0.0, ground_truth=1)
35
36
  assert score == 1.0
36
37
 
38
+
37
39
  def test_brier_score_uncertainty():
38
40
  """Verify Brier score for 0.5 prediction."""
39
41
  evaluator = BrierScoreEvaluator()
40
42
  score = evaluator.score(prediction=0.5, ground_truth=1)
41
43
  assert score == 0.25 # (0.5 - 1.0)^2 = 0.25
42
44
 
45
+
43
46
  def test_string_ground_truth_handling():
44
47
  """Verify string handling (Resolution logic)."""
45
48
  evaluator = BrierScoreEvaluator()
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2026 XRTM Team. All rights reserved.
3
-
4
- from .epistemics import IntegrityGuardian, SourceTrustEntry, SourceTrustRegistry
5
- from .eval import EvaluationReport, EvaluationResult, Evaluator
6
-
7
- __all__ = [
8
- "Evaluator",
9
- "EvaluationResult",
10
- "EvaluationReport",
11
- "IntegrityGuardian",
12
- "SourceTrustRegistry",
13
- "SourceTrustEntry",
14
- ]
@@ -1,3 +0,0 @@
1
- from .forecast import ForecastResolution
2
-
3
- __all__ = ["ForecastResolution"]
@@ -1,21 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2026 XRTM Team. All rights reserved.
3
-
4
- from datetime import datetime, timezone
5
- from typing import Any, Dict
6
-
7
- from pydantic import BaseModel, Field
8
-
9
-
10
- class ForecastResolution(BaseModel):
11
- r"""
12
- The ground-truth outcome used to evaluate forecast accuracy.
13
- """
14
-
15
- question_id: str
16
- outcome: str = Field(..., description="The final winning outcome or value")
17
- resolved_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
18
- metadata: Dict[str, Any] = Field(default_factory=dict, description="Source info, verification method")
19
-
20
-
21
- __all__ = ["ForecastResolution"]
File without changes
File without changes