rag-forge-evaluator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rag_forge_evaluator-0.1.0/.gitignore +62 -0
  2. rag_forge_evaluator-0.1.0/PKG-INFO +66 -0
  3. rag_forge_evaluator-0.1.0/README.md +35 -0
  4. rag_forge_evaluator-0.1.0/pyproject.toml +42 -0
  5. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/__init__.py +3 -0
  6. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/assess.py +215 -0
  7. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/audit.py +153 -0
  8. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/cli.py +265 -0
  9. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/cost.py +124 -0
  10. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engine.py +60 -0
  11. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/__init__.py +22 -0
  12. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/deepeval_evaluator.py +59 -0
  13. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/ragas_evaluator.py +54 -0
  14. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/golden_set.py +180 -0
  15. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/history.py +57 -0
  16. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/input_loader.py +74 -0
  17. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/__init__.py +6 -0
  18. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/base.py +16 -0
  19. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/claude_judge.py +36 -0
  20. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/mock_judge.py +33 -0
  21. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/openai_judge.py +37 -0
  22. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/maturity.py +116 -0
  23. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/__init__.py +4 -0
  24. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/answer_relevance.py +52 -0
  25. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/base.py +16 -0
  26. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/context_relevance.py +52 -0
  27. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/faithfulness.py +54 -0
  28. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/hallucination.py +58 -0
  29. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/llm_judge.py +123 -0
  30. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/__init__.py +1 -0
  31. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/generator.py +129 -0
  32. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/health.py +177 -0
  33. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/pdf.py +38 -0
  34. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/radar.py +74 -0
  35. rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/templates/audit_report.html.j2 +179 -0
  36. rag_forge_evaluator-0.1.0/tests/conftest.py +1 -0
  37. rag_forge_evaluator-0.1.0/tests/test_assess.py +77 -0
  38. rag_forge_evaluator-0.1.0/tests/test_audit.py +49 -0
  39. rag_forge_evaluator-0.1.0/tests/test_audit_enhanced_integration.py +92 -0
  40. rag_forge_evaluator-0.1.0/tests/test_cost.py +60 -0
  41. rag_forge_evaluator-0.1.0/tests/test_cost_cli.py +27 -0
  42. rag_forge_evaluator-0.1.0/tests/test_enhanced_report.py +62 -0
  43. rag_forge_evaluator-0.1.0/tests/test_evaluator_factory.py +31 -0
  44. rag_forge_evaluator-0.1.0/tests/test_golden_set_full.py +138 -0
  45. rag_forge_evaluator-0.1.0/tests/test_health_report.py +89 -0
  46. rag_forge_evaluator-0.1.0/tests/test_history.py +88 -0
  47. rag_forge_evaluator-0.1.0/tests/test_input_loader.py +69 -0
  48. rag_forge_evaluator-0.1.0/tests/test_instrumented_audit.py +60 -0
  49. rag_forge_evaluator-0.1.0/tests/test_json_report.py +56 -0
  50. rag_forge_evaluator-0.1.0/tests/test_maturity.py +46 -0
  51. rag_forge_evaluator-0.1.0/tests/test_metrics.py +129 -0
  52. rag_forge_evaluator-0.1.0/tests/test_pdf_generator.py +28 -0
  53. rag_forge_evaluator-0.1.0/tests/test_radar_chart.py +46 -0
  54. rag_forge_evaluator-0.1.0/tests/test_report.py +53 -0
@@ -0,0 +1,62 @@
1
+ # Dependencies
2
+ node_modules/
3
+ .pnpm-store/
4
+
5
+ # Build outputs
6
+ dist/
7
+ build/
8
+ *.tsbuildinfo
9
+
10
+ # Turborepo
11
+ .turbo/
12
+
13
+ # Python
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.egg-info/
18
+ *.egg
19
+ .venv/
20
+ .python-version-local
21
+
22
+ # Python tools
23
+ .mypy_cache/
24
+ .ruff_cache/
25
+ .pytest_cache/
26
+ htmlcov/
27
+ .coverage
28
+ .coverage.*
29
+
30
+ # Environment variables
31
+ .env
32
+ .env.local
33
+ .env.*.local
34
+
35
+ # IDE
36
+ .vscode/
37
+ .idea/
38
+ *.swp
39
+ *.swo
40
+ *~
41
+
42
+ # OS
43
+ .DS_Store
44
+ Thumbs.db
45
+ desktop.ini
46
+
47
+ # Test & coverage
48
+ coverage/
49
+ *.lcov
50
+
51
+ # Logs
52
+ *.log
53
+ npm-debug.log*
54
+ pnpm-debug.log*
55
+
56
+ .claude/
57
+
58
+ # Next.js
59
+ apps/*/.next
60
+ apps/*/out
61
+ apps/*/next-env.d.ts
62
+ .vercel
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag-forge-evaluator
3
+ Version: 0.1.0
4
+ Summary: Evaluation engine: RAGAS, DeepEval, LLM-as-Judge, and audit report generation
5
+ Project-URL: Homepage, https://github.com/hallengray/rag-forge
6
+ Project-URL: Repository, https://github.com/hallengray/rag-forge
7
+ Project-URL: Issues, https://github.com/hallengray/rag-forge/issues
8
+ Project-URL: Documentation, https://github.com/hallengray/rag-forge#readme
9
+ Author: Femi Adedayo
10
+ License-Expression: MIT
11
+ Keywords: deepeval,evaluation,llm-as-judge,rag,ragas,rmm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: anthropic>=0.30
21
+ Requires-Dist: jinja2>=3.1
22
+ Requires-Dist: openai>=1.30
23
+ Requires-Dist: pydantic>=2.0
24
+ Provides-Extra: deepeval
25
+ Requires-Dist: deepeval>=1.0; extra == 'deepeval'
26
+ Provides-Extra: pdf
27
+ Requires-Dist: playwright>=1.40; extra == 'pdf'
28
+ Provides-Extra: ragas
29
+ Requires-Dist: ragas>=0.2; extra == 'ragas'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # rag-forge-evaluator
33
+
34
+ RAG pipeline evaluation engine for the RAG-Forge toolkit: RAGAS, DeepEval, LLM-as-Judge, and the RAG Maturity Model.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install rag-forge-evaluator
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ from rag_forge_evaluator.assess import RMMAssessor
46
+
47
+ assessor = RMMAssessor()
48
+ result = assessor.assess(config={
49
+ "retrieval_strategy": "hybrid",
50
+ "input_guard_configured": True,
51
+ "output_guard_configured": True,
52
+ })
53
+ print(result.badge) # e.g., "RMM-3 Better Trust"
54
+ ```
55
+
56
+ ## Features
57
+
58
+ - RMM (RAG Maturity Model) scoring (levels 0-5)
59
+ - RAGAS, DeepEval, and LLM-as-Judge evaluators
60
+ - Golden set management with traffic sampling
61
+ - Cost estimation
62
+ - HTML and PDF report generation
63
+
64
+ ## License
65
+
66
+ MIT
@@ -0,0 +1,35 @@
1
+ # rag-forge-evaluator
2
+
3
+ RAG pipeline evaluation engine for the RAG-Forge toolkit: RAGAS, DeepEval, LLM-as-Judge, and the RAG Maturity Model.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install rag-forge-evaluator
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from rag_forge_evaluator.assess import RMMAssessor
15
+
16
+ assessor = RMMAssessor()
17
+ result = assessor.assess(config={
18
+ "retrieval_strategy": "hybrid",
19
+ "input_guard_configured": True,
20
+ "output_guard_configured": True,
21
+ })
22
+ print(result.badge) # e.g., "RMM-3 Better Trust"
23
+ ```
24
+
25
+ ## Features
26
+
27
+ - RMM (RAG Maturity Model) scoring (levels 0-5)
28
+ - RAGAS, DeepEval, and LLM-as-Judge evaluators
29
+ - Golden set management with traffic sampling
30
+ - Cost estimation
31
+ - HTML and PDF report generation
32
+
33
+ ## License
34
+
35
+ MIT
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "rag-forge-evaluator"
3
+ version = "0.1.0"
4
+ description = "Evaluation engine: RAGAS, DeepEval, LLM-as-Judge, and audit report generation"
5
+ requires-python = ">=3.11"
6
+ license = "MIT"
7
+ authors = [{ name = "Femi Adedayo" }]
8
+ keywords = ["rag", "evaluation", "ragas", "deepeval", "llm-as-judge", "rmm"]
9
+ classifiers = [
10
+ "Development Status :: 3 - Alpha",
11
+ "Intended Audience :: Developers",
12
+ "License :: OSI Approved :: MIT License",
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ]
18
+ readme = "README.md"
19
+ dependencies = [
20
+ "pydantic>=2.0",
21
+ "jinja2>=3.1",
22
+ "anthropic>=0.30",
23
+ "openai>=1.30",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/hallengray/rag-forge"
28
+ Repository = "https://github.com/hallengray/rag-forge"
29
+ Issues = "https://github.com/hallengray/rag-forge/issues"
30
+ Documentation = "https://github.com/hallengray/rag-forge#readme"
31
+
32
+ [project.optional-dependencies]
33
+ ragas = ["ragas>=0.2"]
34
+ deepeval = ["deepeval>=1.0"]
35
+ pdf = ["playwright>=1.40"]
36
+
37
+ [build-system]
38
+ requires = ["hatchling"]
39
+ build-backend = "hatchling.build"
40
+
41
+ [tool.hatch.build.targets.wheel]
42
+ packages = ["src/rag_forge_evaluator"]
@@ -0,0 +1,3 @@
1
+ """RAG-Forge Evaluator: Evaluation engine with RAGAS, DeepEval, and LLM-as-Judge."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,215 @@
1
+ """RMM assessment: score a pipeline against the RAG Maturity Model.
2
+
3
+ Inspects configuration and optional audit data to determine the current
4
+ RMM level (0-5) without running a full evaluation.
5
+ """
6
+
7
+ import json
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ @dataclass
14
+ class AssessmentCheck:
15
+ """A single check within an RMM level."""
16
+
17
+ description: str
18
+ passed: bool
19
+ source: str # "config", "audit", or "unknown"
20
+
21
+
22
+ @dataclass
23
+ class AssessmentResult:
24
+ """Result of an RMM assessment."""
25
+
26
+ rmm_level: int
27
+ rmm_name: str
28
+ criteria: list[dict[str, Any]]
29
+ badge: str
30
+
31
+
32
+ _LEVEL_NAMES = {
33
+ 0: "Naive RAG",
34
+ 1: "Better Recall",
35
+ 2: "Better Precision",
36
+ 3: "Better Trust",
37
+ 4: "Better Workflow",
38
+ 5: "Enterprise",
39
+ }
40
+
41
+
42
+ class RMMAssessor:
43
+ """Assesses a pipeline's RMM level from config and optional audit data."""
44
+
45
+ def load_audit_metrics(self, report_path: str) -> dict[str, float]:
46
+ """Load metric scores from an audit JSON report."""
47
+ path = Path(report_path)
48
+ with path.open() as f:
49
+ data = json.load(f)
50
+ metrics: dict[str, float] = {}
51
+ for m in data.get("metrics", []):
52
+ try:
53
+ metrics[str(m["name"])] = float(m["score"])
54
+ except (KeyError, ValueError, TypeError):
55
+ continue
56
+ return metrics
57
+
58
+ def assess(
59
+ self,
60
+ config: dict[str, Any],
61
+ audit_metrics: dict[str, float] | None = None,
62
+ ) -> AssessmentResult:
63
+ """Determine RMM level from pipeline configuration and audit data."""
64
+ metrics = audit_metrics or {}
65
+ all_criteria: list[dict[str, Any]] = []
66
+ current_level = 0
67
+
68
+ def _to_dict(checks: list[AssessmentCheck]) -> list[dict[str, Any]]:
69
+ return [
70
+ {"description": c.description, "passed": c.passed, "source": c.source}
71
+ for c in checks
72
+ ]
73
+
74
+ # RMM-0: Naive RAG — always passes
75
+ checks_0 = [AssessmentCheck("Pipeline exists", True, "config")]
76
+ all_criteria.append(
77
+ {"level": 0, "name": "Naive RAG", "passed": True, "checks": _to_dict(checks_0)}
78
+ )
79
+
80
+ # RMM-1: Better Recall
81
+ hybrid = config.get("retrieval_strategy") == "hybrid"
82
+ sparse = bool(config.get("sparse_index_configured", False))
83
+ recall_ok = metrics.get("recall_at_k", 0.0) >= 0.70
84
+ checks_1 = [
85
+ AssessmentCheck("Hybrid search configured", hybrid, "config"),
86
+ AssessmentCheck("Sparse index configured", sparse, "config"),
87
+ AssessmentCheck(
88
+ "Recall@5 >= 70%",
89
+ recall_ok,
90
+ "audit" if "recall_at_k" in metrics else "unknown",
91
+ ),
92
+ ]
93
+ level1_passed = hybrid and sparse and recall_ok
94
+ if level1_passed:
95
+ current_level = 1
96
+ all_criteria.append(
97
+ {
98
+ "level": 1,
99
+ "name": "Better Recall",
100
+ "passed": level1_passed,
101
+ "checks": _to_dict(checks_1),
102
+ }
103
+ )
104
+
105
+ # RMM-2: Better Precision
106
+ reranker = bool(config.get("reranker_configured", False))
107
+ ndcg_ok = metrics.get("ndcg_improvement", 0.0) >= 0.10
108
+ checks_2 = [
109
+ AssessmentCheck("Reranker active", reranker, "config"),
110
+ AssessmentCheck(
111
+ "nDCG@10 improvement >= 10%",
112
+ ndcg_ok,
113
+ "audit" if "ndcg_improvement" in metrics else "unknown",
114
+ ),
115
+ ]
116
+ level2_passed = level1_passed and reranker and ndcg_ok
117
+ if level2_passed:
118
+ current_level = 2
119
+ all_criteria.append(
120
+ {
121
+ "level": 2,
122
+ "name": "Better Precision",
123
+ "passed": level2_passed,
124
+ "checks": _to_dict(checks_2),
125
+ }
126
+ )
127
+
128
+ # RMM-3: Better Trust
129
+ input_guard = bool(config.get("input_guard_configured", False))
130
+ output_guard = bool(config.get("output_guard_configured", False))
131
+ faith_ok = metrics.get("faithfulness", 0.0) >= 0.85
132
+ ctx_ok = metrics.get("context_relevance", 0.0) >= 0.80
133
+ checks_3 = [
134
+ AssessmentCheck("InputGuard active", input_guard, "config"),
135
+ AssessmentCheck("OutputGuard active", output_guard, "config"),
136
+ AssessmentCheck(
137
+ "Faithfulness >= 85%",
138
+ faith_ok,
139
+ "audit" if "faithfulness" in metrics else "unknown",
140
+ ),
141
+ AssessmentCheck(
142
+ "Context relevance >= 80%",
143
+ ctx_ok,
144
+ "audit" if "context_relevance" in metrics else "unknown",
145
+ ),
146
+ ]
147
+ level3_passed = (
148
+ current_level >= 2 and input_guard and output_guard and faith_ok and ctx_ok
149
+ )
150
+ if level3_passed:
151
+ current_level = 3
152
+ all_criteria.append(
153
+ {
154
+ "level": 3,
155
+ "name": "Better Trust",
156
+ "passed": level3_passed,
157
+ "checks": _to_dict(checks_3),
158
+ }
159
+ )
160
+
161
+ # RMM-4: Better Workflow
162
+ caching = bool(config.get("caching_configured", False))
163
+ latency_ok = metrics.get("latency_p95", 99999.0) <= 4000
164
+ cost_tracked = bool(config.get("cost_tracking_configured", False))
165
+ checks_4 = [
166
+ AssessmentCheck("Semantic caching active", caching, "config"),
167
+ AssessmentCheck(
168
+ "P95 latency < 4s",
169
+ latency_ok,
170
+ "audit" if "latency_p95" in metrics else "unknown",
171
+ ),
172
+ AssessmentCheck("Cost per query tracked", cost_tracked, "config"),
173
+ ]
174
+ level4_passed = current_level >= 3 and caching and latency_ok and cost_tracked
175
+ if level4_passed:
176
+ current_level = 4
177
+ all_criteria.append(
178
+ {
179
+ "level": 4,
180
+ "name": "Better Workflow",
181
+ "passed": level4_passed,
182
+ "checks": _to_dict(checks_4),
183
+ }
184
+ )
185
+
186
+ # RMM-5: Enterprise
187
+ drift = bool(config.get("drift_detection_configured", False))
188
+ ci_gates = bool(config.get("ci_cd_gates_configured", False))
189
+ adversarial_ok = bool(config.get("adversarial_tests_passing", False))
190
+ checks_5 = [
191
+ AssessmentCheck("Drift detection live", drift, "config"),
192
+ AssessmentCheck("CI/CD evaluation gates configured", ci_gates, "config"),
193
+ AssessmentCheck("Adversarial tests green", adversarial_ok, "config"),
194
+ ]
195
+ level5_passed = level4_passed and drift and ci_gates and adversarial_ok
196
+ if level5_passed:
197
+ current_level = 5
198
+ all_criteria.append(
199
+ {
200
+ "level": 5,
201
+ "name": "Enterprise",
202
+ "passed": level5_passed,
203
+ "checks": _to_dict(checks_5),
204
+ }
205
+ )
206
+
207
+ name = _LEVEL_NAMES.get(current_level, "Unknown")
208
+ badge = f"RMM-{current_level} {name}"
209
+
210
+ return AssessmentResult(
211
+ rmm_level=current_level,
212
+ rmm_name=name,
213
+ criteria=all_criteria,
214
+ badge=badge,
215
+ )
@@ -0,0 +1,153 @@
1
+ """Audit orchestrator: coordinates evaluation, history, and report generation."""
2
+
3
+ from contextlib import nullcontext
4
+ from dataclasses import dataclass
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from rag_forge_evaluator.engine import EvaluationResult
10
+ from rag_forge_evaluator.engines import create_evaluator
11
+ from rag_forge_evaluator.history import AuditHistory, AuditHistoryEntry
12
+ from rag_forge_evaluator.input_loader import InputLoader
13
+ from rag_forge_evaluator.judge.base import JudgeProvider
14
+ from rag_forge_evaluator.judge.mock_judge import MockJudge
15
+ from rag_forge_evaluator.maturity import RMMLevel, RMMScorer
16
+ from rag_forge_evaluator.report.generator import ReportGenerator
17
+
18
+
19
+ @dataclass
20
+ class AuditConfig:
21
+ """Configuration for an audit run."""
22
+
23
+ input_path: Path | None = None
24
+ golden_set_path: Path | None = None
25
+ judge_model: str | None = None
26
+ output_dir: Path = Path("./reports")
27
+ generate_pdf: bool = False
28
+ thresholds: dict[str, float] | None = None
29
+ evaluator_engine: str = "llm-judge"
30
+ tracer: Any = None # opentelemetry.trace.Tracer or None
31
+
32
+
33
+ @dataclass
34
+ class AuditReport:
35
+ """Complete audit report with evaluation results and RMM scoring."""
36
+
37
+ evaluation: EvaluationResult
38
+ rmm_level: RMMLevel
39
+ report_path: Path
40
+ json_report_path: Path
41
+ samples_evaluated: int
42
+ pdf_report_path: Path | None = None
43
+
44
+
45
+ def _create_judge(model: str | None) -> JudgeProvider:
46
+ """Create a judge provider based on model name."""
47
+ if model == "mock" or model is None:
48
+ return MockJudge()
49
+ if model in ("claude", "claude-sonnet"):
50
+ from rag_forge_evaluator.judge.claude_judge import ClaudeJudge
51
+ return ClaudeJudge()
52
+ if model in ("openai", "gpt-4o"):
53
+ from rag_forge_evaluator.judge.openai_judge import OpenAIJudge
54
+ return OpenAIJudge()
55
+ return MockJudge()
56
+
57
+
58
+ class AuditOrchestrator:
59
+ """Orchestrates the full audit pipeline."""
60
+
61
+ def __init__(self, config: AuditConfig) -> None:
62
+ self.config = config
63
+ self._tracer = config.tracer
64
+
65
+ def _span(self, name: str) -> Any:
66
+ """Return an active span context manager, or a no-op if no tracer is configured."""
67
+ if self._tracer is not None:
68
+ return self._tracer.start_as_current_span(name)
69
+ return nullcontext()
70
+
71
+ def run(self) -> AuditReport:
72
+ """Execute the full audit pipeline."""
73
+ with self._span("rag-forge.audit"):
74
+ # 1. Load input
75
+ with self._span("rag-forge.load_input") as span:
76
+ if self.config.input_path:
77
+ samples = InputLoader.load_jsonl(self.config.input_path)
78
+ source_type = "jsonl"
79
+ elif self.config.golden_set_path:
80
+ samples = InputLoader.load_golden_set(self.config.golden_set_path)
81
+ source_type = "golden_set"
82
+ else:
83
+ msg = "Either input_path or golden_set_path must be provided"
84
+ raise ValueError(msg)
85
+ if span is not None:
86
+ span.set_attribute("sample_count", len(samples))
87
+ span.set_attribute("source_type", source_type)
88
+
89
+ # 2. Create evaluator via factory
90
+ judge = _create_judge(self.config.judge_model)
91
+ evaluator = create_evaluator(
92
+ self.config.evaluator_engine,
93
+ judge=judge,
94
+ thresholds=self.config.thresholds,
95
+ )
96
+
97
+ # 3. Run evaluation
98
+ with self._span("rag-forge.evaluate") as span:
99
+ evaluation = evaluator.evaluate(samples)
100
+ if span is not None:
101
+ span.set_attribute("engine", self.config.evaluator_engine)
102
+ span.set_attribute("sample_count", evaluation.samples_evaluated)
103
+
104
+ # 4. Score against RMM
105
+ metric_map = {m.name: m.score for m in evaluation.metrics}
106
+ with self._span("rag-forge.score_rmm") as span:
107
+ rmm_level = RMMScorer().assess(metric_map)
108
+ if span is not None:
109
+ span.set_attribute("rmm_level", int(rmm_level))
110
+
111
+ # 5. Load history and compute trends
112
+ history = AuditHistory(self.config.output_dir / "audit-history.json")
113
+ previous = history.get_previous()
114
+ trends = history.compute_trends(metric_map, previous)
115
+
116
+ # 6. Generate reports
117
+ generator = ReportGenerator(output_dir=self.config.output_dir)
118
+ with self._span("rag-forge.generate_report") as span:
119
+ report_path = generator.generate_html(
120
+ evaluation, rmm_level,
121
+ trends=trends,
122
+ sample_results=evaluation.sample_results,
123
+ )
124
+ json_report_path = generator.generate_json(
125
+ evaluation, rmm_level,
126
+ sample_results=evaluation.sample_results,
127
+ )
128
+ if span is not None:
129
+ span.set_attribute("report_path", str(report_path))
130
+
131
+ # 7. Generate PDF (optional)
132
+ pdf_report_path: Path | None = None
133
+ if self.config.generate_pdf:
134
+ from rag_forge_evaluator.report.pdf import PDFGenerator
135
+ pdf_report_path = PDFGenerator().generate(report_path)
136
+
137
+ # 8. Append to history (after all reports succeed)
138
+ history.append(AuditHistoryEntry(
139
+ timestamp=datetime.now(tz=UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
140
+ metrics=metric_map,
141
+ rmm_level=int(rmm_level),
142
+ overall_score=evaluation.overall_score,
143
+ passed=evaluation.passed,
144
+ ))
145
+
146
+ return AuditReport(
147
+ evaluation=evaluation,
148
+ rmm_level=rmm_level,
149
+ report_path=report_path,
150
+ json_report_path=json_report_path,
151
+ samples_evaluated=evaluation.samples_evaluated,
152
+ pdf_report_path=pdf_report_path,
153
+ )