rag-forge-evaluator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_forge_evaluator-0.1.0/.gitignore +62 -0
- rag_forge_evaluator-0.1.0/PKG-INFO +66 -0
- rag_forge_evaluator-0.1.0/README.md +35 -0
- rag_forge_evaluator-0.1.0/pyproject.toml +42 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/__init__.py +3 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/assess.py +215 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/audit.py +153 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/cli.py +265 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/cost.py +124 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engine.py +60 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/__init__.py +22 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/deepeval_evaluator.py +59 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/engines/ragas_evaluator.py +54 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/golden_set.py +180 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/history.py +57 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/input_loader.py +74 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/__init__.py +6 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/base.py +16 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/claude_judge.py +36 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/mock_judge.py +33 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/judge/openai_judge.py +37 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/maturity.py +116 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/__init__.py +4 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/answer_relevance.py +52 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/base.py +16 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/context_relevance.py +52 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/faithfulness.py +54 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/hallucination.py +58 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/metrics/llm_judge.py +123 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/__init__.py +1 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/generator.py +129 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/health.py +177 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/pdf.py +38 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/radar.py +74 -0
- rag_forge_evaluator-0.1.0/src/rag_forge_evaluator/report/templates/audit_report.html.j2 +179 -0
- rag_forge_evaluator-0.1.0/tests/conftest.py +1 -0
- rag_forge_evaluator-0.1.0/tests/test_assess.py +77 -0
- rag_forge_evaluator-0.1.0/tests/test_audit.py +49 -0
- rag_forge_evaluator-0.1.0/tests/test_audit_enhanced_integration.py +92 -0
- rag_forge_evaluator-0.1.0/tests/test_cost.py +60 -0
- rag_forge_evaluator-0.1.0/tests/test_cost_cli.py +27 -0
- rag_forge_evaluator-0.1.0/tests/test_enhanced_report.py +62 -0
- rag_forge_evaluator-0.1.0/tests/test_evaluator_factory.py +31 -0
- rag_forge_evaluator-0.1.0/tests/test_golden_set_full.py +138 -0
- rag_forge_evaluator-0.1.0/tests/test_health_report.py +89 -0
- rag_forge_evaluator-0.1.0/tests/test_history.py +88 -0
- rag_forge_evaluator-0.1.0/tests/test_input_loader.py +69 -0
- rag_forge_evaluator-0.1.0/tests/test_instrumented_audit.py +60 -0
- rag_forge_evaluator-0.1.0/tests/test_json_report.py +56 -0
- rag_forge_evaluator-0.1.0/tests/test_maturity.py +46 -0
- rag_forge_evaluator-0.1.0/tests/test_metrics.py +129 -0
- rag_forge_evaluator-0.1.0/tests/test_pdf_generator.py +28 -0
- rag_forge_evaluator-0.1.0/tests/test_radar_chart.py +46 -0
- rag_forge_evaluator-0.1.0/tests/test_report.py +53 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Dependencies
|
|
2
|
+
node_modules/
|
|
3
|
+
.pnpm-store/
|
|
4
|
+
|
|
5
|
+
# Build outputs
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.tsbuildinfo
|
|
9
|
+
|
|
10
|
+
# Turborepo
|
|
11
|
+
.turbo/
|
|
12
|
+
|
|
13
|
+
# Python
|
|
14
|
+
__pycache__/
|
|
15
|
+
*.py[cod]
|
|
16
|
+
*$py.class
|
|
17
|
+
*.egg-info/
|
|
18
|
+
*.egg
|
|
19
|
+
.venv/
|
|
20
|
+
.python-version-local
|
|
21
|
+
|
|
22
|
+
# Python tools
|
|
23
|
+
.mypy_cache/
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
.pytest_cache/
|
|
26
|
+
htmlcov/
|
|
27
|
+
.coverage
|
|
28
|
+
.coverage.*
|
|
29
|
+
|
|
30
|
+
# Environment variables
|
|
31
|
+
.env
|
|
32
|
+
.env.local
|
|
33
|
+
.env.*.local
|
|
34
|
+
|
|
35
|
+
# IDE
|
|
36
|
+
.vscode/
|
|
37
|
+
.idea/
|
|
38
|
+
*.swp
|
|
39
|
+
*.swo
|
|
40
|
+
*~
|
|
41
|
+
|
|
42
|
+
# OS
|
|
43
|
+
.DS_Store
|
|
44
|
+
Thumbs.db
|
|
45
|
+
desktop.ini
|
|
46
|
+
|
|
47
|
+
# Test & coverage
|
|
48
|
+
coverage/
|
|
49
|
+
*.lcov
|
|
50
|
+
|
|
51
|
+
# Logs
|
|
52
|
+
*.log
|
|
53
|
+
npm-debug.log*
|
|
54
|
+
pnpm-debug.log*
|
|
55
|
+
|
|
56
|
+
.claude/
|
|
57
|
+
|
|
58
|
+
# Next.js
|
|
59
|
+
apps/*/.next
|
|
60
|
+
apps/*/out
|
|
61
|
+
apps/*/next-env.d.ts
|
|
62
|
+
.vercel
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rag-forge-evaluator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evaluation engine: RAGAS, DeepEval, LLM-as-Judge, and audit report generation
|
|
5
|
+
Project-URL: Homepage, https://github.com/hallengray/rag-forge
|
|
6
|
+
Project-URL: Repository, https://github.com/hallengray/rag-forge
|
|
7
|
+
Project-URL: Issues, https://github.com/hallengray/rag-forge/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/hallengray/rag-forge#readme
|
|
9
|
+
Author: Femi Adedayo
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Keywords: deepeval,evaluation,llm-as-judge,rag,ragas,rmm
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: anthropic>=0.30
|
|
21
|
+
Requires-Dist: jinja2>=3.1
|
|
22
|
+
Requires-Dist: openai>=1.30
|
|
23
|
+
Requires-Dist: pydantic>=2.0
|
|
24
|
+
Provides-Extra: deepeval
|
|
25
|
+
Requires-Dist: deepeval>=1.0; extra == 'deepeval'
|
|
26
|
+
Provides-Extra: pdf
|
|
27
|
+
Requires-Dist: playwright>=1.40; extra == 'pdf'
|
|
28
|
+
Provides-Extra: ragas
|
|
29
|
+
Requires-Dist: ragas>=0.2; extra == 'ragas'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# rag-forge-evaluator
|
|
33
|
+
|
|
34
|
+
RAG pipeline evaluation engine for the RAG-Forge toolkit: RAGAS, DeepEval, LLM-as-Judge, and the RAG Maturity Model.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install rag-forge-evaluator
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from rag_forge_evaluator.assess import RMMAssessor
|
|
46
|
+
|
|
47
|
+
assessor = RMMAssessor()
|
|
48
|
+
result = assessor.assess(config={
|
|
49
|
+
"retrieval_strategy": "hybrid",
|
|
50
|
+
"input_guard_configured": True,
|
|
51
|
+
"output_guard_configured": True,
|
|
52
|
+
})
|
|
53
|
+
print(result.badge) # e.g., "RMM-3 Better Trust"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
- RMM (RAG Maturity Model) scoring (levels 0-5)
|
|
59
|
+
- RAGAS, DeepEval, and LLM-as-Judge evaluators
|
|
60
|
+
- Golden set management with traffic sampling
|
|
61
|
+
- Cost estimation
|
|
62
|
+
- HTML and PDF report generation
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# rag-forge-evaluator
|
|
2
|
+
|
|
3
|
+
RAG pipeline evaluation engine for the RAG-Forge toolkit: RAGAS, DeepEval, LLM-as-Judge, and the RAG Maturity Model.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install rag-forge-evaluator
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from rag_forge_evaluator.assess import RMMAssessor
|
|
15
|
+
|
|
16
|
+
assessor = RMMAssessor()
|
|
17
|
+
result = assessor.assess(config={
|
|
18
|
+
"retrieval_strategy": "hybrid",
|
|
19
|
+
"input_guard_configured": True,
|
|
20
|
+
"output_guard_configured": True,
|
|
21
|
+
})
|
|
22
|
+
print(result.badge) # e.g., "RMM-3 Better Trust"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Features
|
|
26
|
+
|
|
27
|
+
- RMM (RAG Maturity Model) scoring (levels 0-5)
|
|
28
|
+
- RAGAS, DeepEval, and LLM-as-Judge evaluators
|
|
29
|
+
- Golden set management with traffic sampling
|
|
30
|
+
- Cost estimation
|
|
31
|
+
- HTML and PDF report generation
|
|
32
|
+
|
|
33
|
+
## License
|
|
34
|
+
|
|
35
|
+
MIT
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rag-forge-evaluator"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Evaluation engine: RAGAS, DeepEval, LLM-as-Judge, and audit report generation"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
authors = [{ name = "Femi Adedayo" }]
|
|
8
|
+
keywords = ["rag", "evaluation", "ragas", "deepeval", "llm-as-judge", "rmm"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 3 - Alpha",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"License :: OSI Approved :: MIT License",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
]
|
|
18
|
+
readme = "README.md"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"pydantic>=2.0",
|
|
21
|
+
"jinja2>=3.1",
|
|
22
|
+
"anthropic>=0.30",
|
|
23
|
+
"openai>=1.30",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/hallengray/rag-forge"
|
|
28
|
+
Repository = "https://github.com/hallengray/rag-forge"
|
|
29
|
+
Issues = "https://github.com/hallengray/rag-forge/issues"
|
|
30
|
+
Documentation = "https://github.com/hallengray/rag-forge#readme"
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
ragas = ["ragas>=0.2"]
|
|
34
|
+
deepeval = ["deepeval>=1.0"]
|
|
35
|
+
pdf = ["playwright>=1.40"]
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["hatchling"]
|
|
39
|
+
build-backend = "hatchling.build"
|
|
40
|
+
|
|
41
|
+
[tool.hatch.build.targets.wheel]
|
|
42
|
+
packages = ["src/rag_forge_evaluator"]
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""RMM assessment: score a pipeline against the RAG Maturity Model.
|
|
2
|
+
|
|
3
|
+
Inspects configuration and optional audit data to determine the current
|
|
4
|
+
RMM level (0-5) without running a full evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class AssessmentCheck:
|
|
15
|
+
"""A single check within an RMM level."""
|
|
16
|
+
|
|
17
|
+
description: str
|
|
18
|
+
passed: bool
|
|
19
|
+
source: str # "config", "audit", or "unknown"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class AssessmentResult:
|
|
24
|
+
"""Result of an RMM assessment."""
|
|
25
|
+
|
|
26
|
+
rmm_level: int
|
|
27
|
+
rmm_name: str
|
|
28
|
+
criteria: list[dict[str, Any]]
|
|
29
|
+
badge: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_LEVEL_NAMES = {
|
|
33
|
+
0: "Naive RAG",
|
|
34
|
+
1: "Better Recall",
|
|
35
|
+
2: "Better Precision",
|
|
36
|
+
3: "Better Trust",
|
|
37
|
+
4: "Better Workflow",
|
|
38
|
+
5: "Enterprise",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RMMAssessor:
|
|
43
|
+
"""Assesses a pipeline's RMM level from config and optional audit data."""
|
|
44
|
+
|
|
45
|
+
def load_audit_metrics(self, report_path: str) -> dict[str, float]:
|
|
46
|
+
"""Load metric scores from an audit JSON report."""
|
|
47
|
+
path = Path(report_path)
|
|
48
|
+
with path.open() as f:
|
|
49
|
+
data = json.load(f)
|
|
50
|
+
metrics: dict[str, float] = {}
|
|
51
|
+
for m in data.get("metrics", []):
|
|
52
|
+
try:
|
|
53
|
+
metrics[str(m["name"])] = float(m["score"])
|
|
54
|
+
except (KeyError, ValueError, TypeError):
|
|
55
|
+
continue
|
|
56
|
+
return metrics
|
|
57
|
+
|
|
58
|
+
def assess(
|
|
59
|
+
self,
|
|
60
|
+
config: dict[str, Any],
|
|
61
|
+
audit_metrics: dict[str, float] | None = None,
|
|
62
|
+
) -> AssessmentResult:
|
|
63
|
+
"""Determine RMM level from pipeline configuration and audit data."""
|
|
64
|
+
metrics = audit_metrics or {}
|
|
65
|
+
all_criteria: list[dict[str, Any]] = []
|
|
66
|
+
current_level = 0
|
|
67
|
+
|
|
68
|
+
def _to_dict(checks: list[AssessmentCheck]) -> list[dict[str, Any]]:
|
|
69
|
+
return [
|
|
70
|
+
{"description": c.description, "passed": c.passed, "source": c.source}
|
|
71
|
+
for c in checks
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
# RMM-0: Naive RAG — always passes
|
|
75
|
+
checks_0 = [AssessmentCheck("Pipeline exists", True, "config")]
|
|
76
|
+
all_criteria.append(
|
|
77
|
+
{"level": 0, "name": "Naive RAG", "passed": True, "checks": _to_dict(checks_0)}
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# RMM-1: Better Recall
|
|
81
|
+
hybrid = config.get("retrieval_strategy") == "hybrid"
|
|
82
|
+
sparse = bool(config.get("sparse_index_configured", False))
|
|
83
|
+
recall_ok = metrics.get("recall_at_k", 0.0) >= 0.70
|
|
84
|
+
checks_1 = [
|
|
85
|
+
AssessmentCheck("Hybrid search configured", hybrid, "config"),
|
|
86
|
+
AssessmentCheck("Sparse index configured", sparse, "config"),
|
|
87
|
+
AssessmentCheck(
|
|
88
|
+
"Recall@5 >= 70%",
|
|
89
|
+
recall_ok,
|
|
90
|
+
"audit" if "recall_at_k" in metrics else "unknown",
|
|
91
|
+
),
|
|
92
|
+
]
|
|
93
|
+
level1_passed = hybrid and sparse and recall_ok
|
|
94
|
+
if level1_passed:
|
|
95
|
+
current_level = 1
|
|
96
|
+
all_criteria.append(
|
|
97
|
+
{
|
|
98
|
+
"level": 1,
|
|
99
|
+
"name": "Better Recall",
|
|
100
|
+
"passed": level1_passed,
|
|
101
|
+
"checks": _to_dict(checks_1),
|
|
102
|
+
}
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# RMM-2: Better Precision
|
|
106
|
+
reranker = bool(config.get("reranker_configured", False))
|
|
107
|
+
ndcg_ok = metrics.get("ndcg_improvement", 0.0) >= 0.10
|
|
108
|
+
checks_2 = [
|
|
109
|
+
AssessmentCheck("Reranker active", reranker, "config"),
|
|
110
|
+
AssessmentCheck(
|
|
111
|
+
"nDCG@10 improvement >= 10%",
|
|
112
|
+
ndcg_ok,
|
|
113
|
+
"audit" if "ndcg_improvement" in metrics else "unknown",
|
|
114
|
+
),
|
|
115
|
+
]
|
|
116
|
+
level2_passed = level1_passed and reranker and ndcg_ok
|
|
117
|
+
if level2_passed:
|
|
118
|
+
current_level = 2
|
|
119
|
+
all_criteria.append(
|
|
120
|
+
{
|
|
121
|
+
"level": 2,
|
|
122
|
+
"name": "Better Precision",
|
|
123
|
+
"passed": level2_passed,
|
|
124
|
+
"checks": _to_dict(checks_2),
|
|
125
|
+
}
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# RMM-3: Better Trust
|
|
129
|
+
input_guard = bool(config.get("input_guard_configured", False))
|
|
130
|
+
output_guard = bool(config.get("output_guard_configured", False))
|
|
131
|
+
faith_ok = metrics.get("faithfulness", 0.0) >= 0.85
|
|
132
|
+
ctx_ok = metrics.get("context_relevance", 0.0) >= 0.80
|
|
133
|
+
checks_3 = [
|
|
134
|
+
AssessmentCheck("InputGuard active", input_guard, "config"),
|
|
135
|
+
AssessmentCheck("OutputGuard active", output_guard, "config"),
|
|
136
|
+
AssessmentCheck(
|
|
137
|
+
"Faithfulness >= 85%",
|
|
138
|
+
faith_ok,
|
|
139
|
+
"audit" if "faithfulness" in metrics else "unknown",
|
|
140
|
+
),
|
|
141
|
+
AssessmentCheck(
|
|
142
|
+
"Context relevance >= 80%",
|
|
143
|
+
ctx_ok,
|
|
144
|
+
"audit" if "context_relevance" in metrics else "unknown",
|
|
145
|
+
),
|
|
146
|
+
]
|
|
147
|
+
level3_passed = (
|
|
148
|
+
current_level >= 2 and input_guard and output_guard and faith_ok and ctx_ok
|
|
149
|
+
)
|
|
150
|
+
if level3_passed:
|
|
151
|
+
current_level = 3
|
|
152
|
+
all_criteria.append(
|
|
153
|
+
{
|
|
154
|
+
"level": 3,
|
|
155
|
+
"name": "Better Trust",
|
|
156
|
+
"passed": level3_passed,
|
|
157
|
+
"checks": _to_dict(checks_3),
|
|
158
|
+
}
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# RMM-4: Better Workflow
|
|
162
|
+
caching = bool(config.get("caching_configured", False))
|
|
163
|
+
latency_ok = metrics.get("latency_p95", 99999.0) <= 4000
|
|
164
|
+
cost_tracked = bool(config.get("cost_tracking_configured", False))
|
|
165
|
+
checks_4 = [
|
|
166
|
+
AssessmentCheck("Semantic caching active", caching, "config"),
|
|
167
|
+
AssessmentCheck(
|
|
168
|
+
"P95 latency < 4s",
|
|
169
|
+
latency_ok,
|
|
170
|
+
"audit" if "latency_p95" in metrics else "unknown",
|
|
171
|
+
),
|
|
172
|
+
AssessmentCheck("Cost per query tracked", cost_tracked, "config"),
|
|
173
|
+
]
|
|
174
|
+
level4_passed = current_level >= 3 and caching and latency_ok and cost_tracked
|
|
175
|
+
if level4_passed:
|
|
176
|
+
current_level = 4
|
|
177
|
+
all_criteria.append(
|
|
178
|
+
{
|
|
179
|
+
"level": 4,
|
|
180
|
+
"name": "Better Workflow",
|
|
181
|
+
"passed": level4_passed,
|
|
182
|
+
"checks": _to_dict(checks_4),
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# RMM-5: Enterprise
|
|
187
|
+
drift = bool(config.get("drift_detection_configured", False))
|
|
188
|
+
ci_gates = bool(config.get("ci_cd_gates_configured", False))
|
|
189
|
+
adversarial_ok = bool(config.get("adversarial_tests_passing", False))
|
|
190
|
+
checks_5 = [
|
|
191
|
+
AssessmentCheck("Drift detection live", drift, "config"),
|
|
192
|
+
AssessmentCheck("CI/CD evaluation gates configured", ci_gates, "config"),
|
|
193
|
+
AssessmentCheck("Adversarial tests green", adversarial_ok, "config"),
|
|
194
|
+
]
|
|
195
|
+
level5_passed = level4_passed and drift and ci_gates and adversarial_ok
|
|
196
|
+
if level5_passed:
|
|
197
|
+
current_level = 5
|
|
198
|
+
all_criteria.append(
|
|
199
|
+
{
|
|
200
|
+
"level": 5,
|
|
201
|
+
"name": "Enterprise",
|
|
202
|
+
"passed": level5_passed,
|
|
203
|
+
"checks": _to_dict(checks_5),
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
name = _LEVEL_NAMES.get(current_level, "Unknown")
|
|
208
|
+
badge = f"RMM-{current_level} {name}"
|
|
209
|
+
|
|
210
|
+
return AssessmentResult(
|
|
211
|
+
rmm_level=current_level,
|
|
212
|
+
rmm_name=name,
|
|
213
|
+
criteria=all_criteria,
|
|
214
|
+
badge=badge,
|
|
215
|
+
)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Audit orchestrator: coordinates evaluation, history, and report generation."""
|
|
2
|
+
|
|
3
|
+
from contextlib import nullcontext
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from rag_forge_evaluator.engine import EvaluationResult
|
|
10
|
+
from rag_forge_evaluator.engines import create_evaluator
|
|
11
|
+
from rag_forge_evaluator.history import AuditHistory, AuditHistoryEntry
|
|
12
|
+
from rag_forge_evaluator.input_loader import InputLoader
|
|
13
|
+
from rag_forge_evaluator.judge.base import JudgeProvider
|
|
14
|
+
from rag_forge_evaluator.judge.mock_judge import MockJudge
|
|
15
|
+
from rag_forge_evaluator.maturity import RMMLevel, RMMScorer
|
|
16
|
+
from rag_forge_evaluator.report.generator import ReportGenerator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class AuditConfig:
|
|
21
|
+
"""Configuration for an audit run."""
|
|
22
|
+
|
|
23
|
+
input_path: Path | None = None
|
|
24
|
+
golden_set_path: Path | None = None
|
|
25
|
+
judge_model: str | None = None
|
|
26
|
+
output_dir: Path = Path("./reports")
|
|
27
|
+
generate_pdf: bool = False
|
|
28
|
+
thresholds: dict[str, float] | None = None
|
|
29
|
+
evaluator_engine: str = "llm-judge"
|
|
30
|
+
tracer: Any = None # opentelemetry.trace.Tracer or None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class AuditReport:
|
|
35
|
+
"""Complete audit report with evaluation results and RMM scoring."""
|
|
36
|
+
|
|
37
|
+
evaluation: EvaluationResult
|
|
38
|
+
rmm_level: RMMLevel
|
|
39
|
+
report_path: Path
|
|
40
|
+
json_report_path: Path
|
|
41
|
+
samples_evaluated: int
|
|
42
|
+
pdf_report_path: Path | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _create_judge(model: str | None) -> JudgeProvider:
|
|
46
|
+
"""Create a judge provider based on model name."""
|
|
47
|
+
if model == "mock" or model is None:
|
|
48
|
+
return MockJudge()
|
|
49
|
+
if model in ("claude", "claude-sonnet"):
|
|
50
|
+
from rag_forge_evaluator.judge.claude_judge import ClaudeJudge
|
|
51
|
+
return ClaudeJudge()
|
|
52
|
+
if model in ("openai", "gpt-4o"):
|
|
53
|
+
from rag_forge_evaluator.judge.openai_judge import OpenAIJudge
|
|
54
|
+
return OpenAIJudge()
|
|
55
|
+
return MockJudge()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AuditOrchestrator:
|
|
59
|
+
"""Orchestrates the full audit pipeline."""
|
|
60
|
+
|
|
61
|
+
def __init__(self, config: AuditConfig) -> None:
|
|
62
|
+
self.config = config
|
|
63
|
+
self._tracer = config.tracer
|
|
64
|
+
|
|
65
|
+
def _span(self, name: str) -> Any:
|
|
66
|
+
"""Return an active span context manager, or a no-op if no tracer is configured."""
|
|
67
|
+
if self._tracer is not None:
|
|
68
|
+
return self._tracer.start_as_current_span(name)
|
|
69
|
+
return nullcontext()
|
|
70
|
+
|
|
71
|
+
def run(self) -> AuditReport:
|
|
72
|
+
"""Execute the full audit pipeline."""
|
|
73
|
+
with self._span("rag-forge.audit"):
|
|
74
|
+
# 1. Load input
|
|
75
|
+
with self._span("rag-forge.load_input") as span:
|
|
76
|
+
if self.config.input_path:
|
|
77
|
+
samples = InputLoader.load_jsonl(self.config.input_path)
|
|
78
|
+
source_type = "jsonl"
|
|
79
|
+
elif self.config.golden_set_path:
|
|
80
|
+
samples = InputLoader.load_golden_set(self.config.golden_set_path)
|
|
81
|
+
source_type = "golden_set"
|
|
82
|
+
else:
|
|
83
|
+
msg = "Either input_path or golden_set_path must be provided"
|
|
84
|
+
raise ValueError(msg)
|
|
85
|
+
if span is not None:
|
|
86
|
+
span.set_attribute("sample_count", len(samples))
|
|
87
|
+
span.set_attribute("source_type", source_type)
|
|
88
|
+
|
|
89
|
+
# 2. Create evaluator via factory
|
|
90
|
+
judge = _create_judge(self.config.judge_model)
|
|
91
|
+
evaluator = create_evaluator(
|
|
92
|
+
self.config.evaluator_engine,
|
|
93
|
+
judge=judge,
|
|
94
|
+
thresholds=self.config.thresholds,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# 3. Run evaluation
|
|
98
|
+
with self._span("rag-forge.evaluate") as span:
|
|
99
|
+
evaluation = evaluator.evaluate(samples)
|
|
100
|
+
if span is not None:
|
|
101
|
+
span.set_attribute("engine", self.config.evaluator_engine)
|
|
102
|
+
span.set_attribute("sample_count", evaluation.samples_evaluated)
|
|
103
|
+
|
|
104
|
+
# 4. Score against RMM
|
|
105
|
+
metric_map = {m.name: m.score for m in evaluation.metrics}
|
|
106
|
+
with self._span("rag-forge.score_rmm") as span:
|
|
107
|
+
rmm_level = RMMScorer().assess(metric_map)
|
|
108
|
+
if span is not None:
|
|
109
|
+
span.set_attribute("rmm_level", int(rmm_level))
|
|
110
|
+
|
|
111
|
+
# 5. Load history and compute trends
|
|
112
|
+
history = AuditHistory(self.config.output_dir / "audit-history.json")
|
|
113
|
+
previous = history.get_previous()
|
|
114
|
+
trends = history.compute_trends(metric_map, previous)
|
|
115
|
+
|
|
116
|
+
# 6. Generate reports
|
|
117
|
+
generator = ReportGenerator(output_dir=self.config.output_dir)
|
|
118
|
+
with self._span("rag-forge.generate_report") as span:
|
|
119
|
+
report_path = generator.generate_html(
|
|
120
|
+
evaluation, rmm_level,
|
|
121
|
+
trends=trends,
|
|
122
|
+
sample_results=evaluation.sample_results,
|
|
123
|
+
)
|
|
124
|
+
json_report_path = generator.generate_json(
|
|
125
|
+
evaluation, rmm_level,
|
|
126
|
+
sample_results=evaluation.sample_results,
|
|
127
|
+
)
|
|
128
|
+
if span is not None:
|
|
129
|
+
span.set_attribute("report_path", str(report_path))
|
|
130
|
+
|
|
131
|
+
# 7. Generate PDF (optional)
|
|
132
|
+
pdf_report_path: Path | None = None
|
|
133
|
+
if self.config.generate_pdf:
|
|
134
|
+
from rag_forge_evaluator.report.pdf import PDFGenerator
|
|
135
|
+
pdf_report_path = PDFGenerator().generate(report_path)
|
|
136
|
+
|
|
137
|
+
# 8. Append to history (after all reports succeed)
|
|
138
|
+
history.append(AuditHistoryEntry(
|
|
139
|
+
timestamp=datetime.now(tz=UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
140
|
+
metrics=metric_map,
|
|
141
|
+
rmm_level=int(rmm_level),
|
|
142
|
+
overall_score=evaluation.overall_score,
|
|
143
|
+
passed=evaluation.passed,
|
|
144
|
+
))
|
|
145
|
+
|
|
146
|
+
return AuditReport(
|
|
147
|
+
evaluation=evaluation,
|
|
148
|
+
rmm_level=rmm_level,
|
|
149
|
+
report_path=report_path,
|
|
150
|
+
json_report_path=json_report_path,
|
|
151
|
+
samples_evaluated=evaluation.samples_evaluated,
|
|
152
|
+
pdf_report_path=pdf_report_path,
|
|
153
|
+
)
|