ragverdict 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragverdict/__init__.py +30 -0
- ragverdict/adapters/__init__.py +23 -0
- ragverdict/adapters/base.py +92 -0
- ragverdict/adapters/http.py +65 -0
- ragverdict/adapters/loader.py +53 -0
- ragverdict/cli.py +58 -0
- ragverdict/config.py +117 -0
- ragverdict/evaluators/__init__.py +27 -0
- ragverdict/evaluators/base.py +47 -0
- ragverdict/evaluators/citation_audit.py +207 -0
- ragverdict/evaluators/edge_cases.py +460 -0
- ragverdict/evaluators/rag_quality.py +318 -0
- ragverdict/evaluators/tool_coverage.py +130 -0
- ragverdict/judges/__init__.py +1 -0
- ragverdict/judges/llm_judge.py +324 -0
- ragverdict/py.typed +0 -0
- ragverdict/report.py +140 -0
- ragverdict/runner.py +107 -0
- ragverdict-0.2.1.dist-info/METADATA +303 -0
- ragverdict-0.2.1.dist-info/RECORD +23 -0
- ragverdict-0.2.1.dist-info/WHEEL +4 -0
- ragverdict-0.2.1.dist-info/entry_points.txt +2 -0
- ragverdict-0.2.1.dist-info/licenses/LICENSE +21 -0
ragverdict/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""ragverdict — pytest for RAG agents."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.2.1"
|
|
4
|
+
|
|
5
|
+
from ragverdict.adapters.base import (
|
|
6
|
+
Citation,
|
|
7
|
+
ContextDoc,
|
|
8
|
+
Message,
|
|
9
|
+
RagAdapter,
|
|
10
|
+
RagResponse,
|
|
11
|
+
SourceDoc,
|
|
12
|
+
ToolCall,
|
|
13
|
+
ToolSpec,
|
|
14
|
+
)
|
|
15
|
+
from ragverdict.evaluators.base import Evaluator, TestResult, Verdict
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Citation",
|
|
19
|
+
"ContextDoc",
|
|
20
|
+
"Evaluator",
|
|
21
|
+
"Message",
|
|
22
|
+
"RagAdapter",
|
|
23
|
+
"RagResponse",
|
|
24
|
+
"SourceDoc",
|
|
25
|
+
"TestResult",
|
|
26
|
+
"ToolCall",
|
|
27
|
+
"ToolSpec",
|
|
28
|
+
"Verdict",
|
|
29
|
+
"__version__",
|
|
30
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Adapters connect ragverdict to user-defined RAG systems."""
|
|
2
|
+
|
|
3
|
+
from ragverdict.adapters.base import (
|
|
4
|
+
Citation,
|
|
5
|
+
ContextDoc,
|
|
6
|
+
Message,
|
|
7
|
+
RagAdapter,
|
|
8
|
+
RagResponse,
|
|
9
|
+
SourceDoc,
|
|
10
|
+
ToolCall,
|
|
11
|
+
ToolSpec,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Citation",
|
|
16
|
+
"ContextDoc",
|
|
17
|
+
"Message",
|
|
18
|
+
"RagAdapter",
|
|
19
|
+
"RagResponse",
|
|
20
|
+
"SourceDoc",
|
|
21
|
+
"ToolCall",
|
|
22
|
+
"ToolSpec",
|
|
23
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Core protocol types every RAG adapter speaks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Message:
|
|
13
|
+
role: Literal["user", "assistant", "system"]
|
|
14
|
+
content: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Citation:
|
|
19
|
+
"""A reference from a response back to a source document."""
|
|
20
|
+
|
|
21
|
+
id: str
|
|
22
|
+
source_id: str
|
|
23
|
+
span: str = ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ToolCall:
|
|
28
|
+
"""A tool invocation the agent made while answering."""
|
|
29
|
+
|
|
30
|
+
name: str
|
|
31
|
+
args: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
result: Any = None
|
|
33
|
+
latency_ms: int = 0
|
|
34
|
+
error: str | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ContextDoc:
|
|
39
|
+
"""A retrieved chunk the agent considered while answering."""
|
|
40
|
+
|
|
41
|
+
source_id: str
|
|
42
|
+
chunk: str
|
|
43
|
+
score: float = 0.0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ToolSpec:
|
|
48
|
+
"""Describes a tool the adapter exposes for tool_coverage testing."""
|
|
49
|
+
|
|
50
|
+
name: str
|
|
51
|
+
description: str = ""
|
|
52
|
+
trigger_prompt: str | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class SourceDoc:
|
|
57
|
+
"""A document in the adapter's source corpus for citation_audit."""
|
|
58
|
+
|
|
59
|
+
source_id: str
|
|
60
|
+
content: str
|
|
61
|
+
title: str = ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class RagResponse:
|
|
66
|
+
"""What an adapter returns from `query`."""
|
|
67
|
+
|
|
68
|
+
text: str
|
|
69
|
+
citations: list[Citation] = field(default_factory=list)
|
|
70
|
+
tool_calls: list[ToolCall] = field(default_factory=list)
|
|
71
|
+
retrieved_context: list[ContextDoc] = field(default_factory=list)
|
|
72
|
+
raw: dict[str, Any] = field(default_factory=dict)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class RagAdapter(ABC):
|
|
76
|
+
"""Subclass and implement `query` to connect ragverdict to your RAG system."""
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def query(
|
|
80
|
+
self,
|
|
81
|
+
prompt: str,
|
|
82
|
+
*,
|
|
83
|
+
conversation: list[Message] | None = None,
|
|
84
|
+
) -> RagResponse: ...
|
|
85
|
+
|
|
86
|
+
def available_tools(self) -> list[ToolSpec]:
|
|
87
|
+
"""Tools the adapter exposes. Used by tool_coverage. Default: none."""
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
def corpus(self) -> Iterable[SourceDoc] | None:
|
|
91
|
+
"""The source corpus citations resolve against. Default: not available."""
|
|
92
|
+
return None
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""HTTP-endpoint adapter — posts {prompt, conversation} and parses the response."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from ragverdict.adapters.base import (
|
|
10
|
+
Citation,
|
|
11
|
+
ContextDoc,
|
|
12
|
+
Message,
|
|
13
|
+
RagAdapter,
|
|
14
|
+
RagResponse,
|
|
15
|
+
ToolCall,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HttpAdapter(RagAdapter):
|
|
20
|
+
"""Generic adapter for a RAG agent exposed over HTTP JSON.
|
|
21
|
+
|
|
22
|
+
Expects POST {endpoint} with:
|
|
23
|
+
{"prompt": str, "conversation": [{"role": str, "content": str}, ...]}
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
{
|
|
27
|
+
"text": str,
|
|
28
|
+
"citations": [{"id": str, "source_id": str, "span": str}, ...],
|
|
29
|
+
"tool_calls": [{"name": str, "args": {}, "result": ..., "latency_ms": int, "error": null}, ...],
|
|
30
|
+
"retrieved_context": [{"source_id": str, "chunk": str, "score": float}, ...]
|
|
31
|
+
}
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, *, endpoint: str, headers: dict[str, str], timeout_s: float) -> None:
|
|
35
|
+
self.endpoint = endpoint
|
|
36
|
+
self.headers = headers
|
|
37
|
+
self.timeout_s = timeout_s
|
|
38
|
+
|
|
39
|
+
def query(
|
|
40
|
+
self,
|
|
41
|
+
prompt: str,
|
|
42
|
+
*,
|
|
43
|
+
conversation: list[Message] | None = None,
|
|
44
|
+
) -> RagResponse:
|
|
45
|
+
payload: dict[str, Any] = {
|
|
46
|
+
"prompt": prompt,
|
|
47
|
+
"conversation": [
|
|
48
|
+
{"role": m.role, "content": m.content} for m in (conversation or [])
|
|
49
|
+
],
|
|
50
|
+
}
|
|
51
|
+
resp = httpx.post(
|
|
52
|
+
self.endpoint,
|
|
53
|
+
json=payload,
|
|
54
|
+
headers=self.headers,
|
|
55
|
+
timeout=self.timeout_s,
|
|
56
|
+
)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
data: dict[str, Any] = resp.json()
|
|
59
|
+
return RagResponse(
|
|
60
|
+
text=data.get("text", ""),
|
|
61
|
+
citations=[Citation(**c) for c in data.get("citations", [])],
|
|
62
|
+
tool_calls=[ToolCall(**t) for t in data.get("tool_calls", [])],
|
|
63
|
+
retrieved_context=[ContextDoc(**c) for c in data.get("retrieved_context", [])],
|
|
64
|
+
raw=data,
|
|
65
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Instantiate the user's RagAdapter from config."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ragverdict.adapters.base import RagAdapter
|
|
11
|
+
from ragverdict.adapters.http import HttpAdapter
|
|
12
|
+
from ragverdict.config import AdapterConfig, HttpAdapterConfig, PythonAdapterConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AdapterLoadError(Exception):
|
|
16
|
+
"""Raised when an adapter cannot be loaded or instantiated."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_adapter(config: AdapterConfig) -> RagAdapter:
|
|
20
|
+
if isinstance(config, PythonAdapterConfig):
|
|
21
|
+
return _load_python(config.module, config.cls)
|
|
22
|
+
if isinstance(config, HttpAdapterConfig):
|
|
23
|
+
return HttpAdapter(
|
|
24
|
+
endpoint=config.endpoint,
|
|
25
|
+
headers=config.headers,
|
|
26
|
+
timeout_s=config.timeout_s,
|
|
27
|
+
)
|
|
28
|
+
raise AdapterLoadError(f"unknown adapter type: {type(config).__name__}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _load_python(module_name: str, class_name: str) -> RagAdapter:
|
|
32
|
+
# User adapters live in their project tree, not in this package — make cwd
|
|
33
|
+
# importable so `module: examples.demo_rag.adapter` resolves when the user
|
|
34
|
+
# runs `ragverdict` from their project root.
|
|
35
|
+
cwd = os.getcwd()
|
|
36
|
+
if cwd not in sys.path:
|
|
37
|
+
sys.path.insert(0, cwd)
|
|
38
|
+
try:
|
|
39
|
+
module = importlib.import_module(module_name)
|
|
40
|
+
except ImportError as exc:
|
|
41
|
+
raise AdapterLoadError(f"cannot import module {module_name!r}: {exc}") from exc
|
|
42
|
+
cls: Any = getattr(module, class_name, None)
|
|
43
|
+
if cls is None:
|
|
44
|
+
raise AdapterLoadError(f"module {module_name!r} has no attribute {class_name!r}")
|
|
45
|
+
try:
|
|
46
|
+
instance = cls()
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
raise AdapterLoadError(f"failed to instantiate {module_name}.{class_name}: {exc}") from exc
|
|
49
|
+
if not isinstance(instance, RagAdapter):
|
|
50
|
+
raise AdapterLoadError(
|
|
51
|
+
f"{module_name}.{class_name} is not a RagAdapter subclass"
|
|
52
|
+
)
|
|
53
|
+
return instance
|
ragverdict/cli.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Click CLI entry point: `ragverdict run <config.yaml>`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ragverdict import __version__
|
|
11
|
+
from ragverdict.adapters.loader import AdapterLoadError
|
|
12
|
+
from ragverdict.config import ConfigError
|
|
13
|
+
from ragverdict.runner import Runner, RunnerError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@click.group(name="ragverdict")
|
|
17
|
+
@click.version_option(__version__, prog_name="ragverdict")
|
|
18
|
+
def cli() -> None:
|
|
19
|
+
"""ragverdict — pytest for RAG agents."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@cli.command()
|
|
23
|
+
@click.argument("config_path", type=click.Path(exists=True, path_type=Path))
|
|
24
|
+
@click.option(
|
|
25
|
+
"--out-dir",
|
|
26
|
+
type=click.Path(path_type=Path),
|
|
27
|
+
default=Path("./report"),
|
|
28
|
+
show_default=True,
|
|
29
|
+
help="Directory to write report.json and report.md.",
|
|
30
|
+
)
|
|
31
|
+
@click.option(
|
|
32
|
+
"--no-judge",
|
|
33
|
+
is_flag=True,
|
|
34
|
+
default=False,
|
|
35
|
+
help="Skip LLM-as-judge entirely. Hard assertions still run; WEAK verdicts and "
|
|
36
|
+
"citation support scoring are disabled.",
|
|
37
|
+
)
|
|
38
|
+
def run(config_path: Path, out_dir: Path, no_judge: bool) -> None:
|
|
39
|
+
"""Run the evaluation defined in CONFIG_PATH."""
|
|
40
|
+
try:
|
|
41
|
+
runner = Runner.from_config_path(
|
|
42
|
+
config_path,
|
|
43
|
+
out_dir=out_dir,
|
|
44
|
+
judge=None if no_judge else "auto",
|
|
45
|
+
)
|
|
46
|
+
_results, exit_code = runner.execute()
|
|
47
|
+
except (ConfigError, AdapterLoadError, RunnerError) as exc:
|
|
48
|
+
click.echo(f"error: {exc}", err=True)
|
|
49
|
+
sys.exit(2)
|
|
50
|
+
sys.exit(exit_code)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main() -> None:
|
|
54
|
+
cli()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
main()
|
ragverdict/config.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Pydantic schemas + YAML loader for ragverdict configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ConfigError(Exception):
|
|
13
|
+
"""Raised when a config file is missing, malformed, or fails validation."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PythonAdapterConfig(BaseModel):
|
|
17
|
+
model_config = ConfigDict(extra="forbid")
|
|
18
|
+
|
|
19
|
+
type: Literal["python"] = "python"
|
|
20
|
+
module: str
|
|
21
|
+
cls: str = Field(alias="class")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HttpAdapterConfig(BaseModel):
|
|
25
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
26
|
+
|
|
27
|
+
type: Literal["http"] = "http"
|
|
28
|
+
endpoint: str
|
|
29
|
+
headers: dict[str, str] = Field(default_factory=dict)
|
|
30
|
+
timeout_s: float = 30.0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
AdapterConfig = PythonAdapterConfig | HttpAdapterConfig
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class JudgeConfig(BaseModel):
|
|
37
|
+
model_config = ConfigDict(extra="forbid")
|
|
38
|
+
|
|
39
|
+
provider: Literal["anthropic"] = "anthropic"
|
|
40
|
+
model: str = "claude-sonnet-4-6"
|
|
41
|
+
max_concurrency: int = 4
|
|
42
|
+
fixtures_path: Path | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Thresholds(BaseModel):
|
|
46
|
+
model_config = ConfigDict(extra="forbid")
|
|
47
|
+
|
|
48
|
+
faithfulness_pass: float = 0.85
|
|
49
|
+
faithfulness_weak: float = 0.7
|
|
50
|
+
relevance_pass: float = 0.85
|
|
51
|
+
relevance_weak: float = 0.7
|
|
52
|
+
citation_support_pass: float = 0.95
|
|
53
|
+
citation_support_weak: float = 0.8
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class RagQualityCase(BaseModel):
|
|
57
|
+
model_config = ConfigDict(extra="forbid")
|
|
58
|
+
|
|
59
|
+
query: str
|
|
60
|
+
expects_citations: bool = False
|
|
61
|
+
must_mention: list[str] = Field(default_factory=list)
|
|
62
|
+
must_not_cite: bool = False
|
|
63
|
+
must_refuse: bool = False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ToolCoverageSpec(BaseModel):
|
|
67
|
+
model_config = ConfigDict(extra="forbid")
|
|
68
|
+
|
|
69
|
+
require_all_tools: bool = True
|
|
70
|
+
trigger_prompts: dict[str, str] = Field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class CitationAuditSpec(BaseModel):
|
|
74
|
+
model_config = ConfigDict(extra="forbid")
|
|
75
|
+
|
|
76
|
+
sample_queries: list[str] = Field(default_factory=list)
|
|
77
|
+
sample_size: int = 10
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TestSpec(BaseModel):
|
|
81
|
+
"""One entry in the top-level `tests:` list.
|
|
82
|
+
|
|
83
|
+
`evaluator` selects which evaluator runs; the remaining fields are evaluator-specific
|
|
84
|
+
and validated by the evaluator at run time. We keep them as a free-form dict here so
|
|
85
|
+
evaluators can evolve their case shapes without churning Config.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
__test__ = False # tell pytest this is not a test class
|
|
89
|
+
model_config = ConfigDict(extra="allow")
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
evaluator: str
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Config(BaseModel):
|
|
96
|
+
model_config = ConfigDict(extra="forbid")
|
|
97
|
+
|
|
98
|
+
adapter: AdapterConfig
|
|
99
|
+
judge: JudgeConfig = Field(default_factory=JudgeConfig)
|
|
100
|
+
thresholds: Thresholds = Field(default_factory=Thresholds)
|
|
101
|
+
tests: list[TestSpec] = Field(default_factory=list)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def load_config(path: Path) -> Config:
|
|
105
|
+
"""Read a YAML config file and validate it. Raises ConfigError on any failure."""
|
|
106
|
+
if not path.exists():
|
|
107
|
+
raise ConfigError(f"config file not found: {path}")
|
|
108
|
+
try:
|
|
109
|
+
raw: Any = yaml.safe_load(path.read_text())
|
|
110
|
+
except yaml.YAMLError as exc:
|
|
111
|
+
raise ConfigError(f"invalid YAML in {path}: {exc}") from exc
|
|
112
|
+
if not isinstance(raw, dict):
|
|
113
|
+
raise ConfigError(f"config root must be a mapping, got {type(raw).__name__}")
|
|
114
|
+
try:
|
|
115
|
+
return Config.model_validate(raw)
|
|
116
|
+
except ValidationError as exc:
|
|
117
|
+
raise ConfigError(f"config validation failed:\n{exc}") from exc
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Evaluators run behavioral tests against a RagAdapter."""
|
|
2
|
+
|
|
3
|
+
from ragverdict.evaluators.base import Evaluator, TestResult, Verdict
|
|
4
|
+
|
|
5
|
+
EVALUATORS: dict[str, type[Evaluator]] = {}
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def register(cls: type[Evaluator]) -> type[Evaluator]:
|
|
9
|
+
"""Register an Evaluator subclass by its `name` class attribute."""
|
|
10
|
+
EVALUATORS[cls.name] = cls
|
|
11
|
+
return cls
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _autoload() -> None:
|
|
15
|
+
# Import the bundled evaluator modules so they self-register on the registry.
|
|
16
|
+
# Kept inside a function (not module-level) to avoid circular imports.
|
|
17
|
+
from ragverdict.evaluators import ( # noqa: F401
|
|
18
|
+
citation_audit,
|
|
19
|
+
edge_cases,
|
|
20
|
+
rag_quality,
|
|
21
|
+
tool_coverage,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_autoload()
|
|
26
|
+
|
|
27
|
+
__all__ = ["EVALUATORS", "Evaluator", "TestResult", "Verdict", "register"]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Evaluator ABC, verdict enum, and the TestResult dataclass."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ragverdict.adapters.base import RagAdapter
|
|
12
|
+
from ragverdict.config import TestSpec, Thresholds
|
|
13
|
+
from ragverdict.judges.llm_judge import LLMJudge
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Verdict(str, Enum):
|
|
17
|
+
PASS = "PASS"
|
|
18
|
+
WEAK = "WEAK"
|
|
19
|
+
FAIL = "FAIL"
|
|
20
|
+
ERROR = "ERROR"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TestResult:
|
|
25
|
+
name: str
|
|
26
|
+
evaluator: str
|
|
27
|
+
verdict: Verdict
|
|
28
|
+
detail: str = ""
|
|
29
|
+
metrics: dict[str, float] = field(default_factory=dict)
|
|
30
|
+
duration_ms: int = 0
|
|
31
|
+
artifacts: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Evaluator(ABC):
|
|
35
|
+
"""Subclass and implement `run`. Register with `@register` in evaluators/__init__.py."""
|
|
36
|
+
|
|
37
|
+
name: ClassVar[str]
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def run(
|
|
41
|
+
self,
|
|
42
|
+
adapter: RagAdapter,
|
|
43
|
+
spec: TestSpec,
|
|
44
|
+
*,
|
|
45
|
+
judge: LLMJudge | None,
|
|
46
|
+
thresholds: Thresholds,
|
|
47
|
+
) -> TestResult: ...
|