evalseed 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalseed/__init__.py +31 -0
- evalseed/cli.py +75 -0
- evalseed/corpus.py +101 -0
- evalseed/dataset.py +79 -0
- evalseed/exceptions.py +23 -0
- evalseed/filters/__init__.py +17 -0
- evalseed/filters/answerability.py +79 -0
- evalseed/filters/base.py +42 -0
- evalseed/filters/difficulty.py +99 -0
- evalseed/filters/faithfulness.py +71 -0
- evalseed/filters/prefilters.py +85 -0
- evalseed/filters/triviality.py +107 -0
- evalseed/generator.py +196 -0
- evalseed/judges.py +136 -0
- evalseed/pipeline.py +272 -0
- evalseed/py.typed +0 -0
- evalseed/schemas.py +52 -0
- evalseed-0.1.0.dist-info/METADATA +480 -0
- evalseed-0.1.0.dist-info/RECORD +22 -0
- evalseed-0.1.0.dist-info/WHEEL +4 -0
- evalseed-0.1.0.dist-info/entry_points.txt +2 -0
- evalseed-0.1.0.dist-info/licenses/LICENSE +21 -0
evalseed/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""evalseed — quality-filtered synthetic Q&A datasets for RAG evaluation."""
|
|
2
|
+
|
|
3
|
+
from evalseed.dataset import Dataset
|
|
4
|
+
from evalseed.exceptions import (
|
|
5
|
+
EvalseedError,
|
|
6
|
+
FilterError,
|
|
7
|
+
GenerationError,
|
|
8
|
+
JudgeAuthError,
|
|
9
|
+
JudgeError,
|
|
10
|
+
)
|
|
11
|
+
from evalseed.judges import Judge, OpenAIJudge
|
|
12
|
+
from evalseed.pipeline import Pipeline
|
|
13
|
+
from evalseed.schemas import FilterResult, QAPair, QAType
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Dataset",
|
|
19
|
+
"EvalseedError",
|
|
20
|
+
"FilterError",
|
|
21
|
+
"FilterResult",
|
|
22
|
+
"GenerationError",
|
|
23
|
+
"Judge",
|
|
24
|
+
"JudgeAuthError",
|
|
25
|
+
"JudgeError",
|
|
26
|
+
"OpenAIJudge",
|
|
27
|
+
"Pipeline",
|
|
28
|
+
"QAPair",
|
|
29
|
+
"QAType",
|
|
30
|
+
"__version__",
|
|
31
|
+
]
|
evalseed/cli.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from evalseed import JudgeAuthError, OpenAIJudge, Pipeline
|
|
8
|
+
from evalseed.schemas import QAType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _parse_types(value: str) -> list[QAType]:
|
|
12
|
+
out: list[QAType] = []
|
|
13
|
+
for raw in value.split(","):
|
|
14
|
+
raw = raw.strip()
|
|
15
|
+
if not raw:
|
|
16
|
+
continue
|
|
17
|
+
out.append(QAType(raw))
|
|
18
|
+
if not out:
|
|
19
|
+
raise argparse.ArgumentTypeError("at least one QA type required")
|
|
20
|
+
return out
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main(argv: list[str] | None = None) -> int:
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
prog="evalseed",
|
|
26
|
+
description="Generate quality-filtered synthetic Q&A datasets for RAG evaluation.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument("corpus", help="Path to a corpus file or directory.")
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-o",
|
|
31
|
+
"--out",
|
|
32
|
+
default=None,
|
|
33
|
+
help="Output JSONL path. Defaults to <corpus>/output/eval.jsonl "
|
|
34
|
+
"(or <corpus_parent>/output/eval.jsonl if corpus is a file).",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("-n", "--n-pairs", type=int, default=50, help="Target pair count.")
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--types",
|
|
39
|
+
type=_parse_types,
|
|
40
|
+
default=[QAType.SINGLE_HOP, QAType.MULTI_HOP],
|
|
41
|
+
help="Comma-separated QA types: single_hop,multi_hop,distractor",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument("--model", default="gpt-4o-mini", help="OpenAI model id.")
|
|
44
|
+
parser.add_argument("--seed", type=int, default=None)
|
|
45
|
+
parser.add_argument("--all", action="store_true", help="Save passed AND rejected pairs.")
|
|
46
|
+
|
|
47
|
+
args = parser.parse_args(argv)
|
|
48
|
+
out_path = _resolve_out_path(args.corpus, args.out)
|
|
49
|
+
try:
|
|
50
|
+
judge = OpenAIJudge(model=args.model)
|
|
51
|
+
pipeline = Pipeline(
|
|
52
|
+
judge=judge,
|
|
53
|
+
n_pairs=args.n_pairs,
|
|
54
|
+
types=args.types,
|
|
55
|
+
seed=args.seed,
|
|
56
|
+
)
|
|
57
|
+
dataset = pipeline.generate_from_corpus(args.corpus)
|
|
58
|
+
except JudgeAuthError as exc:
|
|
59
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
60
|
+
return 2
|
|
61
|
+
dataset.save(out_path, only_passed=not args.all)
|
|
62
|
+
print(f"wrote {out_path}")
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _resolve_out_path(corpus: str, out: str | None) -> Path:
|
|
67
|
+
if out is not None:
|
|
68
|
+
return Path(out)
|
|
69
|
+
corpus_path = Path(corpus)
|
|
70
|
+
base = corpus_path if corpus_path.is_dir() else corpus_path.parent
|
|
71
|
+
return base / "output" / "eval.jsonl"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if __name__ == "__main__":
|
|
75
|
+
sys.exit(main())
|
evalseed/corpus.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from evalseed.exceptions import GenerationError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class Chunk:
|
|
12
|
+
"""A contiguous slice of a source document."""
|
|
13
|
+
|
|
14
|
+
text: str
|
|
15
|
+
source: str
|
|
16
|
+
chunk_index: int
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_SUPPORTED_SUFFIXES = {".txt", ".md", ".markdown", ".rst"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_corpus(path: str | Path) -> list[tuple[str, str]]:
|
|
23
|
+
"""Load (source_name, text) pairs from a file or directory.
|
|
24
|
+
|
|
25
|
+
Supports plain-text formats only in v0.1 (.txt, .md, .markdown, .rst).
|
|
26
|
+
PDF/HTML loaders are intentionally deferred — users can preprocess.
|
|
27
|
+
"""
|
|
28
|
+
p = Path(path)
|
|
29
|
+
if not p.exists():
|
|
30
|
+
raise GenerationError(f"corpus path does not exist: {p}")
|
|
31
|
+
|
|
32
|
+
if p.is_file():
|
|
33
|
+
return [(p.name, _read_text(p))]
|
|
34
|
+
|
|
35
|
+
docs: list[tuple[str, str]] = []
|
|
36
|
+
for sub in sorted(p.rglob("*")):
|
|
37
|
+
if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
|
|
38
|
+
docs.append((str(sub.relative_to(p)), _read_text(sub)))
|
|
39
|
+
if not docs:
|
|
40
|
+
raise GenerationError(
|
|
41
|
+
f"no supported files (.txt/.md/.rst) found under {p}"
|
|
42
|
+
)
|
|
43
|
+
return docs
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _read_text(path: Path) -> str:
|
|
47
|
+
try:
|
|
48
|
+
return path.read_text(encoding="utf-8")
|
|
49
|
+
except UnicodeDecodeError:
|
|
50
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def chunk_text(
|
|
54
|
+
text: str,
|
|
55
|
+
source: str,
|
|
56
|
+
target_chars: int = 1500,
|
|
57
|
+
overlap_chars: int = 150,
|
|
58
|
+
) -> list[Chunk]:
|
|
59
|
+
"""Paragraph-aware char-window chunker.
|
|
60
|
+
|
|
61
|
+
Greedily packs paragraphs into windows of roughly ``target_chars``,
|
|
62
|
+
with character overlap between consecutive windows. Good enough for
|
|
63
|
+
v0.1; more sophisticated chunking is a v0.2 concern.
|
|
64
|
+
"""
|
|
65
|
+
if target_chars <= 0:
|
|
66
|
+
raise ValueError("target_chars must be positive")
|
|
67
|
+
if overlap_chars < 0 or overlap_chars >= target_chars:
|
|
68
|
+
raise ValueError("overlap_chars must be in [0, target_chars)")
|
|
69
|
+
|
|
70
|
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
|
|
71
|
+
if not paragraphs:
|
|
72
|
+
return []
|
|
73
|
+
|
|
74
|
+
chunks: list[Chunk] = []
|
|
75
|
+
buf: list[str] = []
|
|
76
|
+
buf_len = 0
|
|
77
|
+
idx = 0
|
|
78
|
+
|
|
79
|
+
def flush() -> None:
|
|
80
|
+
nonlocal buf, buf_len, idx
|
|
81
|
+
if not buf:
|
|
82
|
+
return
|
|
83
|
+
chunk_text_value = "\n\n".join(buf)
|
|
84
|
+
chunks.append(Chunk(text=chunk_text_value, source=source, chunk_index=idx))
|
|
85
|
+
idx += 1
|
|
86
|
+
if overlap_chars and len(chunk_text_value) > overlap_chars:
|
|
87
|
+
tail = chunk_text_value[-overlap_chars:]
|
|
88
|
+
buf = [tail]
|
|
89
|
+
buf_len = len(tail)
|
|
90
|
+
else:
|
|
91
|
+
buf = []
|
|
92
|
+
buf_len = 0
|
|
93
|
+
|
|
94
|
+
for para in paragraphs:
|
|
95
|
+
if buf_len + len(para) + 2 > target_chars and buf:
|
|
96
|
+
flush()
|
|
97
|
+
buf.append(para)
|
|
98
|
+
buf_len += len(para) + 2
|
|
99
|
+
|
|
100
|
+
flush()
|
|
101
|
+
return chunks
|
evalseed/dataset.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import overload
|
|
8
|
+
|
|
9
|
+
from evalseed.schemas import QAPair
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Dataset:
|
|
13
|
+
"""A collection of QA pairs with serialization and stats helpers.
|
|
14
|
+
|
|
15
|
+
Iteration yields all pairs (passed and rejected). Use ``passed`` /
|
|
16
|
+
``rejected`` to filter, or ``save`` to write only passed pairs to disk.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, pairs: list[QAPair]) -> None:
|
|
20
|
+
self._pairs: list[QAPair] = list(pairs)
|
|
21
|
+
|
|
22
|
+
def __len__(self) -> int:
|
|
23
|
+
return len(self._pairs)
|
|
24
|
+
|
|
25
|
+
def __iter__(self) -> Iterator[QAPair]:
|
|
26
|
+
return iter(self._pairs)
|
|
27
|
+
|
|
28
|
+
@overload
|
|
29
|
+
def __getitem__(self, key: int) -> QAPair: ...
|
|
30
|
+
@overload
|
|
31
|
+
def __getitem__(self, key: slice) -> Dataset: ...
|
|
32
|
+
def __getitem__(self, key: int | slice) -> QAPair | Dataset:
|
|
33
|
+
if isinstance(key, slice):
|
|
34
|
+
return Dataset(self._pairs[key])
|
|
35
|
+
return self._pairs[key]
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def passed(self) -> Dataset:
|
|
39
|
+
return Dataset([p for p in self._pairs if p.passed])
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def rejected(self) -> Dataset:
|
|
43
|
+
return Dataset([p for p in self._pairs if not p.passed])
|
|
44
|
+
|
|
45
|
+
def stats(self) -> dict[str, object]:
|
|
46
|
+
total = len(self._pairs)
|
|
47
|
+
passed = sum(1 for p in self._pairs if p.passed)
|
|
48
|
+
rejection_counts: Counter[str] = Counter()
|
|
49
|
+
for pair in self._pairs:
|
|
50
|
+
for r in pair.filter_results:
|
|
51
|
+
if not r.passed:
|
|
52
|
+
rejection_counts[r.filter_name] += 1
|
|
53
|
+
return {
|
|
54
|
+
"total": total,
|
|
55
|
+
"passed": passed,
|
|
56
|
+
"rejected": total - passed,
|
|
57
|
+
"pass_rate": passed / total if total else 0.0,
|
|
58
|
+
"rejections_by_filter": dict(rejection_counts),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
def save(self, path: str | Path, only_passed: bool = True) -> None:
|
|
62
|
+
"""Write pairs to a JSONL file. By default writes only passed pairs."""
|
|
63
|
+
target = Path(path)
|
|
64
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
pairs = self.passed if only_passed else self
|
|
66
|
+
with target.open("w", encoding="utf-8") as f:
|
|
67
|
+
for pair in pairs:
|
|
68
|
+
f.write(pair.model_dump_json() + "\n")
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def load(cls, path: str | Path) -> Dataset:
|
|
72
|
+
pairs: list[QAPair] = []
|
|
73
|
+
with Path(path).open(encoding="utf-8") as f:
|
|
74
|
+
for line in f:
|
|
75
|
+
line = line.strip()
|
|
76
|
+
if not line:
|
|
77
|
+
continue
|
|
78
|
+
pairs.append(QAPair.model_validate(json.loads(line)))
|
|
79
|
+
return cls(pairs)
|
evalseed/exceptions.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
class EvalseedError(Exception):
|
|
2
|
+
"""Base class for all evalseed errors."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GenerationError(EvalseedError):
|
|
6
|
+
"""Raised when QA pair generation fails."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FilterError(EvalseedError):
|
|
10
|
+
"""Raised when a filter cannot evaluate a pair."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JudgeError(EvalseedError):
|
|
14
|
+
"""Raised when the underlying judge LLM call fails."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class JudgeAuthError(JudgeError):
|
|
18
|
+
"""Raised when the judge cannot authenticate (missing or invalid API key).
|
|
19
|
+
|
|
20
|
+
Distinct from JudgeError so callers that normally swallow transient
|
|
21
|
+
judge failures can let auth errors surface — retrying a bad key is
|
|
22
|
+
pointless and would silently produce empty results.
|
|
23
|
+
"""
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from evalseed.filters.answerability import AnswerabilityFilter
|
|
2
|
+
from evalseed.filters.base import Filter, PreFilter
|
|
3
|
+
from evalseed.filters.difficulty import DifficultyFilter
|
|
4
|
+
from evalseed.filters.faithfulness import FaithfulnessFilter
|
|
5
|
+
from evalseed.filters.prefilters import LengthPreFilter, RegexPreFilter
|
|
6
|
+
from evalseed.filters.triviality import TrivialityFilter
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AnswerabilityFilter",
|
|
10
|
+
"DifficultyFilter",
|
|
11
|
+
"FaithfulnessFilter",
|
|
12
|
+
"Filter",
|
|
13
|
+
"LengthPreFilter",
|
|
14
|
+
"PreFilter",
|
|
15
|
+
"RegexPreFilter",
|
|
16
|
+
"TrivialityFilter",
|
|
17
|
+
]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalseed.exceptions import JudgeAuthError, JudgeError
|
|
4
|
+
from evalseed.filters.base import Filter
|
|
5
|
+
from evalseed.schemas import FilterResult, QAPair
|
|
6
|
+
|
|
7
|
+
_SYSTEM = (
|
|
8
|
+
"You evaluate whether a question is well-posed and answerable from a "
|
|
9
|
+
"specific context, with a single defensible answer. Respond ONLY with JSON."
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
_USER_TEMPLATE = """Evaluate the QUESTION against the CONTEXT.
|
|
13
|
+
|
|
14
|
+
CONTEXT:
|
|
15
|
+
\"\"\"
|
|
16
|
+
{context}
|
|
17
|
+
\"\"\"
|
|
18
|
+
|
|
19
|
+
QUESTION: {question}
|
|
20
|
+
|
|
21
|
+
Decide:
|
|
22
|
+
1. Is the question unambiguous (a careful reader would not produce multiple equally valid answers)?
|
|
23
|
+
2. Is the question answerable from the context alone (no external knowledge required)?
|
|
24
|
+
3. Is the question self-contained (does not rely on pronouns or "the above" referring outside itself)?
|
|
25
|
+
|
|
26
|
+
Return JSON:
|
|
27
|
+
{{
|
|
28
|
+
"unambiguous": <true|false>,
|
|
29
|
+
"answerable": <true|false>,
|
|
30
|
+
"self_contained": <true|false>,
|
|
31
|
+
"reason": "<one short sentence if any check fails, else empty string>"
|
|
32
|
+
}}
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AnswerabilityFilter(Filter):
|
|
37
|
+
"""Rejects ambiguous, externally-dependent, or non-self-contained questions."""
|
|
38
|
+
|
|
39
|
+
name = "answerability"
|
|
40
|
+
|
|
41
|
+
def evaluate(self, pair: QAPair) -> FilterResult:
|
|
42
|
+
try:
|
|
43
|
+
result = self.judge.judge(
|
|
44
|
+
_SYSTEM,
|
|
45
|
+
_USER_TEMPLATE.format(context=pair.context, question=pair.question),
|
|
46
|
+
)
|
|
47
|
+
except JudgeAuthError:
|
|
48
|
+
raise
|
|
49
|
+
except JudgeError as exc:
|
|
50
|
+
return self._result(passed=False, reason=f"judge error: {exc}")
|
|
51
|
+
|
|
52
|
+
unambiguous = bool(result.get("unambiguous", False))
|
|
53
|
+
answerable = bool(result.get("answerable", False))
|
|
54
|
+
self_contained = bool(result.get("self_contained", False))
|
|
55
|
+
passed = unambiguous and answerable and self_contained
|
|
56
|
+
reason = str(result.get("reason", "")).strip() or None
|
|
57
|
+
if passed:
|
|
58
|
+
return self._result(
|
|
59
|
+
passed=True,
|
|
60
|
+
unambiguous=unambiguous,
|
|
61
|
+
answerable=answerable,
|
|
62
|
+
self_contained=self_contained,
|
|
63
|
+
)
|
|
64
|
+
failed = [
|
|
65
|
+
name
|
|
66
|
+
for name, ok in (
|
|
67
|
+
("unambiguous", unambiguous),
|
|
68
|
+
("answerable", answerable),
|
|
69
|
+
("self_contained", self_contained),
|
|
70
|
+
)
|
|
71
|
+
if not ok
|
|
72
|
+
]
|
|
73
|
+
return self._result(
|
|
74
|
+
passed=False,
|
|
75
|
+
reason=reason or f"failed: {', '.join(failed)}",
|
|
76
|
+
unambiguous=unambiguous,
|
|
77
|
+
answerable=answerable,
|
|
78
|
+
self_contained=self_contained,
|
|
79
|
+
)
|
evalseed/filters/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from evalseed.judges import Judge
|
|
6
|
+
from evalseed.schemas import FilterResult, QAPair
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PreFilter(ABC):
|
|
10
|
+
"""Cheap, judge-free filter — runs before any LLM calls."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def evaluate(self, pair: QAPair) -> FilterResult: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Filter(ABC):
|
|
19
|
+
"""LLM-judge-backed filter."""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
|
|
23
|
+
def __init__(self, judge: Judge) -> None:
|
|
24
|
+
self.judge = judge
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def evaluate(self, pair: QAPair) -> FilterResult: ...
|
|
28
|
+
|
|
29
|
+
def _result(
|
|
30
|
+
self,
|
|
31
|
+
passed: bool,
|
|
32
|
+
score: float | None = None,
|
|
33
|
+
reason: str | None = None,
|
|
34
|
+
**metadata: object,
|
|
35
|
+
) -> FilterResult:
|
|
36
|
+
return FilterResult(
|
|
37
|
+
filter_name=self.name,
|
|
38
|
+
passed=passed,
|
|
39
|
+
score=score,
|
|
40
|
+
reason=reason,
|
|
41
|
+
metadata=dict(metadata),
|
|
42
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalseed.exceptions import JudgeAuthError, JudgeError
|
|
4
|
+
from evalseed.filters.base import Filter
|
|
5
|
+
from evalseed.judges import Judge
|
|
6
|
+
from evalseed.schemas import Difficulty, FilterResult, QAPair
|
|
7
|
+
|
|
8
|
+
_SYSTEM = (
|
|
9
|
+
"You assess the reasoning effort required by a question, given its "
|
|
10
|
+
"context. You output one of: 'easy', 'medium', 'hard'. Respond ONLY with JSON."
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
_USER_TEMPLATE = """Assess the difficulty of answering QUESTION from CONTEXT.
|
|
14
|
+
|
|
15
|
+
CONTEXT:
|
|
16
|
+
\"\"\"
|
|
17
|
+
{context}
|
|
18
|
+
\"\"\"
|
|
19
|
+
|
|
20
|
+
QUESTION: {question}
|
|
21
|
+
|
|
22
|
+
Difficulty rubric:
|
|
23
|
+
- easy: single-span lookup, no reasoning required.
|
|
24
|
+
- medium: requires combining 2-3 facts, light synthesis, or simple inference.
|
|
25
|
+
- hard: requires multi-step reasoning, comparison across distant spans, or non-trivial inference.
|
|
26
|
+
|
|
27
|
+
Return JSON:
|
|
28
|
+
{{
|
|
29
|
+
"predicted": "<easy|medium|hard>",
|
|
30
|
+
"reason": "<one short sentence>"
|
|
31
|
+
}}
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_LEVELS = {Difficulty.EASY: 0, Difficulty.MEDIUM: 1, Difficulty.HARD: 2}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DifficultyFilter(Filter):
|
|
39
|
+
"""Records judge-predicted difficulty alongside the labeled value.
|
|
40
|
+
|
|
41
|
+
By default this is a label-enrichment pass — the prediction is stored in
|
|
42
|
+
metadata and the pair always passes. A 3-level rubric like easy/medium/hard
|
|
43
|
+
is too coarse for one-step disagreements ("easy" vs "medium") to be a
|
|
44
|
+
quality signal, so disagreement-based rejection is opt-in.
|
|
45
|
+
|
|
46
|
+
Modes:
|
|
47
|
+
strict=False (default): always pass; record prediction + agreement.
|
|
48
|
+
strict=True: reject only on a TWO-step gap (easy↔hard).
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
name = "difficulty"
|
|
52
|
+
|
|
53
|
+
def __init__(self, judge: Judge, strict: bool = False) -> None:
|
|
54
|
+
super().__init__(judge)
|
|
55
|
+
self.strict = strict
|
|
56
|
+
|
|
57
|
+
def evaluate(self, pair: QAPair) -> FilterResult:
|
|
58
|
+
try:
|
|
59
|
+
result = self.judge.judge(
|
|
60
|
+
_SYSTEM,
|
|
61
|
+
_USER_TEMPLATE.format(context=pair.context, question=pair.question),
|
|
62
|
+
)
|
|
63
|
+
except JudgeAuthError:
|
|
64
|
+
raise
|
|
65
|
+
except JudgeError as exc:
|
|
66
|
+
return self._result(passed=False, reason=f"judge error: {exc}")
|
|
67
|
+
|
|
68
|
+
predicted_raw = str(result.get("predicted", "")).strip().lower()
|
|
69
|
+
try:
|
|
70
|
+
predicted = Difficulty(predicted_raw)
|
|
71
|
+
except ValueError:
|
|
72
|
+
return self._result(
|
|
73
|
+
passed=True,
|
|
74
|
+
reason=f"unparseable difficulty: {predicted_raw!r}",
|
|
75
|
+
predicted=predicted_raw,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if pair.difficulty is None:
|
|
79
|
+
return self._result(passed=True, predicted=predicted.value)
|
|
80
|
+
|
|
81
|
+
gap = abs(_LEVELS[pair.difficulty] - _LEVELS[predicted])
|
|
82
|
+
if not self.strict or gap < 2:
|
|
83
|
+
return self._result(
|
|
84
|
+
passed=True,
|
|
85
|
+
labeled=pair.difficulty.value,
|
|
86
|
+
predicted=predicted.value,
|
|
87
|
+
gap=gap,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return self._result(
|
|
91
|
+
passed=False,
|
|
92
|
+
reason=(
|
|
93
|
+
f"labeled difficulty {pair.difficulty.value!r} is two steps "
|
|
94
|
+
f"away from judge prediction {predicted.value!r}"
|
|
95
|
+
),
|
|
96
|
+
labeled=pair.difficulty.value,
|
|
97
|
+
predicted=predicted.value,
|
|
98
|
+
gap=gap,
|
|
99
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalseed.exceptions import JudgeAuthError, JudgeError
|
|
4
|
+
from evalseed.filters.base import Filter
|
|
5
|
+
from evalseed.judges import Judge
|
|
6
|
+
from evalseed.schemas import FilterResult, QAPair
|
|
7
|
+
|
|
8
|
+
_SYSTEM = (
|
|
9
|
+
"You are a strict evaluator of factual grounding. You decide whether an "
|
|
10
|
+
"answer is fully supported by the provided context, with no information "
|
|
11
|
+
"added from outside the context. Respond ONLY with JSON."
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
_USER_TEMPLATE = """Evaluate whether the ANSWER is fully entailed by the CONTEXT for the QUESTION.
|
|
15
|
+
|
|
16
|
+
CONTEXT:
|
|
17
|
+
\"\"\"
|
|
18
|
+
{context}
|
|
19
|
+
\"\"\"
|
|
20
|
+
|
|
21
|
+
QUESTION: {question}
|
|
22
|
+
|
|
23
|
+
ANSWER: {answer}
|
|
24
|
+
|
|
25
|
+
Return JSON with this exact schema:
|
|
26
|
+
{{
|
|
27
|
+
"faithful": <true|false>,
|
|
28
|
+
"score": <float between 0 and 1>,
|
|
29
|
+
"reason": "<one short sentence>"
|
|
30
|
+
}}
|
|
31
|
+
|
|
32
|
+
Rules:
|
|
33
|
+
- "faithful" must be false if any factual claim in the answer is not supported by the context.
|
|
34
|
+
- "faithful" must be false if the answer adds quantitative details, dates, or names not in the context.
|
|
35
|
+
- Paraphrase is OK as long as every claim is supported.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FaithfulnessFilter(Filter):
|
|
40
|
+
"""Rejects pairs where the answer cannot be entailed from the context."""
|
|
41
|
+
|
|
42
|
+
name = "faithfulness"
|
|
43
|
+
|
|
44
|
+
def __init__(self, judge: Judge, threshold: float = 0.7) -> None:
|
|
45
|
+
super().__init__(judge)
|
|
46
|
+
self.threshold = threshold
|
|
47
|
+
|
|
48
|
+
def evaluate(self, pair: QAPair) -> FilterResult:
|
|
49
|
+
try:
|
|
50
|
+
result = self.judge.judge(
|
|
51
|
+
_SYSTEM,
|
|
52
|
+
_USER_TEMPLATE.format(
|
|
53
|
+
context=pair.context,
|
|
54
|
+
question=pair.question,
|
|
55
|
+
answer=pair.answer,
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
except JudgeAuthError:
|
|
59
|
+
raise
|
|
60
|
+
except JudgeError as exc:
|
|
61
|
+
return self._result(passed=False, reason=f"judge error: {exc}")
|
|
62
|
+
|
|
63
|
+
faithful = bool(result.get("faithful", False))
|
|
64
|
+
score_raw = result.get("score", 0.0)
|
|
65
|
+
try:
|
|
66
|
+
score = float(score_raw)
|
|
67
|
+
except (TypeError, ValueError):
|
|
68
|
+
score = 0.0
|
|
69
|
+
reason = str(result.get("reason", "")).strip() or None
|
|
70
|
+
passed = faithful and score >= self.threshold
|
|
71
|
+
return self._result(passed=passed, score=score, reason=reason if not passed else None)
|