evalseed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalseed/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ """evalseed — quality-filtered synthetic Q&A datasets for RAG evaluation."""
2
+
3
+ from evalseed.dataset import Dataset
4
+ from evalseed.exceptions import (
5
+ EvalseedError,
6
+ FilterError,
7
+ GenerationError,
8
+ JudgeAuthError,
9
+ JudgeError,
10
+ )
11
+ from evalseed.judges import Judge, OpenAIJudge
12
+ from evalseed.pipeline import Pipeline
13
+ from evalseed.schemas import FilterResult, QAPair, QAType
14
+
15
+ __version__ = "0.1.0"
16
+
17
+ __all__ = [
18
+ "Dataset",
19
+ "EvalseedError",
20
+ "FilterError",
21
+ "FilterResult",
22
+ "GenerationError",
23
+ "Judge",
24
+ "JudgeAuthError",
25
+ "JudgeError",
26
+ "OpenAIJudge",
27
+ "Pipeline",
28
+ "QAPair",
29
+ "QAType",
30
+ "__version__",
31
+ ]
evalseed/cli.py ADDED
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from evalseed import JudgeAuthError, OpenAIJudge, Pipeline
8
+ from evalseed.schemas import QAType
9
+
10
+
11
+ def _parse_types(value: str) -> list[QAType]:
12
+ out: list[QAType] = []
13
+ for raw in value.split(","):
14
+ raw = raw.strip()
15
+ if not raw:
16
+ continue
17
+ out.append(QAType(raw))
18
+ if not out:
19
+ raise argparse.ArgumentTypeError("at least one QA type required")
20
+ return out
21
+
22
+
23
+ def main(argv: list[str] | None = None) -> int:
24
+ parser = argparse.ArgumentParser(
25
+ prog="evalseed",
26
+ description="Generate quality-filtered synthetic Q&A datasets for RAG evaluation.",
27
+ )
28
+ parser.add_argument("corpus", help="Path to a corpus file or directory.")
29
+ parser.add_argument(
30
+ "-o",
31
+ "--out",
32
+ default=None,
33
+ help="Output JSONL path. Defaults to <corpus>/output/eval.jsonl "
34
+ "(or <corpus_parent>/output/eval.jsonl if corpus is a file).",
35
+ )
36
+ parser.add_argument("-n", "--n-pairs", type=int, default=50, help="Target pair count.")
37
+ parser.add_argument(
38
+ "--types",
39
+ type=_parse_types,
40
+ default=[QAType.SINGLE_HOP, QAType.MULTI_HOP],
41
+ help="Comma-separated QA types: single_hop,multi_hop,distractor",
42
+ )
43
+ parser.add_argument("--model", default="gpt-4o-mini", help="OpenAI model id.")
44
+ parser.add_argument("--seed", type=int, default=None)
45
+ parser.add_argument("--all", action="store_true", help="Save passed AND rejected pairs.")
46
+
47
+ args = parser.parse_args(argv)
48
+ out_path = _resolve_out_path(args.corpus, args.out)
49
+ try:
50
+ judge = OpenAIJudge(model=args.model)
51
+ pipeline = Pipeline(
52
+ judge=judge,
53
+ n_pairs=args.n_pairs,
54
+ types=args.types,
55
+ seed=args.seed,
56
+ )
57
+ dataset = pipeline.generate_from_corpus(args.corpus)
58
+ except JudgeAuthError as exc:
59
+ print(f"error: {exc}", file=sys.stderr)
60
+ return 2
61
+ dataset.save(out_path, only_passed=not args.all)
62
+ print(f"wrote {out_path}")
63
+ return 0
64
+
65
+
66
+ def _resolve_out_path(corpus: str, out: str | None) -> Path:
67
+ if out is not None:
68
+ return Path(out)
69
+ corpus_path = Path(corpus)
70
+ base = corpus_path if corpus_path.is_dir() else corpus_path.parent
71
+ return base / "output" / "eval.jsonl"
72
+
73
+
74
+ if __name__ == "__main__":
75
+ sys.exit(main())
evalseed/corpus.py ADDED
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ from evalseed.exceptions import GenerationError
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class Chunk:
12
+ """A contiguous slice of a source document."""
13
+
14
+ text: str
15
+ source: str
16
+ chunk_index: int
17
+
18
+
19
+ _SUPPORTED_SUFFIXES = {".txt", ".md", ".markdown", ".rst"}
20
+
21
+
22
+ def load_corpus(path: str | Path) -> list[tuple[str, str]]:
23
+ """Load (source_name, text) pairs from a file or directory.
24
+
25
+ Supports plain-text formats only in v0.1 (.txt, .md, .markdown, .rst).
26
+ PDF/HTML loaders are intentionally deferred — users can preprocess.
27
+ """
28
+ p = Path(path)
29
+ if not p.exists():
30
+ raise GenerationError(f"corpus path does not exist: {p}")
31
+
32
+ if p.is_file():
33
+ return [(p.name, _read_text(p))]
34
+
35
+ docs: list[tuple[str, str]] = []
36
+ for sub in sorted(p.rglob("*")):
37
+ if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
38
+ docs.append((str(sub.relative_to(p)), _read_text(sub)))
39
+ if not docs:
40
+ raise GenerationError(
41
+ f"no supported files (.txt/.md/.rst) found under {p}"
42
+ )
43
+ return docs
44
+
45
+
46
+ def _read_text(path: Path) -> str:
47
+ try:
48
+ return path.read_text(encoding="utf-8")
49
+ except UnicodeDecodeError:
50
+ return path.read_text(encoding="utf-8", errors="replace")
51
+
52
+
53
+ def chunk_text(
54
+ text: str,
55
+ source: str,
56
+ target_chars: int = 1500,
57
+ overlap_chars: int = 150,
58
+ ) -> list[Chunk]:
59
+ """Paragraph-aware char-window chunker.
60
+
61
+ Greedily packs paragraphs into windows of roughly ``target_chars``,
62
+ with character overlap between consecutive windows. Good enough for
63
+ v0.1; more sophisticated chunking is a v0.2 concern.
64
+ """
65
+ if target_chars <= 0:
66
+ raise ValueError("target_chars must be positive")
67
+ if overlap_chars < 0 or overlap_chars >= target_chars:
68
+ raise ValueError("overlap_chars must be in [0, target_chars)")
69
+
70
+ paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
71
+ if not paragraphs:
72
+ return []
73
+
74
+ chunks: list[Chunk] = []
75
+ buf: list[str] = []
76
+ buf_len = 0
77
+ idx = 0
78
+
79
+ def flush() -> None:
80
+ nonlocal buf, buf_len, idx
81
+ if not buf:
82
+ return
83
+ chunk_text_value = "\n\n".join(buf)
84
+ chunks.append(Chunk(text=chunk_text_value, source=source, chunk_index=idx))
85
+ idx += 1
86
+ if overlap_chars and len(chunk_text_value) > overlap_chars:
87
+ tail = chunk_text_value[-overlap_chars:]
88
+ buf = [tail]
89
+ buf_len = len(tail)
90
+ else:
91
+ buf = []
92
+ buf_len = 0
93
+
94
+ for para in paragraphs:
95
+ if buf_len + len(para) + 2 > target_chars and buf:
96
+ flush()
97
+ buf.append(para)
98
+ buf_len += len(para) + 2
99
+
100
+ flush()
101
+ return chunks
evalseed/dataset.py ADDED
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections import Counter
5
+ from collections.abc import Iterator
6
+ from pathlib import Path
7
+ from typing import overload
8
+
9
+ from evalseed.schemas import QAPair
10
+
11
+
12
+ class Dataset:
13
+ """A collection of QA pairs with serialization and stats helpers.
14
+
15
+ Iteration yields all pairs (passed and rejected). Use ``passed`` /
16
+ ``rejected`` to filter, or ``save`` to write only passed pairs to disk.
17
+ """
18
+
19
+ def __init__(self, pairs: list[QAPair]) -> None:
20
+ self._pairs: list[QAPair] = list(pairs)
21
+
22
+ def __len__(self) -> int:
23
+ return len(self._pairs)
24
+
25
+ def __iter__(self) -> Iterator[QAPair]:
26
+ return iter(self._pairs)
27
+
28
+ @overload
29
+ def __getitem__(self, key: int) -> QAPair: ...
30
+ @overload
31
+ def __getitem__(self, key: slice) -> Dataset: ...
32
+ def __getitem__(self, key: int | slice) -> QAPair | Dataset:
33
+ if isinstance(key, slice):
34
+ return Dataset(self._pairs[key])
35
+ return self._pairs[key]
36
+
37
+ @property
38
+ def passed(self) -> Dataset:
39
+ return Dataset([p for p in self._pairs if p.passed])
40
+
41
+ @property
42
+ def rejected(self) -> Dataset:
43
+ return Dataset([p for p in self._pairs if not p.passed])
44
+
45
+ def stats(self) -> dict[str, object]:
46
+ total = len(self._pairs)
47
+ passed = sum(1 for p in self._pairs if p.passed)
48
+ rejection_counts: Counter[str] = Counter()
49
+ for pair in self._pairs:
50
+ for r in pair.filter_results:
51
+ if not r.passed:
52
+ rejection_counts[r.filter_name] += 1
53
+ return {
54
+ "total": total,
55
+ "passed": passed,
56
+ "rejected": total - passed,
57
+ "pass_rate": passed / total if total else 0.0,
58
+ "rejections_by_filter": dict(rejection_counts),
59
+ }
60
+
61
+ def save(self, path: str | Path, only_passed: bool = True) -> None:
62
+ """Write pairs to a JSONL file. By default writes only passed pairs."""
63
+ target = Path(path)
64
+ target.parent.mkdir(parents=True, exist_ok=True)
65
+ pairs = self.passed if only_passed else self
66
+ with target.open("w", encoding="utf-8") as f:
67
+ for pair in pairs:
68
+ f.write(pair.model_dump_json() + "\n")
69
+
70
+ @classmethod
71
+ def load(cls, path: str | Path) -> Dataset:
72
+ pairs: list[QAPair] = []
73
+ with Path(path).open(encoding="utf-8") as f:
74
+ for line in f:
75
+ line = line.strip()
76
+ if not line:
77
+ continue
78
+ pairs.append(QAPair.model_validate(json.loads(line)))
79
+ return cls(pairs)
evalseed/exceptions.py ADDED
@@ -0,0 +1,23 @@
1
+ class EvalseedError(Exception):
2
+ """Base class for all evalseed errors."""
3
+
4
+
5
+ class GenerationError(EvalseedError):
6
+ """Raised when QA pair generation fails."""
7
+
8
+
9
+ class FilterError(EvalseedError):
10
+ """Raised when a filter cannot evaluate a pair."""
11
+
12
+
13
+ class JudgeError(EvalseedError):
14
+ """Raised when the underlying judge LLM call fails."""
15
+
16
+
17
+ class JudgeAuthError(JudgeError):
18
+ """Raised when the judge cannot authenticate (missing or invalid API key).
19
+
20
+ Distinct from JudgeError so callers that normally swallow transient
21
+ judge failures can let auth errors surface — retrying a bad key is
22
+ pointless and would silently produce empty results.
23
+ """
@@ -0,0 +1,17 @@
1
+ from evalseed.filters.answerability import AnswerabilityFilter
2
+ from evalseed.filters.base import Filter, PreFilter
3
+ from evalseed.filters.difficulty import DifficultyFilter
4
+ from evalseed.filters.faithfulness import FaithfulnessFilter
5
+ from evalseed.filters.prefilters import LengthPreFilter, RegexPreFilter
6
+ from evalseed.filters.triviality import TrivialityFilter
7
+
8
+ __all__ = [
9
+ "AnswerabilityFilter",
10
+ "DifficultyFilter",
11
+ "FaithfulnessFilter",
12
+ "Filter",
13
+ "LengthPreFilter",
14
+ "PreFilter",
15
+ "RegexPreFilter",
16
+ "TrivialityFilter",
17
+ ]
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from evalseed.exceptions import JudgeAuthError, JudgeError
4
+ from evalseed.filters.base import Filter
5
+ from evalseed.schemas import FilterResult, QAPair
6
+
7
+ _SYSTEM = (
8
+ "You evaluate whether a question is well-posed and answerable from a "
9
+ "specific context, with a single defensible answer. Respond ONLY with JSON."
10
+ )
11
+
12
+ _USER_TEMPLATE = """Evaluate the QUESTION against the CONTEXT.
13
+
14
+ CONTEXT:
15
+ \"\"\"
16
+ {context}
17
+ \"\"\"
18
+
19
+ QUESTION: {question}
20
+
21
+ Decide:
22
+ 1. Is the question unambiguous (a careful reader would not produce multiple equally valid answers)?
23
+ 2. Is the question answerable from the context alone (no external knowledge required)?
24
+ 3. Is the question self-contained (does not rely on pronouns or "the above" referring outside itself)?
25
+
26
+ Return JSON:
27
+ {{
28
+ "unambiguous": <true|false>,
29
+ "answerable": <true|false>,
30
+ "self_contained": <true|false>,
31
+ "reason": "<one short sentence if any check fails, else empty string>"
32
+ }}
33
+ """
34
+
35
+
36
+ class AnswerabilityFilter(Filter):
37
+ """Rejects ambiguous, externally-dependent, or non-self-contained questions."""
38
+
39
+ name = "answerability"
40
+
41
+ def evaluate(self, pair: QAPair) -> FilterResult:
42
+ try:
43
+ result = self.judge.judge(
44
+ _SYSTEM,
45
+ _USER_TEMPLATE.format(context=pair.context, question=pair.question),
46
+ )
47
+ except JudgeAuthError:
48
+ raise
49
+ except JudgeError as exc:
50
+ return self._result(passed=False, reason=f"judge error: {exc}")
51
+
52
+ unambiguous = bool(result.get("unambiguous", False))
53
+ answerable = bool(result.get("answerable", False))
54
+ self_contained = bool(result.get("self_contained", False))
55
+ passed = unambiguous and answerable and self_contained
56
+ reason = str(result.get("reason", "")).strip() or None
57
+ if passed:
58
+ return self._result(
59
+ passed=True,
60
+ unambiguous=unambiguous,
61
+ answerable=answerable,
62
+ self_contained=self_contained,
63
+ )
64
+ failed = [
65
+ name
66
+ for name, ok in (
67
+ ("unambiguous", unambiguous),
68
+ ("answerable", answerable),
69
+ ("self_contained", self_contained),
70
+ )
71
+ if not ok
72
+ ]
73
+ return self._result(
74
+ passed=False,
75
+ reason=reason or f"failed: {', '.join(failed)}",
76
+ unambiguous=unambiguous,
77
+ answerable=answerable,
78
+ self_contained=self_contained,
79
+ )
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from evalseed.judges import Judge
6
+ from evalseed.schemas import FilterResult, QAPair
7
+
8
+
9
+ class PreFilter(ABC):
10
+ """Cheap, judge-free filter — runs before any LLM calls."""
11
+
12
+ name: str
13
+
14
+ @abstractmethod
15
+ def evaluate(self, pair: QAPair) -> FilterResult: ...
16
+
17
+
18
+ class Filter(ABC):
19
+ """LLM-judge-backed filter."""
20
+
21
+ name: str
22
+
23
+ def __init__(self, judge: Judge) -> None:
24
+ self.judge = judge
25
+
26
+ @abstractmethod
27
+ def evaluate(self, pair: QAPair) -> FilterResult: ...
28
+
29
+ def _result(
30
+ self,
31
+ passed: bool,
32
+ score: float | None = None,
33
+ reason: str | None = None,
34
+ **metadata: object,
35
+ ) -> FilterResult:
36
+ return FilterResult(
37
+ filter_name=self.name,
38
+ passed=passed,
39
+ score=score,
40
+ reason=reason,
41
+ metadata=dict(metadata),
42
+ )
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from evalseed.exceptions import JudgeAuthError, JudgeError
4
+ from evalseed.filters.base import Filter
5
+ from evalseed.judges import Judge
6
+ from evalseed.schemas import Difficulty, FilterResult, QAPair
7
+
8
+ _SYSTEM = (
9
+ "You assess the reasoning effort required by a question, given its "
10
+ "context. You output one of: 'easy', 'medium', 'hard'. Respond ONLY with JSON."
11
+ )
12
+
13
+ _USER_TEMPLATE = """Assess the difficulty of answering QUESTION from CONTEXT.
14
+
15
+ CONTEXT:
16
+ \"\"\"
17
+ {context}
18
+ \"\"\"
19
+
20
+ QUESTION: {question}
21
+
22
+ Difficulty rubric:
23
+ - easy: single-span lookup, no reasoning required.
24
+ - medium: requires combining 2-3 facts, light synthesis, or simple inference.
25
+ - hard: requires multi-step reasoning, comparison across distant spans, or non-trivial inference.
26
+
27
+ Return JSON:
28
+ {{
29
+ "predicted": "<easy|medium|hard>",
30
+ "reason": "<one short sentence>"
31
+ }}
32
+ """
33
+
34
+
35
+ _LEVELS = {Difficulty.EASY: 0, Difficulty.MEDIUM: 1, Difficulty.HARD: 2}
36
+
37
+
38
+ class DifficultyFilter(Filter):
39
+ """Records judge-predicted difficulty alongside the labeled value.
40
+
41
+ By default this is a label-enrichment pass — the prediction is stored in
42
+ metadata and the pair always passes. A 3-level rubric like easy/medium/hard
43
+ is too coarse for one-step disagreements ("easy" vs "medium") to be a
44
+ quality signal, so disagreement-based rejection is opt-in.
45
+
46
+ Modes:
47
+ strict=False (default): always pass; record prediction + agreement.
48
+ strict=True: reject only on a TWO-step gap (easy↔hard).
49
+ """
50
+
51
+ name = "difficulty"
52
+
53
+ def __init__(self, judge: Judge, strict: bool = False) -> None:
54
+ super().__init__(judge)
55
+ self.strict = strict
56
+
57
+ def evaluate(self, pair: QAPair) -> FilterResult:
58
+ try:
59
+ result = self.judge.judge(
60
+ _SYSTEM,
61
+ _USER_TEMPLATE.format(context=pair.context, question=pair.question),
62
+ )
63
+ except JudgeAuthError:
64
+ raise
65
+ except JudgeError as exc:
66
+ return self._result(passed=False, reason=f"judge error: {exc}")
67
+
68
+ predicted_raw = str(result.get("predicted", "")).strip().lower()
69
+ try:
70
+ predicted = Difficulty(predicted_raw)
71
+ except ValueError:
72
+ return self._result(
73
+ passed=True,
74
+ reason=f"unparseable difficulty: {predicted_raw!r}",
75
+ predicted=predicted_raw,
76
+ )
77
+
78
+ if pair.difficulty is None:
79
+ return self._result(passed=True, predicted=predicted.value)
80
+
81
+ gap = abs(_LEVELS[pair.difficulty] - _LEVELS[predicted])
82
+ if not self.strict or gap < 2:
83
+ return self._result(
84
+ passed=True,
85
+ labeled=pair.difficulty.value,
86
+ predicted=predicted.value,
87
+ gap=gap,
88
+ )
89
+
90
+ return self._result(
91
+ passed=False,
92
+ reason=(
93
+ f"labeled difficulty {pair.difficulty.value!r} is two steps "
94
+ f"away from judge prediction {predicted.value!r}"
95
+ ),
96
+ labeled=pair.difficulty.value,
97
+ predicted=predicted.value,
98
+ gap=gap,
99
+ )
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+
3
+ from evalseed.exceptions import JudgeAuthError, JudgeError
4
+ from evalseed.filters.base import Filter
5
+ from evalseed.judges import Judge
6
+ from evalseed.schemas import FilterResult, QAPair
7
+
8
+ _SYSTEM = (
9
+ "You are a strict evaluator of factual grounding. You decide whether an "
10
+ "answer is fully supported by the provided context, with no information "
11
+ "added from outside the context. Respond ONLY with JSON."
12
+ )
13
+
14
+ _USER_TEMPLATE = """Evaluate whether the ANSWER is fully entailed by the CONTEXT for the QUESTION.
15
+
16
+ CONTEXT:
17
+ \"\"\"
18
+ {context}
19
+ \"\"\"
20
+
21
+ QUESTION: {question}
22
+
23
+ ANSWER: {answer}
24
+
25
+ Return JSON with this exact schema:
26
+ {{
27
+ "faithful": <true|false>,
28
+ "score": <float between 0 and 1>,
29
+ "reason": "<one short sentence>"
30
+ }}
31
+
32
+ Rules:
33
+ - "faithful" must be false if any factual claim in the answer is not supported by the context.
34
+ - "faithful" must be false if the answer adds quantitative details, dates, or names not in the context.
35
+ - Paraphrase is OK as long as every claim is supported.
36
+ """
37
+
38
+
39
+ class FaithfulnessFilter(Filter):
40
+ """Rejects pairs where the answer cannot be entailed from the context."""
41
+
42
+ name = "faithfulness"
43
+
44
+ def __init__(self, judge: Judge, threshold: float = 0.7) -> None:
45
+ super().__init__(judge)
46
+ self.threshold = threshold
47
+
48
+ def evaluate(self, pair: QAPair) -> FilterResult:
49
+ try:
50
+ result = self.judge.judge(
51
+ _SYSTEM,
52
+ _USER_TEMPLATE.format(
53
+ context=pair.context,
54
+ question=pair.question,
55
+ answer=pair.answer,
56
+ ),
57
+ )
58
+ except JudgeAuthError:
59
+ raise
60
+ except JudgeError as exc:
61
+ return self._result(passed=False, reason=f"judge error: {exc}")
62
+
63
+ faithful = bool(result.get("faithful", False))
64
+ score_raw = result.get("score", 0.0)
65
+ try:
66
+ score = float(score_raw)
67
+ except (TypeError, ValueError):
68
+ score = 0.0
69
+ reason = str(result.get("reason", "")).strip() or None
70
+ passed = faithful and score >= self.threshold
71
+ return self._result(passed=passed, score=score, reason=reason if not passed else None)