raglab 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raglab
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A medley of tools to make RAG-based applications.
5
5
  Project-URL: Homepage, https://github.com/thorwhalen/raglab
6
6
  Project-URL: Repository, https://github.com/thorwhalen/raglab
@@ -9,7 +9,7 @@ Author: thorwhalen
9
9
  License: mit
10
10
  License-File: LICENSE
11
11
  Requires-Python: >=3.10
12
- Requires-Dist: ir>=0.1.12
12
+ Requires-Dist: ir>=0.1.14
13
13
  Provides-Extra: dev
14
14
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
15
15
  Requires-Dist: pytest>=7.0; extra == 'dev'
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
6
6
 
7
7
  [project]
8
8
  name = "raglab"
9
- version = "0.2.2"
9
+ version = "0.2.3"
10
10
  description = "A medley of tools to make RAG-based applications."
11
11
  readme = "README.md"
12
12
  requires-python = ">=3.10"
@@ -18,7 +18,7 @@ authors = [
18
18
  # strategies (Planner/Formulator/Evaluator) use `oa` lazily — the `llm` extra
19
19
  # below — so `import raglab` stays offline by default.
20
20
  dependencies = [
21
- "ir>=0.1.12",
21
+ "ir>=0.1.14",
22
22
  ]
23
23
 
24
24
  [project.license]
@@ -47,6 +47,7 @@ from .agent import (
47
47
  score_reranker,
48
48
  single_subtask_planner,
49
49
  )
50
+ from .llm import EVALUATION_PROMPT, make_llm_evaluator, make_llm_formulator
50
51
 
51
52
  __all__ = [
52
53
  "Query",
@@ -69,4 +70,7 @@ __all__ = [
69
70
  "passthrough_evaluator",
70
71
  "score_reranker",
71
72
  "identity_citer",
73
+ "make_llm_formulator",
74
+ "make_llm_evaluator",
75
+ "EVALUATION_PROMPT",
72
76
  ]
@@ -30,6 +30,7 @@ from typing import Any, Protocol, runtime_checkable
30
30
  # ir owns the retrieval substrate: the Result type and the Retriever leaf
31
31
  # contract live there (one-way dependency, ir is the SSOT).
32
32
  from ir import Retriever, SearchHit
33
+ from ir.base import best_per_artifact
33
34
 
34
35
  #: A retrieved item — ir's :class:`~ir.base.SearchHit` (ir_09's ``Result``):
35
36
  #: a *pointer + snippet* (``text``) with a ``score`` and ``metadata``.
@@ -188,13 +189,19 @@ def passthrough_evaluator(task: SubTask, results: Sequence[Result]) -> Judgement
188
189
 
189
190
 
190
191
  def score_reranker(results: Sequence[Result]) -> Sequence[Result]:
191
- """Final ordering by descending ``score`` (the cross-source merge, v1).
192
+ """Cross-source merge (v1): one surface per artifact, ordered by descending score.
193
+
194
+ Delegates to :func:`ir.base.best_per_artifact` (ir is the SSOT for hit
195
+ operations): an artifact retrieved by several queries / sources / rounds —
196
+ common once the back-edge re-queries — survives once, at its highest score, so
197
+ the merged list carries no duplicate ``artifact_id``. Also the evaluator's
198
+ pre-selection rank, so :func:`ir.select` never sees duplicates either.
192
199
 
193
200
  Note: a plain score sort assumes comparable score scales across sources
194
201
  (true when they share an embedder + mode). A rank-based (RRF) cross-source
195
202
  merge for heterogeneous backends is a documented follow-up.
196
203
  """
197
- return sorted(results, key=lambda r: float(getattr(r, "score", 0.0)), reverse=True)
204
+ return best_per_artifact(results)
198
205
 
199
206
 
200
207
  def identity_citer(results: Sequence[Result]) -> Sequence[Result]:
@@ -0,0 +1,257 @@
1
+ """LLM-backed roles — the Formulator and Evaluator (ir_09 §8 steps 1 & 3).
2
+
3
+ `raglab`'s no-LLM thin slice (:mod:`raglab.agent`) runs offline: an identity
4
+ formulator and a pass-through evaluator. This module supplies the two LLM roles
5
+ that turn that slice into a real agent — query understanding and the back-edge:
6
+
7
+ - :func:`make_llm_formulator` — query rewrite / expand / HyDE. A thin adapter
8
+ over ir's :func:`ir.make_llm_formulator` (the ``formulate=`` seam, ir_09 §3):
9
+ ir owns the lazy-:mod:`oa` rewriter and the identity fallback; raglab only
10
+ adapts the shape ``str -> str | [str]`` to the role contract
11
+ ``(SubTask, source) -> [LowLevelQuery]``.
12
+ - :func:`make_llm_evaluator` — sufficiency + refinement. ``ir.select`` owns the
13
+ *relevance* decision (the calibrated committed subset, ir_09 §3); the LLM owns
14
+ *sufficiency* — informed by ir's model-free :attr:`ir.Selection.sufficient`
15
+ hint, it judges whether the committed subset actually satisfies the goal and,
16
+ when it does not, emits a ``refinement`` SubTask. That refinement is the
17
+ **back-edge** that makes the loop an agent rather than a DAG.
18
+
19
+ Two load-bearing boundaries (ir ↔ raglab), guarded here:
20
+
21
+ 1. A Formulator returns **queries, never SubTasks** — decomposition is the
22
+ Planner's job. :func:`make_llm_formulator` only ever yields
23
+ :class:`~raglab.agent.LowLevelQuery`\\ s.
24
+ 2. The **back-edge lives in raglab**, never in ir: ir derives a ``sufficient``
25
+ *signal* from its own selection; raglab's Evaluator is what reads it, decides,
26
+ and re-queries.
27
+
28
+ Both builders mirror :func:`ir.select.make_llm_selector`: an injectable callable
29
+ (a test double, or your own router), built lazily on :mod:`oa` only when omitted
30
+ (so ``import raglab`` stays offline), with a **safe fallback** — a formulator must
31
+ never make retrieval worse than the raw query, and an evaluator must never
32
+ fabricate an endless loop (on any failure it returns no refinement, which is the
33
+ loop's break condition).
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ from collections.abc import Mapping, Sequence
39
+ from typing import Any, Callable
40
+
41
+ import ir
42
+
43
+ from .agent import (
44
+ Evaluator,
45
+ Formulator,
46
+ Judgement,
47
+ LowLevelQuery,
48
+ Result,
49
+ SubTask,
50
+ score_reranker,
51
+ )
52
+
53
+ __all__ = ["make_llm_formulator", "make_llm_evaluator", "EVALUATION_PROMPT"]
54
+
55
+ #: An ir-style query formulator: a query string -> one query or several.
56
+ QueryFormulator = Callable[[str], "str | Sequence[str]"]
57
+
58
+ #: An evaluator judge: ``(goal, results) -> (sufficient, refinement)``.
59
+ #: ``sufficient`` ends the loop; a non-empty ``refinement`` query becomes the
60
+ #: next sub-goal (the back-edge). A raw text reply is also accepted and parsed.
61
+ Judge = Callable[..., "tuple[bool, str | None] | str"]
62
+
63
+ #: Truncate each rendered result's text to this many chars in the judge prompt.
64
+ DFLT_MAX_RESULT_CHARS = 500
65
+
66
+
67
+ # --------------------------------------------------------------------------- #
68
+ # LLM Formulator — adapt ir's query-level formulator to the role contract
69
+ # --------------------------------------------------------------------------- #
70
+
71
+
72
+ def make_llm_formulator(
73
+ *,
74
+ formulate: QueryFormulator | None = None,
75
+ params: Mapping[str, Any] | None = None,
76
+ **make_kwargs: Any,
77
+ ) -> Formulator:
78
+ """An LLM-backed raglab :class:`~raglab.agent.Formulator`.
79
+
80
+ Adapts an ir-style query formulator (``str -> str | [str]``) to the role
81
+ contract ``(SubTask, source) -> [LowLevelQuery]``: the sub-goal text is
82
+ rewritten / expanded into one or more search queries, each wrapped as a
83
+ :class:`~raglab.agent.LowLevelQuery` against ``source``.
84
+
85
+ Args:
86
+ formulate: an injectable ir-style formulator (a test double, or one you
87
+ built). When omitted it is built once via
88
+ :func:`ir.make_llm_formulator` (``**make_kwargs`` are forwarded —
89
+ e.g. ``n``, ``prompt``, ``rewriter``); that call is offline (``oa`` is
90
+ imported lazily only when the formulator is *invoked*), so
91
+ ``import raglab`` stays offline.
92
+ params: per-call retriever overrides (e.g. ``{"mode": "hybrid", "k": 5}``)
93
+ attached to every emitted :class:`~raglab.agent.LowLevelQuery`.
94
+
95
+ The boundary: this returns **queries, never SubTasks**. ir's formulator
96
+ already falls back to identity on any failure, so the emitted list is never
97
+ empty — a formulator must never make retrieval worse than the raw sub-goal.
98
+ """
99
+ extra = dict(params or {})
100
+ fn: QueryFormulator = (
101
+ formulate if formulate is not None else ir.make_llm_formulator(**make_kwargs)
102
+ )
103
+
104
+ def formulator(task: SubTask, source: str) -> list[LowLevelQuery]:
105
+ try:
106
+ out = fn(task.goal)
107
+ except Exception:
108
+ out = None # a formulator must never make retrieval worse: fall back
109
+ queries = [out] if isinstance(out, str) else list(out or [])
110
+ queries = [q for q in queries if isinstance(q, str) and q.strip()] or [
111
+ task.goal
112
+ ]
113
+ return [
114
+ LowLevelQuery(source=source, query=q, params=dict(extra)) for q in queries
115
+ ]
116
+
117
+ return formulator
118
+
119
+
120
+ # --------------------------------------------------------------------------- #
121
+ # LLM Evaluator — ir.select owns relevance; the LLM owns sufficiency + the
122
+ # back-edge (ir_09 §3/§4)
123
+ # --------------------------------------------------------------------------- #
124
+
125
+ #: Default prompt for :func:`make_llm_evaluator` — judge sufficiency, else emit a
126
+ #: single improved query (the refinement that drives the back-edge).
127
+ EVALUATION_PROMPT = """\
128
+ A search agent is pursuing this goal:
129
+
130
+ {goal}
131
+
132
+ A calibrated selector reviewed the retrieved candidates and committed to the
133
+ results below (it abstained if this list is empty):
134
+
135
+ {results}
136
+
137
+ Decide whether these results are SUFFICIENT to satisfy the goal.
138
+ - If they are, reply with exactly: SUFFICIENT
139
+ - If they are not, reply with: INSUFFICIENT
140
+ then, on the next line, a single improved search query that would retrieve what
141
+ is still missing. Keep it a terse search phrase — no prose, no numbering.
142
+ """
143
+
144
+
145
+ def make_llm_evaluator(
146
+ *,
147
+ judge: Judge | None = None,
148
+ select_strategy: str = "conservative",
149
+ select_kwargs: Mapping[str, Any] | None = None,
150
+ prompt: str = EVALUATION_PROMPT,
151
+ max_result_chars: int = DFLT_MAX_RESULT_CHARS,
152
+ **prompt_function_kwargs: Any,
153
+ ) -> Evaluator:
154
+ """An LLM-backed raglab :class:`~raglab.agent.Evaluator` (turns on the back-edge).
155
+
156
+ Relevance is ir's: each round the accumulated results are passed through
157
+ :func:`ir.select` and the committed subset becomes the ``Judgement.relevant``
158
+ (so the LLM stays in its lane — LLM relevance is known-fragile, ir_01 §3).
159
+ Sufficiency is the LLM's: it reads the committed subset (informed by ir's
160
+ model-free :attr:`~ir.Selection.sufficient` hint via abstention) and decides
161
+ whether the goal is satisfied. When it is not, the judge's improved query
162
+ becomes a ``refinement`` :class:`~raglab.agent.SubTask` over the same sources
163
+ — the **back-edge**.
164
+
165
+ Args:
166
+ judge: an injectable ``(goal, results) -> (sufficient, refinement)``
167
+ callable (a test double, or your own router); a raw text reply is also
168
+ accepted and parsed. When omitted it is built lazily on :mod:`oa`.
169
+ select_strategy: ir selection strategy for the relevance decision
170
+ (default ``"conservative"`` — distractor-robust).
171
+ select_kwargs: extra args forwarded to :func:`ir.select`
172
+ (e.g. ``max_k``, ``rel``, ``min_score``).
173
+ max_result_chars: per-result text truncation in the judge prompt.
174
+
175
+ Safe fallback: any judge error returns ``refinement=None`` — the control
176
+ loop's break condition — so a failing judge can never fabricate an endless
177
+ loop. Sufficiency without a refinement query is likewise treated as a stop.
178
+ """
179
+ sel_kw = dict(select_kwargs or {})
180
+
181
+ def _ask_judge(goal: str, rendered: str) -> tuple[bool, str | None]:
182
+ fn = (
183
+ judge
184
+ if judge is not None
185
+ else _default_llm_judge(prompt, **prompt_function_kwargs)
186
+ )
187
+ return _normalize_verdict(fn(goal=goal, results=rendered))
188
+
189
+ def evaluator(task: SubTask, results: Sequence[Result]) -> Judgement:
190
+ # ``ir.select`` documents a best-first precondition; the loop accumulates
191
+ # hits across rounds/sources in arbitrary order, so rank them first (the
192
+ # same score ordering the final reranker uses).
193
+ ranked = score_reranker(results)
194
+ selection = ir.select(list(ranked), strategy=select_strategy, **sel_kw)
195
+ relevant = list(selection.selected)
196
+ rendered = (
197
+ _render_results(relevant, max_result_chars)
198
+ if relevant
199
+ else "(none — the selector abstained)"
200
+ )
201
+ try:
202
+ sufficient, refinement = _ask_judge(task.goal, rendered)
203
+ except Exception:
204
+ # Trust ir's model-free signal for the report, but never re-query on a
205
+ # judge failure: refinement=None is the loop's break condition.
206
+ return Judgement(
207
+ relevant=relevant, sufficient=selection.sufficient, refinement=None
208
+ )
209
+ if sufficient or not refinement:
210
+ return Judgement(relevant=relevant, sufficient=True, refinement=None)
211
+ return Judgement(
212
+ relevant=relevant,
213
+ sufficient=False,
214
+ refinement=SubTask(goal=refinement, sources=task.sources),
215
+ )
216
+
217
+ return evaluator
218
+
219
+
220
+ def _render_results(results: Sequence[Result], max_chars: int) -> str:
221
+ """Render committed hits as ``- id: text`` lines for the judge prompt."""
222
+ return "\n".join(f"- {h.artifact_id}: {str(h.text)[:max_chars]}" for h in results)
223
+
224
+
225
+ def _normalize_verdict(out: "tuple[bool, str | None] | str") -> tuple[bool, str | None]:
226
+ """Coerce a judge reply (a ``(sufficient, refinement)`` tuple, or raw text)."""
227
+ if isinstance(out, tuple):
228
+ sufficient, refinement = out
229
+ refinement = str(refinement).strip() if refinement else None
230
+ return bool(sufficient), (refinement or None)
231
+ return _parse_verdict(str(out or ""))
232
+
233
+
234
+ def _parse_verdict(text: str) -> tuple[bool, str | None]:
235
+ """Parse a SUFFICIENT / INSUFFICIENT + refinement-query text reply."""
236
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
237
+ if not lines or lines[0].upper().startswith("SUFFICIENT"):
238
+ return True, None
239
+ refinement = lines[1] if len(lines) > 1 else None
240
+ return False, (refinement or None)
241
+
242
+
243
+ def _default_llm_judge(prompt: str, **prompt_function_kwargs: Any) -> Judge:
244
+ """Build the default sufficiency judge on :mod:`oa` (lazy import)."""
245
+ import oa
246
+
247
+ fn = oa.prompt_function(
248
+ prompt,
249
+ egress=_parse_verdict,
250
+ name="evaluate_sufficiency",
251
+ **prompt_function_kwargs,
252
+ )
253
+
254
+ def judge(*, goal: str, results: str) -> tuple[bool, str | None]:
255
+ return fn(goal=goal, results=results)
256
+
257
+ return judge
@@ -66,6 +66,28 @@ def test_cross_source_merge_reranks_by_score():
66
66
  assert [r.artifact_id for r in results] == ["b", "a"] # by score desc
67
67
 
68
68
 
69
+ def test_final_results_have_no_duplicate_artifacts():
70
+ # An artifact re-retrieved across back-edge rounds collapses to its best score.
71
+ retr = _fake_retriever(_hits(("a", 0.9), ("b", 0.4)))
72
+ rounds = {"n": 0}
73
+
74
+ def refining(task, results):
75
+ rounds["n"] += 1
76
+ if rounds["n"] < 2:
77
+ return Judgement(
78
+ list(results),
79
+ sufficient=False,
80
+ refinement=SubTask(task.goal, task.sources),
81
+ )
82
+ return Judgement(list(results), sufficient=True)
83
+
84
+ ids = [
85
+ r.artifact_id for r in make_search_agent({"s": retr}, evaluator=refining)("q")
86
+ ]
87
+ assert ids == ["a", "b"] # best score per artifact, descending
88
+ assert len(ids) == len(set(ids)) # the re-query did not duplicate artifacts
89
+
90
+
69
91
  def test_passthrough_evaluator_does_not_loop():
70
92
  retr = _fake_retriever(_hits(("a", 0.9)))
71
93
  make_search_agent({"s": retr})("q")
@@ -133,6 +155,26 @@ def test_budget_bounds_a_never_sufficient_loop():
133
155
  assert len(retr.calls) == 3 # exactly max_rounds — the safety net holds
134
156
 
135
157
 
158
+ def test_budget_caps_results_per_task_seen_by_evaluator():
159
+ retr = _fake_retriever(_hits(*[(f"a{i}", 1.0 - i / 100) for i in range(20)]))
160
+ seen = {}
161
+
162
+ def evaluator(task, results):
163
+ seen["n"] = len(results)
164
+ return Judgement(list(results), sufficient=True)
165
+
166
+ make_search_agent(
167
+ {"s": retr}, evaluator=evaluator, budget=Budget(max_results_per_task=5)
168
+ )("q")
169
+ assert seen["n"] == 5 # the evaluator sees at most max_results_per_task
170
+
171
+
172
+ def test_budget_caps_sources_per_task():
173
+ retrs = {f"s{i}": _fake_retriever(_hits((f"a{i}", 0.5))) for i in range(6)}
174
+ make_search_agent(retrs, budget=Budget(max_sources_per_task=2))("q")
175
+ assert sum(1 for r in retrs.values() if r.calls) == 2 # only first 2 sources hit
176
+
177
+
136
178
  # ----- end-to-end over a REAL ir corpus (hermetic: light embedder) ---------- #
137
179
 
138
180
 
@@ -0,0 +1,250 @@
1
+ """Tests for raglab's LLM-backed roles (the Formulator and Evaluator).
2
+
3
+ Hermetic and deterministic: every "LLM" is an injected test double (no model, no
4
+ network). The Formulator adapts an ir-style ``str -> [str]`` rewriter; the
5
+ Evaluator delegates relevance to ``ir.select`` and sufficiency to the injected
6
+ judge. One end-to-end demo wires a real ``ir`` corpus (the light, numpy-only
7
+ embedder, in-memory store) and shows the **back-edge** recovering a gold document
8
+ that single-shot retrieval misses.
9
+ """
10
+
11
+ import ir
12
+ from ir import SearchHit
13
+ from ir.store import CorpusStore
14
+
15
+ from raglab import (
16
+ Budget,
17
+ LowLevelQuery,
18
+ SubTask,
19
+ make_llm_evaluator,
20
+ make_llm_formulator,
21
+ make_search_agent,
22
+ )
23
+
24
+
25
+ def _hits(*specs):
26
+ """``(artifact_id, score)`` pairs -> ir.SearchHits (no corpus needed)."""
27
+ return [SearchHit(aid, "k", score, f"text {aid}", {}) for aid, score in specs]
28
+
29
+
30
+ def _fake_retriever(hits):
31
+ """A Retriever that records its calls and returns canned hits."""
32
+ calls = []
33
+
34
+ def retrieve(query, **kw):
35
+ calls.append((query, kw))
36
+ return list(hits)
37
+
38
+ retrieve.calls = calls
39
+ return retrieve
40
+
41
+
42
+ # ----- LLM Formulator: adapt str -> [str] to (SubTask, source) -> [LLQ] ------ #
43
+
44
+
45
+ def test_llm_formulator_fans_out_one_llq_per_query():
46
+ formulator = make_llm_formulator(formulate=lambda q: [q, q + " alt"])
47
+ llqs = formulator(SubTask(goal="deploy", sources=("s",)), "s")
48
+ assert [q.query for q in llqs] == ["deploy", "deploy alt"]
49
+ assert all(isinstance(q, LowLevelQuery) and q.source == "s" for q in llqs)
50
+
51
+
52
+ def test_llm_formulator_accepts_a_bare_string():
53
+ formulator = make_llm_formulator(formulate=lambda q: q + "!")
54
+ llqs = formulator(SubTask(goal="x", sources=("s",)), "s")
55
+ assert [q.query for q in llqs] == ["x!"]
56
+
57
+
58
+ def test_llm_formulator_attaches_params_to_every_query():
59
+ formulator = make_llm_formulator(
60
+ formulate=lambda q: [q, q + " b"], params={"mode": "hybrid", "k": 5}
61
+ )
62
+ llqs = formulator(SubTask(goal="g", sources=("s",)), "s")
63
+ assert all(q.params == {"mode": "hybrid", "k": 5} for q in llqs)
64
+
65
+
66
+ def test_llm_formulator_empty_output_falls_back_to_the_goal():
67
+ # A formulator must never make retrieval worse than the raw sub-goal.
68
+ formulator = make_llm_formulator(formulate=lambda q: [])
69
+ llqs = formulator(SubTask(goal="the goal", sources=("s",)), "s")
70
+ assert [q.query for q in llqs] == ["the goal"]
71
+
72
+
73
+ def test_llm_formulator_swallows_a_raising_callable():
74
+ # A failing custom formulator must fall back to the goal, never propagate.
75
+ def boom(_q):
76
+ raise RuntimeError("rewriter down")
77
+
78
+ formulator = make_llm_formulator(formulate=boom)
79
+ llqs = formulator(SubTask(goal="the goal", sources=("s",)), "s")
80
+ assert [q.query for q in llqs] == ["the goal"]
81
+
82
+
83
+ def test_llm_formulator_drives_the_agent_loop():
84
+ retr = _fake_retriever(_hits(("a", 0.9)))
85
+ formulator = make_llm_formulator(formulate=lambda q: [q, q + " expanded"])
86
+ make_search_agent({"s": retr}, formulator=formulator)("q")
87
+ assert [c[0] for c in retr.calls] == ["q", "q expanded"]
88
+
89
+
90
+ # ----- LLM Evaluator: ir.select owns relevance, the LLM owns sufficiency ----- #
91
+
92
+
93
+ def test_evaluator_relevance_comes_from_ir_select():
94
+ # Conservative selection keeps only the near-top hit; "b" is a distractor.
95
+ evaluator = make_llm_evaluator(judge=lambda **kw: (True, None))
96
+ judged = evaluator(SubTask("g", ("s",)), _hits(("a", 0.9), ("b", 0.1)))
97
+ assert [h.artifact_id for h in judged.relevant] == ["a"]
98
+ assert judged.sufficient and judged.refinement is None
99
+
100
+
101
+ def test_evaluator_emits_a_refinement_when_insufficient():
102
+ evaluator = make_llm_evaluator(judge=lambda **kw: (False, "better query"))
103
+ judged = evaluator(SubTask("g", ("s1", "s2")), _hits(("a", 0.9)))
104
+ assert judged.sufficient is False
105
+ assert judged.refinement == SubTask(goal="better query", sources=("s1", "s2"))
106
+
107
+
108
+ def test_evaluator_insufficient_without_a_query_stops_the_loop():
109
+ # Insufficient but no refinement query -> nothing better to try -> stop.
110
+ evaluator = make_llm_evaluator(judge=lambda **kw: (False, None))
111
+ judged = evaluator(SubTask("g", ("s",)), _hits(("a", 0.9)))
112
+ assert judged.sufficient is True and judged.refinement is None
113
+
114
+
115
+ def test_evaluator_parses_a_raw_text_reply():
116
+ evaluator = make_llm_evaluator(
117
+ judge=lambda **kw: "INSUFFICIENT\nvector database filtering"
118
+ )
119
+ judged = evaluator(SubTask("g", ("s",)), _hits(("a", 0.9)))
120
+ assert judged.refinement.goal == "vector database filtering"
121
+
122
+
123
+ def test_evaluator_judge_error_falls_back_to_signal_no_loop():
124
+ def boom(**kw):
125
+ raise RuntimeError("model down")
126
+
127
+ evaluator = make_llm_evaluator(judge=boom)
128
+ judged = evaluator(SubTask("g", ("s",)), _hits(("a", 0.9)))
129
+ # refinement=None is the loop's break condition: a judge error never spins.
130
+ assert judged.refinement is None
131
+ assert judged.sufficient is True # ir.select committed to "a" -> sufficient
132
+
133
+
134
+ def test_evaluator_renders_abstention_to_the_judge():
135
+ seen = {}
136
+
137
+ def judge(*, goal, results):
138
+ seen["results"] = results
139
+ return (True, None)
140
+
141
+ evaluator = make_llm_evaluator(judge=judge)
142
+ evaluator(SubTask("g", ("s",)), []) # no results -> ir.select abstains
143
+ assert "abstained" in seen["results"]
144
+
145
+
146
+ def test_evaluator_forwards_select_kwargs_to_ir_select():
147
+ # Two near-tied hits: conservative keeps only "a" by default, but a loose
148
+ # rel threshold (forwarded via select_kwargs) admits "b" too.
149
+ hits = _hits(("a", 0.9), ("b", 0.6))
150
+ strict = make_llm_evaluator(judge=lambda **kw: (True, None))
151
+ loose = make_llm_evaluator(
152
+ judge=lambda **kw: (True, None), select_kwargs={"rel": 0.5}
153
+ )
154
+ assert [h.artifact_id for h in strict(SubTask("g", ("s",)), hits).relevant] == ["a"]
155
+ assert [h.artifact_id for h in loose(SubTask("g", ("s",)), hits).relevant] == [
156
+ "a",
157
+ "b",
158
+ ]
159
+
160
+
161
+ def test_evaluator_ranks_heterogeneous_results_before_selecting():
162
+ # Accumulated cross-source results arrive unordered; the evaluator must rank
163
+ # best-first before ir.select (which trusts input order).
164
+ captured = {}
165
+
166
+ def judge(*, goal, results):
167
+ captured["results"] = results
168
+ return (True, None)
169
+
170
+ evaluator = make_llm_evaluator(judge=judge, select_kwargs={"rel": 0.0})
171
+ unordered = _hits(("lo", 0.1), ("hi", 0.9), ("mid", 0.5))
172
+ judged = evaluator(SubTask("g", ("s",)), unordered)
173
+ assert [h.artifact_id for h in judged.relevant] == ["hi", "mid", "lo"]
174
+
175
+
176
+ # ----- the back-edge end-to-end, wired through the agent loop ---------------- #
177
+
178
+
179
+ def test_evaluator_back_edge_loops_until_sufficient():
180
+ retr = _fake_retriever(_hits(("a", 0.9)))
181
+ rounds = {"n": 0}
182
+
183
+ def judge(*, goal, results):
184
+ rounds["n"] += 1
185
+ if rounds["n"] < 2:
186
+ return (False, goal + " more")
187
+ return (True, None)
188
+
189
+ make_search_agent({"s": retr}, evaluator=make_llm_evaluator(judge=judge))("q")
190
+ assert rounds["n"] == 2 # looped once via the back-edge
191
+ assert [c[0] for c in retr.calls] == ["q", "q more"] # refinement re-queried
192
+
193
+
194
+ def test_evaluator_back_edge_is_bounded_by_budget():
195
+ retr = _fake_retriever(_hits(("a", 0.9)))
196
+ evaluator = make_llm_evaluator(judge=lambda **kw: (False, "again"))
197
+ make_search_agent({"s": retr}, evaluator=evaluator, budget=Budget(max_rounds=3))(
198
+ "q"
199
+ )
200
+ assert len(retr.calls) == 3 # the safety net holds even if never sufficient
201
+
202
+
203
+ # ----- end-to-end over a REAL ir corpus (hermetic: light embedder) ---------- #
204
+
205
+
206
+ def _light_corpus():
207
+ docs = {
208
+ "embed": "embed and cache model vectors",
209
+ "systemd": "configure systemd units and restart services",
210
+ "filtering": "narrow similarity search using metadata filters",
211
+ }
212
+ return ir.build(
213
+ ir.CorpusSource.from_mapping(docs, name="t", strategy=ir.WholeText()),
214
+ store=CorpusStore.memory(),
215
+ embedder="light",
216
+ )
217
+
218
+
219
+ def test_back_edge_recovers_a_doc_single_shot_misses():
220
+ """A query that overlaps a distractor misses the gold; the refinement recovers it.
221
+
222
+ Deterministic with the light embedder: the round-1 query shares vocabulary
223
+ with ``embed`` (a positive-score distractor) but none with the gold
224
+ ``filtering``, so single-shot ranks ``embed`` first; the injected judge
225
+ declares it insufficient and reformulates to the gold doc's own vocabulary, so
226
+ round 2 retrieves ``filtering`` to the top via the back-edge.
227
+ """
228
+ corpus = _light_corpus()
229
+ sources = {"t": ir.as_retriever(corpus, k=3)}
230
+ vague = "cache model results" # overlaps the `embed` distractor, not the gold
231
+ gold_query = "narrow similarity search using metadata filters"
232
+
233
+ # Baseline: single-shot (no LLM evaluator) surfaces the distractor, not the gold.
234
+ baseline = make_search_agent(sources)(vague)
235
+ assert baseline[0].artifact_id == "embed"
236
+
237
+ # With the back-edge: reformulate to the gold's vocabulary, then it wins.
238
+ rounds = {"n": 0}
239
+
240
+ def judge(*, goal, results):
241
+ rounds["n"] += 1
242
+ if rounds["n"] < 2:
243
+ return (False, gold_query)
244
+ return (True, None)
245
+
246
+ agent = make_search_agent(sources, evaluator=make_llm_evaluator(judge=judge))
247
+ results = agent(vague)
248
+ assert rounds["n"] == 2 # the back-edge fired
249
+ assert results[0].artifact_id == "filtering" # gold recovered
250
+ assert isinstance(results[0], SearchHit)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes