ragbits-evaluate 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +87 -0
- ragbits/evaluate/agent_simulation/context.py +118 -0
- ragbits/evaluate/agent_simulation/conversation.py +333 -0
- ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
- ragbits/evaluate/agent_simulation/logger.py +165 -0
- ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
- ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
- ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
- ragbits/evaluate/agent_simulation/models.py +37 -0
- ragbits/evaluate/agent_simulation/results.py +200 -0
- ragbits/evaluate/agent_simulation/scenarios.py +129 -0
- ragbits/evaluate/agent_simulation/simulation.py +243 -0
- ragbits/evaluate/cli.py +150 -0
- ragbits/evaluate/config.py +11 -0
- ragbits/evaluate/dataloaders/__init__.py +3 -0
- ragbits/evaluate/dataloaders/base.py +95 -0
- ragbits/evaluate/dataloaders/document_search.py +61 -0
- ragbits/evaluate/dataloaders/exceptions.py +25 -0
- ragbits/evaluate/dataloaders/gaia.py +78 -0
- ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
- ragbits/evaluate/dataloaders/human_eval.py +70 -0
- ragbits/evaluate/dataloaders/question_answer.py +56 -0
- ragbits/evaluate/dataset_generator/pipeline.py +4 -4
- ragbits/evaluate/dataset_generator/prompts/qa.py +2 -4
- ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +2 -4
- ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +3 -5
- ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +3 -3
- ragbits/evaluate/evaluator.py +178 -50
- ragbits/evaluate/factories/__init__.py +42 -0
- ragbits/evaluate/metrics/__init__.py +2 -23
- ragbits/evaluate/metrics/base.py +40 -17
- ragbits/evaluate/metrics/document_search.py +40 -23
- ragbits/evaluate/metrics/gaia.py +84 -0
- ragbits/evaluate/metrics/hotpot_qa.py +51 -0
- ragbits/evaluate/metrics/human_eval.py +105 -0
- ragbits/evaluate/metrics/question_answer.py +222 -0
- ragbits/evaluate/optimizer.py +138 -86
- ragbits/evaluate/pipelines/__init__.py +37 -0
- ragbits/evaluate/pipelines/base.py +34 -10
- ragbits/evaluate/pipelines/document_search.py +72 -67
- ragbits/evaluate/pipelines/gaia.py +249 -0
- ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
- ragbits/evaluate/pipelines/human_eval.py +323 -0
- ragbits/evaluate/pipelines/question_answer.py +96 -0
- ragbits/evaluate/utils.py +86 -59
- {ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +33 -9
- ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD +59 -0
- {ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +1 -1
- ragbits/evaluate/callbacks/base.py +0 -22
- ragbits/evaluate/callbacks/neptune.py +0 -26
- ragbits/evaluate/loaders/__init__.py +0 -21
- ragbits/evaluate/loaders/base.py +0 -24
- ragbits/evaluate/loaders/hf.py +0 -25
- ragbits_evaluate-0.5.0.dist-info/RECORD +0 -33
- /ragbits/evaluate/{callbacks/__init__.py → py.typed} +0 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
import io
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import multiprocessing
|
|
7
|
+
import textwrap
|
|
8
|
+
import time
|
|
9
|
+
from collections.abc import Callable, Coroutine, Iterable
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from multiprocessing.connection import Connection
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from ragbits.agents import Agent
|
|
18
|
+
from ragbits.core.llms.base import LLM, LLMClientOptionsT
|
|
19
|
+
from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HumanEvalData(EvaluationData):
|
|
23
|
+
"""
|
|
24
|
+
Represents a single HumanEval task.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
task_id: str
|
|
28
|
+
prompt: str
|
|
29
|
+
entry_point: str
|
|
30
|
+
test: str
|
|
31
|
+
canonical_solution: str | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class HumanEvalResult(EvaluationResult):
|
|
36
|
+
"""
|
|
37
|
+
Represents the result of evaluating a single HumanEval task.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
task_id: str
|
|
41
|
+
entry_point: str
|
|
42
|
+
samples: list[str]
|
|
43
|
+
passed_mask: list[bool]
|
|
44
|
+
exec_durations_sec: list[float]
|
|
45
|
+
compile_ok_mask: list[bool]
|
|
46
|
+
errors: list[str | None]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _execute_in_subprocess(
|
|
50
|
+
source: str, entry_point: str, test_code: str, timeout_sec: int = 10, memory_limit_mb: int | None = 512
|
|
51
|
+
) -> tuple[bool, float, str | None]:
|
|
52
|
+
"""Run candidate against HumanEval test in a subprocess with timeout."""
|
|
53
|
+
|
|
54
|
+
def _runner(pipe: Connection) -> None:
|
|
55
|
+
captured_out = io.StringIO()
|
|
56
|
+
start = time.perf_counter()
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
with contextlib.redirect_stdout(captured_out), contextlib.redirect_stderr(captured_out):
|
|
60
|
+
# Apply soft resource limits -> NOT A SANDBOX
|
|
61
|
+
with contextlib.suppress(Exception):
|
|
62
|
+
import os # type: ignore
|
|
63
|
+
import resource # type: ignore
|
|
64
|
+
import tempfile # type: ignore
|
|
65
|
+
|
|
66
|
+
cpu_secs = max(1, timeout_sec)
|
|
67
|
+
resource.setrlimit(resource.RLIMIT_CPU, (cpu_secs, cpu_secs))
|
|
68
|
+
|
|
69
|
+
if memory_limit_mb is not None:
|
|
70
|
+
mem_bytes = int(memory_limit_mb) * 1024 * 1024
|
|
71
|
+
resource.setrlimit(resource.RLIMIT_AS, (mem_bytes, mem_bytes))
|
|
72
|
+
|
|
73
|
+
# Minimal extra security
|
|
74
|
+
for rlim, val in (
|
|
75
|
+
(getattr(resource, "RLIMIT_NOFILE", None), 256),
|
|
76
|
+
(getattr(resource, "RLIMIT_NPROC", None), 64),
|
|
77
|
+
(getattr(resource, "RLIMIT_FSIZE", None), 10 * 1024 * 1024),
|
|
78
|
+
):
|
|
79
|
+
if rlim is not None:
|
|
80
|
+
with contextlib.suppress(Exception):
|
|
81
|
+
resource.setrlimit(rlim, (val, val))
|
|
82
|
+
|
|
83
|
+
# Temporary working directory for solution
|
|
84
|
+
tmp = tempfile.TemporaryDirectory()
|
|
85
|
+
with contextlib.suppress(Exception):
|
|
86
|
+
os.chdir(tmp.name)
|
|
87
|
+
|
|
88
|
+
globals_dict: dict[str, Any] = {"__name__": "__main__"}
|
|
89
|
+
exec(compile(source, filename="candidate.py", mode="exec"), globals_dict)
|
|
90
|
+
|
|
91
|
+
if entry_point not in globals_dict:
|
|
92
|
+
raise NameError(f"Entry point '{entry_point}' not defined")
|
|
93
|
+
|
|
94
|
+
harness = textwrap.dedent(f"candidate = {entry_point}\n").lstrip()
|
|
95
|
+
test_code_clean = textwrap.dedent(test_code).lstrip()
|
|
96
|
+
compiled_test = compile(
|
|
97
|
+
harness + "\n" + test_code_clean + "\ncheck(candidate)", filename="test.py", mode="exec"
|
|
98
|
+
)
|
|
99
|
+
exec(compiled_test, globals_dict)
|
|
100
|
+
|
|
101
|
+
duration = time.perf_counter() - start
|
|
102
|
+
pipe.send((True, duration, None))
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
duration = time.perf_counter() - start
|
|
106
|
+
pipe.send((False, duration, f"{e.__class__.__name__}: {e}"))
|
|
107
|
+
|
|
108
|
+
parent_conn, child_conn = multiprocessing.Pipe()
|
|
109
|
+
proc = multiprocessing.Process(target=_runner, args=(child_conn,))
|
|
110
|
+
proc.start()
|
|
111
|
+
proc.join(timeout=timeout_sec)
|
|
112
|
+
|
|
113
|
+
if proc.is_alive():
|
|
114
|
+
proc.terminate()
|
|
115
|
+
proc.join()
|
|
116
|
+
return False, float(timeout_sec), "TimeoutError: execution exceeded time limit"
|
|
117
|
+
|
|
118
|
+
passed, duration, err = parent_conn.recv()
|
|
119
|
+
return bool(passed), float(duration), (str(err) if err is not None else None)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class HumanEvalPipeline(
|
|
123
|
+
EvaluationPipeline[Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT], HumanEvalData, HumanEvalResult]
|
|
124
|
+
):
|
|
125
|
+
"""HumanEval evaluation pipeline for code generation models/agents."""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
evaluation_target: Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT],
|
|
130
|
+
*,
|
|
131
|
+
n_samples: int = 1,
|
|
132
|
+
timeout_sec: int = 10,
|
|
133
|
+
memory_limit_mb: int | None = 512,
|
|
134
|
+
per_example_log_file: Path | None = None,
|
|
135
|
+
extended_logs: bool = False,
|
|
136
|
+
code_sanitize_fn: Callable[[str], str] | None = None,
|
|
137
|
+
) -> None:
|
|
138
|
+
super().__init__(evaluation_target=evaluation_target)
|
|
139
|
+
self.n_samples = n_samples
|
|
140
|
+
self.timeout_sec = timeout_sec
|
|
141
|
+
self.memory_limit_mb = memory_limit_mb
|
|
142
|
+
self.per_example_log_file = per_example_log_file
|
|
143
|
+
self.extended_logs = extended_logs
|
|
144
|
+
self.code_sanitize_fn = code_sanitize_fn
|
|
145
|
+
self._init_log_file()
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def from_config(cls, config: dict) -> Self:
|
|
149
|
+
"""Create pipeline from config.
|
|
150
|
+
Attempts Agent first, falls back to raw LLM construction.
|
|
151
|
+
"""
|
|
152
|
+
if "evaluation_target" not in config:
|
|
153
|
+
try:
|
|
154
|
+
config["evaluation_target"] = Agent.from_config(config)
|
|
155
|
+
except Exception:
|
|
156
|
+
config["evaluation_target"] = LLM.from_config(config)
|
|
157
|
+
return super().from_config(config)
|
|
158
|
+
|
|
159
|
+
def _process_generation(
|
|
160
|
+
self, raw: BaseException | tuple[str, dict | None] | str, debug_traces: list[dict | None] | None
|
|
161
|
+
) -> tuple[str, dict | None]:
|
|
162
|
+
"""Process a single generation result."""
|
|
163
|
+
if isinstance(raw, BaseException):
|
|
164
|
+
err_msg = f"GenerationError: {raw.__class__.__name__}: {raw}"
|
|
165
|
+
if self.extended_logs and debug_traces is not None:
|
|
166
|
+
debug_traces.append({"error": err_msg})
|
|
167
|
+
raise raw
|
|
168
|
+
|
|
169
|
+
if self.extended_logs and isinstance(raw, tuple):
|
|
170
|
+
content, dbg = raw
|
|
171
|
+
code = self._sanitize(content)
|
|
172
|
+
if debug_traces is not None:
|
|
173
|
+
debug_traces.append(dbg)
|
|
174
|
+
return code, dbg
|
|
175
|
+
|
|
176
|
+
if isinstance(raw, str):
|
|
177
|
+
code = self._sanitize(raw)
|
|
178
|
+
return code, None
|
|
179
|
+
|
|
180
|
+
raise TypeError(f"Unexpected type for raw: {type(raw)}")
|
|
181
|
+
|
|
182
|
+
def _evaluate_code_sample(self, code: str, row: HumanEvalData) -> tuple[bool, bool, float, str | None]:
|
|
183
|
+
"""Evaluate a single code sample."""
|
|
184
|
+
# Compile check
|
|
185
|
+
try:
|
|
186
|
+
compile(code, filename="candidate.py", mode="exec")
|
|
187
|
+
compile_ok = True
|
|
188
|
+
except Exception as e:
|
|
189
|
+
return False, False, 0.0, f"SyntaxError: {e}"
|
|
190
|
+
|
|
191
|
+
ok, dur, err = _execute_in_subprocess(
|
|
192
|
+
code,
|
|
193
|
+
row.entry_point,
|
|
194
|
+
row.test,
|
|
195
|
+
timeout_sec=self.timeout_sec,
|
|
196
|
+
memory_limit_mb=self.memory_limit_mb,
|
|
197
|
+
)
|
|
198
|
+
return compile_ok, ok, dur, err
|
|
199
|
+
|
|
200
|
+
async def __call__(self, data: Iterable[HumanEvalData]) -> Iterable[HumanEvalResult]:
|
|
201
|
+
"""Generate code completions per task and evaluate them.
|
|
202
|
+
Returns list of `HumanEvalResult`, one per input task.
|
|
203
|
+
"""
|
|
204
|
+
results: list[HumanEvalResult] = []
|
|
205
|
+
|
|
206
|
+
for row in data:
|
|
207
|
+
prompt_input = row.prompt
|
|
208
|
+
samples: list[str] = []
|
|
209
|
+
compile_ok: list[bool] = []
|
|
210
|
+
pass_mask: list[bool] = []
|
|
211
|
+
durations: list[float] = []
|
|
212
|
+
errors: list[str | None] = []
|
|
213
|
+
|
|
214
|
+
# Produce n samples
|
|
215
|
+
gen_tasks: list[Coroutine[Any, Any, tuple[str, dict | None] | str]] = []
|
|
216
|
+
for _ in range(self.n_samples):
|
|
217
|
+
if self.extended_logs:
|
|
218
|
+
gen_tasks.append(self._generate_with_debug(prompt_input))
|
|
219
|
+
else:
|
|
220
|
+
gen_tasks.append(self._generate_code(prompt_input))
|
|
221
|
+
generations = await asyncio.gather(*gen_tasks, return_exceptions=True)
|
|
222
|
+
|
|
223
|
+
debug_traces: list[dict | None] | None = [] if self.extended_logs else None
|
|
224
|
+
|
|
225
|
+
for raw in generations:
|
|
226
|
+
try:
|
|
227
|
+
code, _ = self._process_generation(raw, debug_traces)
|
|
228
|
+
samples.append(code)
|
|
229
|
+
except BaseException as e:
|
|
230
|
+
samples.append("")
|
|
231
|
+
compile_ok.append(False)
|
|
232
|
+
pass_mask.append(False)
|
|
233
|
+
durations.append(0.0)
|
|
234
|
+
err_msg = f"GenerationError: {e.__class__.__name__}: {e}"
|
|
235
|
+
errors.append(err_msg)
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
compile_result, passed, duration, error = self._evaluate_code_sample(code, row)
|
|
239
|
+
compile_ok.append(compile_result)
|
|
240
|
+
pass_mask.append(passed)
|
|
241
|
+
durations.append(duration)
|
|
242
|
+
errors.append(error)
|
|
243
|
+
|
|
244
|
+
result = HumanEvalResult(
|
|
245
|
+
task_id=row.task_id,
|
|
246
|
+
entry_point=row.entry_point,
|
|
247
|
+
samples=samples,
|
|
248
|
+
passed_mask=pass_mask,
|
|
249
|
+
exec_durations_sec=durations,
|
|
250
|
+
compile_ok_mask=compile_ok,
|
|
251
|
+
errors=errors,
|
|
252
|
+
)
|
|
253
|
+
results.append(result)
|
|
254
|
+
ext_log_str = (
|
|
255
|
+
json.dumps(debug_traces, ensure_ascii=False, default=str)
|
|
256
|
+
if (self.extended_logs and debug_traces is not None)
|
|
257
|
+
else None
|
|
258
|
+
)
|
|
259
|
+
self._log_example(row, result, ext_log_str)
|
|
260
|
+
return results
|
|
261
|
+
|
|
262
|
+
def _sanitize(self, text: str) -> str:
|
|
263
|
+
"""Optionally sanitize cpde from text using provided function.
|
|
264
|
+
If no parser provided, returns the original text.
|
|
265
|
+
"""
|
|
266
|
+
if self.code_sanitize_fn is None:
|
|
267
|
+
return text
|
|
268
|
+
try:
|
|
269
|
+
return self.code_sanitize_fn(text)
|
|
270
|
+
except Exception as exc:
|
|
271
|
+
logging.getLogger(__name__).debug("Code sanitize error: %s", exc)
|
|
272
|
+
return text
|
|
273
|
+
|
|
274
|
+
def _init_log_file(self) -> None:
|
|
275
|
+
"""Ensure the per-example log file exists if logging is enabled."""
|
|
276
|
+
if self.per_example_log_file is None:
|
|
277
|
+
return
|
|
278
|
+
self.per_example_log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
279
|
+
with open(self.per_example_log_file, "w", encoding="utf-8") as _:
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
def _log_example(self, row: HumanEvalData, result: HumanEvalResult, extended_log: str | None = None) -> None:
|
|
283
|
+
"""Append a single NDJSON record for debugging if enabled."""
|
|
284
|
+
if self.per_example_log_file is None:
|
|
285
|
+
return
|
|
286
|
+
record: dict[str, object] = {
|
|
287
|
+
"task_id": row.task_id,
|
|
288
|
+
"entry_point": row.entry_point,
|
|
289
|
+
"n_samples": len(result.samples),
|
|
290
|
+
"samples": result.samples,
|
|
291
|
+
"compile_ok_mask": result.compile_ok_mask,
|
|
292
|
+
"passed_mask": result.passed_mask,
|
|
293
|
+
"exec_durations_sec": result.exec_durations_sec,
|
|
294
|
+
"errors": result.errors,
|
|
295
|
+
}
|
|
296
|
+
record["extended_debug_logging"] = extended_log or "[]"
|
|
297
|
+
with open(self.per_example_log_file, "a", encoding="utf-8") as f:
|
|
298
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
299
|
+
|
|
300
|
+
async def _generate_code(self, prompt: str) -> str:
|
|
301
|
+
"""Generate final answer code from Agent or raw LLM."""
|
|
302
|
+
target = self.evaluation_target
|
|
303
|
+
if isinstance(target, Agent):
|
|
304
|
+
res = await target.run(prompt)
|
|
305
|
+
return str(res.content)
|
|
306
|
+
|
|
307
|
+
resp = await target.generate(prompt)
|
|
308
|
+
return str(resp)
|
|
309
|
+
|
|
310
|
+
async def _generate_with_debug(self, prompt: str) -> tuple[str, dict | None]:
|
|
311
|
+
"""Generate code and capture tool/history/usage for logging (as raw content)."""
|
|
312
|
+
target = self.evaluation_target
|
|
313
|
+
if isinstance(target, Agent):
|
|
314
|
+
res = await target.run(prompt)
|
|
315
|
+
dbg = {
|
|
316
|
+
"history": res.history,
|
|
317
|
+
"tool_calls": res.tool_calls,
|
|
318
|
+
"usage": res.usage,
|
|
319
|
+
"metadata": res.metadata,
|
|
320
|
+
}
|
|
321
|
+
return str(res.content), dbg
|
|
322
|
+
resp = await target.generate(prompt)
|
|
323
|
+
return str(resp), None
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Generic
|
|
5
|
+
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from ragbits.agents._main import AgentResult
|
|
9
|
+
from ragbits.agents.types import (
|
|
10
|
+
QuestionAnswerAgent,
|
|
11
|
+
QuestionAnswerPromptInput,
|
|
12
|
+
QuestionAnswerPromptOutputT,
|
|
13
|
+
)
|
|
14
|
+
from ragbits.core.llms.base import LLMClientOptionsT
|
|
15
|
+
from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class QuestionAnswerData(EvaluationData):
|
|
19
|
+
"""
|
|
20
|
+
Represents the evaluation data for question answer.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
question: str
|
|
24
|
+
reference_answer: str
|
|
25
|
+
reference_context: Any | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class QuestionAnswerResult(EvaluationResult, Generic[QuestionAnswerPromptOutputT]):
|
|
30
|
+
"""
|
|
31
|
+
Represents the result of a single evaluation.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
question: str
|
|
35
|
+
predicted_result: AgentResult[QuestionAnswerPromptOutputT]
|
|
36
|
+
reference_answer: str
|
|
37
|
+
reference_context: Any | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class QuestionAnswerPipeline(
|
|
41
|
+
EvaluationPipeline[
|
|
42
|
+
QuestionAnswerAgent[LLMClientOptionsT, QuestionAnswerPromptInput, QuestionAnswerPromptOutputT],
|
|
43
|
+
QuestionAnswerData,
|
|
44
|
+
QuestionAnswerResult,
|
|
45
|
+
]
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Question answer evaluation pipeline.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def from_config(cls, config: dict) -> Self:
|
|
53
|
+
"""
|
|
54
|
+
Create an instance of `QuestionAnswerPipeline` from a configuration dictionary.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: A dictionary containing configuration settings for the pipeline.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
An instance of the pipeline class initialized with the provided configuration.
|
|
61
|
+
"""
|
|
62
|
+
config["evaluation_target"] = QuestionAnswerAgent.from_config(config)
|
|
63
|
+
return super().from_config(config)
|
|
64
|
+
|
|
65
|
+
async def __call__(
|
|
66
|
+
self, data: Iterable[QuestionAnswerData]
|
|
67
|
+
) -> Iterable[QuestionAnswerResult[QuestionAnswerPromptOutputT]]:
|
|
68
|
+
"""
|
|
69
|
+
Run the question answer evaluation pipeline.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
data: The evaluation data batch.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
The evaluation result batch.
|
|
76
|
+
"""
|
|
77
|
+
results = await asyncio.gather(
|
|
78
|
+
*[
|
|
79
|
+
self.evaluation_target.run(
|
|
80
|
+
QuestionAnswerPromptInput(
|
|
81
|
+
question=row.question,
|
|
82
|
+
context=row.reference_context,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
for row in data
|
|
86
|
+
]
|
|
87
|
+
)
|
|
88
|
+
return [
|
|
89
|
+
QuestionAnswerResult(
|
|
90
|
+
question=row.question,
|
|
91
|
+
predicted_result=result,
|
|
92
|
+
reference_answer=row.reference_answer,
|
|
93
|
+
reference_context=row.reference_context,
|
|
94
|
+
)
|
|
95
|
+
for row, result in zip(data, results, strict=False)
|
|
96
|
+
]
|
ragbits/evaluate/utils.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
|
+
import traceback
|
|
4
|
+
from dataclasses import asdict
|
|
3
5
|
from datetime import datetime
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import Any
|
|
@@ -7,34 +9,20 @@ from typing import Any
|
|
|
7
9
|
from datasets import Dataset
|
|
8
10
|
from hydra.core.hydra_config import HydraConfig
|
|
9
11
|
from neptune import Run
|
|
12
|
+
from neptune.types import File
|
|
10
13
|
from neptune.utils import stringify_unsupported
|
|
11
|
-
from
|
|
14
|
+
from neptune_optuna import NeptuneCallback
|
|
15
|
+
from omegaconf import DictConfig
|
|
12
16
|
|
|
17
|
+
from ragbits.evaluate.evaluator import EvaluatorResult
|
|
13
18
|
|
|
14
|
-
def _save(file_path: Path, **data: Any) -> None: # noqa: ANN401
|
|
15
|
-
"""
|
|
16
|
-
Save the data to a file. Add the current timestamp and Python version to the data.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
file_path: The path to the file.
|
|
20
|
-
data: The data to be saved.
|
|
21
|
-
"""
|
|
22
|
-
current_time = datetime.now()
|
|
23
|
-
|
|
24
|
-
data["_timestamp"] = current_time.isoformat()
|
|
25
|
-
data["_python_version"] = sys.version
|
|
26
|
-
data["_interpreter_path"] = sys.executable
|
|
27
19
|
|
|
28
|
-
|
|
29
|
-
json.dump(data, file, indent=4)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path:
|
|
20
|
+
def log_evaluation_to_file(result: EvaluatorResult, output_dir: Path | None = None) -> Path:
|
|
33
21
|
"""
|
|
34
|
-
Log the evaluation
|
|
22
|
+
Log the evaluation result locally.
|
|
35
23
|
|
|
36
24
|
Args:
|
|
37
|
-
|
|
25
|
+
result: The evaluation result.
|
|
38
26
|
output_dir: The output directory.
|
|
39
27
|
|
|
40
28
|
Returns:
|
|
@@ -43,13 +31,59 @@ def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path
|
|
|
43
31
|
output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
|
|
44
32
|
metrics_file = output_dir / "metrics.json"
|
|
45
33
|
results_file = output_dir / "results.json"
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
34
|
+
errors_file = output_dir / "errors.json"
|
|
35
|
+
|
|
36
|
+
_save_json(metrics_file, metrics=result.metrics, time_perf=asdict(result.time_perf))
|
|
37
|
+
_save_json(results_file, results=[asdict(entry) for entry in result.results])
|
|
38
|
+
_save_json(
|
|
39
|
+
errors_file,
|
|
40
|
+
errors=[
|
|
41
|
+
{
|
|
42
|
+
"type": exc.__class__.__name__,
|
|
43
|
+
"message": str(exc),
|
|
44
|
+
"stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
|
|
45
|
+
}
|
|
46
|
+
for exc in result.errors
|
|
47
|
+
],
|
|
48
|
+
)
|
|
49
49
|
|
|
50
50
|
return output_dir
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
def log_evaluation_to_neptune(result: EvaluatorResult, config: DictConfig, tags: str | list[str] | None = None) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Log the evaluation result to Neptune.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
result: The evaluation result.
|
|
59
|
+
config: The evaluation configuration.
|
|
60
|
+
tags: The experiment tags.
|
|
61
|
+
"""
|
|
62
|
+
run = Run(tags=tags)
|
|
63
|
+
run["config"] = stringify_unsupported(config)
|
|
64
|
+
run["evaluation/metrics"] = stringify_unsupported(result.metrics)
|
|
65
|
+
run["evaluation/time_perf"] = stringify_unsupported(asdict(result.time_perf))
|
|
66
|
+
run["evaluation/results"].upload(
|
|
67
|
+
File.from_content(json.dumps([asdict(entry) for entry in result.results], indent=4), extension="json")
|
|
68
|
+
)
|
|
69
|
+
run["evaluation/errors"].upload(
|
|
70
|
+
File.from_content(
|
|
71
|
+
json.dumps(
|
|
72
|
+
[
|
|
73
|
+
{
|
|
74
|
+
"type": exc.__class__.__name__,
|
|
75
|
+
"message": str(exc),
|
|
76
|
+
"stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
|
|
77
|
+
}
|
|
78
|
+
for exc in result.errors
|
|
79
|
+
],
|
|
80
|
+
indent=4,
|
|
81
|
+
),
|
|
82
|
+
extension="json",
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
53
87
|
def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path:
|
|
54
88
|
"""
|
|
55
89
|
Log the evaluation results locally.
|
|
@@ -68,7 +102,7 @@ def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Pat
|
|
|
68
102
|
|
|
69
103
|
|
|
70
104
|
def log_optimization_to_file(
|
|
71
|
-
results: list[tuple[
|
|
105
|
+
results: list[tuple[dict, float, dict[str, float]]], output_dir: Path | None = None
|
|
72
106
|
) -> Path:
|
|
73
107
|
"""
|
|
74
108
|
Log the evaluation results locally.
|
|
@@ -81,53 +115,46 @@ def log_optimization_to_file(
|
|
|
81
115
|
The output directory.
|
|
82
116
|
"""
|
|
83
117
|
output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
|
|
118
|
+
|
|
84
119
|
scores = {}
|
|
85
|
-
for
|
|
86
|
-
trial_name = f"
|
|
87
|
-
OmegaConf.save(cfg, output_dir / f"{trial_name}.yaml")
|
|
120
|
+
for i, (config, score, all_metrics) in enumerate(results):
|
|
121
|
+
trial_name = f"trial-{i}"
|
|
88
122
|
scores[trial_name] = {"score": score, "all_metrics": all_metrics}
|
|
123
|
+
trial_config_file = output_dir / f"{trial_name}.json"
|
|
124
|
+
_save_json(trial_config_file, config=config)
|
|
125
|
+
|
|
89
126
|
scores_file = output_dir / "scores.json"
|
|
90
|
-
|
|
127
|
+
_save_json(scores_file, scores=scores)
|
|
128
|
+
|
|
91
129
|
return output_dir
|
|
92
130
|
|
|
93
131
|
|
|
94
|
-
def
|
|
132
|
+
def _save_json(file_path: Path, **data: Any) -> None: # noqa: ANN401
|
|
95
133
|
"""
|
|
96
|
-
|
|
134
|
+
Save the data to a file. Add the current timestamp and Python version to the data.
|
|
97
135
|
|
|
98
136
|
Args:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
The Neptune run.
|
|
137
|
+
file_path: The path to the file.
|
|
138
|
+
data: The data to be saved.
|
|
103
139
|
"""
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return run
|
|
115
|
-
return None
|
|
140
|
+
current_time = datetime.now()
|
|
141
|
+
|
|
142
|
+
data["_timestamp"] = current_time.isoformat()
|
|
143
|
+
data["_python_version"] = sys.version
|
|
144
|
+
data["_interpreter_path"] = sys.executable
|
|
145
|
+
|
|
146
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
147
|
+
|
|
148
|
+
with open(file_path, "w", encoding="utf-8") as file:
|
|
149
|
+
json.dump(data, file, indent=4)
|
|
116
150
|
|
|
117
151
|
|
|
118
|
-
def
|
|
152
|
+
def setup_optuna_neptune_callback(tags: str | list[str] | None = None) -> NeptuneCallback:
|
|
119
153
|
"""
|
|
120
|
-
Log the
|
|
154
|
+
Log the optimization process to Neptune.
|
|
121
155
|
|
|
122
156
|
Args:
|
|
123
|
-
|
|
124
|
-
results: The evaluation results.
|
|
125
|
-
output_dir: The output directory.
|
|
157
|
+
tags: Experiment tags.
|
|
126
158
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
|
|
130
|
-
run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
|
|
131
|
-
run["evaluation/results"] = stringify_unsupported(results["results"])
|
|
132
|
-
run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
|
|
133
|
-
run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())
|
|
159
|
+
run = Run(tags=tags)
|
|
160
|
+
return NeptuneCallback(run)
|
{ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.4.0.dev202602030301
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
7
7
|
Project-URL: Documentation, https://ragbits.deepsense.ai/
|
|
8
8
|
Project-URL: Source, https://github.com/deepsense-ai/ragbits
|
|
9
9
|
Author-email: "deepsense.ai" <ragbits@deepsense.ai>
|
|
10
|
-
License: MIT
|
|
10
|
+
License-Expression: MIT
|
|
11
11
|
Keywords: Evaluation,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Environment :: Console
|
|
@@ -22,13 +22,37 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: datasets<4.0.0,>=3.0.1
|
|
26
|
+
Requires-Dist: deepeval<3.0.0,>=2.0.0
|
|
27
|
+
Requires-Dist: distilabel<2.0.0,>=1.5.0
|
|
28
|
+
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
29
|
+
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
30
|
+
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
31
|
+
Requires-Dist: ragbits-core==1.4.0.dev202602030301
|
|
30
32
|
Provides-Extra: relari
|
|
31
|
-
Requires-Dist: continuous-eval
|
|
33
|
+
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
32
34
|
Description-Content-Type: text/markdown
|
|
33
35
|
|
|
34
36
|
# Ragbits Evaluate
|
|
37
|
+
|
|
38
|
+
Ragbits Evaluate is a package that contains tools for evaluating the performance of AI pipelines defined with Ragbits components. It also helps with automatically finding the best hyperparameter configurations for them.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
To install the Ragbits Evaluate package, run:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
pip install ragbits-evaluate
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
<!--
|
|
49
|
+
TODO: Add a minimalistic example inspired by the Quickstart chapter on Ragbits Evaluate once it is ready.
|
|
50
|
+
-->
|
|
51
|
+
|
|
52
|
+
## Documentation
|
|
53
|
+
<!--
|
|
54
|
+
TODO:
|
|
55
|
+
* Add link to the Quickstart chapter on Ragbits Evaluate once it is ready.
|
|
56
|
+
* Add link to API Reference once classes from the Evaluate package are added to the API Reference.
|
|
57
|
+
-->
|
|
58
|
+
* [How-To Guides - Evaluate](https://ragbits.deepsense.ai/how-to/evaluate/optimize/)
|