ragbits-evaluate 1.4.0.dev202509220615__py3-none-any.whl → 1.4.0.dev202511160236__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,323 @@
1
+ import asyncio
2
+ import contextlib
3
+ import io
4
+ import json
5
+ import logging
6
+ import multiprocessing
7
+ import textwrap
8
+ import time
9
+ from collections.abc import Callable, Coroutine, Iterable
10
+ from dataclasses import dataclass
11
+ from multiprocessing.connection import Connection
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from typing_extensions import Self
16
+
17
+ from ragbits.agents import Agent
18
+ from ragbits.core.llms.base import LLM, LLMClientOptionsT
19
+ from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
20
+
21
+
22
+ class HumanEvalData(EvaluationData):
23
+ """
24
+ Represents a single HumanEval task.
25
+ """
26
+
27
+ task_id: str
28
+ prompt: str
29
+ entry_point: str
30
+ test: str
31
+ canonical_solution: str | None = None
32
+
33
+
34
+ @dataclass
35
+ class HumanEvalResult(EvaluationResult):
36
+ """
37
+ Represents the result of evaluating a single HumanEval task.
38
+ """
39
+
40
+ task_id: str
41
+ entry_point: str
42
+ samples: list[str]
43
+ passed_mask: list[bool]
44
+ exec_durations_sec: list[float]
45
+ compile_ok_mask: list[bool]
46
+ errors: list[str | None]
47
+
48
+
49
+ def _execute_in_subprocess(
50
+ source: str, entry_point: str, test_code: str, timeout_sec: int = 10, memory_limit_mb: int | None = 512
51
+ ) -> tuple[bool, float, str | None]:
52
+ """Run candidate against HumanEval test in a subprocess with timeout."""
53
+
54
+ def _runner(pipe: Connection) -> None:
55
+ captured_out = io.StringIO()
56
+ start = time.perf_counter()
57
+
58
+ try:
59
+ with contextlib.redirect_stdout(captured_out), contextlib.redirect_stderr(captured_out):
60
+ # Apply soft resource limits -> NOT A SANDBOX
61
+ with contextlib.suppress(Exception):
62
+ import os # type: ignore
63
+ import resource # type: ignore
64
+ import tempfile # type: ignore
65
+
66
+ cpu_secs = max(1, timeout_sec)
67
+ resource.setrlimit(resource.RLIMIT_CPU, (cpu_secs, cpu_secs))
68
+
69
+ if memory_limit_mb is not None:
70
+ mem_bytes = int(memory_limit_mb) * 1024 * 1024
71
+ resource.setrlimit(resource.RLIMIT_AS, (mem_bytes, mem_bytes))
72
+
73
+ # Minimal extra security
74
+ for rlim, val in (
75
+ (getattr(resource, "RLIMIT_NOFILE", None), 256),
76
+ (getattr(resource, "RLIMIT_NPROC", None), 64),
77
+ (getattr(resource, "RLIMIT_FSIZE", None), 10 * 1024 * 1024),
78
+ ):
79
+ if rlim is not None:
80
+ with contextlib.suppress(Exception):
81
+ resource.setrlimit(rlim, (val, val))
82
+
83
+ # Temporary working directory for solution
84
+ tmp = tempfile.TemporaryDirectory()
85
+ with contextlib.suppress(Exception):
86
+ os.chdir(tmp.name)
87
+
88
+ globals_dict: dict[str, Any] = {"__name__": "__main__"}
89
+ exec(compile(source, filename="candidate.py", mode="exec"), globals_dict)
90
+
91
+ if entry_point not in globals_dict:
92
+ raise NameError(f"Entry point '{entry_point}' not defined")
93
+
94
+ harness = textwrap.dedent(f"candidate = {entry_point}\n").lstrip()
95
+ test_code_clean = textwrap.dedent(test_code).lstrip()
96
+ compiled_test = compile(
97
+ harness + "\n" + test_code_clean + "\ncheck(candidate)", filename="test.py", mode="exec"
98
+ )
99
+ exec(compiled_test, globals_dict)
100
+
101
+ duration = time.perf_counter() - start
102
+ pipe.send((True, duration, None))
103
+
104
+ except Exception as e:
105
+ duration = time.perf_counter() - start
106
+ pipe.send((False, duration, f"{e.__class__.__name__}: {e}"))
107
+
108
+ parent_conn, child_conn = multiprocessing.Pipe()
109
+ proc = multiprocessing.Process(target=_runner, args=(child_conn,))
110
+ proc.start()
111
+ proc.join(timeout=timeout_sec)
112
+
113
+ if proc.is_alive():
114
+ proc.terminate()
115
+ proc.join()
116
+ return False, float(timeout_sec), "TimeoutError: execution exceeded time limit"
117
+
118
+ passed, duration, err = parent_conn.recv()
119
+ return bool(passed), float(duration), (str(err) if err is not None else None)
120
+
121
+
122
+ class HumanEvalPipeline(
123
+ EvaluationPipeline[Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT], HumanEvalData, HumanEvalResult]
124
+ ):
125
+ """HumanEval evaluation pipeline for code generation models/agents."""
126
+
127
+ def __init__(
128
+ self,
129
+ evaluation_target: Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT],
130
+ *,
131
+ n_samples: int = 1,
132
+ timeout_sec: int = 10,
133
+ memory_limit_mb: int | None = 512,
134
+ per_example_log_file: Path | None = None,
135
+ extended_logs: bool = False,
136
+ code_sanitize_fn: Callable[[str], str] | None = None,
137
+ ) -> None:
138
+ super().__init__(evaluation_target=evaluation_target)
139
+ self.n_samples = n_samples
140
+ self.timeout_sec = timeout_sec
141
+ self.memory_limit_mb = memory_limit_mb
142
+ self.per_example_log_file = per_example_log_file
143
+ self.extended_logs = extended_logs
144
+ self.code_sanitize_fn = code_sanitize_fn
145
+ self._init_log_file()
146
+
147
+ @classmethod
148
+ def from_config(cls, config: dict) -> Self:
149
+ """Create pipeline from config.
150
+ Attempts Agent first, falls back to raw LLM construction.
151
+ """
152
+ if "evaluation_target" not in config:
153
+ try:
154
+ config["evaluation_target"] = Agent.from_config(config)
155
+ except Exception:
156
+ config["evaluation_target"] = LLM.from_config(config)
157
+ return super().from_config(config)
158
+
159
+ def _process_generation(
160
+ self, raw: BaseException | tuple[str, dict | None] | str, debug_traces: list[dict | None] | None
161
+ ) -> tuple[str, dict | None]:
162
+ """Process a single generation result."""
163
+ if isinstance(raw, BaseException):
164
+ err_msg = f"GenerationError: {raw.__class__.__name__}: {raw}"
165
+ if self.extended_logs and debug_traces is not None:
166
+ debug_traces.append({"error": err_msg})
167
+ raise raw
168
+
169
+ if self.extended_logs and isinstance(raw, tuple):
170
+ content, dbg = raw
171
+ code = self._sanitize(content)
172
+ if debug_traces is not None:
173
+ debug_traces.append(dbg)
174
+ return code, dbg
175
+
176
+ if isinstance(raw, str):
177
+ code = self._sanitize(raw)
178
+ return code, None
179
+
180
+ raise TypeError(f"Unexpected type for raw: {type(raw)}")
181
+
182
+ def _evaluate_code_sample(self, code: str, row: HumanEvalData) -> tuple[bool, bool, float, str | None]:
183
+ """Evaluate a single code sample."""
184
+ # Compile check
185
+ try:
186
+ compile(code, filename="candidate.py", mode="exec")
187
+ compile_ok = True
188
+ except Exception as e:
189
+ return False, False, 0.0, f"SyntaxError: {e}"
190
+
191
+ ok, dur, err = _execute_in_subprocess(
192
+ code,
193
+ row.entry_point,
194
+ row.test,
195
+ timeout_sec=self.timeout_sec,
196
+ memory_limit_mb=self.memory_limit_mb,
197
+ )
198
+ return compile_ok, ok, dur, err
199
+
200
+ async def __call__(self, data: Iterable[HumanEvalData]) -> Iterable[HumanEvalResult]:
201
+ """Generate code completions per task and evaluate them.
202
+ Returns list of `HumanEvalResult`, one per input task.
203
+ """
204
+ results: list[HumanEvalResult] = []
205
+
206
+ for row in data:
207
+ prompt_input = row.prompt
208
+ samples: list[str] = []
209
+ compile_ok: list[bool] = []
210
+ pass_mask: list[bool] = []
211
+ durations: list[float] = []
212
+ errors: list[str | None] = []
213
+
214
+ # Produce n samples
215
+ gen_tasks: list[Coroutine[Any, Any, tuple[str, dict | None] | str]] = []
216
+ for _ in range(self.n_samples):
217
+ if self.extended_logs:
218
+ gen_tasks.append(self._generate_with_debug(prompt_input))
219
+ else:
220
+ gen_tasks.append(self._generate_code(prompt_input))
221
+ generations = await asyncio.gather(*gen_tasks, return_exceptions=True)
222
+
223
+ debug_traces: list[dict | None] | None = [] if self.extended_logs else None
224
+
225
+ for raw in generations:
226
+ try:
227
+ code, _ = self._process_generation(raw, debug_traces)
228
+ samples.append(code)
229
+ except BaseException as e:
230
+ samples.append("")
231
+ compile_ok.append(False)
232
+ pass_mask.append(False)
233
+ durations.append(0.0)
234
+ err_msg = f"GenerationError: {e.__class__.__name__}: {e}"
235
+ errors.append(err_msg)
236
+ continue
237
+
238
+ compile_result, passed, duration, error = self._evaluate_code_sample(code, row)
239
+ compile_ok.append(compile_result)
240
+ pass_mask.append(passed)
241
+ durations.append(duration)
242
+ errors.append(error)
243
+
244
+ result = HumanEvalResult(
245
+ task_id=row.task_id,
246
+ entry_point=row.entry_point,
247
+ samples=samples,
248
+ passed_mask=pass_mask,
249
+ exec_durations_sec=durations,
250
+ compile_ok_mask=compile_ok,
251
+ errors=errors,
252
+ )
253
+ results.append(result)
254
+ ext_log_str = (
255
+ json.dumps(debug_traces, ensure_ascii=False, default=str)
256
+ if (self.extended_logs and debug_traces is not None)
257
+ else None
258
+ )
259
+ self._log_example(row, result, ext_log_str)
260
+ return results
261
+
262
+ def _sanitize(self, text: str) -> str:
263
+ """Optionally sanitize cpde from text using provided function.
264
+ If no parser provided, returns the original text.
265
+ """
266
+ if self.code_sanitize_fn is None:
267
+ return text
268
+ try:
269
+ return self.code_sanitize_fn(text)
270
+ except Exception as exc:
271
+ logging.getLogger(__name__).debug("Code sanitize error: %s", exc)
272
+ return text
273
+
274
+ def _init_log_file(self) -> None:
275
+ """Ensure the per-example log file exists if logging is enabled."""
276
+ if self.per_example_log_file is None:
277
+ return
278
+ self.per_example_log_file.parent.mkdir(parents=True, exist_ok=True)
279
+ with open(self.per_example_log_file, "w", encoding="utf-8") as _:
280
+ pass
281
+
282
+ def _log_example(self, row: HumanEvalData, result: HumanEvalResult, extended_log: str | None = None) -> None:
283
+ """Append a single NDJSON record for debugging if enabled."""
284
+ if self.per_example_log_file is None:
285
+ return
286
+ record: dict[str, object] = {
287
+ "task_id": row.task_id,
288
+ "entry_point": row.entry_point,
289
+ "n_samples": len(result.samples),
290
+ "samples": result.samples,
291
+ "compile_ok_mask": result.compile_ok_mask,
292
+ "passed_mask": result.passed_mask,
293
+ "exec_durations_sec": result.exec_durations_sec,
294
+ "errors": result.errors,
295
+ }
296
+ record["extended_debug_logging"] = extended_log or "[]"
297
+ with open(self.per_example_log_file, "a", encoding="utf-8") as f:
298
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
299
+
300
+ async def _generate_code(self, prompt: str) -> str:
301
+ """Generate final answer code from Agent or raw LLM."""
302
+ target = self.evaluation_target
303
+ if isinstance(target, Agent):
304
+ res = await target.run(prompt)
305
+ return str(res.content)
306
+
307
+ resp = await target.generate(prompt)
308
+ return str(resp)
309
+
310
+ async def _generate_with_debug(self, prompt: str) -> tuple[str, dict | None]:
311
+ """Generate code and capture tool/history/usage for logging (as raw content)."""
312
+ target = self.evaluation_target
313
+ if isinstance(target, Agent):
314
+ res = await target.run(prompt)
315
+ dbg = {
316
+ "history": res.history,
317
+ "tool_calls": res.tool_calls,
318
+ "usage": res.usage,
319
+ "metadata": res.metadata,
320
+ }
321
+ return str(res.content), dbg
322
+ resp = await target.generate(prompt)
323
+ return str(resp), None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 1.4.0.dev202509220615
3
+ Version: 1.4.0.dev202511160236
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==1.4.0.dev202509220615
30
+ Requires-Dist: ragbits-core==1.4.0.dev202511160236
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -9,6 +9,9 @@ ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcH
9
9
  ragbits/evaluate/dataloaders/base.py,sha256=x8rEl5utNOziF_9urL0grkqoXwMgaDWYSM5akw3Kt9Y,3213
10
10
  ragbits/evaluate/dataloaders/document_search.py,sha256=c9Bc4ZtFEKAiG9B70JFiBZlZDkBSGNWFRKabF7PMTU0,2495
11
11
  ragbits/evaluate/dataloaders/exceptions.py,sha256=xUOBLj1JuCkcqzRVnu0A0I_i1THxbDt2MEDVdDGjDyY,735
12
+ ragbits/evaluate/dataloaders/gaia.py,sha256=B0XnQ-K2ZW8oNOOwTeUYSok7Yj7JNP4ZERKqEZZE4zQ,2918
13
+ ragbits/evaluate/dataloaders/hotpot_qa.py,sha256=UJmgK9XgUpSt3-iq6FKmLvPQx9aUt-vCDB5qAcGAqfo,3698
14
+ ragbits/evaluate/dataloaders/human_eval.py,sha256=eJM-l1Xkf73_gj1mUv8XTz8LiWvQu2UmkJyYPK_bRY8,2626
12
15
  ragbits/evaluate/dataloaders/question_answer.py,sha256=PvG2n9zSy5bH4NJKgSxgxqHjNozLHPJijuBvryiCq_o,1964
13
16
  ragbits/evaluate/dataset_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
17
  ragbits/evaluate/dataset_generator/pipeline.py,sha256=dgnV-Qm0Z7S1Y6ga9-9RscXxxr3krOKsIj7E9WS4ANk,4940
@@ -28,11 +31,17 @@ ragbits/evaluate/factories/__init__.py,sha256=7nh0J80EfqMWRGtHx4hkfHNMztfC6FMhH8
28
31
  ragbits/evaluate/metrics/__init__.py,sha256=Mr83ytGyvdXtBlr7Bbo0-5auE0530xsd3wffKSIf8cE,95
29
32
  ragbits/evaluate/metrics/base.py,sha256=bOscQ_nJXLGWmP2ls9jncrUoeghNBnKDJsab71pFEjo,2519
30
33
  ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZVYb0T6ibpOF3vGI,3189
34
+ ragbits/evaluate/metrics/gaia.py,sha256=Q1oZPVAxRsQkyctJLE95fsGNewmDhzTJ5vjNoXvu10E,3086
35
+ ragbits/evaluate/metrics/hotpot_qa.py,sha256=Tw4gKDbua60fbE7BbxKV08-yp0PbQKTHlnk87GULNe8,1776
36
+ ragbits/evaluate/metrics/human_eval.py,sha256=ud4G-xaMi0f1tkzYb1V2uSgYDF-ymnKxiN6CdOWGZqU,4285
31
37
  ragbits/evaluate/metrics/question_answer.py,sha256=369lOoY76KY-wUxBKl0lSQlJSF0JhmPpehNQYeiWNHg,7072
32
- ragbits/evaluate/pipelines/__init__.py,sha256=Bqp_L7aRq12Ua19ELZDsdYvra6-GlLrQ9cIG2IWArko,1294
38
+ ragbits/evaluate/pipelines/__init__.py,sha256=PZ2477OqOV622QMC-3iwW5ThC-nYRS9KBe_nlyas3Zs,1573
33
39
  ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
34
40
  ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
41
+ ragbits/evaluate/pipelines/gaia.py,sha256=DkVAlNI-a9chQGPyOFtjrXWGoPSWogrSfSxxgxanAb8,9686
42
+ ragbits/evaluate/pipelines/hotpot_qa.py,sha256=eHDQ7e_Pa1YRWkc-7oxarYHnZKKvEv_Q8gCBBvo_iss,13629
43
+ ragbits/evaluate/pipelines/human_eval.py,sha256=o2q3O3-OcdRBESwRFUKVU2dQt0TIMDXqxFitdKEw_fw,12406
35
44
  ragbits/evaluate/pipelines/question_answer.py,sha256=3CYVHDLnOy4z7kgYPMluiJ8POulHo-w3PEiqvqsF4Dc,2797
36
- ragbits_evaluate-1.4.0.dev202509220615.dist-info/METADATA,sha256=ko7BNdi_uusZ0U7s_4eNDxkAoeQajOqU80fyieC1Uug,2330
37
- ragbits_evaluate-1.4.0.dev202509220615.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- ragbits_evaluate-1.4.0.dev202509220615.dist-info/RECORD,,
45
+ ragbits_evaluate-1.4.0.dev202511160236.dist-info/METADATA,sha256=QWqkZGqQ2lvYrzWZYyYnaZpM4yxm21mqx6mn5DuL8s8,2330
46
+ ragbits_evaluate-1.4.0.dev202511160236.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
47
+ ragbits_evaluate-1.4.0.dev202511160236.dist-info/RECORD,,