ragbits-evaluate 0.5.0__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ragbits/evaluate/agent_simulation/__init__.py +87 -0
  2. ragbits/evaluate/agent_simulation/context.py +118 -0
  3. ragbits/evaluate/agent_simulation/conversation.py +333 -0
  4. ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
  5. ragbits/evaluate/agent_simulation/logger.py +165 -0
  6. ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
  7. ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
  8. ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
  9. ragbits/evaluate/agent_simulation/models.py +37 -0
  10. ragbits/evaluate/agent_simulation/results.py +200 -0
  11. ragbits/evaluate/agent_simulation/scenarios.py +129 -0
  12. ragbits/evaluate/agent_simulation/simulation.py +243 -0
  13. ragbits/evaluate/cli.py +150 -0
  14. ragbits/evaluate/config.py +11 -0
  15. ragbits/evaluate/dataloaders/__init__.py +3 -0
  16. ragbits/evaluate/dataloaders/base.py +95 -0
  17. ragbits/evaluate/dataloaders/document_search.py +61 -0
  18. ragbits/evaluate/dataloaders/exceptions.py +25 -0
  19. ragbits/evaluate/dataloaders/gaia.py +78 -0
  20. ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
  21. ragbits/evaluate/dataloaders/human_eval.py +70 -0
  22. ragbits/evaluate/dataloaders/question_answer.py +56 -0
  23. ragbits/evaluate/dataset_generator/pipeline.py +4 -4
  24. ragbits/evaluate/dataset_generator/prompts/qa.py +2 -4
  25. ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +2 -4
  26. ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +3 -5
  27. ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +3 -3
  28. ragbits/evaluate/evaluator.py +178 -50
  29. ragbits/evaluate/factories/__init__.py +42 -0
  30. ragbits/evaluate/metrics/__init__.py +2 -23
  31. ragbits/evaluate/metrics/base.py +40 -17
  32. ragbits/evaluate/metrics/document_search.py +40 -23
  33. ragbits/evaluate/metrics/gaia.py +84 -0
  34. ragbits/evaluate/metrics/hotpot_qa.py +51 -0
  35. ragbits/evaluate/metrics/human_eval.py +105 -0
  36. ragbits/evaluate/metrics/question_answer.py +222 -0
  37. ragbits/evaluate/optimizer.py +138 -86
  38. ragbits/evaluate/pipelines/__init__.py +37 -0
  39. ragbits/evaluate/pipelines/base.py +34 -10
  40. ragbits/evaluate/pipelines/document_search.py +72 -67
  41. ragbits/evaluate/pipelines/gaia.py +249 -0
  42. ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
  43. ragbits/evaluate/pipelines/human_eval.py +323 -0
  44. ragbits/evaluate/pipelines/question_answer.py +96 -0
  45. ragbits/evaluate/utils.py +86 -59
  46. {ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +33 -9
  47. ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD +59 -0
  48. {ragbits_evaluate-0.5.0.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +1 -1
  49. ragbits/evaluate/callbacks/base.py +0 -22
  50. ragbits/evaluate/callbacks/neptune.py +0 -26
  51. ragbits/evaluate/loaders/__init__.py +0 -21
  52. ragbits/evaluate/loaders/base.py +0 -24
  53. ragbits/evaluate/loaders/hf.py +0 -25
  54. ragbits_evaluate-0.5.0.dist-info/RECORD +0 -33
  55. /ragbits/evaluate/{callbacks/__init__.py → py.typed} +0 -0
@@ -0,0 +1,323 @@
1
+ import asyncio
2
+ import contextlib
3
+ import io
4
+ import json
5
+ import logging
6
+ import multiprocessing
7
+ import textwrap
8
+ import time
9
+ from collections.abc import Callable, Coroutine, Iterable
10
+ from dataclasses import dataclass
11
+ from multiprocessing.connection import Connection
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from typing_extensions import Self
16
+
17
+ from ragbits.agents import Agent
18
+ from ragbits.core.llms.base import LLM, LLMClientOptionsT
19
+ from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
20
+
21
+
22
+ class HumanEvalData(EvaluationData):
23
+ """
24
+ Represents a single HumanEval task.
25
+ """
26
+
27
+ task_id: str
28
+ prompt: str
29
+ entry_point: str
30
+ test: str
31
+ canonical_solution: str | None = None
32
+
33
+
34
+ @dataclass
35
+ class HumanEvalResult(EvaluationResult):
36
+ """
37
+ Represents the result of evaluating a single HumanEval task.
38
+ """
39
+
40
+ task_id: str
41
+ entry_point: str
42
+ samples: list[str]
43
+ passed_mask: list[bool]
44
+ exec_durations_sec: list[float]
45
+ compile_ok_mask: list[bool]
46
+ errors: list[str | None]
47
+
48
+
49
+ def _execute_in_subprocess(
50
+ source: str, entry_point: str, test_code: str, timeout_sec: int = 10, memory_limit_mb: int | None = 512
51
+ ) -> tuple[bool, float, str | None]:
52
+ """Run candidate against HumanEval test in a subprocess with timeout."""
53
+
54
+ def _runner(pipe: Connection) -> None:
55
+ captured_out = io.StringIO()
56
+ start = time.perf_counter()
57
+
58
+ try:
59
+ with contextlib.redirect_stdout(captured_out), contextlib.redirect_stderr(captured_out):
60
+ # Apply soft resource limits -> NOT A SANDBOX
61
+ with contextlib.suppress(Exception):
62
+ import os # type: ignore
63
+ import resource # type: ignore
64
+ import tempfile # type: ignore
65
+
66
+ cpu_secs = max(1, timeout_sec)
67
+ resource.setrlimit(resource.RLIMIT_CPU, (cpu_secs, cpu_secs))
68
+
69
+ if memory_limit_mb is not None:
70
+ mem_bytes = int(memory_limit_mb) * 1024 * 1024
71
+ resource.setrlimit(resource.RLIMIT_AS, (mem_bytes, mem_bytes))
72
+
73
+ # Minimal extra security
74
+ for rlim, val in (
75
+ (getattr(resource, "RLIMIT_NOFILE", None), 256),
76
+ (getattr(resource, "RLIMIT_NPROC", None), 64),
77
+ (getattr(resource, "RLIMIT_FSIZE", None), 10 * 1024 * 1024),
78
+ ):
79
+ if rlim is not None:
80
+ with contextlib.suppress(Exception):
81
+ resource.setrlimit(rlim, (val, val))
82
+
83
+ # Temporary working directory for solution
84
+ tmp = tempfile.TemporaryDirectory()
85
+ with contextlib.suppress(Exception):
86
+ os.chdir(tmp.name)
87
+
88
+ globals_dict: dict[str, Any] = {"__name__": "__main__"}
89
+ exec(compile(source, filename="candidate.py", mode="exec"), globals_dict)
90
+
91
+ if entry_point not in globals_dict:
92
+ raise NameError(f"Entry point '{entry_point}' not defined")
93
+
94
+ harness = textwrap.dedent(f"candidate = {entry_point}\n").lstrip()
95
+ test_code_clean = textwrap.dedent(test_code).lstrip()
96
+ compiled_test = compile(
97
+ harness + "\n" + test_code_clean + "\ncheck(candidate)", filename="test.py", mode="exec"
98
+ )
99
+ exec(compiled_test, globals_dict)
100
+
101
+ duration = time.perf_counter() - start
102
+ pipe.send((True, duration, None))
103
+
104
+ except Exception as e:
105
+ duration = time.perf_counter() - start
106
+ pipe.send((False, duration, f"{e.__class__.__name__}: {e}"))
107
+
108
+ parent_conn, child_conn = multiprocessing.Pipe()
109
+ proc = multiprocessing.Process(target=_runner, args=(child_conn,))
110
+ proc.start()
111
+ proc.join(timeout=timeout_sec)
112
+
113
+ if proc.is_alive():
114
+ proc.terminate()
115
+ proc.join()
116
+ return False, float(timeout_sec), "TimeoutError: execution exceeded time limit"
117
+
118
+ passed, duration, err = parent_conn.recv()
119
+ return bool(passed), float(duration), (str(err) if err is not None else None)
120
+
121
+
122
+ class HumanEvalPipeline(
123
+ EvaluationPipeline[Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT], HumanEvalData, HumanEvalResult]
124
+ ):
125
+ """HumanEval evaluation pipeline for code generation models/agents."""
126
+
127
+ def __init__(
128
+ self,
129
+ evaluation_target: Agent[LLMClientOptionsT, None, str] | LLM[LLMClientOptionsT],
130
+ *,
131
+ n_samples: int = 1,
132
+ timeout_sec: int = 10,
133
+ memory_limit_mb: int | None = 512,
134
+ per_example_log_file: Path | None = None,
135
+ extended_logs: bool = False,
136
+ code_sanitize_fn: Callable[[str], str] | None = None,
137
+ ) -> None:
138
+ super().__init__(evaluation_target=evaluation_target)
139
+ self.n_samples = n_samples
140
+ self.timeout_sec = timeout_sec
141
+ self.memory_limit_mb = memory_limit_mb
142
+ self.per_example_log_file = per_example_log_file
143
+ self.extended_logs = extended_logs
144
+ self.code_sanitize_fn = code_sanitize_fn
145
+ self._init_log_file()
146
+
147
+ @classmethod
148
+ def from_config(cls, config: dict) -> Self:
149
+ """Create pipeline from config.
150
+ Attempts Agent first, falls back to raw LLM construction.
151
+ """
152
+ if "evaluation_target" not in config:
153
+ try:
154
+ config["evaluation_target"] = Agent.from_config(config)
155
+ except Exception:
156
+ config["evaluation_target"] = LLM.from_config(config)
157
+ return super().from_config(config)
158
+
159
+ def _process_generation(
160
+ self, raw: BaseException | tuple[str, dict | None] | str, debug_traces: list[dict | None] | None
161
+ ) -> tuple[str, dict | None]:
162
+ """Process a single generation result."""
163
+ if isinstance(raw, BaseException):
164
+ err_msg = f"GenerationError: {raw.__class__.__name__}: {raw}"
165
+ if self.extended_logs and debug_traces is not None:
166
+ debug_traces.append({"error": err_msg})
167
+ raise raw
168
+
169
+ if self.extended_logs and isinstance(raw, tuple):
170
+ content, dbg = raw
171
+ code = self._sanitize(content)
172
+ if debug_traces is not None:
173
+ debug_traces.append(dbg)
174
+ return code, dbg
175
+
176
+ if isinstance(raw, str):
177
+ code = self._sanitize(raw)
178
+ return code, None
179
+
180
+ raise TypeError(f"Unexpected type for raw: {type(raw)}")
181
+
182
+ def _evaluate_code_sample(self, code: str, row: HumanEvalData) -> tuple[bool, bool, float, str | None]:
183
+ """Evaluate a single code sample."""
184
+ # Compile check
185
+ try:
186
+ compile(code, filename="candidate.py", mode="exec")
187
+ compile_ok = True
188
+ except Exception as e:
189
+ return False, False, 0.0, f"SyntaxError: {e}"
190
+
191
+ ok, dur, err = _execute_in_subprocess(
192
+ code,
193
+ row.entry_point,
194
+ row.test,
195
+ timeout_sec=self.timeout_sec,
196
+ memory_limit_mb=self.memory_limit_mb,
197
+ )
198
+ return compile_ok, ok, dur, err
199
+
200
+ async def __call__(self, data: Iterable[HumanEvalData]) -> Iterable[HumanEvalResult]:
201
+ """Generate code completions per task and evaluate them.
202
+ Returns list of `HumanEvalResult`, one per input task.
203
+ """
204
+ results: list[HumanEvalResult] = []
205
+
206
+ for row in data:
207
+ prompt_input = row.prompt
208
+ samples: list[str] = []
209
+ compile_ok: list[bool] = []
210
+ pass_mask: list[bool] = []
211
+ durations: list[float] = []
212
+ errors: list[str | None] = []
213
+
214
+ # Produce n samples
215
+ gen_tasks: list[Coroutine[Any, Any, tuple[str, dict | None] | str]] = []
216
+ for _ in range(self.n_samples):
217
+ if self.extended_logs:
218
+ gen_tasks.append(self._generate_with_debug(prompt_input))
219
+ else:
220
+ gen_tasks.append(self._generate_code(prompt_input))
221
+ generations = await asyncio.gather(*gen_tasks, return_exceptions=True)
222
+
223
+ debug_traces: list[dict | None] | None = [] if self.extended_logs else None
224
+
225
+ for raw in generations:
226
+ try:
227
+ code, _ = self._process_generation(raw, debug_traces)
228
+ samples.append(code)
229
+ except BaseException as e:
230
+ samples.append("")
231
+ compile_ok.append(False)
232
+ pass_mask.append(False)
233
+ durations.append(0.0)
234
+ err_msg = f"GenerationError: {e.__class__.__name__}: {e}"
235
+ errors.append(err_msg)
236
+ continue
237
+
238
+ compile_result, passed, duration, error = self._evaluate_code_sample(code, row)
239
+ compile_ok.append(compile_result)
240
+ pass_mask.append(passed)
241
+ durations.append(duration)
242
+ errors.append(error)
243
+
244
+ result = HumanEvalResult(
245
+ task_id=row.task_id,
246
+ entry_point=row.entry_point,
247
+ samples=samples,
248
+ passed_mask=pass_mask,
249
+ exec_durations_sec=durations,
250
+ compile_ok_mask=compile_ok,
251
+ errors=errors,
252
+ )
253
+ results.append(result)
254
+ ext_log_str = (
255
+ json.dumps(debug_traces, ensure_ascii=False, default=str)
256
+ if (self.extended_logs and debug_traces is not None)
257
+ else None
258
+ )
259
+ self._log_example(row, result, ext_log_str)
260
+ return results
261
+
262
+ def _sanitize(self, text: str) -> str:
263
+ """Optionally sanitize cpde from text using provided function.
264
+ If no parser provided, returns the original text.
265
+ """
266
+ if self.code_sanitize_fn is None:
267
+ return text
268
+ try:
269
+ return self.code_sanitize_fn(text)
270
+ except Exception as exc:
271
+ logging.getLogger(__name__).debug("Code sanitize error: %s", exc)
272
+ return text
273
+
274
+ def _init_log_file(self) -> None:
275
+ """Ensure the per-example log file exists if logging is enabled."""
276
+ if self.per_example_log_file is None:
277
+ return
278
+ self.per_example_log_file.parent.mkdir(parents=True, exist_ok=True)
279
+ with open(self.per_example_log_file, "w", encoding="utf-8") as _:
280
+ pass
281
+
282
+ def _log_example(self, row: HumanEvalData, result: HumanEvalResult, extended_log: str | None = None) -> None:
283
+ """Append a single NDJSON record for debugging if enabled."""
284
+ if self.per_example_log_file is None:
285
+ return
286
+ record: dict[str, object] = {
287
+ "task_id": row.task_id,
288
+ "entry_point": row.entry_point,
289
+ "n_samples": len(result.samples),
290
+ "samples": result.samples,
291
+ "compile_ok_mask": result.compile_ok_mask,
292
+ "passed_mask": result.passed_mask,
293
+ "exec_durations_sec": result.exec_durations_sec,
294
+ "errors": result.errors,
295
+ }
296
+ record["extended_debug_logging"] = extended_log or "[]"
297
+ with open(self.per_example_log_file, "a", encoding="utf-8") as f:
298
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
299
+
300
+ async def _generate_code(self, prompt: str) -> str:
301
+ """Generate final answer code from Agent or raw LLM."""
302
+ target = self.evaluation_target
303
+ if isinstance(target, Agent):
304
+ res = await target.run(prompt)
305
+ return str(res.content)
306
+
307
+ resp = await target.generate(prompt)
308
+ return str(resp)
309
+
310
+ async def _generate_with_debug(self, prompt: str) -> tuple[str, dict | None]:
311
+ """Generate code and capture tool/history/usage for logging (as raw content)."""
312
+ target = self.evaluation_target
313
+ if isinstance(target, Agent):
314
+ res = await target.run(prompt)
315
+ dbg = {
316
+ "history": res.history,
317
+ "tool_calls": res.tool_calls,
318
+ "usage": res.usage,
319
+ "metadata": res.metadata,
320
+ }
321
+ return str(res.content), dbg
322
+ resp = await target.generate(prompt)
323
+ return str(resp), None
@@ -0,0 +1,96 @@
1
+ import asyncio
2
+ from collections.abc import Iterable
3
+ from dataclasses import dataclass
4
+ from typing import Any, Generic
5
+
6
+ from typing_extensions import Self
7
+
8
+ from ragbits.agents._main import AgentResult
9
+ from ragbits.agents.types import (
10
+ QuestionAnswerAgent,
11
+ QuestionAnswerPromptInput,
12
+ QuestionAnswerPromptOutputT,
13
+ )
14
+ from ragbits.core.llms.base import LLMClientOptionsT
15
+ from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
16
+
17
+
18
+ class QuestionAnswerData(EvaluationData):
19
+ """
20
+ Represents the evaluation data for question answer.
21
+ """
22
+
23
+ question: str
24
+ reference_answer: str
25
+ reference_context: Any | None = None
26
+
27
+
28
+ @dataclass
29
+ class QuestionAnswerResult(EvaluationResult, Generic[QuestionAnswerPromptOutputT]):
30
+ """
31
+ Represents the result of a single evaluation.
32
+ """
33
+
34
+ question: str
35
+ predicted_result: AgentResult[QuestionAnswerPromptOutputT]
36
+ reference_answer: str
37
+ reference_context: Any | None = None
38
+
39
+
40
+ class QuestionAnswerPipeline(
41
+ EvaluationPipeline[
42
+ QuestionAnswerAgent[LLMClientOptionsT, QuestionAnswerPromptInput, QuestionAnswerPromptOutputT],
43
+ QuestionAnswerData,
44
+ QuestionAnswerResult,
45
+ ]
46
+ ):
47
+ """
48
+ Question answer evaluation pipeline.
49
+ """
50
+
51
+ @classmethod
52
+ def from_config(cls, config: dict) -> Self:
53
+ """
54
+ Create an instance of `QuestionAnswerPipeline` from a configuration dictionary.
55
+
56
+ Args:
57
+ config: A dictionary containing configuration settings for the pipeline.
58
+
59
+ Returns:
60
+ An instance of the pipeline class initialized with the provided configuration.
61
+ """
62
+ config["evaluation_target"] = QuestionAnswerAgent.from_config(config)
63
+ return super().from_config(config)
64
+
65
+ async def __call__(
66
+ self, data: Iterable[QuestionAnswerData]
67
+ ) -> Iterable[QuestionAnswerResult[QuestionAnswerPromptOutputT]]:
68
+ """
69
+ Run the question answer evaluation pipeline.
70
+
71
+ Args:
72
+ data: The evaluation data batch.
73
+
74
+ Returns:
75
+ The evaluation result batch.
76
+ """
77
+ results = await asyncio.gather(
78
+ *[
79
+ self.evaluation_target.run(
80
+ QuestionAnswerPromptInput(
81
+ question=row.question,
82
+ context=row.reference_context,
83
+ )
84
+ )
85
+ for row in data
86
+ ]
87
+ )
88
+ return [
89
+ QuestionAnswerResult(
90
+ question=row.question,
91
+ predicted_result=result,
92
+ reference_answer=row.reference_answer,
93
+ reference_context=row.reference_context,
94
+ )
95
+ for row, result in zip(data, results, strict=False)
96
+ ]
ragbits/evaluate/utils.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import json
2
2
  import sys
3
+ import traceback
4
+ from dataclasses import asdict
3
5
  from datetime import datetime
4
6
  from pathlib import Path
5
7
  from typing import Any
@@ -7,34 +9,20 @@ from typing import Any
7
9
  from datasets import Dataset
8
10
  from hydra.core.hydra_config import HydraConfig
9
11
  from neptune import Run
12
+ from neptune.types import File
10
13
  from neptune.utils import stringify_unsupported
11
- from omegaconf import DictConfig, OmegaConf
14
+ from neptune_optuna import NeptuneCallback
15
+ from omegaconf import DictConfig
12
16
 
17
+ from ragbits.evaluate.evaluator import EvaluatorResult
13
18
 
14
- def _save(file_path: Path, **data: Any) -> None: # noqa: ANN401
15
- """
16
- Save the data to a file. Add the current timestamp and Python version to the data.
17
-
18
- Args:
19
- file_path: The path to the file.
20
- data: The data to be saved.
21
- """
22
- current_time = datetime.now()
23
-
24
- data["_timestamp"] = current_time.isoformat()
25
- data["_python_version"] = sys.version
26
- data["_interpreter_path"] = sys.executable
27
19
 
28
- with open(file_path, "w", encoding="utf-8") as file:
29
- json.dump(data, file, indent=4)
30
-
31
-
32
- def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path:
20
+ def log_evaluation_to_file(result: EvaluatorResult, output_dir: Path | None = None) -> Path:
33
21
  """
34
- Log the evaluation results locally.
22
+ Log the evaluation result locally.
35
23
 
36
24
  Args:
37
- results: The evaluation results.
25
+ result: The evaluation result.
38
26
  output_dir: The output directory.
39
27
 
40
28
  Returns:
@@ -43,13 +31,59 @@ def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path
43
31
  output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
44
32
  metrics_file = output_dir / "metrics.json"
45
33
  results_file = output_dir / "results.json"
46
-
47
- _save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
48
- _save(results_file, results=results["results"])
34
+ errors_file = output_dir / "errors.json"
35
+
36
+ _save_json(metrics_file, metrics=result.metrics, time_perf=asdict(result.time_perf))
37
+ _save_json(results_file, results=[asdict(entry) for entry in result.results])
38
+ _save_json(
39
+ errors_file,
40
+ errors=[
41
+ {
42
+ "type": exc.__class__.__name__,
43
+ "message": str(exc),
44
+ "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
45
+ }
46
+ for exc in result.errors
47
+ ],
48
+ )
49
49
 
50
50
  return output_dir
51
51
 
52
52
 
53
+ def log_evaluation_to_neptune(result: EvaluatorResult, config: DictConfig, tags: str | list[str] | None = None) -> None:
54
+ """
55
+ Log the evaluation result to Neptune.
56
+
57
+ Args:
58
+ result: The evaluation result.
59
+ config: The evaluation configuration.
60
+ tags: The experiment tags.
61
+ """
62
+ run = Run(tags=tags)
63
+ run["config"] = stringify_unsupported(config)
64
+ run["evaluation/metrics"] = stringify_unsupported(result.metrics)
65
+ run["evaluation/time_perf"] = stringify_unsupported(asdict(result.time_perf))
66
+ run["evaluation/results"].upload(
67
+ File.from_content(json.dumps([asdict(entry) for entry in result.results], indent=4), extension="json")
68
+ )
69
+ run["evaluation/errors"].upload(
70
+ File.from_content(
71
+ json.dumps(
72
+ [
73
+ {
74
+ "type": exc.__class__.__name__,
75
+ "message": str(exc),
76
+ "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
77
+ }
78
+ for exc in result.errors
79
+ ],
80
+ indent=4,
81
+ ),
82
+ extension="json",
83
+ )
84
+ )
85
+
86
+
53
87
  def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path:
54
88
  """
55
89
  Log the evaluation results locally.
@@ -68,7 +102,7 @@ def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Pat
68
102
 
69
103
 
70
104
  def log_optimization_to_file(
71
- results: list[tuple[DictConfig, float, dict[str, float]]], output_dir: Path | None = None
105
+ results: list[tuple[dict, float, dict[str, float]]], output_dir: Path | None = None
72
106
  ) -> Path:
73
107
  """
74
108
  Log the evaluation results locally.
@@ -81,53 +115,46 @@ def log_optimization_to_file(
81
115
  The output directory.
82
116
  """
83
117
  output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
118
+
84
119
  scores = {}
85
- for idx, (cfg, score, all_metrics) in enumerate(results):
86
- trial_name = f"trial_{idx}"
87
- OmegaConf.save(cfg, output_dir / f"{trial_name}.yaml")
120
+ for i, (config, score, all_metrics) in enumerate(results):
121
+ trial_name = f"trial-{i}"
88
122
  scores[trial_name] = {"score": score, "all_metrics": all_metrics}
123
+ trial_config_file = output_dir / f"{trial_name}.json"
124
+ _save_json(trial_config_file, config=config)
125
+
89
126
  scores_file = output_dir / "scores.json"
90
- _save(scores_file, scores=scores)
127
+ _save_json(scores_file, scores=scores)
128
+
91
129
  return output_dir
92
130
 
93
131
 
94
- def setup_neptune(config: DictConfig) -> Run | None:
132
+ def _save_json(file_path: Path, **data: Any) -> None: # noqa: ANN401
95
133
  """
96
- Setup the Neptune run.
134
+ Save the data to a file. Add the current timestamp and Python version to the data.
97
135
 
98
136
  Args:
99
- config: The Hydra configuration.
100
-
101
- Returns:
102
- The Neptune run.
137
+ file_path: The path to the file.
138
+ data: The data to be saved.
103
139
  """
104
- if config.neptune.run:
105
- run = Run(
106
- project=config.neptune.project,
107
- tags=[
108
- config.task.type,
109
- config.task.name,
110
- config.data.name,
111
- ],
112
- )
113
- run["config"] = stringify_unsupported(config)
114
- return run
115
- return None
140
+ current_time = datetime.now()
141
+
142
+ data["_timestamp"] = current_time.isoformat()
143
+ data["_python_version"] = sys.version
144
+ data["_interpreter_path"] = sys.executable
145
+
146
+ file_path.parent.mkdir(parents=True, exist_ok=True)
147
+
148
+ with open(file_path, "w", encoding="utf-8") as file:
149
+ json.dump(data, file, indent=4)
116
150
 
117
151
 
118
- def log_to_neptune(run: Run, results: dict[str, Any], output_dir: Path | None = None) -> None:
152
+ def setup_optuna_neptune_callback(tags: str | list[str] | None = None) -> NeptuneCallback:
119
153
  """
120
- Log the evaluation results to Neptune.
154
+ Log the optimization process to Neptune.
121
155
 
122
156
  Args:
123
- run: The Neptune run.
124
- results: The evaluation results.
125
- output_dir: The output directory.
157
+ tags: Experiment tags.
126
158
  """
127
- output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
128
-
129
- run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
130
- run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
131
- run["evaluation/results"] = stringify_unsupported(results["results"])
132
- run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
133
- run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())
159
+ run = Run(tags=tags)
160
+ return NeptuneCallback(run)
@@ -1,13 +1,13 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.5.0
3
+ Version: 1.4.0.dev202602030301
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
7
7
  Project-URL: Documentation, https://ragbits.deepsense.ai/
8
8
  Project-URL: Source, https://github.com/deepsense-ai/ragbits
9
9
  Author-email: "deepsense.ai" <ragbits@deepsense.ai>
10
- License: MIT
10
+ License-Expression: MIT
11
11
  Keywords: Evaluation,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Environment :: Console
@@ -22,13 +22,37 @@ Classifier: Programming Language :: Python :: 3.13
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: >=3.10
25
- Requires-Dist: distilabel==1.4.1
26
- Requires-Dist: hydra-core~=1.3.2
27
- Requires-Dist: neptune~=1.12.0
28
- Requires-Dist: optuna==4.0.0
29
- Requires-Dist: ragbits-core==0.5.0
25
+ Requires-Dist: datasets<4.0.0,>=3.0.1
26
+ Requires-Dist: deepeval<3.0.0,>=2.0.0
27
+ Requires-Dist: distilabel<2.0.0,>=1.5.0
28
+ Requires-Dist: hydra-core<2.0.0,>=1.3.2
29
+ Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
30
+ Requires-Dist: optuna<5.0.0,>=4.0.0
31
+ Requires-Dist: ragbits-core==1.4.0.dev202602030301
30
32
  Provides-Extra: relari
31
- Requires-Dist: continuous-eval~=0.3.12; extra == 'relari'
33
+ Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
32
34
  Description-Content-Type: text/markdown
33
35
 
34
36
  # Ragbits Evaluate
37
+
38
+ Ragbits Evaluate is a package that contains tools for evaluating the performance of AI pipelines defined with Ragbits components. It also helps with automatically finding the best hyperparameter configurations for them.
39
+
40
+ ## Installation
41
+
42
+ To install the Ragbits Evaluate package, run:
43
+
44
+ ```sh
45
+ pip install ragbits-evaluate
46
+ ```
47
+
48
+ <!--
49
+ TODO: Add a minimalistic example inspired by the Quickstart chapter on Ragbits Evaluate once it is ready.
50
+ -->
51
+
52
+ ## Documentation
53
+ <!--
54
+ TODO:
55
+ * Add link to the Quickstart chapter on Ragbits Evaluate once it is ready.
56
+ * Add link to API Reference once classes from the Evaluate package are added to the API Reference.
57
+ -->
58
+ * [How-To Guides - Evaluate](https://ragbits.deepsense.ai/how-to/evaluate/optimize/)