ragbits-evaluate 1.4.0.dev202509220615__tar.gz → 1.4.0.dev202511160236__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/CHANGELOG.md +2 -0
  2. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/PKG-INFO +2 -2
  3. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/pyproject.toml +2 -2
  4. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/gaia.py +78 -0
  5. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
  6. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/dataloaders/human_eval.py +70 -0
  7. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/gaia.py +84 -0
  8. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/hotpot_qa.py +51 -0
  9. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/metrics/human_eval.py +105 -0
  10. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/pipelines/__init__.py +12 -1
  11. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/pipelines/gaia.py +249 -0
  12. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
  13. ragbits_evaluate-1.4.0.dev202511160236/src/ragbits/evaluate/pipelines/human_eval.py +323 -0
  14. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/.gitignore +0 -0
  15. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/README.md +0 -0
  16. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/__init__.py +0 -0
  17. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/cli.py +0 -0
  18. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/config.py +0 -0
  19. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
  20. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataloaders/base.py +0 -0
  21. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
  22. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
  23. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -0
  24. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  25. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  26. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  27. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  28. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  29. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  30. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  31. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  32. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  33. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  34. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  35. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
  36. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
  37. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  38. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/evaluator.py +0 -0
  39. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/factories/__init__.py +0 -0
  40. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  41. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/metrics/base.py +0 -0
  42. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/metrics/document_search.py +0 -0
  43. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/metrics/question_answer.py +0 -0
  44. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/optimizer.py +0 -0
  45. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/pipelines/base.py +0 -0
  46. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
  47. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
  48. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/py.typed +0 -0
  49. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/src/ragbits/evaluate/utils.py +0 -0
  50. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/tests/cli/test_run_evaluation.py +0 -0
  51. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/tests/unit/test_evaluator.py +0 -0
  52. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/tests/unit/test_metrics.py +0 -0
  53. {ragbits_evaluate-1.4.0.dev202509220615 → ragbits_evaluate-1.4.0.dev202511160236}/tests/unit/test_optimizer.py +0 -0
@@ -2,6 +2,8 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ - Feat: introduce agent evaluation pipelines and metrics (HotpotQA, HumanEval, GAIA) (#829)
6
+
5
7
  ## 1.3.0 (2025-09-11)
6
8
 
7
9
  ### Changed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 1.4.0.dev202509220615
3
+ Version: 1.4.0.dev202511160236
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==1.4.0.dev202509220615
30
+ Requires-Dist: ragbits-core==1.4.0.dev202511160236
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "1.4.0.dev202509220615"
3
+ version = "1.4.0.dev202511160236"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202509220615"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202511160236"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -0,0 +1,78 @@
1
+ from collections.abc import Iterable
2
+
3
+ from ragbits.core.sources.base import Source
4
+ from ragbits.evaluate.dataloaders.base import DataLoader
5
+ from ragbits.evaluate.pipelines.gaia import GaiaData
6
+
7
+
8
+ class GaiaDataLoader(DataLoader[GaiaData]):
9
+ """
10
+ GAIA benchmark evaluation data loader.
11
+
12
+ The source should point to a local/remote JSON or JSONL file exported from the
13
+ Hugging Face dataset `gaia-benchmark/GAIA`. Rows are expected to contain at least:
14
+ - "task_id" (str)
15
+ - "Question" (str)
16
+ - "Level" (int)
17
+ - "Final answer" (str)
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ source: Source,
23
+ *,
24
+ split: str = "data",
25
+ task_id_key: str = "task_id",
26
+ question_key: str = "Question",
27
+ level_key: str = "Level",
28
+ final_answer_key: str = "Final answer",
29
+ file_name_key: str = "file_name",
30
+ skip_file_attachments: bool = False,
31
+ ) -> None:
32
+ """
33
+ Initialize the GAIA data loader.
34
+
35
+ Args:
36
+ source: The source to load the data from.
37
+ split: The split to load the data from (file name generated by the source helper).
38
+ task_id_key: Column name for GAIA task identifier.
39
+ question_key: Column name for the natural language question.
40
+ level_key: Column name for numeric difficulty level (1, 2, 3).
41
+ final_answer_key: Column name for the final ground-truth answer.
42
+ file_name_key: Column name with optional associated file name (may be empty).
43
+ skip_file_attachments: If True, skip rows that have a non-empty file attachment.
44
+ """
45
+ required = {task_id_key, question_key, level_key, final_answer_key}
46
+ super().__init__(source=source, split=split, required_keys=required)
47
+ self.task_id_key = task_id_key
48
+ self.question_key = question_key
49
+ self.level_key = level_key
50
+ self.final_answer_key = final_answer_key
51
+ self.file_name_key = file_name_key
52
+ self.skip_file_attachments = skip_file_attachments
53
+
54
+ async def map(self, dataset: Iterable[dict]) -> Iterable[GaiaData]:
55
+ """
56
+ Map the dataset to the GAIA evaluation data schema.
57
+
58
+ Args:
59
+ dataset: The dataset to map.
60
+
61
+ Returns:
62
+ The GAIA evaluation data rows.
63
+ """
64
+ return [
65
+ GaiaData(
66
+ task_id=str(row.get(self.task_id_key, "")),
67
+ question=str(row.get(self.question_key, "")),
68
+ level=int(row.get(self.level_key, 1)),
69
+ reference_answer=str(row.get(self.final_answer_key, "")),
70
+ file_name=(row.get(self.file_name_key) or None),
71
+ )
72
+ for row in dataset
73
+ if (
74
+ not self.skip_file_attachments
75
+ or not row.get(self.file_name_key)
76
+ or str(row.get(self.file_name_key)).strip() == ""
77
+ )
78
+ ]
@@ -0,0 +1,95 @@
1
+ from collections.abc import Iterable
2
+ from typing import Any
3
+
4
+ from ragbits.core.sources.base import Source
5
+ from ragbits.evaluate.dataloaders.base import DataLoader
6
+ from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAData
7
+
8
+
9
+ class HotpotQADataLoader(DataLoader[HotpotQAData]):
10
+ """
11
+ HotpotQA evaluation data loader.
12
+
13
+ The source should point to a local/remote JSON file exported from Hugging Face, where each example includes at
14
+ least the following keys:
15
+ - "id" (str)
16
+ - "question" (str)
17
+ - "answer" (str)
18
+ - "type" ("bridge" | "comparison")
19
+ - "level" ("easy" | "medium" | "hard")
20
+ - "context" (object with keys: "title": list[str], "sentences": list[list[str]])
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ source: Source,
26
+ *,
27
+ split: str = "data",
28
+ id_key: str = "id",
29
+ question_key: str = "question",
30
+ answer_key: str = "answer",
31
+ type_key: str = "type",
32
+ level_key: str = "level",
33
+ context_key: str = "context",
34
+ # filter
35
+ level_filter: str | None = None, # one of: easy|medium|hard
36
+ ) -> None:
37
+ """
38
+ Initialize the HotpotQA data loader.
39
+
40
+ Args:
41
+ source: The source to load the data from.
42
+ split: The split to load the data from.
43
+ id_key: Column with unique id.
44
+ question_key: Column with question text.
45
+ answer_key: Column with ground truth answer.
46
+ type_key: Column with question type ("bridge" | "comparison").
47
+ level_key: Column with difficulty ("easy" | "medium" | "hard").
48
+ context_key: Column with context object containing titles and sentences.
49
+ level_filter: If provided, return only examples with this level.
50
+ """
51
+ required = {id_key, question_key, answer_key, type_key, level_key, context_key}
52
+ super().__init__(source=source, split=split, required_keys=required)
53
+ self.id_key = id_key
54
+ self.question_key = question_key
55
+ self.answer_key = answer_key
56
+ self.type_key = type_key
57
+ self.level_key = level_key
58
+ self.context_key = context_key
59
+ self.level_filter = level_filter
60
+
61
+ async def map(self, dataset: Iterable[dict]) -> Iterable[HotpotQAData]:
62
+ """
63
+ Map the dataset to the HotpotQA evaluation data schema.
64
+
65
+ Args:
66
+ dataset: The dataset to map.
67
+
68
+ Returns:
69
+ The HotpotQA evaluation data rows.
70
+ """
71
+
72
+ def to_context_rows(context: dict[str, Any]) -> list[str]:
73
+ titles = context.get("title", []) or []
74
+ sentences = context.get("sentences", []) or []
75
+ rows: list[str] = []
76
+ for title, sent_list in zip(titles, sentences, strict=False):
77
+ doc_text = "\n".join(sent_list) if isinstance(sent_list, list) else str(sent_list)
78
+ rows.append(f"{title}\n{doc_text}")
79
+ if not rows and isinstance(sentences, list):
80
+ flat = "\n".join([" ".join(s) if isinstance(s, list) else str(s) for s in sentences])
81
+ rows = [flat]
82
+ return rows
83
+
84
+ return [
85
+ HotpotQAData(
86
+ id=row.get(self.id_key, ""),
87
+ question=row.get(self.question_key, ""),
88
+ reference_answer=str(row.get(self.answer_key, "")),
89
+ qtype=str(row.get(self.type_key, "")),
90
+ level=(row.get(self.level_key) or "").lower(),
91
+ reference_context=to_context_rows(row.get(self.context_key, {}) or {}),
92
+ )
93
+ for row in dataset
94
+ if not self.level_filter or (row.get(self.level_key, "").lower() == self.level_filter)
95
+ ]
@@ -0,0 +1,70 @@
1
+ from collections.abc import Iterable
2
+
3
+ from ragbits.core.sources.base import Source
4
+ from ragbits.evaluate.dataloaders.base import DataLoader
5
+ from ragbits.evaluate.pipelines.human_eval import HumanEvalData
6
+
7
+
8
+ class HumanEvalDataLoader(DataLoader[HumanEvalData]):
9
+ """
10
+ HumanEval evaluation data loader.
11
+
12
+ The source should point to a local/remote JSONL file in HumanEval format, where each line is a JSON object
13
+ with at least the following keys: "
14
+ - task_id" (str)
15
+ - "prompt" (str)
16
+ - "entry_point" (str)
17
+ - "test" (str)
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ source: Source,
23
+ *,
24
+ split: str = "data",
25
+ task_id_key: str = "task_id",
26
+ prompt_key: str = "prompt",
27
+ entry_point_key: str = "entry_point",
28
+ test_key: str = "test",
29
+ canonical_solution_key: str | None = "canonical_solution",
30
+ ) -> None:
31
+ """
32
+ Initialize the HumanEval data loader.
33
+
34
+ Args:
35
+ source: The source to load the data from.
36
+ split: The split to load the data from.
37
+ task_id_key: Dataset column with the HumanEval task identifier.
38
+ prompt_key: Dataset column with the Python prompt (function signature and docstring).
39
+ entry_point_key: Dataset column with the function name to evaluate.
40
+ test_key: Dataset column with the Python test harness defining `check(candidate)`.
41
+ canonical_solution_key: Optional dataset column with the reference solution (not used for scoring).
42
+ """
43
+ required = {task_id_key, prompt_key, entry_point_key, test_key}
44
+ super().__init__(source=source, split=split, required_keys=required)
45
+ self.task_id_key = task_id_key
46
+ self.prompt_key = prompt_key
47
+ self.entry_point_key = entry_point_key
48
+ self.test_key = test_key
49
+ self.canonical_solution_key = canonical_solution_key
50
+
51
+ async def map(self, dataset: Iterable[dict]) -> Iterable[HumanEvalData]:
52
+ """
53
+ Map the dataset to the HumanEval evaluation data schema.
54
+
55
+ Args:
56
+ dataset: The dataset to map.
57
+
58
+ Returns:
59
+ The HumanEval evaluation data rows.
60
+ """
61
+ return [
62
+ HumanEvalData(
63
+ task_id=row.get(self.task_id_key, ""),
64
+ prompt=row.get(self.prompt_key, ""),
65
+ entry_point=row.get(self.entry_point_key, ""),
66
+ test=row.get(self.test_key, ""),
67
+ canonical_solution=(row.get(self.canonical_solution_key) if self.canonical_solution_key else None),
68
+ )
69
+ for row in dataset
70
+ ]
@@ -0,0 +1,84 @@
1
+ from statistics import mean
2
+
3
+ from ragbits.evaluate.metrics.base import Metric
4
+ from ragbits.evaluate.pipelines.gaia import GaiaResult
5
+
6
+
7
+ class GaiaOutcome(Metric[GaiaResult]):
8
+ """
9
+ Computes task success rate over GAIA tasks.
10
+ Measures the fraction of tasks that were successfully solved.
11
+ """
12
+
13
+ @staticmethod
14
+ async def compute(results: list[GaiaResult]) -> dict:
15
+ """Compute task success rate.
16
+
17
+ Returns:
18
+ Dictionary with gaia_task_success_rate: fraction of successfully solved tasks.
19
+ """
20
+ success_count = sum(1 for r in results if r.task_success)
21
+ success_rate = (success_count / len(results)) if results else 0.0
22
+
23
+ return {"gaia_task_success_rate": float(success_rate)}
24
+
25
+
26
+ class GaiaTooling(Metric[GaiaResult]):
27
+ """
28
+ Tool utilization and performance metrics:
29
+ - gaia_tool_trigger_rate: fraction of tasks where tools were used
30
+ - gaia_avg_num_tool_calls: average number of tool calls per task
31
+ - gaia_avg_tool_error_count: average number of tool errors per task
32
+ - averaged_freq: average tool usage/calls per task
33
+ """
34
+
35
+ @staticmethod
36
+ async def compute(results: list[GaiaResult]) -> dict:
37
+ """Compute tool utilization and performance metrics.
38
+
39
+ Returns:
40
+ Dictionary with tool trigger rate, average tool calls, average errors,
41
+ and flattened tool frequency usage as numeric metrics.
42
+ """
43
+ tool_triggered_count = sum(1 for r in results if r.tool_triggered)
44
+ tool_trigger_rate = (tool_triggered_count / len(results)) if results else 0.0
45
+ avg_tool_calls = float(mean(r.num_tool_calls for r in results)) if results else 0.0
46
+ avg_tool_errors = float(mean(r.tool_error_count for r in results)) if results else 0.0
47
+
48
+ # tool frequency as average per task (mean calls per task per tool)
49
+ total_tasks = len(results) if results else 1
50
+ aggregated_counts: dict[str, int] = {}
51
+ for r in results:
52
+ if r.tool_names:
53
+ for name in r.tool_names:
54
+ aggregated_counts[name] = aggregated_counts.get(name, 0) + 1
55
+ averaged_freq: dict[str, float] = {
56
+ f"gaia_tool_frequency_usage.{name}": (count / total_tasks) for name, count in aggregated_counts.items()
57
+ }
58
+
59
+ return {
60
+ "gaia_tool_trigger_rate": float(tool_trigger_rate),
61
+ "gaia_avg_num_tool_calls": avg_tool_calls,
62
+ "gaia_avg_tool_error_count": avg_tool_errors,
63
+ **averaged_freq,
64
+ }
65
+
66
+
67
+ class GaiaEfficiency(Metric[GaiaResult]):
68
+ """
69
+ Efficiency and resource usage metrics:
70
+ - gaia_avg_latency_ms: average response latency in milliseconds
71
+ """
72
+
73
+ @staticmethod
74
+ async def compute(results: list[GaiaResult]) -> dict:
75
+ """Compute efficiency and resource usage metrics.
76
+
77
+ Returns:
78
+ Dictionary with average latency.
79
+ """
80
+ avg_latency = float(mean(r.total_latency_ms for r in results)) if results else 0.0
81
+
82
+ return {
83
+ "gaia_avg_latency_ms": avg_latency,
84
+ }
@@ -0,0 +1,51 @@
1
+ from collections import defaultdict
2
+ from collections.abc import Iterable
3
+
4
+ from ragbits.evaluate.metrics.base import Metric
5
+ from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAResult
6
+
7
+
8
+ class HotpotQAExactMatch(Metric[HotpotQAResult]):
9
+ """Computes EM over HotpotQA by type and overall."""
10
+
11
+ @staticmethod
12
+ async def compute(results: list[HotpotQAResult]) -> dict:
13
+ """Compute EM. Returns hotpotqa_<type>_em and hotpotqa_overall_em."""
14
+ buckets: dict[str, list[float]] = defaultdict(list)
15
+ for r in results:
16
+ em = r.em_value
17
+ t = r.qtype or "unknown"
18
+ buckets[t].append(em)
19
+ buckets["overall"].append(em)
20
+
21
+ def avg(vals: Iterable[float]) -> float:
22
+ lst = list(vals)
23
+ return float(sum(lst) / len(lst)) if lst else 0.0
24
+
25
+ metrics: dict[str, float] = {}
26
+ for t, vals in buckets.items():
27
+ metrics[f"hotpotqa_{t}_em"] = avg(vals)
28
+ return metrics
29
+
30
+
31
+ class HotpotQAF1(Metric[HotpotQAResult]):
32
+ """Computes token-level F1 over HotpotQA by type and overall."""
33
+
34
+ @staticmethod
35
+ async def compute(results: list[HotpotQAResult]) -> dict:
36
+ """Compute F1. Returns hotpotqa_<type>_f1 and hotpotqa_overall_f1."""
37
+ buckets: dict[str, list[float]] = defaultdict(list)
38
+ for r in results:
39
+ f1v = r.f1_value
40
+ t = r.qtype or "unknown"
41
+ buckets[t].append(f1v)
42
+ buckets["overall"].append(f1v)
43
+
44
+ def avg(vals: Iterable[float]) -> float:
45
+ lst = list(vals)
46
+ return float(sum(lst) / len(lst)) if lst else 0.0
47
+
48
+ metrics: dict[str, float] = {}
49
+ for t, vals in buckets.items():
50
+ metrics[f"hotpotqa_{t}_f1"] = avg(vals)
51
+ return metrics
@@ -0,0 +1,105 @@
1
+ import math
2
+ from statistics import mean
3
+
4
+ from ragbits.evaluate.metrics.base import Metric
5
+ from ragbits.evaluate.pipelines.human_eval import HumanEvalResult
6
+
7
+
8
+ class HumanEvalPassAtK(Metric[HumanEvalResult]):
9
+ """
10
+ Computes pass@k over HumanEval tasks.
11
+ Measures the fraction of tasks with at least one passing sample out of k attempts.
12
+ """
13
+
14
+ def __init__(self, k: int = 1) -> None:
15
+ super().__init__()
16
+ self.k = k
17
+
18
+ async def compute(self, results: list[HumanEvalResult]) -> dict:
19
+ """Compute pass@k averaged over tasks.
20
+
21
+ Returns:
22
+ Dictionary with humaneval_pass@k: fraction of tasks with at least one passing sample.
23
+ """
24
+ values = []
25
+ for r in results:
26
+ n = len(r.passed_mask)
27
+ m = sum(1 for x in r.passed_mask if x)
28
+ k = min(self.k, n)
29
+ if n == 0 or k == 0:
30
+ values.append(0.0)
31
+ continue
32
+ if m == 0:
33
+ values.append(0.0)
34
+ continue
35
+ if m == n:
36
+ values.append(1.0)
37
+ continue
38
+ # 1 - C(n-m, k) / C(n, k)
39
+ denom = math.comb(n, k)
40
+ numer = math.comb(n - m, k) if n - m >= k else 0
41
+ values.append(1.0 - (numer / denom))
42
+ return {f"humaneval_pass@{self.k}": float(mean(values)) if values else 0.0}
43
+
44
+
45
+ class HumanEvalQualityPerf(Metric[HumanEvalResult]):
46
+ """
47
+ Code quality and execution performance metrics:
48
+ - humaneval_compile_rate: fraction of samples that compiled
49
+ - humaneval_syntax_error_rate: fraction of samples with syntax error (compile failed)
50
+ - humaneval_assert_fail_rate: fraction of samples that ran but failed assertions
51
+ - humaneval_runtime_error_rate: fraction of samples with other runtime errors
52
+ - humaneval_timeout_rate: fraction of samples that timed out
53
+ - humaneval_tasks_solved: fraction of tasks with any passing sample
54
+ - humaneval_avg_exec_time_sec: average exec time over compilable runs
55
+ """
56
+
57
+ @staticmethod
58
+ async def compute(results: list[HumanEvalResult]) -> dict:
59
+ """Compute code quality and execution performance metrics.
60
+
61
+ Returns:
62
+ Dictionary with compile rates, error rates, tasks solved rate, and average execution time.
63
+ """
64
+ total_samples = sum(len(r.passed_mask) for r in results)
65
+ compiled = 0
66
+ syntax_errors = 0
67
+ assert_fails = 0
68
+ runtime_errors = 0
69
+ timeouts = 0
70
+ any_pass = sum(1 for r in results if any(r.passed_mask))
71
+ durations: list[float] = []
72
+
73
+ for r in results:
74
+ for ok, err, dur in zip(r.compile_ok_mask, r.errors, r.exec_durations_sec, strict=False):
75
+ if ok:
76
+ compiled += 1
77
+ durations.append(dur)
78
+ if err:
79
+ if err.startswith("AssertionError"):
80
+ assert_fails += 1
81
+ elif err.startswith("TimeoutError"):
82
+ timeouts += 1
83
+ else:
84
+ runtime_errors += 1
85
+ else:
86
+ # Compile failed: count as syntax error
87
+ syntax_errors += 1
88
+
89
+ compile_rate = (compiled / total_samples) if total_samples else 0.0
90
+ syntax_error_rate = (syntax_errors / total_samples) if total_samples else 0.0
91
+ assert_fail_rate = (assert_fails / total_samples) if total_samples else 0.0
92
+ runtime_error_rate = (runtime_errors / total_samples) if total_samples else 0.0
93
+ timeout_rate = (timeouts / total_samples) if total_samples else 0.0
94
+ tasks_solved = (any_pass / len(results)) if results else 0.0
95
+ avg_exec_time = float(mean(durations)) if durations else 0.0
96
+
97
+ return {
98
+ "humaneval_compile_rate": float(compile_rate),
99
+ "humaneval_syntax_error_rate": float(syntax_error_rate),
100
+ "humaneval_assert_fail_rate": float(assert_fail_rate),
101
+ "humaneval_runtime_error_rate": float(runtime_error_rate),
102
+ "humaneval_timeout_rate": float(timeout_rate),
103
+ "humaneval_tasks_solved": float(tasks_solved),
104
+ "humaneval_avg_exec_time_sec": avg_exec_time,
105
+ }
@@ -2,8 +2,19 @@ from ragbits.core.utils.config_handling import WithConstructionConfig
2
2
  from ragbits.document_search import DocumentSearch
3
3
  from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
4
4
  from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
5
+ from ragbits.evaluate.pipelines.gaia import GaiaPipeline
6
+ from ragbits.evaluate.pipelines.hotpot_qa import HotpotQAPipeline
7
+ from ragbits.evaluate.pipelines.human_eval import HumanEvalPipeline
5
8
 
6
- __all__ = ["DocumentSearchPipeline", "EvaluationData", "EvaluationPipeline", "EvaluationResult"]
9
+ __all__ = [
10
+ "DocumentSearchPipeline",
11
+ "EvaluationData",
12
+ "EvaluationPipeline",
13
+ "EvaluationResult",
14
+ "GaiaPipeline",
15
+ "HotpotQAPipeline",
16
+ "HumanEvalPipeline",
17
+ ]
7
18
 
8
19
  _target_to_evaluation_pipeline: dict[type[WithConstructionConfig], type[EvaluationPipeline]] = {
9
20
  DocumentSearch: DocumentSearchPipeline,