PyPI - crfm-helm - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show

helm/benchmark/scenarios/mmlu_scenario.py CHANGED Viewed

@@ -134,8 +134,14 @@ class MMLUScenario(Scenario):
             short_display_name="MMLU",
             description="The Massive Multitask Language Understanding (MMLU) benchmark for "
             "knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
-            "2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).",
-            taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
+            "2021)](https://arxiv.org/pdf/2009.03300.pdf).",
+            taxonomy=TaxonomyInfo(
+                task="multiple-choice question answering",
+                what="math, science, history, etc.",
+                when="before 2021",
+                who="various online sources",
+                language="English",
+            ),
             main_metric="exact_match",
             main_split="test",
         )

helm/benchmark/scenarios/narrativeqa_scenario.py CHANGED Viewed

@@ -162,14 +162,13 @@ class NarrativeQAScenario(Scenario):
         return ScenarioMetadata(
             name="narrative_qa",
             display_name="NarrativeQA",
-            short_display_name=None,
             description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
             "et al., 2017)](https://aclanthology.org/Q18-1023/).",
             taxonomy=TaxonomyInfo(
-                task="question answering",
+                task="short-answer question answering",
                 what="passages are books and movie scripts, questions are unknown",
-                when="?",
-                who="?",
+                when="2018",
+                who="annotators from summaries",
                 language="English",
             ),
             main_metric="f1_score",

helm/benchmark/scenarios/openai_mrcr_scenario.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import List, Optional
 import datasets
 import tiktoken
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Output,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
     Instance,
     TEST_SPLIT,
     Input,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_directory_exists
@@ -77,3 +79,16 @@ class OpenAIMRCRScenario(Scenario):
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="openai_mrcr",
+            display_name="OpenAI MRCR",
+            description="OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset "
+            "for benchmarking an LLM's ability to distinguish between multiple needles "
+            "hidden in context. This eval is inspired by the MRCR eval first introduced by "
+            "[Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).",
+            taxonomy=TaxonomyInfo(task="MRCR", what="Synthetic data", when="2025", who="None", language="English"),
+            main_metric="openai_mrcr_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/ruler_qa_scenarios.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
 from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples  # type: ignore
 from helm.benchmark.scenarios.scenario import (
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -78,6 +80,25 @@ class RULERHotpotQAScenario(_RULERQAScenario):
     def __init__(self, max_num_words: int):
         super().__init__("hotpotqa", max_num_words)
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="ruler_hotpotqa",
+            display_name="RULER HotPotQA",
+            description="RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., "
+            "2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., "
+            "2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question "
+            "answering as a long-context scenario.",
+            taxonomy=TaxonomyInfo(
+                task="question answering with retrieval-augmented generation",
+                what="Wikipedia articles",
+                when="Before 2018",
+                who="Wikipedia authors",
+                language="English",
+            ),
+            main_metric="ruler_string_match_part",
+            main_split="valid",
+        )
 class RULERSQuADScenario(_RULERQAScenario):
     name = "ruler_squad"
@@ -86,3 +107,22 @@ class RULERSQuADScenario(_RULERQAScenario):
     def __init__(self, max_num_words: int):
         super().__init__("squad", max_num_words)
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="ruler_squad",
+            display_name="RULER SQuAD",
+            description="RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., "
+            "2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., "
+            "2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question "
+            "answering as a long-context scenario.",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="Wikipedia articles",
+                when="Before 2018",
+                who="Wikipedia authors and crowdworkers",
+                language="English",
+            ),
+            main_metric="ruler_string_match_part",
+            main_split="valid",
+        )

helm/benchmark/scenarios/simple_safety_tests_scenario.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import List
 from datasets import load_dataset
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
 class SimpleSafetyTestsScenario(Scenario):
@@ -31,3 +32,13 @@ class SimpleSafetyTestsScenario(Scenario):
             instance = Instance(input=input, references=references, split=TEST_SPLIT)
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="simple_safety_tests",
+            display_name="SimpleSafetyTests",
+            description="SimpleSafetyTests",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="safety_score",
+            main_split="test",
+        )

helm/benchmark/scenarios/spider_scenario.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Dict, List
 from filelock import FileLock
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.bird_sql_scenario_helper import (  # type: ignore
@@ -17,6 +18,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -89,3 +91,19 @@ INSERT_YOUR_SQL_QUERY_HERE
             )
             instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="spider",
+            display_name="Spider 1.0 (Test)",
+            description="Spider 1.0 (Test)",
+            taxonomy=TaxonomyInfo(
+                task="text-to-SQL",
+                what="databases from various domains",
+                when="?",
+                who="expert data scientists",
+                language="English",
+            ),
+            main_metric="execution_accuracy",
+            main_split="valid",
+        )

helm/benchmark/scenarios/thai_exam_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from typing import Dict, List
 import json
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
             instances.extend(self.process_jsonl(jsonl_path, splits[split]))
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        if self.exam == "onet":
+            return ScenarioMetadata(
+                name="thai_exam_onet",
+                display_name="ONET",
+                description="The Ordinary National Educational Test (ONET) is an examination for students "
+                "in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
+                "each question has 5 choices. These subjects are Thai, English, Mathematics, "
+                "Social Studies, and Science. Amounting to a total of 170 questions and "
+                "options.\n",
+                taxonomy=TaxonomyInfo(
+                    task="question answering",
+                    what="high school / medical school academic knowledge",
+                    when="?",
+                    who="n/a",
+                    language="Thai and English",
+                ),
+                main_metric="exact_match",
+                main_split="test",
+            )
+        elif self.exam == "ic":
+            return ScenarioMetadata(
+                name="thai_exam_ic",
+                display_name="IC",
+                description="The Investment Consultant (IC) examination, a licensing test for investment "
+                "professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
+                "features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
+                "resulting in a total of 95 questions and options.\n",
+                taxonomy=TaxonomyInfo(
+                    task="question answering",
+                    what="licensing for investment professionals",
+                    when="?",
+                    who="n/a",
+                    language="Thai",
+                ),
+                main_metric="exact_match",
+                main_split="test",
+            )
+        elif self.exam == "tgat":
+            return ScenarioMetadata(
+                name="thai_exam_tgat",
+                display_name="TGAT",
+                description="The Thai General Aptitude Test (TGAT), a national high school examination in "
+                "Thailand. Focuses on critical and logical thinking skills. We collected a "
+                "total of 90 questions and answers. The TGAT consists of four choices per "
+                "question.\n",
+                taxonomy=TaxonomyInfo(
+                    task="question answering",
+                    what="high school level questions on reasoning",
+                    when="?",
+                    who="n/a",
+                    language="English",
+                ),
+                main_metric="exact_match",
+                main_split="test",
+            )
+        elif self.exam == "tpat1":
+            return ScenarioMetadata(
+                name="thai_exam_tpat1",
+                display_name="TPAT-1",
+                description="TBD",
+                taxonomy=TaxonomyInfo(
+                    task="question answering",
+                    what="high school / medical school academic knowledge",
+                    when="?",
+                    who="n/a",
+                    language="Thai",
+                ),
+                main_metric="exact_match",
+                main_split="test",
+            )
+        elif self.exam == "a_level":
+            return ScenarioMetadata(
+                name="thai_exam_a_level",
+                display_name="A-Level",
+                description="An academic knowledge assessment examination (Applied Knowledge Level) that "
+                "covers general foundational subjects taught in schools. The content assessed "
+                "in this examination aligns with the curriculum guidelines and emphasizes the "
+                "practical application of knowledge in daily life. We collected a total of 175 "
+                "questions and answers.\n",
+                taxonomy=TaxonomyInfo(
+                    task="question answering",
+                    what="high school academic knowledge",
+                    when="?",
+                    who="n/a",
+                    language="Thai and English",
+                ),
+                main_metric="exact_match",
+                main_split="test",
+            )
+        else:
+            raise ValueError(f"Unknown exam: {self.exam}")

helm/benchmark/scenarios/wmt_14_scenario.py CHANGED Viewed

@@ -113,8 +113,15 @@ class WMT14Scenario(Scenario):
         return ScenarioMetadata(
             name="wmt_14",
             display_name="WMT 2014",
-            description="WMT 2014 is a collection of machine translation datasets.",
-            taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
+            description="WMT 2014 is a collection of machine translation datasets "
+            "[(website)](https://www.statmt.org/wmt14/index.html).",
+            taxonomy=TaxonomyInfo(
+                task="machine translation",
+                what="multilingual sentences",
+                when="before 2014",
+                who="Europarl, news, Common Crawl, etc.",
+                language="English, French, Czech, etc.",
+            ),
             main_metric="bleu_4",
             main_split="test",
         )

helm/benchmark/static/schema_long_context.yaml CHANGED Viewed

@@ -191,16 +191,15 @@ run_groups:
     description: Scenarios for evaluating long context capabilities
     category: All scenarios
     subgroups:
-      - ruler_hotpotqa
       - ruler_squad
-      - infinite_bench_en_sum
-      # - infinite_bench_en_qa
+      - ruler_hotpotqa
       - infinite_bench_en_mc
+      - infinite_bench_en_sum
       - openai_mrcr
-  - name: ruler_hotpotqa
-    display_name: RULER HotPotQA
-    description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
+  - name: ruler_squad
+    display_name: RULER SQuAD
+    description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
     metric_groups:
       - accuracy
       - general_information
@@ -209,16 +208,15 @@ run_groups:
       main_name: ruler_string_match_part
       main_split: valid
     taxonomy:
-      task: question answering with retrieval-augmented generation
+      task: question answering
       what: Wikipedia articles
-      who: Wikipedia authors
+      who: Wikipedia authors and crowdworkers
       when: Before 2018
       language: English
-  - name: ruler_squad
-    display_name: RULER SQuAD
-    description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
+  - name: ruler_hotpotqa
+    display_name: RULER HotPotQA
+    description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
     metric_groups:
       - accuracy
       - general_information
@@ -227,29 +225,12 @@ run_groups:
       main_name: ruler_string_match_part
       main_split: valid
     taxonomy:
-      task: question answering
+      task: question answering with retrieval-augmented generation
       what: Wikipedia articles
-      who: Wikipedia authors and crowdworkers
+      who: Wikipedia authors
       when: Before 2018
       language: English
-  # - name: infinite_bench_en_qa
-  #   display_name: ∞Bench En.QA
-  #   description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
-  #   metric_groups:
-  #     - accuracy
-  #     - general_information
-  #     - annotation_metrics
-  #   environment:
-  #     main_name: f1_score
-  #     main_split: test
-  #   taxonomy:
-  #     task: question answering
-  #     what: Novels
-  #     who: Novel authors
-  #     when: Before 2024
-  #     language: English
   - name: infinite_bench_en_mc
     display_name: ∞Bench En.MC
     description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))

helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png ADDED Viewed

Binary file

crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl