PyPI - crfm-helm - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
helm/benchmark/__init__.py +2 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +28 -23
helm/benchmark/metrics/test_classification_metrics.py +44 -9
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +23 -1
helm/benchmark/run_expander.py +161 -47
helm/benchmark/run_specs.py +84 -10
helm/benchmark/runner.py +31 -3
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
helm/benchmark/scenarios/lextreme_scenario.py +37 -25
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +66 -8
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +27 -6
helm/common/general.py +12 -5
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +28 -24
helm/proxy/clients/huggingface_client.py +30 -17
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +23 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +82 -2
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/lex_glue_scenario.py CHANGED Viewed

@@ -26,6 +26,11 @@ TASK_CODE_MAPPING = {
     CASE_HOLD: TaskType.QA,
 }
+def get_lex_glue_task_type(subset):
+    return TASK_CODE_MAPPING[subset]
 TASK_MAX_TRAIN_INSTANCES_MAPPING = {
     ECTHR_A: 1,  # ~ max 4096 tokens
     ECTHR_B: 1,  # ~ max 4096 tokens
@@ -58,19 +63,65 @@ def get_lex_glue_max_tokens(subset):
 INSTRUCTIONS = {
     ECTHR_A: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
-    "Predict the articles of the ECtHR that were violated (if any).",
+    "Predict the articles of the ECtHR that were violated (if any) out of the following: "
+    "0: Article 2, "
+    "1: Article 3, "
+    "2: Article 5, "
+    "3: Article 6, "
+    "4: Article 8, "
+    "5: Article 9, "
+    "6: Article 10, "
+    "7: Article 11, "
+    "8: Article 14, "
+    "9: Article 1 of Protocol 1. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     ECTHR_B: "In this task, you are given the facts from a case heard at the European Court of Human Rights (ECtHR). "
-    "Predict the articles of ECtHR that were allegedly violated (considered by the court).",
+    "Predict the articles of ECtHR that were allegedly violated (considered by the court) out of the following:"
+    "0: Article 2, "
+    "1: Article 3, "
+    "2: Article 5, "
+    "3: Article 6, "
+    "4: Article 8, "
+    "5: Article 9, "
+    "6: Article 10, "
+    "7: Article 11, "
+    "8: Article 14, "
+    "9: Article 1 of Protocol 1. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     SCOTUS: "In this task, you are given a case heard at the Supreme Court of the United States (SCOTUS). "
-    "Predict the relevant issue area.",
+    "Predict the relevant issue area out of the following: "
+    "0: Criminal Procedure, "
+    "1: Civil Rights, "
+    "2: First Amendment, "
+    "3: Due Process, "
+    "4: Privacy, "
+    "5: Attorneys, "
+    "6: Unions, "
+    "7: Economic Activity, "
+    "8: Judicial Power, "
+    "9: Federalism, "
+    "10: Interstate Relations, "
+    "11: Federal Taxation, "
+    "12: Miscellaneous, "
+    "13: Private Action.",
     EURLEX: "In this task, you are given an EU law document published in the EUR-Lex portal. "
-    "Predict the relevant EuroVoc concepts.",
+    "Predict the relevant EuroVoc concepts. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     LEDGAR: "In this task, you are given a contract provision "
     "from contracts obtained from US Securities and Exchange Commission (SEC) filings."
-    "Predict the main topic.",
+    "Predict the main topic. ",
     UNFAIR_TOS: "In this task, you are given a sentence "
-    "from a Terms of Service (ToS) document from on-line platforms. "
-    "Predict the types of unfair contractual terms",
+    "from a Terms of Service (ToS) document from online platforms. "
+    "Predict the types of unfair contractual terms out of the following: "
+    "0: Limitation of liability, "
+    "1: Unilateral termination, "
+    "2: Unilateral change, "
+    "3: Content removal, "
+    "4: Contract by using, "
+    "5: Choice of law, "
+    "6: Jurisdiction, "
+    "7: Arbitration. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     CASE_HOLD: "In this task, you are given an excerpt from a court decision, "
     "containing a reference to a particular case, while the holding statement is masked out. "
     "Predict the index of the holding statement fitting in the context at <HOLDING> from a selection of five choices.",
@@ -126,7 +177,6 @@ class LexGLUEScenario(Scenario):
     dataset_name = "lex_glue"
     max_number_of_wrong_answers = 30
-    mltc_no_label_name = "No Label"
     def __init__(self, subset: str):
         super().__init__()
@@ -168,15 +218,6 @@ class LexGLUEScenario(Scenario):
             wrong_references = reduce_wrong_reference_count(wrong_references)
-            if task_code == TaskType.MLTC:  # special case for multilabel classification tasks
-                if correct_labels:  # if we have a correct label
-                    # add the no_label to the wrong references
-                    # IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
-                    wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
-                else:  # if we don't have a correct label
-                    # add the no_label to the correct labels
-                    correct_labels = [self.mltc_no_label_name]
             # construct correct references and input
             if task_code in [TaskType.SLTC, TaskType.MLTC]:
                 input_text = example["text"]

helm/benchmark/scenarios/lextreme_scenario.py CHANGED Viewed

@@ -55,6 +55,11 @@ TASK_CODE_MAPPING = {
     MAPA_FINE: TaskType.NER,
 }
+def get_lextreme_task_type(subset):
+    return TASK_CODE_MAPPING[subset]
 TASK_MAX_TRAIN_INSTANCES_MAPPING = {
     BRAZILIAN_COURT_DECISIONS_JUDGMENT: 4,  # ~ max 1024 tokens
     BRAZILIAN_COURT_DECISIONS_UNANIMITY: 4,  # ~ max 1024 tokens
@@ -134,14 +139,14 @@ INSTRUCTIONS = {
     "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
     SWISS_JUDGMENT_PREDICTION: "In this task, you are given the facts description "
     "from a decision heard at the Swiss Federal Supreme Court. "
-    "Predict the judgment of the case (approval or dismissal)",
+    "Predict the judgment of the case (approval: The appeal was approved, or dismissal: The appeal was denied)",
     ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: "In this task, you are given a sentence "
     "from a Terms of Service (ToS) document. "
     "Predict the unfairness level of the sentence (potentially_unfair, clearly_unfair, clearly_fair, untagged)",
     ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: "In this task, you are given a sentence "
     "from a Terms of Service (ToS) document. "
-    "Predict the clause topics of the sentence "
-    "(0: Arbitration, "
+    "Predict the clause topics of the sentence out of the following: "
+    "0: Arbitration, "
     "1: Unilateral change, "
     "2: Content removal, "
     "3: Jurisdiction, "
@@ -149,34 +154,51 @@ INSTRUCTIONS = {
     "5: Limitation of liability, "
     "6: Unilateral termination, "
     "7: Contract by using, "
-    "8: Privacy included)",
+    "8: Privacy included. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     COVID19_EMERGENCY_EVENT: "In this task, you are given a sentence from a European legislative document. "
-    "Predict the applicable measurements against COVID-19 "
-    "(0: State of Emergency, "
+    "Predict the applicable measurements against COVID-19 out of the following: "
+    "0: State of Emergency, "
     "1: Restrictions of fundamental rights and civil liberties, "
     "2: Restrictions of daily liberties, "
     "3: Closures / lockdown, "
     "4: Suspension of international cooperation and commitments, "
     "5: Police mobilization, "
     "6: Army mobilization, "
-    "7: Government oversight)",
+    "7: Government oversight. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     MULTI_EURLEX_LEVEL_1: "In this task, you are given a document from an EU law. "
-    "Predict the level 1 concept in the EUROVOC taxonomy.",
+    "Predict the level 1 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     MULTI_EURLEX_LEVEL_2: "In this task, you are given a document from an EU law. "
-    "Predict the level 2 concept in the EUROVOC taxonomy.",
+    "Predict the level 2 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     MULTI_EURLEX_LEVEL_3: "In this task, you are given a document from an EU law. "
-    "Predict the level 3 concept in the EUROVOC taxonomy.",
+    "Predict the level 3 concept in the EUROVOC taxonomy. "
+    "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
     GREEK_LEGAL_NER: "In this task, you are given a sentence from Greek legislation. "
-    "Predict the named entity type for each token.",
+    "Predict the named entity type for each token out of the following: "
+    "O, B-ORG, I-ORG, B-GPE, I-GPE, B-LEG-REFS, I-LEG-REFS, B-PUBLIC-DOCS, I-PUBLIC-DOCS, B-PERSON, I-PERSON, "
+    "B-FACILITY, I-FACILITY, B-LOCATION-UNK, I-LOCATION-UNK, B-LOCATION-NAT, I-LOCATION-NAT",
     LEGALNERO: "In this task, you are given a sentence from Romanian legislation. "
-    "Predict the named entity type for each token.",
+    "Predict the named entity type for each token out of the following: "
+    "O, B-TIME, I-TIME, B-LEGAL, I-LEGAL, B-ORG, I-ORG, B-LOC, I-LOC, B-PER, I-PER",
     LENER_BR: "In this task, you are given a sentence "
     "from Brazilian legal documents (court decisions and legislation). "
-    "Predict the named entity type for each token.",
+    "Predict the named entity type for each token out of the following: "
+    "O, B-ORGANIZACAO, I-ORGANIZACAO, B-PESSOA, I-PESSOA, B-TEMPO, I-TEMPO, B-LOCAL, I-LOCAL, "
+    "B-LEGISLACAO, I-LEGISLACAO, B-JURISPRUDENCIA, I-JURISPRUDENCIA",
     MAPA_COARSE: "In this task, you are given a sentence from the EUR-Lex database. "
-    "Predict the coarse grained named entity type for each token.",
+    "Predict the coarse grained named entity type for each token out of the following: "
+    "O, B-ORGANISATION, I-ORGANISATION, B-ADDRESS, I-ADDRESS, B-DATE, I-DATE, "
+    "B-PERSON, I-PERSON, B-AMOUNT, I-AMOUNT, B-TIME, I-TIME",
     MAPA_FINE: "In this task, you are given a sentence from the EUR-Lex database. "
-    "Predict the fine grained named entity type for each token.",
+    "Predict the fine grained named entity type for each token out of the following: "
+    "O, B-BUILDING, I-BUILDING, B-CITY, I-CITY, B-COUNTRY, I-COUNTRY, B-PLACE, I-PLACE, B-TERRITORY, I-TERRITORY, "
+    "I-UNIT, B-UNIT, B-VALUE, I-VALUE, B-YEAR, I-YEAR, B-STANDARD ABBREVIATION, I-STANDARD ABBREVIATION, "
+    "B-MONTH, I-MONTH, B-DAY, I-DAY, B-AGE, I-AGE, B-ETHNIC CATEGORY, I-ETHNIC CATEGORY, B-FAMILY NAME, I-FAMILY NAME, "
+    "B-INITIAL NAME, I-INITIAL NAME, B-MARITAL STATUS, I-MARITAL STATUS, B-PROFESSION, I-PROFESSION, B-ROLE, I-ROLE, "
+    "B-NATIONALITY, I-NATIONALITY, B-TITLE, I-TITLE, B-URL, I-URL, B-TYPE, I-TYPE",
 }
@@ -226,7 +248,6 @@ class LEXTREMEScenario(Scenario):
     dataset_name = "joelito/lextreme"
     max_number_of_wrong_answers = 30
-    mltc_no_label_name = "No Label"
     delimiter = '" "'  # we choose quotes and whitespace as a delimiter because this is what worked for gpt3
     ner_class_mapping = {
@@ -396,15 +417,6 @@ class LEXTREMEScenario(Scenario):
             wrong_references = reduce_wrong_reference_count(wrong_references)
-            if task_code == TaskType.MLTC:  # special case for multilabel classification tasks
-                if correct_labels:  # if we have a correct label
-                    # add the no_label to the wrong references
-                    # IMPORTANT: add it after reduce_wrong_reference_count, to make sure the no label is always there
-                    wrong_references.append(Reference(output=Output(self.mltc_no_label_name), tags=[]))
-                else:  # if we don't have a correct label
-                    # add the no_label to the correct labels
-                    correct_labels = [self.mltc_no_label_name]
             # construct correct references and input
             if task_code in [TaskType.SLTC, TaskType.MLTC]:
                 input_text = example["input"]

helm/benchmark/scenarios/opinions_qa_scenario.py ADDED Viewed

@@ -0,0 +1,194 @@
+import os
+import pandas as pd
+from typing import List, Dict
+from helm.common.general import ensure_file_downloaded
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Input,
+    Output,
+)
+class OpinionsQAScenario(Scenario):
+    """
+    The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
+    [Santurkar et al., 2023].
+    OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
+    subjective, there isn't a single ground truth response. Instead, the object of interest is how
+    the distribution of model responses compares to those obtained from human survey participants.
+    As discussed in Santurkar et al., we consider prompting an LM:
+    1. Without any context (zero-shot) to evaluate the "default" opinions reflected
+        by it.
+    2. With context containing information pertaining to the group (say Democrats) we want to steer
+    the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
+    example below for an illustration fo the difference between the three steering modes.
+    We prompt models using the following format
+        <optional context>
+        Question: <question>
+        A. <reference>
+        B. <reference>
+        C. <reference>
+        D. <reference>
+        Answer: <A/B/C/D>
+    For instance:
+        <optional context>
+        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
+        to gun violence in the country today?
+        A. A great deal
+        B. A fair amount
+        C. Not too much
+        D. Not at all
+        E. Refused
+        Answer:
+    In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
+    1. If context="default", <optional context> is not used and the model is prompted with the question directly.
+    2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
+        Question: In politics today, do you consider yourself a
+        A. Republican
+        B. Democrat
+        C. Independent
+        D. Something else
+        E. Refused
+        Answer: B
+    3. If context="steer-bio", the group information is passed to the model as a free-text response
+    to a prompt asking for demographic information, e.g.:
+    Below you will be asked to provide a short description of your political affiliation and
+    then answer some questions.
+    Description: In politics today, I consider myself a Democrat.
+    4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
+    said group, e.g.:
+    Answer the following question as if in politics today, you considered yourself a Democrat.
+    """
+    name = "opinions_qa"
+    description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
+    tags = ["multiple_choice"]
+    """ Information needed to download the dataset """
+    CODALAB_URI_TEMPLATE: str = (
+        "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
+    )
+    CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
+    FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
+    PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
+    def __init__(self, survey_type: str, context: str):
+        super().__init__()
+        assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
+        self.survey_type: str = survey_type
+        self.context: str = context
+    def download_data(self):
+        self.output_path: str = os.path.join(self.output_path, "data")
+        if not os.path.exists(self.output_path):
+            os.makedirs(self.output_path)
+        DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
+        DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
+        DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
+        for filename in DOWNLOAD_FILENAMES:
+            data_path: str = os.path.join(self.output_path, filename)
+            source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
+            ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
+    def read_survey_questions(self, csv_path):
+        df = pd.read_csv(csv_path, sep="\t")
+        df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
+        return df
+    def get_instances(self) -> List[Instance]:
+        self.download_data()
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
+        csv_dict = {
+            "dev": os.path.join(self.output_path, f"{self.context}.csv"),
+            "test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
+        }
+        bios_df = None
+        if self.context in ["steer-bio", "steer-portray"]:
+            bios_path = os.path.join(self.output_path, f"{self.context}.csv")
+            bios_df = pd.read_csv(bios_path, sep="\t")
+        for split in all_splits:
+            csv_path: str = csv_dict[split]
+            assert os.path.exists(csv_path)
+            question_df = self.read_survey_questions(csv_path)
+            for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
+                # Opinions QA test questions have no correct answer and thus we set it to be None by default
+                # for all test instances.
+                # In the case where context = steer-qa, we add demographic information in the form of a
+                # in-context question answer pair as shown in the example above.
+                correct_answer = None if split == "test" else question_df["correct"][qidx]
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(
+                        Output(text=answer),
+                        tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
+                    )
+                if bios_df is None:
+                    # context = "default" or "steer-qa"
+                    instance = Instance(
+                        Input(text=question),
+                        references=list(map(answer_to_reference, answers)),
+                        split=splits[split],
+                    )
+                    instances.append(instance)
+                else:
+                    # context = "steer-bio"or "steer-portray"
+                    for bio in bios_df["question"].values:
+                        context = PassageQuestionInput(passage=bio, question=question + "\n")
+                        instance = Instance(
+                            context,
+                            references=list(map(answer_to_reference, answers)),
+                            split=splits[split],
+                        )
+                        instances.append(instance)
+        return instances

helm/benchmark/scenarios/scenario.py CHANGED Viewed

@@ -147,6 +147,11 @@ class Instance:
                 return reference
         return None
+    @property
+    def all_correct_references(self) -> List[Reference]:
+        """Return all correct references."""
+        return [reference for reference in self.references if reference.is_correct]
     def render_lines(self) -> List[str]:
         info = [f"input: {format_text(self.input.text)}"]
         if self.sub_split:

helm/benchmark/scenarios/the_pile_scenario.py CHANGED Viewed

@@ -78,7 +78,7 @@ class ThePileScenario(Scenario):
         # Download the raw data
         data_path = os.path.join(self.output_path, "data")
         ensure_file_downloaded(
-            source_url="https://mystic.the-eye.eu/public/AI/pile/test.jsonl.zst",
+            source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
             target_path=data_path,
             unpack=True,
         )

helm/benchmark/static/benchmarking.css CHANGED Viewed

@@ -136,3 +136,17 @@ tbody .table-sort-column {
   background-color: #f5f5f5;
   white-space: pre-wrap;
 }
+.plot {
+  margin: 15px;
+}
+.plot img {
+  margin: 10px;
+}
+.plot-caption {
+  color: #555;
+  font-style: italic;
+  margin: 5px;
+}

helm/benchmark/static/benchmarking.js CHANGED Viewed

@@ -124,6 +124,44 @@ $(function () {
     return $table;
   }
+  function renderPlots() {
+    const container = $('<div>', {class: "container"});
+    const links = $('<div>');
+    container.append(links);
+    const tableLinks = [];
+    function renderPlot(name, title) {
+        const plot = $('<div>', {class: "plot"});
+        const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
+        plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
+        plot.append(caption);
+        plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
+        container.append(plot);
+        tableLinks.push($('<a>', {href: '#' + title}).append(title));
+    }
+    renderPlot("generic_summary", "Metric spread for core scenarios");
+    renderPlot("model_ranking_all", "Head-to-head win rate per each model");
+    renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
+    renderPlot("metric_correlation", "Correlation between metrics");
+    renderPlot("accuracy_v_access", "Accuracy as a function of model access");
+    renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
+    renderPlot("accuracy_over_release_date", "Accuracy over time");
+    renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
+    renderPlot("targeted_evals", "Targeted evaluations");
+    renderPlot("in_context_ablations", "Number of in-context examples ablation");
+    renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
+    links.append(renderItems(tableLinks));
+    return container;
+  }
   function renderRunsOverview(runSpecs) {
     let query = '';
     const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
@@ -1170,6 +1208,11 @@ $(function () {
       $main.empty()
       $main.append(renderHeader('Scenarios', renderScenarios()));
       refreshHashLocation();
+    } else if (urlParams.plots) {
+      // Plots
+      $main.empty()
+      $main.append(renderHeader('Plots', renderPlots()));
+      refreshHashLocation();
     } else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
       // Predictions for a set of run specs (matching a regular expression)
       $main.text('Loading runs...');

helm/benchmark/static/index.html CHANGED Viewed

@@ -22,6 +22,7 @@
             <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
             <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
             <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
+            <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
             <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
           </ul>
         </div>
@@ -48,5 +49,6 @@
     <script src="json-urls-root.js"></script>
     <script src="json-urls.js"></script>
     <script src="benchmarking.js"></script>
+    <script src="plot-captions.js"></script>
   </body>
 </html>

helm/benchmark/static/json-urls.js CHANGED Viewed

@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
 function requestsJsonUrl(suite, runSpecName) {
   return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
 }
+function plotUrl(suite, plotName) {
+  return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
+}

helm/benchmark/static/plot-captions.js ADDED Viewed

@@ -0,0 +1,16 @@
+////////////////////////////////////////////////////////////
+// Dictionary of plot captions
+const plotCaptions = {
+    "generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
+    "model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
+    "accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
+    "metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
+    "accuracy_v_access": "The relationship between access (open vs. limited vs.  closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
+    "accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
+    "accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
+    "accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
+    "targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
+    "in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
+    "mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
+};

crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl