PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} RENAMED Viewed

@@ -74,13 +74,25 @@ metrics:
     display_name: F1
     description: Average F1 score in terms of word overlap between the model output and correct reference.
     lower_is_better: false
-  - name: live_qa_score
-    display_name: Judge Score
-    description: LLM-as-judge score
+  - name: safety_score
+    display_name: LM Evaluated Safety score
+    description: LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).
     lower_is_better: false
-  - name: medication_qa_score
-    display_name: Judge Score
-    description: LLM-as-judge score
+  - name: bbq_accuracy
+    display_name: BBQ accuracy
+    description: BBQ accuracy
+    lower_is_better: false
+  - name: rouge_1
+    display_name: ROUGE-1
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
+    lower_is_better: false
+  - name: rouge_2
+    display_name: ROUGE-2
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
+    lower_is_better: false
+  - name: rouge_l
+    display_name: ROUGE-L
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
     lower_is_better: false
   # Toxicity metrics
@@ -98,6 +110,13 @@ metrics:
     lower_is_better: true
     description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
+  # Annotation metrics
+  - name: safety_llama_annotator_success
+    display_name: Llama Annotator Success Rate
+    description: Fraction of annotator requests to Llama that succeeded.
+  - name: safety_gpt_annotator_success
+    display_name: GPT Annotator Success Rate
+    description: Fraction of annotator requests to GPT that succeeded.
 ############################################################
 perturbations: []
@@ -106,6 +125,8 @@ perturbations: []
 metric_groups:
   - name: accuracy
     display_name: Accuracy
+    aggregation_strategies:
+      - mean
     metrics:
       - name: ${main_name}
         split: ${main_split}
@@ -131,52 +152,50 @@ metric_groups:
     - name: num_output_tokens
       split: ${main_split}
+  - name: annotation_metrics
+    display_name: Annotation
+    hide_win_rates: true
+    metrics:
+    - name: safety_llama_annotator_success
+      split: ${main_split}
+    - name: safety_gpt_annotator_success
+      split: ${main_split}
   - name: toxicity
     display_name: Toxicity
     metrics:
     - name: toxic_frac
       split: ${main_split}
+  - name: bbq_metrics
+    display_name: BBQ metrics
+    description: Metrics used for the BBQ bias benchmark.
+    metrics:
+      - name: bbq_metric_ambiguous_bias
+        split: ${main_split}
+      - name: bbq_metric_unambiguous_bias
+        split: ${main_split}
 ############################################################
 run_groups:
-  - name: medical_scenarios
-    display_name: Medical Scenarios
-    description: Scenarios for the medical domain
+  - name: long_context_scenarios
+    display_name: Long Context Scenarios
+    description: Scenarios for the model safety
     category: All scenarios
     subgroups:
-      - med_qa
-      - med_mcqa
-      - pubmed_qa
-      - mmlu
-      - live_qa
-      - medication_qa
-  - name: med_qa
-    display_name: MedQA
-    description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: question answering
-      what: n/a
-      who: n/a
-      when: n/a
-      language: English
+      - ruler_hotpotqa
+      - ruler_squad
+      - infinite_bench_sum
-  - name: med_mcqa
-    display_name: MedMCQA
-    description: AIIMS/NEET QA multiple choice questions with 4 choices.
+  - name: ruler_hotpotqa
+    display_name: RULER HotPotQA
+    description: RULER HotPotQA
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
+      - annotation_metrics
     environment:
-      main_name: exact_match
+      main_name: f1_score
       main_split: valid
     taxonomy:
       task: question answering
@@ -185,51 +204,17 @@ run_groups:
       when: n/a
       language: English
-  - name: pubmed_qa
-    display_name: PubMedQA
-    description: biomedical literature Q + Context + A yes/no/maybe + long answer questions
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: question answering
-      what: n/a
-      who: n/a
-      when: n/a
-      language: English
-  - name: mmlu
-    display_name: MMLU (Massive Multitask Language Understanding)
-    short_display_name: MMLU
-    description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).
+  - name: ruler_squad
+    display_name: RULER SQuAD
+    description: RULER SQuAD
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
+      - annotation_metrics
     environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: question answering
-      what: "?"
-      who: "?"
-      when: "?"
-      language: English
-  - name: live_qa
-    display_name: LiveQA
-    description: Consumer health questions with librarian-generated reference answers.
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: live_qa_score
-      main_split: test
+      main_name: f1_score
+      main_split: valid
     taxonomy:
       task: question answering
       what: n/a
@@ -237,15 +222,15 @@ run_groups:
       when: n/a
       language: English
-  - name: medication_qa
-    display_name: MedicationQA
-    description: Consumer medication questions with reference answers.
+  - name: infinite_bench_sum
+    display_name: ∞Bench Sum
+    description: ∞Bench Sum
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
+      - annotation_metrics
     environment:
-      main_name: medication_qa_score
+      main_name: rouge_l
       main_split: test
     taxonomy:
       task: question answering