PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show

helm/benchmark/static/schema_tables.yaml CHANGED Viewed

@@ -99,47 +99,94 @@ metrics:
     display_name: METEOR
     short_display_name: METEOR
     description: METEOR
+    lower_is_better: false
   - name: f1
-    display_name: F1
-    short_display_name: F1
-    description: F1
+    display_name: BERTScore F1
+    short_display_name: BERTScore F1
+    description: BERTScore F1
+    lower_is_better: false
   - name: precision
     display_name: Precision
     short_display_name: Precision
     description: Precision
+    lower_is_better: false
   - name: recall
     display_name: Recall
     short_display_name: Recall
     description: Recall
+    lower_is_better: false
   - name: rouge1
     display_name: ROUGE-1
     short_display_name: ROUGE-1
     description: ROUGE-1
+    lower_is_better: false
   - name: rouge2
     display_name: ROUGE-2
     short_display_name: ROUGE-2
     description: ROUGE-2
+    lower_is_better: false
   - name: rougeL
     display_name: ROUGE-L
     short_display_name: ROUGE-L
     description: ROUGE-L
+    lower_is_better: false
   - name: rougeLsum
     display_name: ROUGE-Lsum
     short_display_name: ROUGE-Lsum
     description: ROUGE-Lsum
+    lower_is_better: false
   - name: bleu
     display_name: BLEU
     short_display_name: BLEU
     description: BLEU
+    lower_is_better: false
+  - name: accuracy
+    display_name: Accuracy
+    short_display_name: Accuracy
+    description: Accuracy
+    lower_is_better: false
+  - name: f1_macro
+    display_name: Macro F1
+    short_display_name: Macro F1
+    description: Macro F1
+    lower_is_better: false
+  - name: f1_micro
+    display_name: Micro F1
+    short_display_name: Micro F1
+    description: Micro F1
+    lower_is_better: false
+  - name: unsorted_list_exact_match
+    display_name: Unsorted List Exact Match
+    short_display_name: Exact Match
+    description: Unsorted List Exact Match
+    lower_is_better: false
+  # FinQA Accuracy
+  - name: program_accuracy
+    display_name: Program Accuracy
+    short_display_name: Program Accuracy
+    description: Program Accuracy
+    lower_is_better: false
+  - name: execution_accuracy
+    display_name: Execution Accuracy
+    short_display_name: Execution Accuracy
+    description: Execution Accuracy
+    lower_is_better: false
 perturbations: []
 metric_groups:
-  - name: accuracy
-    display_name: Accuracy
+  - name: main_metrics
+    display_name: Main Metrics
+    metrics:
+    - name: ${main_name}
+      split: __all__
+  - name: generation_metrics
+    display_name: Other Generation Metrics
     hide_win_rates: true
     metrics:
-    - name: meteor
+    - name: f1
       split: __all__
     - name: rouge1
       split: __all__
@@ -152,6 +199,17 @@ metric_groups:
     - name: bleu
       split: __all__
+  - name: classification_metrics
+    display_name: Classification Metrics
+    hide_win_rates: true
+    metrics:
+    - name: accuracy
+      split: __all__
+    - name: f1_macro
+      split: __all__
+    - name: f1_micro
+      split: __all__
   - name: efficiency
     display_name: Efficiency
     metrics:
@@ -180,13 +238,17 @@ run_groups:
     category: All Scenarios
     subgroups:
       - unitxt_cards.numeric_nlg
+      - unitxt_cards.tab_fact
+      - unitxt_cards.wikitq
+      - unitxt_cards.fin_qa
   - name: unitxt_cards.numeric_nlg
     display_name: NumericNLG
     short_display_name: NumericNLG
     description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
     metric_groups:
-      - accuracy
+      - main_metrics
+      - generation_metrics
       - efficiency
       - general_information
     environment:
@@ -198,3 +260,58 @@ run_groups:
       who: "?"
       when: "?"
       language: English
+  - name: unitxt_cards.tab_fact
+    display_name: TabFact
+    short_display_name: TabFact
+    description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
+    metric_groups:
+      - main_metrics
+      - classification_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: accuracy
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: unitxt_cards.wikitq
+    display_name: WikiTableQuestions
+    short_display_name: WikiTableQuestions
+    description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
+    metric_groups:
+      - main_metrics
+      - classification_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: unsorted_list_exact_match
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: unitxt_cards.fin_qa
+    display_name: FinQA
+    description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
+    metric_groups:
+      - main_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: program_accuracy
+      main_split: test
+    taxonomy:
+      task: question answering with numeric reasoning
+      what: financial reports
+      who: financial experts
+      when: 1999 to 2019
+      language: English

helm/benchmark/static/schema_thai.yaml CHANGED Viewed

@@ -78,6 +78,7 @@ perturbations: []
 metric_groups:
   - name: accuracy
     display_name: Accuracy
+    hide_win_rates: true
     metrics:
       - name: ${main_name}
         split: ${main_split}
@@ -111,12 +112,32 @@ run_groups:
     description: Thai-language scenarios
     category: All scenarios
     subgroups:
+      - thai_exam
       - thai_exam_onet
       - thai_exam_ic
       - thai_exam_tgat
       - thai_exam_tpat1
       - thai_exam_a_level
+  - name: thai_exam
+    display_name: ThaiExam
+    description: >
+      Macro-averaged accuracy on all ThaiExam examinations.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: "?"
+      language: Thai and English
   - name: thai_exam_onet
     display_name: ONET
     description: >