PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} RENAMED Viewed

@@ -160,6 +160,11 @@ metrics:
     short_display_name: Exact Match
     description: Unsorted List Exact Match
     lower_is_better: false
+  - name: f1_strings
+    display_name: F1 Strings
+    short_display_name: F1 Strings
+    description: F1 Strings
+    lower_is_better: false
   # FinQA Accuracy
   - name: program_accuracy
@@ -173,15 +178,45 @@ metrics:
     description: Execution Accuracy
     lower_is_better: false
+  # SciGen Accuracy
+  - name: llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference
+    display_name: Rating
+    short_display_name: Rating
+    description: Rating by Llama 3.1 (70B) LLM as judge
+    lower_is_better: false
+  # Robustness
+  # NOTE: This is a "virtual" metric that is not produced directly by the metrics, but will appear as an aggregate table.
+  # Run `helm-summarize` with `--summarizer-class helm.benchmark.presentation.torr_robustness_summarizer.ToRRRobustnessSummarizer`
+  # to compute the values of this metric.
+  - name: robustness
+    display_name: Robustness
+    short_display_name: Robustness
+    description: Robustness
+    lower_is_better: false
 perturbations: []
 metric_groups:
-  - name: main_metrics
-    display_name: Main Metrics
+  - name: performance_metrics
+    display_name: Performance
+    aggregation_strategies:
+      - mean
     metrics:
     - name: ${main_name}
       split: __all__
+  # NOTE: Robustness is a "virtual" metric that is not produced directly by the metrics, but will appear as an aggregate table.
+  # Run `helm-summarize` with `--summarizer-class helm.benchmark.presentation.torr_robustness_summarizer.ToRRRobustnessSummarizer`
+  # to compute the values of this metric.
+  - name: robustness_metrics
+    display_name: Robustness
+    aggregation_strategies:
+      - mean
+    metrics:
+    - name: robustness
+      split: __all__
   - name: generation_metrics
     display_name: Other Generation Metrics
     hide_win_rates: true
@@ -233,26 +268,89 @@ metric_groups:
 run_groups:
   - name: table_scenarios
-    display_name: Table  Scenarios
+    display_name: Table Scenarios
     description: Table Scenarios
     category: All Scenarios
     subgroups:
-      - unitxt_cards.numeric_nlg
-      - unitxt_cards.tab_fact
-      - unitxt_cards.wikitq
-      - unitxt_cards.fin_qa
+      - fin_qa
+      - numeric_nlg
+      - qtsumm
+      - scigen
+      - tab_fact
+      - tablebench_data_analysis
+      - tablebench_fact_checking
+      - tablebench_numerical_reasoning
+      - turl_col_type
+      - wikitq
+  - name: fin_qa
+    display_name: FinQA
+    description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: program_accuracy
+      main_split: test
+    taxonomy:
+      task: question answering with numeric reasoning
+      what: financial reports
+      who: financial experts
+      when: 1999 to 2019
+      language: English
-  - name: unitxt_cards.numeric_nlg
+  - name: numeric_nlg
     display_name: NumericNLG
     short_display_name: NumericNLG
     description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
     metric_groups:
-      - main_metrics
-      - generation_metrics
+      - performance_metrics
+      - robustness_metrics
+      # - generation_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: rougeL
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: qtsumm
+    display_name: QTSumm
+    short_display_name: QTSumm
+    description: QTFumm
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
+      # - generation_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: rougeL
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: scigen
+    display_name: SciGen
+    description: SciGen
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
       - efficiency
       - general_information
     environment:
-      main_name: meteor
+      main_name: rougeL
       main_split: test
     taxonomy:
       task: "?"
@@ -261,13 +359,13 @@ run_groups:
       when: "?"
       language: English
-  - name: unitxt_cards.tab_fact
+  - name: tab_fact
     display_name: TabFact
     short_display_name: TabFact
     description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
     metric_groups:
-      - main_metrics
-      - classification_metrics
+      - performance_metrics
+      - robustness_metrics
       - efficiency
       - general_information
     environment:
@@ -280,17 +378,17 @@ run_groups:
       when: "?"
       language: English
-  - name: unitxt_cards.wikitq
-    display_name: WikiTableQuestions
-    short_display_name: WikiTableQuestions
-    description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
+  - name: tablebench_data_analysis
+    display_name: Tablebench Data Analysis
+    short_display_name: Tablebench Data Analysis
+    description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
     metric_groups:
-      - main_metrics
-      - classification_metrics
+      - performance_metrics
+      - robustness_metrics
       - efficiency
       - general_information
     environment:
-      main_name: unsorted_list_exact_match
+      main_name: rougeL
       main_split: test
     taxonomy:
       task: "?"
@@ -299,19 +397,78 @@ run_groups:
       when: "?"
       language: English
-  - name: unitxt_cards.fin_qa
-    display_name: FinQA
-    description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
+  - name: tablebench_fact_checking
+    display_name: Tablebench Fact Checking
+    short_display_name: Tablebench Fact Checking
+    description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
     metric_groups:
-      - main_metrics
+      - performance_metrics
+      - robustness_metrics
       - efficiency
       - general_information
     environment:
-      main_name: program_accuracy
+      main_name: rougeL
       main_split: test
     taxonomy:
-      task: question answering with numeric reasoning
-      what: financial reports
-      who: financial experts
-      when: 1999 to 2019
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: tablebench_numerical_reasoning
+    display_name: Tablebench Numerical Reasoning
+    short_display_name: Tablebench Numerical Reasoning
+    description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: rougeL
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: turl_col_type
+    display_name: Turl Col Type
+    description: Turl Col Type
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: f1_micro
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+  - name: wikitq
+    display_name: WikiTableQuestions
+    short_display_name: WikiTableQuestions
+    description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
+    metric_groups:
+      - performance_metrics
+      - robustness_metrics
+      # - classification_metrics
+      - efficiency
+      - general_information
+    environment:
+      main_name: f1_strings
+      main_split: test
+    taxonomy:
+      task: "?"
+      what: "?"
+      who: "?"
+      when: "?"
       language: English

helm/benchmark/static/schema_tweetsentbr.yaml ADDED Viewed

@@ -0,0 +1,146 @@
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match
+    display_name: Quasi-exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match
+    display_name: Prefix exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: Prefix quasi-exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+############################################################
+perturbations: []
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+  - name: efficiency
+    display_name: Efficiency
+    metrics:
+    - name: inference_runtime
+      split: ${main_split}
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+############################################################
+run_groups:
+  - name: core_scenarios
+    display_name: Core Scenarios
+    description: Core Scenarios
+    category: All scenarios
+    subgroups:
+      - tweetsentbr
+  - name: tweetsentbr
+    display_name: TweetSentBR
+    description: TweetSentBR
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: "text classification"
+      what: "tweets with sentiments"
+      who: "?"
+      when: "2018"
+      language: Portuguese