PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show

helm/benchmark/static/schema_cleva.yaml ADDED Viewed

@@ -0,0 +1,768 @@
+---
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  # Calibration metrics:
+  - name: ece_10_bin
+    display_name: 10-bin expected calibration error
+    short_display_name: ECE (10-bin)
+    lower_is_better: true
+    description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
+  # Classification metrics
+  - name: classification_macro_f1
+    display_name: Macro-F1
+    description: Population-level macro-averaged F1 score.
+    lower_is_better: false
+  - name: classification_micro_f1
+    display_name: Micro-F1
+    description: Population-level micro-averaged F1 score.
+    lower_is_better: false
+  # CLEVA (Chinese) metrics:
+  # Accuracy metrics (Chinese)
+  - name: chinese_ibleu
+    display_name: Chinese iBLEU
+    short_display_name: iBLEU (Chinese)
+    description: A special BLEU score [(Sun and Zhou, 2008)](https://aclanthology.org/P12-2008.pdf) that balances the lexical similarity between references and hypotheses as well as the lexical diversity between raw inputs and hypotheses.
+    lower_is_better: false
+  - name: cleva_top1_accuracy
+    display_name: Chinese Top-1 Accuracy
+    short_display_name: Acc@Top-1 (Chinese)
+    description: A special accuracy [(Patel and Pavlick, 2022)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
+    lower_is_better: false
+  - name: cleva_machine_translation_bleu
+    display_name: BLEU
+    short_display_name: BLEU
+    description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
+    lower_is_better: false
+  - name: chinese_rouge_2
+    display_name: Chinese ROUGE-2 score
+    short_display_name: ROUGE-2 (Chinese)
+    description: ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese tokenizer that segments Chinese strings by character.
+    lower_is_better: false
+  - name: chinese_bleu_1
+    display_name: Chinese BLEU-1 score
+    short_display_name: BLEU-1 (Chinese)
+    description: BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a Chinese tokenizer that segments Chinese strings by character.
+    lower_is_better: false
+  - name: cleva_math_result_match
+    display_name: CLEVA Math Exact Match
+    short_display_name: EM (Math)
+    description: Exact match that cares only the last math expression (numbers and fractions) in the model's prediction.
+    lower_is_better: false
+  # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
+############################################################
+perturbations: []
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+  - name: calibration
+    display_name: Calibration
+    metrics:
+      - name: ece_10_bin
+        split: ${main_split}
+  - name: efficiency
+    display_name: Efficiency
+    metrics:
+    - name: inference_runtime
+      split: ${main_split}
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+  - name: classification_metrics
+    display_name: Classification metrics
+    metrics:
+      - name: classification_macro_f1
+        split: ${main_split}
+      - name: classification_micro_f1
+        split: ${main_split}
+############################################################
+run_groups:
+### Chinese
+  - name: cleva_scenarios
+    display_name: Scenarios
+    description: Scenarios for evaluating Chinese language models
+    category: Targeted evaluations
+    subgroups:
+      # - cleva_bias
+      - cleva_classical_chinese_understanding
+      # - cleva_closed_book_question_answering
+      # - cleva_code_synthesis
+      - cleva_commonsense_reasoning
+      # - cleva_conceptual_generalization
+      # - cleva_copyright
+      - cleva_coreference_resolution
+      - cleva_cultural_knowledge
+      # - cleva_data_to_text_generation
+      # - cleva_deductive_reasoning
+      # - cleva_dialogue_generation
+      # - cleva_fact_checking
+      # - cleva_inductive_reasoning
+      # - cleva_instruction_following
+      # - cleva_intent_understanding
+      # - cleva_language_modeling
+      - cleva_mathematical_calculation
+      - cleva_mathematical_reasoning
+      # - cleva_opinion_mining
+      - cleva_paraphrase_generation
+      - cleva_paraphrase_identification
+      - cleva_pinyin_transliteration
+      - cleva_reading_comprehension
+      # - cleva_reasoning_primitive
+      - cleva_sentiment_analysis
+      # - cleva_subject_knowledge
+      - cleva_summarization
+      - cleva_text_classification
+      - cleva_toxicity_detection
+      - cleva_translation
+## CLEVA (Chinese) Scenarios
+# Applications
+  # - name: cleva_closed_book_question_answering
+  #   display_name: Closed book question answering
+  #   description: Closed-book question answering task comprises three subtasks. One is for the medical domain, another for open-domain, and the last measures if a model generates truthful answers.
+  #   metric_groups:
+  #     - accuracy
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: question answering
+  #     what: medical, open-domain, or truthful qa
+  #     who: n/a
+  #     when: 2022 or before
+  #     language: Chinese
+  - name: cleva_summarization
+    display_name: Summarization
+    description: "Summarize a dialogue between a customer representative and a customer."
+    metric_groups:
+      - accuracy
+      - general_information
+      - efficiency
+    environment:
+      main_name: chinese_rouge_2
+      main_split: test
+    taxonomy:
+      task: summarization
+      what: e-commerce dialogues
+      who: customers and representatives
+      when: 2021 or before
+      language: Chinese
+  - name: cleva_text_classification
+    display_name: Text classification
+    description: This scenario has two subtasks. Classify if an utterance is humorous and identify news topic based on its title.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: text classification
+      what: news or chitchat
+      who: n/a
+      when: 2010s
+      language: Chinese
+  - name: cleva_translation
+    display_name: Translation
+    description: Scenario for measuring the translation quality between Chinese and English.
+    metric_groups:
+      - accuracy
+      - general_information
+      - efficiency
+    environment:
+      main_name: cleva_machine_translation_bleu
+      main_split: test
+    taxonomy:
+      task: translation
+      what: news
+      who: n/a
+      when: 2022 or before
+      language: Chinese, English
+  # - name: cleva_data_to_text_generation
+  #   display_name: Data to text generation
+  #   description: "Generate a product description based on structured data containing various product properties."
+  #   metric_groups:
+  #     - accuracy
+  #     - general_information
+  #     - efficiency
+  #   environment:
+  #     main_name: chinese_bleu_1
+  #     main_split: test
+  #   taxonomy:
+  #     task: data-to-text generation
+  #     what: product description
+  #     who: n/a
+  #     when: 2010s
+  #     language: Chinese
+  # - name: cleva_dialogue_generation
+  #   display_name: Dialogue generation
+  #   description: "Task-oriented dialogue between a user and a system."
+  #   metric_groups:
+  #     - accuracy
+  #     - general_information
+  #     - efficiency
+  #   environment:
+  #     main_name: chinese_bleu_1
+  #     main_split: test
+  #   taxonomy:
+  #     task: dialogue generation
+  #     what: task-oriented dialogue on hotel, restaurant, attraction, metro, and taxi domain
+  #     who: user and assistant
+  #     when: 2020 or before
+  #     language: Chinese
+  # - name: cleva_opinion_mining
+  #   display_name: Opinion mining
+  #   description: "Extract the target of an opinion."
+  #   metric_groups:
+  #     - accuracy
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: opinion target extraction
+  #     what: n/a
+  #     who: n/a
+  #     when: 2010s
+  #     language: Chinese
+  - name: cleva_paraphrase_generation
+    display_name: Paraphrase generation
+    description: Generate a paraphrase of a given sentence.
+    metric_groups:
+      - accuracy
+      - general_information
+      - efficiency
+    environment:
+      main_name: chinese_ibleu
+      main_split: test
+    taxonomy:
+      task: paraphrase generation
+      what: n/a
+      who: n/a
+      when: 2010s
+      language: Chinese
+  - name: cleva_paraphrase_identification
+    display_name: Paraphrase identification
+    description: Identify if two sentences, from a dialogue or from the finance domain, share the same meaning.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: paraphrase identification
+      what: financial questions or chitchat
+      who: n/a
+      when: 2020 or before
+      language: Chinese
+  - name: cleva_reading_comprehension
+    display_name: Reading comprehension
+    description: Answer a multiple-choice question based on a given paragraph.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: 2019 or before
+      language: Chinese
+  - name: cleva_sentiment_analysis
+    display_name: Sentiment analysis
+    description: Chinese sentiment analysis for product reviews.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: sentiment analysis
+      what: product reviews
+      who: customers
+      when: 2021 or before
+      language: Chinese
+# Language
+  # - name: cleva_language_modeling
+  #   display_name: Language modeling
+  #   description: Scenario for measuring language model performance across various domains (wikipedia and news).
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: bits_per_byte
+  #     main_split: test
+  #   taxonomy:
+  #     task: language modeling
+  #     what: Wikipedia and news
+  #     who: n/a
+  #     when: 2010s
+  #     language: Chinese
+  - name: cleva_pinyin_transliteration
+    display_name: Pinyin transliteration
+    description: Scenario that asks the model to translate between Chinese and Pinyin.
+    metric_groups:
+      - accuracy
+      - general_information
+      - efficiency
+    environment:
+      main_name: chinese_bleu_1
+      main_split: test
+    taxonomy:
+      task: pinyin transliteration
+      what: n/a
+      who: automatically generated by algorithm
+      when: '2023'
+      language: Chinese, Pinyin
+  - name: cleva_classical_chinese_understanding
+    display_name: Classical Chinese understanding
+    description: Scenario for evaluating the understanding of classical Chinese by selecting the appropriate classical Chinese translation for a given modern Chinese sentence.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: translation
+      what: n/a
+      who: n/a
+      when: 2021 or before
+      language: Classical Chinese
+  - name: cleva_coreference_resolution
+    display_name: Coreference resolution
+    description: Scenario for testing models on solving coreference resolution problems (the winograd schema challenge).
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: contemporary Chinese literary works
+      who: n/a
+      when: 2020 or before
+      language: Chinese
+  # - name: cleva_intent_understanding
+  #   display_name: Intent understanding
+  #   description: Tests whether the model could capture the writing intention of the authors after reading an article.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: multiple-choice question answering
+  #     what: exam
+  #     who: n/a
+  #     when: 1990-2022
+  #     language: Chinese
+# Knowledge
+  # - name: cleva_subject_knowledge
+  #   display_name: Subject knowledge
+  #   description: Scenario inspired by [Petroni et al. (2019)](https://aclanthology.org/D19-1250/) to extensively test factual knowledge in Chinese. It contains 13 subjects and a general domain.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: knowledge base completion
+  #     what: entity-relation-entity triples in natural language form
+  #     who: automatically generated from templates
+  #     when: 2022 or before
+  #     language: structured Chinese
+  - name: cleva_cultural_knowledge
+    display_name: Cultural knowledge
+    description: "Scenario for evaluating models' understanding of Chinese culture. It has a Chinese-idiom-focused subtask."
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: Various passages containing Chinese idioms
+      who: n/a
+      when: 2010s
+      language: Chinese
+# Reasoning
+  # - name: cleva_reasoning_primitive
+  #   display_name: Reasoning primitive
+  #   description: Scenario focused on primitive reasoning, including dyck language continuation, variable substitution, pattern induction, and pattern matching.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: next-word prediction
+  #     what: n/a
+  #     who: automatically generated from templates
+  #     when: '2023'
+  #     language: synthetic
+  # - name: cleva_deductive_reasoning
+  #   display_name: Deductive reasoning
+  #   description: "Scenario that gauges model's ability to reason deductive arguments. It includes a modus tollens subtask."
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: multiple-choice question answering
+  #     what: natural language questions
+  #     who: n/a
+  #     when: '2023'
+  #     language: structured Chinese
+  # - name: cleva_inductive_reasoning
+  #   display_name: Inductive reasoning
+  #   description: "Scenario that tests models' ability to conclude rules from demonstrations and apply them to unseen test instances."
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: next-word prediction
+  #     what: n/a
+  #     who: automatically generated by algorithm
+  #     when: '2023'
+  #     language: synthetic
+  # - name: cleva_code_synthesis
+  #   display_name: Code synthesis
+  #   description: Scenario for measuring functional correctness for synthesizing programs from Chinese docstrings.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: next-word prediction
+  #     what: n/a
+  #     who: n/a
+  #     when: '2023'
+  #     language: synthetic
+  - name: cleva_commonsense_reasoning
+    display_name: Commonsense reasoning
+    description: "Scenario that tests models' commonsense reasoning ability. There are two subtasks: textual entailment and commonsense question answering."
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: n/a
+      who: n/a
+      when: 2023 or before
+      language: Chinese
+  - name: cleva_mathematical_reasoning
+    display_name: Mathematical reasoning
+    description: "Scenario that tests models' mathematical reasoning ability with chain-of-thought style reasoning. It contains a math word problem solving subtask."
+    metric_groups:
+      - accuracy
+      - general_information
+      - efficiency
+    environment:
+      main_name: cleva_math_result_match
+      main_split: test
+    taxonomy:
+      task: next-word prediction
+      what: exam
+      who: n/a
+      when: 2010s
+      language: Chinese
+  # - name: cleva_conceptual_generalization
+  #   display_name: Conceptual generalization
+  #   description: Scenario that assesses whether models could generalize physical relations to a synthetic grid world.
+  #   metric_groups:
+  #     - calibration
+  #     - efficiency
+  #     - accuracy
+  #     - general_information
+  #   environment:
+  #     main_name: cleva_top1_accuracy
+  #     main_split: test
+  #   taxonomy:
+  #     task: next-word prediction
+  #     what: n/a
+  #     who: automatically generated by algorithm
+  #     when: '2023'
+  #     language: synthetic
+# Harms
+  - name: cleva_toxicity_detection
+    display_name: Toxicity detection
+    description: Ask models about the offensiveness of the given text.
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: toxicity classification
+      what: text from Chinese social media
+      who: web users
+      when: 2022 or before
+      language: Chinese
+  # - name: cleva_bias
+  #   display_name: Bias
+  #   description: Scenario that gauges bias of four demographic categories in dialogues, including race, gender, region, and occupation.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #     - classification_metrics
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: multiple-choice question answering
+  #     what: short Chinese dialogues from social media
+  #     who: web users
+  #     when: 2022 or before
+  #     language: Chinese
+  # - name: cleva_copyright
+  #   display_name: Copyright
+  #   description: Scenario that measures copyright and memorization behavior for Chinese books and code, based off of [Carlini et al. (2021)](https://www.usenix.org/biblio-11958).
+  #   metric_groups:
+  #     # - copyright_metrics
+  #     - general_information
+  #     - efficiency
+  #   environment:
+  #     main_split: test
+  #   taxonomy:
+  #     task: next-word prediction
+  #     what: books and code
+  #     who: n/a
+  #     when: 2023 or before
+  #     language: Chinese
+  # - name: cleva_fact_checking
+  #   display_name: Fact checking
+  #   description: Scenario that lets models identify whether the given fact is true to test their factuality.
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #     - classification_metrics
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: multiple-choice question answering
+  #     what: factual statements in natural language form
+  #     who: n/a
+  #     when: 2022 or before
+  #     language: Chinese
+# Others
+  # - name: cleva_instruction_following
+  #   display_name: Instruction following
+  #   description: "Scenario that examines whether models could follow human instructions, mainly uncommon ones. It contains two subtasks: 'redefine' and 'pattern_matching_suppression'."
+  #   metric_groups:
+  #     - accuracy
+  #     - calibration
+  #     - efficiency
+  #     - general_information
+  #   environment:
+  #     main_name: exact_match
+  #     main_split: test
+  #   taxonomy:
+  #     task: multiple-choice question answering
+  #     what: natural language questions
+  #     who: automatically generated from templates
+  #     when: '2023'
+  #     language: synthetic
+  - name: cleva_mathematical_calculation
+    display_name: Mathematical calculation
+    description: "Scenario that evaluates the calculation ability of models. It has four subtasks: three-digit addition, three-digit subtraction, two-digit multiplication, and significant figures."
+    metric_groups:
+      - accuracy
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: next-word prediction
+      what: natural language math questions or pure math expressions
+      who: automatically generated from templates
+      when: '2023'
+      language: synthetic