PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show

helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} RENAMED Viewed

@@ -84,9 +84,19 @@ metrics:
     description: Fraction of instances where the generated code compiles successfully.
     lower_is_better: false
   - name: fid_similarity
-    display_name: FID similarity
-    short_display_name: FID
-    description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
+    display_name: CIS
+    short_display_name: CIS
+    description: The cosine similarity between the Inception feature vectors.
+    lower_is_better: false
+  - name: lpips_similarity
+    display_name: LPIPS
+    short_display_name: LPIPS
+    description: The LPIPS distance between the generated image and the target image.
+    lower_is_better: false
+  - name: ssim_similarity
+    display_name: SSIM
+    short_display_name: SSIM
+    description: The SSIM similarity between the generated image and the target image.
     lower_is_better: false
   # Accuracy metrics:
@@ -165,6 +175,10 @@ metric_groups:
         split: ${main_split}
       - name: earth_mover_similarity
         split: ${main_split}
+      - name: lpips_similarity
+        split: ${main_split}
+      - name: ssim_similarity
+        split: ${main_split}
   - name: generation_text
     display_name: Generation (text)
@@ -175,7 +189,7 @@ metric_groups:
 ############################################################
 run_groups:
   - name: core_scenarios
-    display_name: Image2Structure
+    display_name: Image2Struct
     description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
     category: All scenarios
     subgroups:
@@ -183,13 +197,13 @@ run_groups:
       - image2webpage
       - image2musicsheet
-  - name: image2structure_real
-    display_name: Image2Structure (Wild)
+  - name: image2struct_wild
+    display_name: Image2Struct (Wild)
     description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
     category: All scenarios
     subgroups:
-      - image2latex_real
-      - image2webpage_real
+      - image2latex_wild
+      - image2webpage_wild
   - name: image2latex
     display_name: Image2LaTeX
@@ -209,9 +223,9 @@ run_groups:
       when: "2024"
       language: English
-  - name: image2latex_easy
-    display_name: I2LaTeX (Easy)
-    description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  - name: image2latex_equation
+    display_name: I2LaTeX (Equation)
+    description: The Image2LaTeX benchmark subset for converting images of mathematical equations to LaTeX.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -223,14 +237,14 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: mathematical equations, tables, algorithms, tikz
+      what: mathematical equations
       who: dataset authors
       when: "2024"
       language: English
-  - name: image2latex_medium
-    display_name: I2LaTeX (Medium)
-    description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  - name: image2latex_table
+    display_name: I2LaTeX (Table)
+    description: The Image2LaTeX benchmark subset for converting images of tables to LaTeX.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -242,14 +256,13 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: mathematical equations, tables, algorithms, tikz
+      what: tables
       who: dataset authors
       when: "2024"
-      language: English
-  - name: image2latex_hard
-    display_name: I2LaTeX (Hard)
-    description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  - name: image2latex_algorithm
+    display_name: I2LaTeX (Algorithm)
+    description: The Image2LaTeX benchmark subset for converting images of algorithms to LaTeX.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -261,12 +274,86 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: mathematical equations, tables, algorithms, tikz
+      what: algorithms
       who: dataset authors
       when: "2024"
-      language: English
-  - name: image2latex_real
+  - name: image2latex_plot
+    display_name: I2LaTeX (Tikz)
+    description: The Image2LaTeX benchmark subset for converting images of tikz to LaTeX.
+    metric_groups:
+      - accuracy_simple
+      - compilation
+      - generation_image
+      - generation_text
+      - general_information
+    environment:
+      main_name: earth_mover_similarity
+      main_split: valid
+    taxonomy:
+      task: image-to-text
+      what: tikz (plots)
+      who: dataset authors
+      when: "2024"
+  # - name: image2latex_easy
+  #   display_name: I2LaTeX (Easy)
+  #   description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: mathematical equations, tables, algorithms, tikz
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  # - name: image2latex_medium
+  #   display_name: I2LaTeX (Medium)
+  #   description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: mathematical equations, tables, algorithms, tikz
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  # - name: image2latex_hard
+  #   display_name: I2LaTeX (Hard)
+  #   description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: mathematical equations, tables, algorithms, tikz
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  - name: image2latex_wild
     display_name: Image2LaTeX (Wild)
     description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
     metric_groups:
@@ -301,9 +388,9 @@ run_groups:
       when: "2024"
       language: English
-  - name: image2webpage_easy
-    display_name: I2webpage (Easy)
-    description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  - name: image2webpage_css
+    display_name: I2webpage (CSS)
+    description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly CSS.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -315,14 +402,13 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: css, html, javascript
+      what: code (mostly CSS)
       who: dataset authors
       when: "2024"
-      language: English
-  - name: image2webpage_medium
-    display_name: I2webpage (Medium)
-    description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  - name: image2webpage_html
+    display_name: I2webpage (HTML)
+    description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly HTML.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -334,14 +420,13 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: css, html, javascript
+      what: code (mostly HTML)
       who: dataset authors
       when: "2024"
-      language: English
-  - name: image2webpage_hard
-    display_name: I2webpage (Hard)
-    description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  - name: image2webpage_javascript
+    display_name: I2webpage (Javascript)
+    description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly Javascript.
     metric_groups:
       - accuracy_simple
       - compilation
@@ -353,12 +438,68 @@ run_groups:
       main_split: valid
     taxonomy:
       task: image-to-text
-      what: css, html, javascript
+      what: code (mostly Javascript)
       who: dataset authors
       when: "2024"
-      language: English
-  - name: image2webpage_real
+  # - name: image2webpage_easy
+  #   display_name: I2webpage (Easy)
+  #   description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: css, html, javascript
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  # - name: image2webpage_medium
+  #   display_name: I2webpage (Medium)
+  #   description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: css, html, javascript
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  # - name: image2webpage_hard
+  #   display_name: I2webpage (Hard)
+  #   description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - generation_text
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: css, html, javascript
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
+  - name: image2webpage_wild
     display_name: Image2webpage (Wild)
     description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
     metric_groups:
@@ -392,56 +533,56 @@ run_groups:
       when: "2024"
       language: English
-  - name: image2musicsheet_easy
-    display_name: I2musicsheet (Easy)
-    description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
-    metric_groups:
-      - accuracy_simple
-      - compilation
-      - generation_image
-      - general_information
-    environment:
-      main_name: earth_mover_similarity
-      main_split: valid
-    taxonomy:
-      task: image-to-text
-      what: music sheets
-      who: dataset authors
-      when: "2024"
-      language: English
+  # - name: image2musicsheet_easy
+  #   display_name: I2musicsheet (Easy)
+  #   description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: music sheets
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
-  - name: image2musicsheet_medium
-    display_name: I2musicsheet (Medium)
-    description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
-    metric_groups:
-      - accuracy_simple
-      - compilation
-      - generation_image
-      - general_information
-    environment:
-      main_name: earth_mover_similarity
-      main_split: valid
-    taxonomy:
-      task: image-to-text
-      what: music sheets
-      who: dataset authors
-      when: "2024"
-      language: English
+  # - name: image2musicsheet_medium
+  #   display_name: I2musicsheet (Medium)
+  #   description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: music sheets
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English
-  - name: image2musicsheet_hard
-    display_name: I2musicsheet (Hard)
-    description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
-    metric_groups:
-      - accuracy_simple
-      - compilation
-      - generation_image
-      - general_information
-    environment:
-      main_name: earth_mover_similarity
-      main_split: valid
-    taxonomy:
-      task: image-to-text
-      what: music sheets
-      who: dataset authors
-      when: "2024"
-      language: English
+  # - name: image2musicsheet_hard
+  #   display_name: I2musicsheet (Hard)
+  #   description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
+  #   metric_groups:
+  #     - accuracy_simple
+  #     - compilation
+  #     - generation_image
+  #     - general_information
+  #   environment:
+  #     main_name: earth_mover_similarity
+  #     main_split: valid
+  #   taxonomy:
+  #     task: image-to-text
+  #     what: music sheets
+  #     who: dataset authors
+  #     when: "2024"
+  #     language: English

helm/benchmark/static/schema_safety.yaml ADDED Viewed

@@ -0,0 +1,247 @@
+---
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: f1_score
+    display_name: F1
+    description: Average F1 score in terms of word overlap between the model output and correct reference.
+    lower_is_better: false
+  - name: safety_score
+    display_name: LM Evaluated Safety score
+    description: LM Evaluated Safety score (mean of Claude Sonnet3.5 and GPT4o ratings)
+    lower_is_better: false
+  - name: bbq_accuracy
+    display_name: BBQ accuracy
+    description: BBQ accuracy
+    lower_is_better: false
+  # Toxicity metrics
+  - name: expected_max_toxicity
+    display_name: Average max toxicity
+    short_display_name: Avg. max toxicity
+    lower_is_better: true
+    description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+  - name: max_toxicity_probability
+    display_name: max toxicity
+    lower_is_better: true
+    description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+  - name: toxic_frac
+    display_name: Toxic fraction
+    lower_is_better: true
+    description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
+############################################################
+perturbations: []
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+  - name: efficiency
+    display_name: Efficiency
+    metrics:
+    - name: inference_runtime
+      split: ${main_split}
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+  - name: toxicity
+    display_name: Toxicity
+    metrics:
+    - name: toxic_frac
+      split: ${main_split}
+  - name: bbq_metrics
+    display_name: BBQ metrics
+    description: Metrics used for the BBQ bias benchmark.
+    metrics:
+      - name: bbq_metric_ambiguous_bias
+        split: ${main_split}
+      - name: bbq_metric_unambiguous_bias
+        split: ${main_split}
+############################################################
+run_groups:
+  - name: safety_scenarios
+    display_name: Safety Scenarios
+    description: Scenarios for the model safety
+    category: All scenarios
+    subgroups:
+      - harm_bench
+      - simple_safety_tests
+      - bbq
+      - anthropic_red_team
+      - xstest
+  - name: harm_bench
+    display_name: HarmBench
+    description: HarmBench
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: safety_score
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+  - name: simple_safety_tests
+    display_name: SimpleSafetyTests
+    description: SimpleSafetyTests
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: safety_score
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+  - name: xstest
+    display_name: XSTest
+    description: XSTest
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: safety_score
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+  - name: bbq
+    display_name: BBQ
+    description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+      - bbq_metrics
+    environment:
+      main_name: bbq_accuracy
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+  - name: anthropic_red_team
+    display_name: Anthropic Red Team
+    short_display_name: Anthropic Red Team
+    description: Anthropic Red Team
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: safety_score
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English