PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} RENAMED Viewed

@@ -1,66 +1,8 @@
 ---
 ############################################################
-adapter:
-  - name: method
-    description: The high-level strategy for converting instances into a prompt for the language model.
-    values:
-      - name: generation
-        description: Given the input, the model generates the output free-form.
-      - name: multiple_choice_joint
-        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
-      - name: multiple_choice_separate_original
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
-      - name: multiple_choice_separate_calibrated
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
-      - name: language_modeling
-        description: Given the input, the model assigns the sequence a probability.
-  - name: instructions
-    description: The description of the task that is included at the very beginning of the prompt.
-  - name: global_prefix
-    description: The string that is prepended to the prompt.
-  - name: global_suffix
-    description: The string that is appended to the prompt.
-  - name: instance_prefix
-    description: The string that is included before each instance (e.g., '\n\n').
-  - name: input_prefix
-    description: The string that is included before each input (e.g., 'Question:').
-  - name: input_suffix
-    description: The string that is included after each input (e.g., '\n').
-  - name: reference_prefix
-    description: The string that is included before each reference (for multiple-choice questions).
-  - name: reference_suffix
-    description: The string that is included after each reference (for multiple-choice questions).
-  - name: output_prefix
-    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
-  - name: output_suffix
-    description: The string that is included after the correct answer/predicted output (e.g., '\n').
-  - name: substitutions
-    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
-  - name: max_train_instances
-    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
-  - name: max_eval_instances
-    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
-  - name: num_outputs
-    description: Maximum number of possible outputs to generate by sampling multiple outputs.
-  - name: num_train_trials
-    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
-  - name: sample_train
-    description: If true, randomly sample N training examples; if false, select N consecutive training examples
-  - name: model
-    description: Name of the language model (<creator_organization>/<model name>) to send requests to.
-  - name: model_deployment
-    description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
-  - name: temperature
-    description: Temperature parameter used in generation.
-  - name: max_tokens
-    description: Maximum number of tokens to generate.
-  - name: stop_sequences
-    description: List of sequences, where we stop generation if we encounter any of them.
-  - name: random
-    description: Random seed (string), which guarantees reproducibility.
-  - name: multi_label
-    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
+# For backwards compatibility with older versions of HELM.
+# TODO: Remove this after 2024-09-01.
+adapter: []
 ############################################################
 metrics:
   # Infrastructure metrics:
@@ -220,49 +162,10 @@ metrics:
     display_name: CIDEr
     description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
     lower_is_better: false
-  # Bias metrics:
-  - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
-    display_name: Stereotypical associations (race, profession)
-    short_display_name: Stereotypes (race)
-    lower_is_better: true
-    description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
-  - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
-    display_name: Stereotypical associations (race, adjectives)
-    short_display_name: Stereotypes (race)
-    lower_is_better: true
-    description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
-  - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
-    display_name: Stereotypical associations (gender, profession)
-    short_display_name: Stereotypes (gender)
-    lower_is_better: true
-    description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
-  - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
-    display_name: Stereotypical associations (gender, adjectives)
-    short_display_name: Stereotypes (gender)
-    lower_is_better: true
-    description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
-  - name: bias_metric:mode=representation,demographic_category=race
-    display_name: Demographic representation (race)
-    short_display_name: Representation (race)
-    lower_is_better: true
-    description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
-  - name: bias_metric:mode=representation,demographic_category=gender
-    display_name: Demographic representation (gender)
-    short_display_name: Representation (gender)
-    lower_is_better: true
-    description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
-  - name: bbq_metric_ambiguous_bias
-    display_name: BBQ (ambiguous)
-    lower_is_better: true
-    description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
-  - name: bbq_metric_unambiguous_bias
-    display_name: BBQ (unambiguous)
-    lower_is_better: true
-    description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
+  - name: prometheus_vision
+    display_name: Prometheus Vision rating
+    description: Scores are from 1 to 5, where 5 is the best.
+    lower_is_better: false
   # Toxicity metrics
   - name: expected_max_toxicity
@@ -281,6 +184,9 @@ metrics:
 ############################################################
 perturbations:
+  - name: translate
+    display_name: Translate
+    description: Translate prompts to different languages.
   - name: robustness
     display_name: Robustness
     description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
@@ -322,12 +228,6 @@ metric_groups:
       - name: ${main_name}
         split: ${main_split}
-  - name: efficiency
-    display_name: Efficiency
-    metrics:
-    - name: inference_runtime
-      split: ${main_split}
   - name: general_information
     display_name: General information
     metrics:
@@ -348,50 +248,126 @@ metric_groups:
       - name: toxic_frac
         split: ${main_split}
-  - name: generation_image
-    display_name: Generation (image)
+  - name: fairness
+    display_name: Fairness
     metrics:
-      - name: pixel_similarity
-        split: ${main_split}
-      - name: compilation_success
-        split: ${main_split}
-      - name: fid_similarity
+      - name: ${main_name}
         split: ${main_split}
-      - name: earth_mover_similarity
+        perturbation_name: fairness
+  - name: robustness
+    display_name: Robustness
+    metrics:
+      - name: ${main_name}
         split: ${main_split}
+        perturbation_name: robustness
-  - name: generation_text
-    display_name: Generation (text)
+  - name: translate
+    display_name: Translate
     metrics:
-      - name: edit_similarity
+      - name: ${main_name}
         split: ${main_split}
+        perturbation_name: translate
 ############################################################
 run_groups:
   - name: core_scenarios
-    display_name: Core scenarios
-    description: The scenarios where we evaluate all the models.
+    display_name: All
+    description: All scenarios across capabilities
     category: All scenarios
     subgroups:
-      - hateful_memes
-      - heim_human_eval
+      - visual_perception
+      - reasoning
+      - knowledge
+      - bias
+      - fairness
+      - toxicity
+      - robustness
+      - multilinguality
+  - name: visual_perception
+    display_name: Visual perception
+    description: Is the output semantically correct, given the text and image inputs?
+    category: Core scenarios
+    subgroups:
+      - vqa_base
       - viz_wiz
-      - vqa
+      - flickr30k
+  - name: reasoning
+    display_name: Reasoning
+    description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
+    category: Core scenarios
+    subgroups:
+      - gqa
+      - math_vista
+      - seed_bench
+  - name: real_world_reasoning
+    display_name: Real-world Reasoning
+    description: Reasoning in the real-world
+    category: Core scenarios
+    subgroups:
+      - gqa
+      - seed_bench
+      - mementos
+      - real_world_qa
+  - name: knowledge
+    display_name: Knowledge
+    description: Does the model have knowledge about the world and common sense?
+    category: Core scenarios
+    subgroups:
+      - a_okvqa_base
       - mmmu
-      - image2structure
+      - mme
+      - vibe_eval
+      - real_world_qa
+  - name: bias
+    display_name: Bias
+    description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
+    category: Core scenarios
+    subgroups:
+      - pairs
+  - name: fairness
+    display_name: Fairness
+    description: Does the model exhibit performance disparities across different groups? We focus on gender, dialect and geographic bias.
+    category: Core scenarios
+    subgroups:
+      - vqa_dialect
+      - a_okvqa_dialect
+      - crossmodal_3600
+      - fair_face
+  - name: toxicity
+    display_name: Toxicity
+    description: Does the model generate toxic or inappropriate content? Can the model identify toxic or inappropriate content?
+    category: Core scenarios
+    subgroups:
+      - mm_safety_bench
+      - hateful_memes
+  - name: robustness
+    display_name: Robustness
+    description: Is the model robust to perturbations? We focus on both text and image perturbations.
+    category: Core scenarios
+    subgroups:
+      - vqa_robustness
+      - a_okvqa_robustness
       - unicorn
       - bingo
-      - multipanelvqa
       - pope
-      - seed_bench
-      - mme
+  - name: multilinguality
+    display_name: Multilinguality
+    description: Do the model support non-English languages?
+    category: Core scenarios
+    subgroups:
+      - a_okvqa_chinese
+      - a_okvqa_hindi
+      - a_okvqa_spanish
+      - a_okvqa_swahili
+      - exams_v
-  - name: a_okvqa
+  - name: a_okvqa_base
     display_name: A-OKVQA
-    description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
+    description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -403,15 +379,110 @@ run_groups:
       when: "2023"
       language: English
+  - name: a_okvqa_dialect
+    display_name: A-OKVQA (AAE)
+    description: African-American English Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - fairness
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: English
+  - name: a_okvqa_robustness
+    display_name: A-OKVQA (robustness)
+    description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - robustness
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: English
+  - name: a_okvqa_chinese
+    display_name: A-OKVQA (chinese)
+    description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - translate
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: Chinese
+  - name: a_okvqa_hindi
+    display_name: A-OKVQA (hindi)
+    description: Hindi Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - translate
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: Hindi
+  - name: a_okvqa_spanish
+    display_name: A-OKVQA (spanish)
+    description: Spanish Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - translate
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: Spanish
+  - name: a_okvqa_swahili
+    display_name: A-OKVQA (swahili)
+    description: Swahili Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
+    metric_groups:
+      - translate
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2023"
+      language: Swahili
   - name: crossmodal_3600
     display_name: Crossmodal 3600
-    description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
+    description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
-      main_name: f1_score
+      main_name: prometheus_vision
       main_split: test
     taxonomy:
       task: multilingual captioning
@@ -422,13 +493,12 @@ run_groups:
   - name: flickr30k
     display_name: Flickr30k
-    description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
+    description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
-      main_name: f1_score
+      main_name: prometheus_vision
       main_split: test
     taxonomy:
       task: image captioning
@@ -439,145 +509,112 @@ run_groups:
   - name: gqa
     display_name: GQA
-    description: Questions about real-world visual reasoning and compositional QA
+    description: Questions about real-world visual reasoning and compositional QA ([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
-      main_name: f1_score
+      main_name: quasi_exact_match
       main_split: valid
     taxonomy:
-      task: short answer question answering
+      task: short-answer question answering
       what: Real-world images
       who: Human experts
       when: "2019"
       language: English
-  - name: heim_human_eval
-    display_name: HEIM Human Eval Scenario
-    description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: multiple-choice question answering
-      what: AI-generated images
-      who: Text-to-image models
-      when: "2024"
-      language: English
-  - name: image2structure
-    display_name: Image2Structure
-    description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
-    category: All scenarios
-    subgroups:
-      - image2latex
-      - image2webpage
-      - image2musicsheet
   - name: hateful_memes
     display_name: Hateful Memes
-    description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
+    description: Multimodal hate speech detection in memes ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: multimodal classification
-      what: images and text
-      who: annotators from Amazon Mechanical Turk
+      task: toxicity identification
+      what: Memes
+      who: Human experts
       when: "2020"
       language: English
   - name: mm_safety_bench
     display_name: MM-SafetyBench
-    description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
+    description: Exposes the vulnerability of open-source VLMs with toxic and biased content ([Liu et al., 2023](https://arxiv.org/abs/2311.17600)).
     metric_groups:
       - accuracy
-      - efficiency
       - toxicity
       - general_information
     environment:
       main_name: toxic_frac
       main_split: test
     taxonomy:
-      task: safety
-      what: safety images
+      task: toxicity mitigation
+      what: Jail-break images
       who: Human experts
       when: "2023"
       language: English
-  - name: mscoco_captioning
-    display_name: MSCOCO (captioning)
-    description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
+  - name: viz_wiz
+    display_name: VizWiz
+    description: A benchmark for visual question answering with images and questions created by visually impaired people ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
-      main_name: f1_score
+      main_name: quasi_exact_match
       main_split: valid
     taxonomy:
-      task: image captioning
-      what: Real world images
-      who: Human experts
-      when: "2014"
+      task: short-answer question answering
+      what: Real-world images
+      who: Visually impaired people
+      when: "2018"
       language: English
-  - name: mscoco_categorization
-    display_name: MSCOCO (categorization)
-    description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
+  - name: vqa_base
+    display_name: VQAv2
+    description: Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: valid
     taxonomy:
-      task: image captioning
-      what: Real world images
+      task: short-answer question answering
+      what: Real-world images
       who: Human experts
-      when: "2014"
+      when: "2017"
       language: English
-  - name: viz_wiz
-    display_name: VizWiz
-    description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
+  - name: vqa_dialect
+    display_name: VQAv2 (AAE)
+    description: African-American English Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
     metric_groups:
-      - accuracy
-      - efficiency
+      - fairness
       - general_information
     environment:
-      main_name: f1_score
+      main_name: quasi_exact_match
       main_split: valid
     taxonomy:
-      task: multimodal short answer question answering
+      task: short-answer question answering
       what: Real-world images
-      who: Visually impaired people
-      when: "2018"
+      who: Human experts
+      when: "2017"
       language: English
-  - name: vqa
-    display_name: VQAv2
-    description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
+  - name: vqa_robustness
+    display_name: VQAv2 (robustness)
+    description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
     metric_groups:
-      - accuracy
-      - efficiency
+      - robustness
       - general_information
     environment:
-      main_name: f1_score
+      main_name: quasi_exact_match
       main_split: valid
     taxonomy:
-      task: multimodal short answer question answering
+      task: short-answer question answering
       what: Real-world images
       who: Human experts
       when: "2017"
@@ -585,10 +622,9 @@ run_groups:
   - name: math_vista
     display_name: MathVista
-    description: Evaluating Math Reasoning in Visual Contexts
+    description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -602,16 +638,15 @@ run_groups:
   - name: mmmu
     display_name: MMMU
-    description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
+    description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
       main_split: valid
     taxonomy:
-      task: multimodal multiple-choice question answering
+      task: multiple-choice question answering
       what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
       who: Human experts
       when: "2023"
@@ -619,7 +654,7 @@ run_groups:
   - name: unicorn
     display_name: Unicorn
-    description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
+    description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images ([Tu et al., 2023](https://arxiv.org/abs/2311.16101)).
     metric_groups:
       - accuracy
       - general_information
@@ -627,7 +662,7 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: short answer question answering
+      task: short-answer question answering
       what: OOD images and sketch images
       who: Human experts
       when: "2023"
@@ -635,48 +670,31 @@ run_groups:
   - name: bingo
     display_name: Bingo
-    description: Open-ended questions about biased images
+    description: Open-ended questions about biased images and hallucinations-inducing images ([Cui et al., 2023](https://arxiv.org/abs/2311.03287)).
     metric_groups:
       - accuracy
+      - general_information
     environment:
-      main_name: f1_score
+      main_name: prometheus_vision
       main_split: test
     taxonomy:
-      task: short answer question answering
+      task: short-answer question answering
       what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
       who: Human experts
       when: "2023"
       language: English, Chinese, Japanese, etc.
-  - name: multipanelvqa
-    display_name: MultipanelVQA
-    description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
-    metric_groups:
-      - accuracy
-      - efficiency
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: short answer or multiple-choice question answering
-      what: Real-world or synthetic multipanel images
-      who: Human experts
-      when: "2024"
-      language: English
   - name: pope
     display_name: POPE
-    description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
+    description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour ([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: short answer question answering
+      task: short-answer question answering
       what: Real-world images
       who: Human experts
       when: "2023"
@@ -684,11 +702,9 @@ run_groups:
   - name: seed_bench
     display_name: Seed Bench
-    description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
-    including the comprehension of both the image and video modality
+    description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input including the comprehension of both the image and video modality ([Li et al., 2023](https://arxiv.org/abs/2307.16125)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -702,10 +718,9 @@ run_groups:
   - name: mme
     display_name: MME
-    description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
+    description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks ([Fu et al., 2023](https://arxiv.org/abs/2306.13394)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -717,107 +732,98 @@ run_groups:
       when: "2023"
       language: English
-  - name: mementos
-    display_name: Mementos
-    description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
+  - name: vibe_eval
+    display_name: Vibe Eval
+    description: A difficult evaluation suite for measuring progress of multimodal language models with day-to-day tasks ([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287)).
     metric_groups:
       - accuracy
+      - general_information
     environment:
-      main_name: f1_score
+      main_name: prometheus_vision
       main_split: test
     taxonomy:
-      task: short answer question answering
-      what: Image sequences of comics, dailylife and robotics
+      task: short-answer question answering
+      what: Knowledge intensive
       who: Human experts
       when: "2024"
       language: English
-  - name: image2latex
-    display_name: Image2LaTeX
-    description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
+  - name: mementos
+    display_name: Mementos
+    description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences ([Wang et al., 2024](https://arxiv.org/abs/2401.10529)).
     metric_groups:
       - accuracy
-      - generation_image
-      - generation_text
-      - efficiency
       - general_information
     environment:
-      main_name: earth_mover_similarity
-      main_split: valid
+      main_name: prometheus_vision
+      main_split: test
     taxonomy:
-      task: image-to-text
-      what: mathematical equations, tables, algorithms, tikz
-      who: n/a
+      task: short-answer question answering
+      what: Image sequences of comics, daily life and robotics
+      who: Human experts
       when: "2024"
       language: English
-  - name: image2webpage
-    display_name: Image2webpage
-    description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
+  - name: pairs
+    display_name: PAIRS
+    description: Examining gender and racial bias using parallel images ([Fraser et al., 2024](https://arxiv.org/abs/2402.05779)).
     metric_groups:
       - accuracy
-      - generation_image
-      - generation_text
-      - efficiency
       - general_information
     environment:
-      main_name: earth_mover_similarity
-      main_split: valid
+      main_name: exact_match
+      main_split: test
     taxonomy:
-      task: image-to-text
-      what: css, html, javascript
-      who: n/a
+      task: multiple-choice question answering
+      what: Bias
+      who: Human experts
       when: "2024"
       language: English
-  - name: image2musicsheet
-    display_name: Image2musicsheet
-    description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
+  - name: fair_face
+    display_name: FairFace
+    description: Identify the race, gender or age of a photo of a person ([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913)).
     metric_groups:
       - accuracy
-      - generation_image
-      - efficiency
       - general_information
     environment:
-      main_name: earth_mover_similarity
+      main_name: exact_match
       main_split: valid
     taxonomy:
-      task: image-to-text
-      what: music sheets
-      who: n/a
-      when: "2024"
+      task: multiple-choice question answering
+      what: Fairness
+      who: Human experts
+      when: "2019"
       language: English
-  - name: chart2csv
-    display_name: Chart2CSV
-    description: The Chart2CSV benchmark for converting images of charts to CSV.
+  - name: real_world_qa
+    display_name: RealWorldQA
+    description: A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models ([xAI, 2024](https://x.ai/blog/grok-1.5v)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: chart to CSV
-      what: plots
-      who: n/a
+      task: short-answer question answering
+      what: Real world images
+      who: Human experts
       when: "2024"
       language: English
-  - name: pairs
-    display_name: PAIRS
-    description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
+  - name: exams_v
+    display_name: Exams-V
+    description: A multimodal and multilingual benchmark with knowledge-intensive exam questions covering natural science, social science, and other miscellaneous studies ([Das et al., 2024]( https://arxiv.org/abs/2403.10378)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
       main_split: test
     taxonomy:
       task: multiple-choice question answering
-      what: Bias
+      what: Exam questions
       who: Human experts
       when: "2024"
-      language: English
+      language: English, Chinese, Croation, Hungarian, Arabic, Serbian, Bulgarian, English, German, French, Spanish, Polish

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl