crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: f1_score
|
|
74
|
+
display_name: F1
|
|
75
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: safety_score
|
|
78
|
+
display_name: LM Evaluated Safety score
|
|
79
|
+
description: LM Evaluated Safety score (mean of Llama3.1 and GPT4o ratings) representing whether the output constitutes a successful refusal of unsafe actions (0 is worst, 1 is best).
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: bbq_accuracy
|
|
82
|
+
display_name: BBQ accuracy
|
|
83
|
+
description: BBQ accuracy
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
|
|
86
|
+
# Toxicity metrics
|
|
87
|
+
- name: expected_max_toxicity
|
|
88
|
+
display_name: Average max toxicity
|
|
89
|
+
short_display_name: Avg. max toxicity
|
|
90
|
+
lower_is_better: true
|
|
91
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
92
|
+
- name: max_toxicity_probability
|
|
93
|
+
display_name: max toxicity
|
|
94
|
+
lower_is_better: true
|
|
95
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
96
|
+
- name: toxic_frac
|
|
97
|
+
display_name: Toxic fraction
|
|
98
|
+
lower_is_better: true
|
|
99
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
100
|
+
|
|
101
|
+
# Annotation metrics
|
|
102
|
+
- name: safety_llama_annotator_success
|
|
103
|
+
display_name: Llama Annotator Success Rate
|
|
104
|
+
description: Fraction of annotator requests to Llama that succeeded.
|
|
105
|
+
- name: safety_gpt_annotator_success
|
|
106
|
+
display_name: GPT Annotator Success Rate
|
|
107
|
+
description: Fraction of annotator requests to GPT that succeeded.
|
|
108
|
+
|
|
109
|
+
############################################################
|
|
110
|
+
perturbations: []
|
|
111
|
+
|
|
112
|
+
############################################################
|
|
113
|
+
metric_groups:
|
|
114
|
+
- name: accuracy
|
|
115
|
+
display_name: Accuracy
|
|
116
|
+
aggregation_strategies:
|
|
117
|
+
- win_rate
|
|
118
|
+
- mean
|
|
119
|
+
metrics:
|
|
120
|
+
- name: ${main_name}
|
|
121
|
+
split: ${main_split}
|
|
122
|
+
|
|
123
|
+
- name: efficiency
|
|
124
|
+
display_name: Efficiency
|
|
125
|
+
metrics:
|
|
126
|
+
- name: inference_runtime
|
|
127
|
+
split: ${main_split}
|
|
128
|
+
|
|
129
|
+
- name: general_information
|
|
130
|
+
display_name: General information
|
|
131
|
+
hide_win_rates: true
|
|
132
|
+
metrics:
|
|
133
|
+
- name: num_instances
|
|
134
|
+
split: ${main_split}
|
|
135
|
+
- name: num_train_instances
|
|
136
|
+
split: ${main_split}
|
|
137
|
+
- name: prompt_truncated
|
|
138
|
+
split: ${main_split}
|
|
139
|
+
- name: num_prompt_tokens
|
|
140
|
+
split: ${main_split}
|
|
141
|
+
- name: num_output_tokens
|
|
142
|
+
split: ${main_split}
|
|
143
|
+
|
|
144
|
+
- name: annotation_metrics
|
|
145
|
+
display_name: Annotation
|
|
146
|
+
hide_win_rates: true
|
|
147
|
+
metrics:
|
|
148
|
+
- name: safety_llama_annotator_success
|
|
149
|
+
split: ${main_split}
|
|
150
|
+
- name: safety_gpt_annotator_success
|
|
151
|
+
split: ${main_split}
|
|
152
|
+
|
|
153
|
+
- name: toxicity
|
|
154
|
+
display_name: Toxicity
|
|
155
|
+
metrics:
|
|
156
|
+
- name: toxic_frac
|
|
157
|
+
split: ${main_split}
|
|
158
|
+
|
|
159
|
+
- name: bbq_metrics
|
|
160
|
+
display_name: BBQ metrics
|
|
161
|
+
description: Metrics used for the BBQ bias benchmark.
|
|
162
|
+
metrics:
|
|
163
|
+
- name: bbq_metric_ambiguous_bias
|
|
164
|
+
split: ${main_split}
|
|
165
|
+
- name: bbq_metric_unambiguous_bias
|
|
166
|
+
split: ${main_split}
|
|
167
|
+
|
|
168
|
+
############################################################
|
|
169
|
+
run_groups:
|
|
170
|
+
- name: safety_scenarios
|
|
171
|
+
display_name: Safety Scenarios
|
|
172
|
+
description: Scenarios for the model safety
|
|
173
|
+
category: All scenarios
|
|
174
|
+
subgroups:
|
|
175
|
+
- harm_bench
|
|
176
|
+
- simple_safety_tests
|
|
177
|
+
- bbq
|
|
178
|
+
- anthropic_red_team
|
|
179
|
+
- xstest
|
|
180
|
+
|
|
181
|
+
- name: harm_bench
|
|
182
|
+
display_name: HarmBench
|
|
183
|
+
description: HarmBench
|
|
184
|
+
metric_groups:
|
|
185
|
+
- accuracy
|
|
186
|
+
- general_information
|
|
187
|
+
- annotation_metrics
|
|
188
|
+
environment:
|
|
189
|
+
main_name: safety_score
|
|
190
|
+
main_split: test
|
|
191
|
+
taxonomy:
|
|
192
|
+
task: question answering
|
|
193
|
+
what: n/a
|
|
194
|
+
who: n/a
|
|
195
|
+
when: n/a
|
|
196
|
+
language: English
|
|
197
|
+
|
|
198
|
+
- name: simple_safety_tests
|
|
199
|
+
display_name: SimpleSafetyTests
|
|
200
|
+
description: SimpleSafetyTests
|
|
201
|
+
metric_groups:
|
|
202
|
+
- accuracy
|
|
203
|
+
- general_information
|
|
204
|
+
- annotation_metrics
|
|
205
|
+
environment:
|
|
206
|
+
main_name: safety_score
|
|
207
|
+
main_split: test
|
|
208
|
+
taxonomy:
|
|
209
|
+
task: question answering
|
|
210
|
+
what: n/a
|
|
211
|
+
who: n/a
|
|
212
|
+
when: n/a
|
|
213
|
+
language: English
|
|
214
|
+
|
|
215
|
+
- name: xstest
|
|
216
|
+
display_name: XSTest
|
|
217
|
+
description: XSTest
|
|
218
|
+
metric_groups:
|
|
219
|
+
- accuracy
|
|
220
|
+
- general_information
|
|
221
|
+
- annotation_metrics
|
|
222
|
+
environment:
|
|
223
|
+
main_name: safety_score
|
|
224
|
+
main_split: test
|
|
225
|
+
taxonomy:
|
|
226
|
+
task: question answering
|
|
227
|
+
what: n/a
|
|
228
|
+
who: n/a
|
|
229
|
+
when: n/a
|
|
230
|
+
language: English
|
|
231
|
+
|
|
232
|
+
- name: bbq
|
|
233
|
+
display_name: BBQ
|
|
234
|
+
description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
|
|
235
|
+
metric_groups:
|
|
236
|
+
- accuracy
|
|
237
|
+
- general_information
|
|
238
|
+
- bbq_metrics
|
|
239
|
+
- annotation_metrics
|
|
240
|
+
environment:
|
|
241
|
+
main_name: bbq_accuracy
|
|
242
|
+
main_split: test
|
|
243
|
+
taxonomy:
|
|
244
|
+
task: question answering
|
|
245
|
+
what: n/a
|
|
246
|
+
who: n/a
|
|
247
|
+
when: n/a
|
|
248
|
+
language: English
|
|
249
|
+
|
|
250
|
+
- name: anthropic_red_team
|
|
251
|
+
display_name: Anthropic Red Team
|
|
252
|
+
short_display_name: Anthropic Red Team
|
|
253
|
+
description: Anthropic Red Team
|
|
254
|
+
metric_groups:
|
|
255
|
+
- accuracy
|
|
256
|
+
- general_information
|
|
257
|
+
- annotation_metrics
|
|
258
|
+
environment:
|
|
259
|
+
main_name: safety_score
|
|
260
|
+
main_split: test
|
|
261
|
+
taxonomy:
|
|
262
|
+
task: question answering
|
|
263
|
+
what: "?"
|
|
264
|
+
who: "?"
|
|
265
|
+
when: "?"
|
|
266
|
+
language: English
|
|
@@ -99,47 +99,101 @@ metrics:
|
|
|
99
99
|
display_name: METEOR
|
|
100
100
|
short_display_name: METEOR
|
|
101
101
|
description: METEOR
|
|
102
|
+
lower_is_better: false
|
|
102
103
|
- name: f1
|
|
103
|
-
display_name: F1
|
|
104
|
-
short_display_name: F1
|
|
105
|
-
description: F1
|
|
104
|
+
display_name: BERTScore F1
|
|
105
|
+
short_display_name: BERTScore F1
|
|
106
|
+
description: BERTScore F1
|
|
107
|
+
lower_is_better: false
|
|
106
108
|
- name: precision
|
|
107
109
|
display_name: Precision
|
|
108
110
|
short_display_name: Precision
|
|
109
111
|
description: Precision
|
|
112
|
+
lower_is_better: false
|
|
110
113
|
- name: recall
|
|
111
114
|
display_name: Recall
|
|
112
115
|
short_display_name: Recall
|
|
113
116
|
description: Recall
|
|
117
|
+
lower_is_better: false
|
|
114
118
|
- name: rouge1
|
|
115
119
|
display_name: ROUGE-1
|
|
116
120
|
short_display_name: ROUGE-1
|
|
117
121
|
description: ROUGE-1
|
|
122
|
+
lower_is_better: false
|
|
118
123
|
- name: rouge2
|
|
119
124
|
display_name: ROUGE-2
|
|
120
125
|
short_display_name: ROUGE-2
|
|
121
126
|
description: ROUGE-2
|
|
127
|
+
lower_is_better: false
|
|
122
128
|
- name: rougeL
|
|
123
129
|
display_name: ROUGE-L
|
|
124
130
|
short_display_name: ROUGE-L
|
|
125
131
|
description: ROUGE-L
|
|
132
|
+
lower_is_better: false
|
|
126
133
|
- name: rougeLsum
|
|
127
134
|
display_name: ROUGE-Lsum
|
|
128
135
|
short_display_name: ROUGE-Lsum
|
|
129
136
|
description: ROUGE-Lsum
|
|
137
|
+
lower_is_better: false
|
|
130
138
|
- name: bleu
|
|
131
139
|
display_name: BLEU
|
|
132
140
|
short_display_name: BLEU
|
|
133
141
|
description: BLEU
|
|
142
|
+
lower_is_better: false
|
|
143
|
+
- name: accuracy
|
|
144
|
+
display_name: Accuracy
|
|
145
|
+
short_display_name: Accuracy
|
|
146
|
+
description: Accuracy
|
|
147
|
+
lower_is_better: false
|
|
148
|
+
- name: f1_macro
|
|
149
|
+
display_name: Macro F1
|
|
150
|
+
short_display_name: Macro F1
|
|
151
|
+
description: Macro F1
|
|
152
|
+
lower_is_better: false
|
|
153
|
+
- name: f1_micro
|
|
154
|
+
display_name: Micro F1
|
|
155
|
+
short_display_name: Micro F1
|
|
156
|
+
description: Micro F1
|
|
157
|
+
lower_is_better: false
|
|
158
|
+
- name: unsorted_list_exact_match
|
|
159
|
+
display_name: Unsorted List Exact Match
|
|
160
|
+
short_display_name: Exact Match
|
|
161
|
+
description: Unsorted List Exact Match
|
|
162
|
+
lower_is_better: false
|
|
163
|
+
|
|
164
|
+
# FinQA Accuracy
|
|
165
|
+
- name: program_accuracy
|
|
166
|
+
display_name: Program Accuracy
|
|
167
|
+
short_display_name: Program Accuracy
|
|
168
|
+
description: Program Accuracy
|
|
169
|
+
lower_is_better: false
|
|
170
|
+
- name: execution_accuracy
|
|
171
|
+
display_name: Execution Accuracy
|
|
172
|
+
short_display_name: Execution Accuracy
|
|
173
|
+
description: Execution Accuracy
|
|
174
|
+
lower_is_better: false
|
|
175
|
+
|
|
176
|
+
# SciGen Accuracy
|
|
177
|
+
- name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
|
|
178
|
+
display_name: Rating
|
|
179
|
+
short_display_name: Rating
|
|
180
|
+
description: Rating by Llama 3 (8B) LLM as judge
|
|
181
|
+
lower_is_better: false
|
|
134
182
|
|
|
135
183
|
perturbations: []
|
|
136
184
|
|
|
137
185
|
metric_groups:
|
|
138
|
-
- name:
|
|
139
|
-
display_name:
|
|
186
|
+
- name: main_metrics
|
|
187
|
+
display_name: Main Metrics
|
|
188
|
+
metrics:
|
|
189
|
+
- name: ${main_name}
|
|
190
|
+
split: __all__
|
|
191
|
+
|
|
192
|
+
- name: generation_metrics
|
|
193
|
+
display_name: Other Generation Metrics
|
|
140
194
|
hide_win_rates: true
|
|
141
195
|
metrics:
|
|
142
|
-
- name:
|
|
196
|
+
- name: f1
|
|
143
197
|
split: __all__
|
|
144
198
|
- name: rouge1
|
|
145
199
|
split: __all__
|
|
@@ -152,6 +206,17 @@ metric_groups:
|
|
|
152
206
|
- name: bleu
|
|
153
207
|
split: __all__
|
|
154
208
|
|
|
209
|
+
- name: classification_metrics
|
|
210
|
+
display_name: Classification Metrics
|
|
211
|
+
hide_win_rates: true
|
|
212
|
+
metrics:
|
|
213
|
+
- name: accuracy
|
|
214
|
+
split: __all__
|
|
215
|
+
- name: f1_macro
|
|
216
|
+
split: __all__
|
|
217
|
+
- name: f1_micro
|
|
218
|
+
split: __all__
|
|
219
|
+
|
|
155
220
|
- name: efficiency
|
|
156
221
|
display_name: Efficiency
|
|
157
222
|
metrics:
|
|
@@ -175,18 +240,22 @@ metric_groups:
|
|
|
175
240
|
|
|
176
241
|
run_groups:
|
|
177
242
|
- name: table_scenarios
|
|
178
|
-
display_name: Table
|
|
243
|
+
display_name: Table Scenarios
|
|
179
244
|
description: Table Scenarios
|
|
180
245
|
category: All Scenarios
|
|
181
246
|
subgroups:
|
|
182
247
|
- unitxt_cards.numeric_nlg
|
|
248
|
+
- unitxt_cards.tab_fact
|
|
249
|
+
- unitxt_cards.wikitq
|
|
250
|
+
- unitxt_cards.scigen
|
|
183
251
|
|
|
184
252
|
- name: unitxt_cards.numeric_nlg
|
|
185
253
|
display_name: NumericNLG
|
|
186
254
|
short_display_name: NumericNLG
|
|
187
255
|
description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
|
|
188
256
|
metric_groups:
|
|
189
|
-
-
|
|
257
|
+
- main_metrics
|
|
258
|
+
- generation_metrics
|
|
190
259
|
- efficiency
|
|
191
260
|
- general_information
|
|
192
261
|
environment:
|
|
@@ -198,3 +267,75 @@ run_groups:
|
|
|
198
267
|
who: "?"
|
|
199
268
|
when: "?"
|
|
200
269
|
language: English
|
|
270
|
+
|
|
271
|
+
- name: unitxt_cards.tab_fact
|
|
272
|
+
display_name: TabFact
|
|
273
|
+
short_display_name: TabFact
|
|
274
|
+
description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
|
|
275
|
+
metric_groups:
|
|
276
|
+
- main_metrics
|
|
277
|
+
- classification_metrics
|
|
278
|
+
- efficiency
|
|
279
|
+
- general_information
|
|
280
|
+
environment:
|
|
281
|
+
main_name: accuracy
|
|
282
|
+
main_split: test
|
|
283
|
+
taxonomy:
|
|
284
|
+
task: "?"
|
|
285
|
+
what: "?"
|
|
286
|
+
who: "?"
|
|
287
|
+
when: "?"
|
|
288
|
+
language: English
|
|
289
|
+
|
|
290
|
+
- name: unitxt_cards.wikitq
|
|
291
|
+
display_name: WikiTableQuestions
|
|
292
|
+
short_display_name: WikiTableQuestions
|
|
293
|
+
description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
|
|
294
|
+
metric_groups:
|
|
295
|
+
- main_metrics
|
|
296
|
+
- classification_metrics
|
|
297
|
+
- efficiency
|
|
298
|
+
- general_information
|
|
299
|
+
environment:
|
|
300
|
+
main_name: unsorted_list_exact_match
|
|
301
|
+
main_split: test
|
|
302
|
+
taxonomy:
|
|
303
|
+
task: "?"
|
|
304
|
+
what: "?"
|
|
305
|
+
who: "?"
|
|
306
|
+
when: "?"
|
|
307
|
+
language: English
|
|
308
|
+
|
|
309
|
+
- name: unitxt_cards.fin_qa
|
|
310
|
+
display_name: FinQA
|
|
311
|
+
description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
|
|
312
|
+
metric_groups:
|
|
313
|
+
- main_metrics
|
|
314
|
+
- efficiency
|
|
315
|
+
- general_information
|
|
316
|
+
environment:
|
|
317
|
+
main_name: program_accuracy
|
|
318
|
+
main_split: test
|
|
319
|
+
taxonomy:
|
|
320
|
+
task: question answering with numeric reasoning
|
|
321
|
+
what: financial reports
|
|
322
|
+
who: financial experts
|
|
323
|
+
when: 1999 to 2019
|
|
324
|
+
language: English
|
|
325
|
+
|
|
326
|
+
- name: unitxt_cards.scigen
|
|
327
|
+
display_name: SciGen
|
|
328
|
+
description: SciGen
|
|
329
|
+
metric_groups:
|
|
330
|
+
- main_metrics
|
|
331
|
+
- efficiency
|
|
332
|
+
- general_information
|
|
333
|
+
environment:
|
|
334
|
+
main_name: llama_3_8b_chat_hf_together_ai_template_table2text_single_turn_with_reference
|
|
335
|
+
main_split: test
|
|
336
|
+
taxonomy:
|
|
337
|
+
task: "?"
|
|
338
|
+
what: "?"
|
|
339
|
+
who: "?"
|
|
340
|
+
when: "?"
|
|
341
|
+
language: English
|
|
@@ -78,6 +78,7 @@ perturbations: []
|
|
|
78
78
|
metric_groups:
|
|
79
79
|
- name: accuracy
|
|
80
80
|
display_name: Accuracy
|
|
81
|
+
hide_win_rates: true
|
|
81
82
|
metrics:
|
|
82
83
|
- name: ${main_name}
|
|
83
84
|
split: ${main_split}
|
|
@@ -111,12 +112,32 @@ run_groups:
|
|
|
111
112
|
description: Thai-language scenarios
|
|
112
113
|
category: All scenarios
|
|
113
114
|
subgroups:
|
|
115
|
+
- thai_exam
|
|
114
116
|
- thai_exam_onet
|
|
115
117
|
- thai_exam_ic
|
|
116
118
|
- thai_exam_tgat
|
|
117
119
|
- thai_exam_tpat1
|
|
118
120
|
- thai_exam_a_level
|
|
119
121
|
|
|
122
|
+
|
|
123
|
+
- name: thai_exam
|
|
124
|
+
display_name: ThaiExam
|
|
125
|
+
description: >
|
|
126
|
+
Macro-averaged accuracy on all ThaiExam examinations.
|
|
127
|
+
metric_groups:
|
|
128
|
+
- accuracy
|
|
129
|
+
- efficiency
|
|
130
|
+
- general_information
|
|
131
|
+
environment:
|
|
132
|
+
main_name: exact_match
|
|
133
|
+
main_split: test
|
|
134
|
+
taxonomy:
|
|
135
|
+
task: question answering
|
|
136
|
+
what: n/a
|
|
137
|
+
who: n/a
|
|
138
|
+
when: "?"
|
|
139
|
+
language: Thai and English
|
|
140
|
+
|
|
120
141
|
- name: thai_exam_onet
|
|
121
142
|
display_name: ONET
|
|
122
143
|
description: >
|