crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: quasi_exact_match
|
|
74
|
+
display_name: Quasi-exact match
|
|
75
|
+
short_display_name: EM
|
|
76
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
77
|
+
lower_is_better: false
|
|
78
|
+
- name: prefix_exact_match
|
|
79
|
+
display_name: Prefix exact match
|
|
80
|
+
short_display_name: PEM
|
|
81
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
82
|
+
lower_is_better: false
|
|
83
|
+
- name: quasi_prefix_exact_match
|
|
84
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
85
|
+
display_name: Prefix quasi-exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
|
|
90
|
+
- name: exact_match@5
|
|
91
|
+
display_name: Exact match @5
|
|
92
|
+
short_display_name: EM@5
|
|
93
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
- name: quasi_exact_match@5
|
|
96
|
+
display_name: Quasi-exact match @5
|
|
97
|
+
short_display_name: EM@5
|
|
98
|
+
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
|
|
99
|
+
lower_is_better: false
|
|
100
|
+
- name: prefix_exact_match@5
|
|
101
|
+
display_name: Prefix exact match @5
|
|
102
|
+
short_display_name: PEM@5
|
|
103
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
|
|
104
|
+
lower_is_better: false
|
|
105
|
+
- name: quasi_prefix_exact_match@5
|
|
106
|
+
display_name: Prefix quasi-exact match @5
|
|
107
|
+
short_display_name: PEM@5
|
|
108
|
+
description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
|
|
109
|
+
lower_is_better: false
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
############################################################
|
|
113
|
+
perturbations: []
|
|
114
|
+
|
|
115
|
+
############################################################
|
|
116
|
+
metric_groups:
|
|
117
|
+
- name: accuracy
|
|
118
|
+
display_name: Accuracy
|
|
119
|
+
hide_win_rates: true
|
|
120
|
+
metrics:
|
|
121
|
+
- name: ${main_name}
|
|
122
|
+
split: ${main_split}
|
|
123
|
+
|
|
124
|
+
- name: efficiency
|
|
125
|
+
display_name: Efficiency
|
|
126
|
+
metrics:
|
|
127
|
+
- name: inference_runtime
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
|
|
130
|
+
- name: general_information
|
|
131
|
+
display_name: General information
|
|
132
|
+
hide_win_rates: true
|
|
133
|
+
metrics:
|
|
134
|
+
- name: num_instances
|
|
135
|
+
split: ${main_split}
|
|
136
|
+
- name: num_train_instances
|
|
137
|
+
split: ${main_split}
|
|
138
|
+
- name: prompt_truncated
|
|
139
|
+
split: ${main_split}
|
|
140
|
+
- name: num_prompt_tokens
|
|
141
|
+
split: ${main_split}
|
|
142
|
+
- name: num_output_tokens
|
|
143
|
+
split: ${main_split}
|
|
144
|
+
|
|
145
|
+
############################################################
|
|
146
|
+
run_groups:
|
|
147
|
+
- name: world_knowledge_scenarios
|
|
148
|
+
display_name: World Knowledge Scenarios
|
|
149
|
+
description: Scenarios the world knowledge
|
|
150
|
+
category: All scenarios
|
|
151
|
+
subgroups:
|
|
152
|
+
- ewok
|
|
153
|
+
- ewok_agent_properties
|
|
154
|
+
- ewok_material_dynamics
|
|
155
|
+
- ewok_material_properties
|
|
156
|
+
- ewok_physical_dynamics
|
|
157
|
+
- ewok_physical_interactions
|
|
158
|
+
- ewok_physical_relations
|
|
159
|
+
- ewok_quantitative_properties
|
|
160
|
+
- ewok_social_interactions
|
|
161
|
+
- ewok_social_properties
|
|
162
|
+
- ewok_social_relations
|
|
163
|
+
- ewok_spatial_relations
|
|
164
|
+
|
|
165
|
+
- name: ewok
|
|
166
|
+
display_name: EWoK
|
|
167
|
+
description: Elements of World Knowledge (EWoK) is a benchmark for evaluating world modeling by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context.
|
|
168
|
+
metric_groups:
|
|
169
|
+
- accuracy
|
|
170
|
+
- efficiency
|
|
171
|
+
- general_information
|
|
172
|
+
environment:
|
|
173
|
+
main_name: exact_match
|
|
174
|
+
main_split: test
|
|
175
|
+
taxonomy:
|
|
176
|
+
task: multiple choice question answering
|
|
177
|
+
what: world knowledge
|
|
178
|
+
who: n/a
|
|
179
|
+
when: n/a
|
|
180
|
+
language: English
|
|
181
|
+
|
|
182
|
+
- name: ewok_agent_properties
|
|
183
|
+
display_name: Agent Properties
|
|
184
|
+
description: The Agent Properties domain from Elements of World Knowledge (EWoK).
|
|
185
|
+
metric_groups:
|
|
186
|
+
- accuracy
|
|
187
|
+
- efficiency
|
|
188
|
+
- general_information
|
|
189
|
+
environment:
|
|
190
|
+
main_name: exact_match
|
|
191
|
+
main_split: test
|
|
192
|
+
taxonomy:
|
|
193
|
+
task: multiple choice question answering
|
|
194
|
+
what: world knowledge
|
|
195
|
+
who: n/a
|
|
196
|
+
when: n/a
|
|
197
|
+
language: English
|
|
198
|
+
|
|
199
|
+
- name: ewok_material_dynamics
|
|
200
|
+
display_name: Material Dynamics
|
|
201
|
+
description: The Material Dynamics domain from Elements of World Knowledge (EWoK).
|
|
202
|
+
metric_groups:
|
|
203
|
+
- accuracy
|
|
204
|
+
- efficiency
|
|
205
|
+
- general_information
|
|
206
|
+
environment:
|
|
207
|
+
main_name: exact_match
|
|
208
|
+
main_split: test
|
|
209
|
+
taxonomy:
|
|
210
|
+
task: multiple choice question answering
|
|
211
|
+
what: world knowledge
|
|
212
|
+
who: n/a
|
|
213
|
+
when: n/a
|
|
214
|
+
language: English
|
|
215
|
+
|
|
216
|
+
- name: ewok_material_properties
|
|
217
|
+
display_name: Material Properties
|
|
218
|
+
description: The Material Properties domain from Elements of World Knowledge (EWoK).
|
|
219
|
+
metric_groups:
|
|
220
|
+
- accuracy
|
|
221
|
+
- efficiency
|
|
222
|
+
- general_information
|
|
223
|
+
environment:
|
|
224
|
+
main_name: exact_match
|
|
225
|
+
main_split: test
|
|
226
|
+
taxonomy:
|
|
227
|
+
task: multiple choice question answering
|
|
228
|
+
what: world knowledge
|
|
229
|
+
who: n/a
|
|
230
|
+
when: n/a
|
|
231
|
+
language: English
|
|
232
|
+
|
|
233
|
+
- name: ewok_physical_dynamics
|
|
234
|
+
display_name: Physical Dynamics
|
|
235
|
+
description: The Physical Dynamics domain from Elements of World Knowledge (EWoK).
|
|
236
|
+
metric_groups:
|
|
237
|
+
- accuracy
|
|
238
|
+
- efficiency
|
|
239
|
+
- general_information
|
|
240
|
+
environment:
|
|
241
|
+
main_name: exact_match
|
|
242
|
+
main_split: test
|
|
243
|
+
taxonomy:
|
|
244
|
+
task: multiple choice question answering
|
|
245
|
+
what: world knowledge
|
|
246
|
+
who: n/a
|
|
247
|
+
when: n/a
|
|
248
|
+
language: English
|
|
249
|
+
|
|
250
|
+
- name: ewok_physical_interactions
|
|
251
|
+
display_name: Physical Interactions
|
|
252
|
+
description: The Physical Interactions domain from Elements of World Knowledge (EWoK).
|
|
253
|
+
metric_groups:
|
|
254
|
+
- accuracy
|
|
255
|
+
- efficiency
|
|
256
|
+
- general_information
|
|
257
|
+
environment:
|
|
258
|
+
main_name: exact_match
|
|
259
|
+
main_split: test
|
|
260
|
+
taxonomy:
|
|
261
|
+
task: multiple choice question answering
|
|
262
|
+
what: world knowledge
|
|
263
|
+
who: n/a
|
|
264
|
+
when: n/a
|
|
265
|
+
language: English
|
|
266
|
+
|
|
267
|
+
- name: ewok_physical_relations
|
|
268
|
+
display_name: Physical Relations
|
|
269
|
+
description: The Physical Relations domain from Elements of World Knowledge (EWoK).
|
|
270
|
+
metric_groups:
|
|
271
|
+
- accuracy
|
|
272
|
+
- efficiency
|
|
273
|
+
- general_information
|
|
274
|
+
environment:
|
|
275
|
+
main_name: exact_match
|
|
276
|
+
main_split: test
|
|
277
|
+
taxonomy:
|
|
278
|
+
task: multiple choice question answering
|
|
279
|
+
what: world knowledge
|
|
280
|
+
who: n/a
|
|
281
|
+
when: n/a
|
|
282
|
+
language: English
|
|
283
|
+
|
|
284
|
+
- name: ewok_quantitative_properties
|
|
285
|
+
display_name: Quantitative Properties
|
|
286
|
+
description: The Quantitative Properties domain from Elements of World Knowledge (EWoK).
|
|
287
|
+
metric_groups:
|
|
288
|
+
- accuracy
|
|
289
|
+
- efficiency
|
|
290
|
+
- general_information
|
|
291
|
+
environment:
|
|
292
|
+
main_name: exact_match
|
|
293
|
+
main_split: test
|
|
294
|
+
taxonomy:
|
|
295
|
+
task: multiple choice question answering
|
|
296
|
+
what: world knowledge
|
|
297
|
+
who: n/a
|
|
298
|
+
when: n/a
|
|
299
|
+
language: English
|
|
300
|
+
|
|
301
|
+
- name: ewok_social_interactions
|
|
302
|
+
display_name: Social Interactions
|
|
303
|
+
description: The Social Interactions domain from Elements of World Knowledge (EWoK).
|
|
304
|
+
metric_groups:
|
|
305
|
+
- accuracy
|
|
306
|
+
- efficiency
|
|
307
|
+
- general_information
|
|
308
|
+
environment:
|
|
309
|
+
main_name: exact_match
|
|
310
|
+
main_split: test
|
|
311
|
+
taxonomy:
|
|
312
|
+
task: multiple choice question answering
|
|
313
|
+
what: world knowledge
|
|
314
|
+
who: n/a
|
|
315
|
+
when: n/a
|
|
316
|
+
language: English
|
|
317
|
+
|
|
318
|
+
- name: ewok_social_properties
|
|
319
|
+
display_name: Social Properties
|
|
320
|
+
description: The Social Properties domain from Elements of World Knowledge (EWoK).
|
|
321
|
+
metric_groups:
|
|
322
|
+
- accuracy
|
|
323
|
+
- efficiency
|
|
324
|
+
- general_information
|
|
325
|
+
environment:
|
|
326
|
+
main_name: exact_match
|
|
327
|
+
main_split: test
|
|
328
|
+
taxonomy:
|
|
329
|
+
task: multiple choice question answering
|
|
330
|
+
what: world knowledge
|
|
331
|
+
who: n/a
|
|
332
|
+
when: n/a
|
|
333
|
+
language: English
|
|
334
|
+
|
|
335
|
+
- name: ewok_social_relations
|
|
336
|
+
display_name: Social Relations
|
|
337
|
+
description: The Social Relations domain from Elements of World Knowledge (EWoK).
|
|
338
|
+
metric_groups:
|
|
339
|
+
- accuracy
|
|
340
|
+
- efficiency
|
|
341
|
+
- general_information
|
|
342
|
+
environment:
|
|
343
|
+
main_name: exact_match
|
|
344
|
+
main_split: test
|
|
345
|
+
taxonomy:
|
|
346
|
+
task: multiple choice question answering
|
|
347
|
+
what: world knowledge
|
|
348
|
+
who: n/a
|
|
349
|
+
when: n/a
|
|
350
|
+
language: English
|
|
351
|
+
|
|
352
|
+
- name: ewok_spatial_relations
|
|
353
|
+
display_name: Spatial Relations
|
|
354
|
+
description: The Spatial Relations domain from Elements of World Knowledge (EWoK).
|
|
355
|
+
metric_groups:
|
|
356
|
+
- accuracy
|
|
357
|
+
- efficiency
|
|
358
|
+
- general_information
|
|
359
|
+
environment:
|
|
360
|
+
main_name: exact_match
|
|
361
|
+
main_split: test
|
|
362
|
+
taxonomy:
|
|
363
|
+
task: multiple choice question answering
|
|
364
|
+
what: world knowledge
|
|
365
|
+
who: n/a
|
|
366
|
+
when: n/a
|
|
367
|
+
language: English
|
|
@@ -73,6 +73,15 @@ metrics:
|
|
|
73
73
|
display_name: Execution Accuracy
|
|
74
74
|
description: Accuracy of the final result of the generated program
|
|
75
75
|
lower_is_better: false
|
|
76
|
+
- name: annotation_financebench_label_correct_answer
|
|
77
|
+
display_name: Correct Answer
|
|
78
|
+
description: Whether the final result was correct, as judged by a GPT-4o
|
|
79
|
+
lower_is_better: false
|
|
80
|
+
- name: quasi_exact_match
|
|
81
|
+
display_name: Quasi-exact match
|
|
82
|
+
short_display_name: EM
|
|
83
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
84
|
+
lower_is_better: false
|
|
76
85
|
|
|
77
86
|
############################################################
|
|
78
87
|
perturbations: []
|
|
@@ -114,6 +123,8 @@ run_groups:
|
|
|
114
123
|
category: All scenarios
|
|
115
124
|
subgroups:
|
|
116
125
|
- fin_qa
|
|
126
|
+
- financebench
|
|
127
|
+
- banking77
|
|
117
128
|
|
|
118
129
|
- name: fin_qa
|
|
119
130
|
display_name: FinQA
|
|
@@ -132,12 +143,47 @@ run_groups:
|
|
|
132
143
|
when: 1999 to 2019
|
|
133
144
|
language: English
|
|
134
145
|
|
|
135
|
-
- name:
|
|
136
|
-
display_name:
|
|
137
|
-
description:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
146
|
+
- name: financebench
|
|
147
|
+
display_name: FinanceBench
|
|
148
|
+
description: FinanceBench is a benchmark for open book financial question answering. It comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings
|
|
149
|
+
metric_groups:
|
|
150
|
+
- accuracy
|
|
151
|
+
- efficiency
|
|
152
|
+
- general_information
|
|
153
|
+
environment:
|
|
154
|
+
main_name: annotation_financebench_label_correct_answer
|
|
155
|
+
main_split: test
|
|
156
|
+
taxonomy:
|
|
157
|
+
task: question answering with numeric reasoning
|
|
158
|
+
what: financial reports
|
|
159
|
+
who: financial experts
|
|
160
|
+
when: 2015 to 2023
|
|
161
|
+
language: English
|
|
162
|
+
|
|
163
|
+
- name: banking77
|
|
164
|
+
display_name: BANKING77
|
|
165
|
+
short_display_name: BANKING77
|
|
166
|
+
description: BANKING77 is a benchmark for intent classification of customer service queries in the banking domain [(Casanueva et al., 2020)](https://aclanthology.org/2020.nlp4convai-1.5/)).
|
|
167
|
+
metric_groups:
|
|
168
|
+
- accuracy
|
|
169
|
+
- efficiency
|
|
170
|
+
- general_information
|
|
171
|
+
environment:
|
|
172
|
+
main_name: quasi_exact_match
|
|
173
|
+
main_split: test
|
|
174
|
+
taxonomy:
|
|
175
|
+
task: text classification
|
|
176
|
+
what: customer service queries in the banking domain
|
|
177
|
+
who: banking customers
|
|
178
|
+
when: During or before 2020
|
|
179
|
+
language: English
|
|
180
|
+
|
|
181
|
+
# - name: financial_scenarios_ablations
|
|
182
|
+
# display_name: Financial Scenarios Ablations
|
|
183
|
+
# description: Scenarios for the financial domain with ablations
|
|
184
|
+
# category: All scenarios
|
|
185
|
+
# subgroups:
|
|
186
|
+
# - fin_qa
|
|
187
|
+
# adapter_keys_shown:
|
|
188
|
+
# - model
|
|
189
|
+
# - max_train_instances
|