crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,8 +1,7 @@
1
- import os
2
1
  import csv
3
2
  from typing import List
4
3
 
5
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
6
5
  from helm.benchmark.scenarios.scenario import (
7
6
  Input,
8
7
  Scenario,
@@ -40,19 +39,27 @@ class StarrPatientInstructionsScenario(Scenario):
40
39
  """
41
40
 
42
41
  name = "starr_patient_instructions"
43
- description = "A dataset containing case details used to generate customized post-procedure patient instructions."
42
+ description = (
43
+ "PatientInstruct is a benchmark designed to evaluate models on generating personalized"
44
+ "post-procedure instructions for patients. It includes real-world patient History & Physical"
45
+ "Note (H&P) and operative report, from which models must produce clear, actionable instructions"
46
+ "appropriate for patients recovering from medical interventions."
47
+ )
44
48
  tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
45
49
 
46
- def get_instances(self, output_path: str) -> List[Instance]:
47
- csv_path = "/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv"
48
- # Ensure the directory for the CSV file exists.
49
- ensure_directory_exists(os.path.dirname(csv_path))
50
+ def __init__(self, data_path: str):
51
+ super().__init__()
52
+ self.data_path = data_path
50
53
 
54
+ def get_instances(self, output_path: str) -> List[Instance]:
55
+ check_file_exists(
56
+ self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
57
+ )
51
58
  instances: List[Instance] = []
52
59
  # For now, we assign all instances to the test split (zero-shot setting).
53
60
  split = TEST_SPLIT
54
61
 
55
- with open(csv_path, "r", encoding="utf-8") as csvfile:
62
+ with open(self.data_path, "r", encoding="utf-8") as csvfile:
56
63
  reader = csv.DictReader(csvfile)
57
64
  for row in reader:
58
65
  # Retrieve and strip the relevant fields.
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_alghafa_scenario_get_instances():
10
+ scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 562
14
+ assert actual_instances[0].id == "id0_test"
15
+ assert actual_instances[0].input == Input(
16
+ text=(
17
+ 'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي' # noqa: E501
18
+ )
19
+ )
20
+ assert len(actual_instances[0].references) == 4
21
+ assert actual_instances[0].references[0].output.text == "الشجاعة"
22
+ assert actual_instances[0].references[0].tags == []
23
+ assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
24
+ assert actual_instances[0].references[1].tags == [CORRECT_TAG]
25
+ assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
26
+ assert actual_instances[0].references[2].tags == []
27
+ assert actual_instances[0].references[3].output.text == "التواضع"
28
+ assert actual_instances[0].references[3].tags == []
29
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,21 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_aratrust_get_instances():
10
+ scenario = AraTrustScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 522
14
+ assert actual_instances[0].id == "id0"
15
+ assert actual_instances[0].input == Input(
16
+ text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
17
+ )
18
+ assert len(actual_instances[0].references) == 1
19
+ assert actual_instances[0].references[0].output.text == "ب"
20
+ assert actual_instances[0].references[0].tags == [CORRECT_TAG]
21
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,59 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_bluex_scenario():
10
+ scenario = BLUEX_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ assert len(instances) > 100
15
+
16
+ assert instances[100].split == TEST_SPLIT
17
+
18
+ assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
19
+
20
+ assert len(instances[0].input.text) == 1011
21
+
22
+ assert instances[0].references == [
23
+ Reference(
24
+ output=Output(
25
+ text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
26
+ ),
27
+ tags=[],
28
+ ),
29
+ Reference(
30
+ output=Output(
31
+ text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
32
+ 'outra coisa".'
33
+ ),
34
+ tags=[],
35
+ ),
36
+ Reference(
37
+ output=Output(
38
+ text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
39
+ "o presente."
40
+ ),
41
+ tags=[],
42
+ ),
43
+ Reference(
44
+ output=Output(
45
+ text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
46
+ 'linhas tortas".'
47
+ ),
48
+ tags=[],
49
+ ),
50
+ Reference(
51
+ output=Output(
52
+ text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
53
+ 'desgraça...".'
54
+ ),
55
+ tags=[CORRECT_TAG],
56
+ ),
57
+ ]
58
+
59
+ assert instances[0].references[4].is_correct
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_exam_multilingual_scenario_get_instances():
10
+ scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 393
14
+ assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
15
+ assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
16
+ assert len(actual_instances[0].references) == 4
17
+ assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
18
+ assert actual_instances[0].references[0].tags == []
19
+ assert (
20
+ actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
21
+ )
22
+ assert actual_instances[0].references[1].tags == []
23
+ assert (
24
+ actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
25
+ )
26
+ assert actual_instances[0].references[2].tags == []
27
+ assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
28
+ assert actual_instances[0].references[3].tags == [CORRECT_TAG]
29
+ assert actual_instances[0].split == TRAIN_SPLIT
@@ -0,0 +1,57 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_healthqa_br_instance():
10
+ scenario = HEALTHQA_BR_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ instance = instances[35]
15
+
16
+ assert instance.split == TEST_SPLIT
17
+
18
+ assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
19
+
20
+ assert instance.references == [
21
+ Reference(
22
+ output=Output(
23
+ text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
24
+ "tentativa de redução do volume."
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
31
+ "imediata do cirurgião."
32
+ ),
33
+ tags=[CORRECT_TAG],
34
+ ),
35
+ Reference(
36
+ output=Output(
37
+ text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
38
+ "abdominal."
39
+ ),
40
+ tags=[],
41
+ ),
42
+ Reference(
43
+ output=Output(
44
+ text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
45
+ ),
46
+ tags=[],
47
+ ),
48
+ Reference(
49
+ output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
50
+ tags=[],
51
+ ),
52
+ ]
53
+
54
+ correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
55
+ assert len(correct_refs) == 1
56
+
57
+ assert instance.references[1].is_correct
@@ -0,0 +1,18 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.infinite_bench_en_qa_scenario import InfiniteBenchEnQAScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_infinite_bench_en_qa_scenario():
10
+ with TemporaryDirectory() as tmpdir:
11
+ scenario = InfiniteBenchEnQAScenario(max_num_words=10000000)
12
+ instances = scenario.get_instances(tmpdir)
13
+ assert len(instances) == 351
14
+ assert instances[0].split == "test"
15
+ assert len(instances[0].input.text) == 381829
16
+ assert len(instances[0].references) == 1
17
+ assert len(instances[0].references[0].output.text) == 8
18
+ assert instances[0].references[0].tags == [CORRECT_TAG]
@@ -0,0 +1,31 @@
1
+ import pytest
2
+ import re
3
+ from tempfile import TemporaryDirectory
4
+ from helm.benchmark.scenarios.infinite_bench_en_sum_scenario import InfiniteBenchEnSumScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
6
+
7
+
8
+ def count_words(text: str) -> int:
9
+ return len(re.split(r"\s+", text.strip()))
10
+
11
+
12
+ @pytest.mark.scenarios
13
+ def test_infinite_bench_en_sum_scenario():
14
+ with TemporaryDirectory() as tmpdir:
15
+ scenario = InfiniteBenchEnSumScenario(max_num_words=10000000)
16
+ instances = scenario.get_instances(tmpdir)
17
+ assert len(instances) == 103
18
+ assert instances[0].split == "test"
19
+ assert len(instances[0].input.text) == 1745528
20
+ references = instances[0].references
21
+ assert len(references[0].output.text) == 2865
22
+ assert references[0].tags == [CORRECT_TAG]
23
+
24
+ scenario = InfiniteBenchEnSumScenario(max_num_words=100000)
25
+ instances = scenario.get_instances(tmpdir)
26
+ assert len(instances) == 48
27
+ assert instances[0].split == "test"
28
+ assert len(instances[0].input.text) == 381778
29
+ references = instances[0].references
30
+ assert len(references[0].output.text) == 4217
31
+ assert references[0].tags == [CORRECT_TAG]
@@ -77,7 +77,8 @@ class TruthfulQAScenario(Scenario):
77
77
  """Downloads the TruthfulQA dataset."""
78
78
  # Download the raw data
79
79
  data_dir = os.path.join(output_path, "data")
80
- url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/TruthfulQA.csv"
80
+
81
+ url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/d71c110897f5d31c5d7f309e7bc316c152f6f031/data/v1/TruthfulQA.csv" # noqa: E501
81
82
  ensure_directory_exists(data_dir)
82
83
  ensure_file_downloaded(source_url=url, target_path=os.path.join(data_dir, self.DATASET_FILE_NAME))
83
84
 
@@ -0,0 +1,75 @@
1
+ from collections import defaultdict
2
+ from typing import List
3
+ import json
4
+ import os
5
+
6
+ from helm.benchmark.scenarios.scenario import (
7
+ CORRECT_TAG,
8
+ TEST_SPLIT,
9
+ Instance,
10
+ Input,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ )
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ class MSRVTTScenario(Scenario):
20
+ """
21
+ A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
22
+ This is achieved by collecting 257 popular queries from a commercial video search engine, with 118 videos for
23
+ each query. In its current version, MSR-VTT provides 10K web video clips with 41.2 hours and 200K clip-sentence
24
+ pairs in total, covering the most comprehensive categories and diverse visual content, and representing the
25
+ largest dataset in terms of sentence and vocabulary. Each clip is annotated with about 20 natural sentences
26
+ by 1,327 AMT workers.
27
+
28
+ Website link: https://cove.thecvf.com/datasets/839
29
+
30
+ Citation:
31
+ MSR-VTT: A Large Video Description Dataset for Bridging Video and Language Jun Xu, Tao Mei, Ting Yao, Yong Rui
32
+ CVPR 2016
33
+ """
34
+
35
+ DOWNLOAD_URL: str = "https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip"
36
+
37
+ name = "msr_vtt"
38
+ description = "Video captioning dataset with 10K web video clips and 200K clip-sentence pairs."
39
+ tags = ["vision-language", "video", "captioning"]
40
+
41
+ def get_instances(self, output_path: str) -> List[Instance]:
42
+ # Download the dataset
43
+ target_path: str = os.path.join(output_path, "data")
44
+ ensure_file_downloaded(
45
+ source_url=self.DOWNLOAD_URL,
46
+ target_path=target_path,
47
+ unpack=True,
48
+ )
49
+
50
+ annotation_path: str = os.path.join(target_path, "annotation", "MSR_VTT.json")
51
+ with open(annotation_path, "r") as f:
52
+ annotations = json.load(f)["annotations"]
53
+
54
+ video_path_to_annotations: dict[str, set[str]] = defaultdict(set)
55
+ for annotation in annotations:
56
+ video_id: str = annotation["image_id"]
57
+ video_path: str = os.path.join(target_path, "videos", "all", f"{video_id}.mp4")
58
+ assert os.path.exists(video_path), f"Video does not exist at path: {video_path}"
59
+ video_path_to_annotations[video_path].add(annotation["caption"])
60
+
61
+ instances: List[Instance] = []
62
+ for video_path, captions in video_path_to_annotations.items():
63
+ content: List[MediaObject] = [
64
+ MediaObject(location=video_path, content_type="video/mp4"),
65
+ ]
66
+ references: List[Reference] = [Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions]
67
+ instances.append(
68
+ Instance(
69
+ Input(multimedia_content=MultimediaObject(content)),
70
+ references=references,
71
+ split=TEST_SPLIT,
72
+ )
73
+ )
74
+
75
+ return instances
helm/benchmark/server.py CHANGED
@@ -9,7 +9,7 @@ import json
9
9
  from os import path
10
10
  import urllib
11
11
 
12
- from bottle import Bottle, static_file, HTTPResponse
12
+ from bottle import Bottle, static_file, HTTPResponse, response
13
13
  import yaml
14
14
 
15
15
  from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
@@ -21,6 +21,7 @@ app = Bottle()
21
21
 
22
22
  @app.get("/config.js")
23
23
  def serve_config():
24
+ response.content_type = "application/javascript; charset=UTF-8"
24
25
  if app.config["helm.release"]:
25
26
  return (
26
27
  f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
13
13
 
14
14
 
15
15
  class SlurmJobState:
16
- # TODO: Convert to StrEnum after upgrading to Python 3.11
17
16
  # Non-exhaustive list of Slurm job states.
18
17
  # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
19
18
 
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
81
80
  except subprocess.CalledProcessError as e:
82
81
  # Default CalledProcessError message doesn't have output, so re-raise here to include the output.
83
82
  raise Exception(f"{str(e)} output: {e.output}")
84
- search_result = re.search("JobState=(\w+)", scontrol_output.decode())
83
+ search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
85
84
  if not search_result:
86
85
  raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
87
86
  return search_result.group(1)
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
26
26
  FAILURE_SLURM_JOB_STATES,
27
27
  )
28
28
  from helm.common.general import ensure_directory_exists
29
- from helm.common.hierarchical_logger import hlog, htrack_block
29
+ from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
30
30
 
31
31
  from helm.benchmark.runner_config_registry import RUNNER_CONFIG
32
32
 
@@ -343,7 +343,14 @@ def main():
343
343
  help="Path to the RunSpec JSON file",
344
344
  required=True,
345
345
  )
346
+ parser.add_argument(
347
+ "--log-config",
348
+ type=str,
349
+ default=None,
350
+ help="PATH to a YAML file to customize logging",
351
+ )
346
352
  args = parser.parse_args()
353
+ setup_default_logging(args.log_config)
347
354
 
348
355
  # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
349
356
  with open(args.slurm_runner_spec_path, "r") as f:
@@ -0,0 +1,228 @@
1
+ ---
2
+ # Schema for Arabic scenarios
3
+ ############################################################
4
+ metrics:
5
+ # Infrastructure metrics:
6
+ - name: num_perplexity_tokens
7
+ display_name: '# tokens'
8
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
9
+ - name: num_bytes
10
+ display_name: '# bytes'
11
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
12
+
13
+ - name: num_references
14
+ display_name: '# ref'
15
+ description: Number of references.
16
+ - name: num_train_trials
17
+ display_name: '# trials'
18
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
19
+ - name: estimated_num_tokens_cost
20
+ display_name: 'cost'
21
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
22
+ - name: num_prompt_tokens
23
+ display_name: '# prompt tokens'
24
+ description: Number of tokens in the prompt.
25
+ - name: num_prompt_characters
26
+ display_name: '# prompt chars'
27
+ description: Number of characters in the prompt.
28
+ - name: num_completion_tokens
29
+ display_name: '# completion tokens'
30
+ description: Actual number of completion tokens (over all completions).
31
+ - name: num_output_tokens
32
+ display_name: '# output tokens'
33
+ description: Actual number of output tokens.
34
+ - name: max_num_output_tokens
35
+ display_name: 'Max output tokens'
36
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
37
+ - name: num_requests
38
+ display_name: '# requests'
39
+ description: Number of distinct API requests.
40
+ - name: num_instances
41
+ display_name: '# eval'
42
+ description: Number of evaluation instances.
43
+ - name: num_train_instances
44
+ display_name: '# train'
45
+ description: Number of training instances (e.g., in-context examples).
46
+ - name: prompt_truncated
47
+ display_name: truncated
48
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
49
+ - name: finish_reason_length
50
+ display_name: finish b/c length
51
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
52
+ - name: finish_reason_stop
53
+ display_name: finish b/c stop
54
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
55
+ - name: finish_reason_endoftext
56
+ display_name: finish b/c endoftext
57
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
58
+ - name: finish_reason_unknown
59
+ display_name: finish b/c unknown
60
+ description: Fraction of instances where the the output was terminated for unknown reasons.
61
+ - name: num_completions
62
+ display_name: '# completions'
63
+ description: Number of completions.
64
+ - name: predicted_index
65
+ display_name: Predicted index
66
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
67
+ - name: inference_runtime
68
+ display_name: Observed inference runtime (s)
69
+ short_display_name: Observed inference time (s)
70
+ lower_is_better: true
71
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
72
+
73
+ # Accuracy metrics:
74
+ - name: exact_match
75
+ display_name: Exact match
76
+ short_display_name: EM
77
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
78
+ lower_is_better: false
79
+ - name: quasi_exact_match
80
+ display_name: Quasi-exact match
81
+ short_display_name: EM
82
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
83
+ lower_is_better: false
84
+ - name: prefix_exact_match
85
+ display_name: Prefix exact match
86
+ short_display_name: PEM
87
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
88
+ lower_is_better: false
89
+ - name: quasi_prefix_exact_match
90
+ # TODO: should call this prefix_quasi_exact_match
91
+ display_name: Prefix quasi-exact match
92
+ short_display_name: PEM
93
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
94
+ lower_is_better: false
95
+
96
+ ############################################################
97
+ perturbations: []
98
+
99
+ ############################################################
100
+ metric_groups:
101
+ - name: accuracy
102
+ display_name: Accuracy
103
+ aggregation_strategies:
104
+ - mean
105
+ metrics:
106
+ - name: ${main_name}
107
+ split: ${main_split}
108
+
109
+ - name: efficiency
110
+ display_name: Efficiency
111
+ aggregation_strategies:
112
+ - mean
113
+ metrics:
114
+ - name: inference_runtime
115
+ split: ${main_split}
116
+
117
+ - name: general_information
118
+ display_name: General information
119
+ hide_win_rates: true
120
+ metrics:
121
+ - name: num_instances
122
+ split: ${main_split}
123
+ - name: num_train_instances
124
+ split: ${main_split}
125
+ - name: prompt_truncated
126
+ split: ${main_split}
127
+ - name: num_prompt_tokens
128
+ split: ${main_split}
129
+ - name: num_output_tokens
130
+ split: ${main_split}
131
+
132
+ ############################################################
133
+ run_groups:
134
+ - name: arabic_scenarios
135
+ display_name: Arabic Scenarios
136
+ description: Arabic Scenarios
137
+ category: All scenarios
138
+ subgroups:
139
+ - mmmlu
140
+ - arabic_mmlu
141
+ - alghafa
142
+ - exams_multilingual
143
+ - aratrust
144
+
145
+ - name: mmmlu
146
+ display_name: Multilingual MMLU (Arabic)
147
+ description: Multilingual MMLU (Arabic)
148
+ metric_groups:
149
+ - accuracy
150
+ - efficiency
151
+ - general_information
152
+ environment:
153
+ main_name: exact_match
154
+ main_split: test
155
+ taxonomy:
156
+ task: multiple-choice question answering
157
+ what: math, science, history, etc.
158
+ who: various online sources
159
+ when: before 2021
160
+ language: Arabic
161
+
162
+ - name: arabic_mmlu
163
+ display_name: Arabic MMLU
164
+ description: Arabic MMLU
165
+ metric_groups:
166
+ - accuracy
167
+ - efficiency
168
+ - general_information
169
+ environment:
170
+ main_name: exact_match
171
+ main_split: test
172
+ taxonomy:
173
+ task: "question answering"
174
+ what: "academic questions across various disciplines"
175
+ who: "academic exams writers and takers"
176
+ when: "before 2024"
177
+ language: Arabic
178
+
179
+ - name: alghafa
180
+ display_name: AlGhafa
181
+ description: AlGhafa
182
+ metric_groups:
183
+ - accuracy
184
+ - efficiency
185
+ - general_information
186
+ environment:
187
+ main_name: exact_match
188
+ main_split: test
189
+ taxonomy:
190
+ task: "multiple choice question answering"
191
+ what: Various
192
+ who: Various
193
+ when: "before 2023"
194
+ language: Arabic
195
+
196
+ - name: exams_multilingual
197
+ display_name: EXAMS (Arabic)
198
+ description: EXAMS (Arabic)
199
+ metric_groups:
200
+ - accuracy
201
+ - efficiency
202
+ - general_information
203
+ environment:
204
+ main_name: exact_match
205
+ main_split: test
206
+ taxonomy:
207
+ task: "multiple choice question answering"
208
+ what: High school examinations
209
+ who: High school examinations writers and test-takers
210
+ when: before 2020
211
+ language: Arabic
212
+
213
+ - name: aratrust
214
+ display_name: AraTrust
215
+ description: AraTrust
216
+ metric_groups:
217
+ - accuracy
218
+ - efficiency
219
+ - general_information
220
+ environment:
221
+ main_name: exact_match
222
+ main_split: test
223
+ taxonomy:
224
+ task: "question answering"
225
+ what: "academic questions across various disciplines"
226
+ who: "academic exams writers and takers"
227
+ when: "before 2024"
228
+ language: Arabic