crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import csv
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
|
-
from helm.common.general import
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
6
5
|
from helm.benchmark.scenarios.scenario import (
|
|
7
6
|
Input,
|
|
8
7
|
Scenario,
|
|
@@ -40,19 +39,27 @@ class StarrPatientInstructionsScenario(Scenario):
|
|
|
40
39
|
"""
|
|
41
40
|
|
|
42
41
|
name = "starr_patient_instructions"
|
|
43
|
-
description =
|
|
42
|
+
description = (
|
|
43
|
+
"PatientInstruct is a benchmark designed to evaluate models on generating personalized"
|
|
44
|
+
"post-procedure instructions for patients. It includes real-world patient History & Physical"
|
|
45
|
+
"Note (H&P) and operative report, from which models must produce clear, actionable instructions"
|
|
46
|
+
"appropriate for patients recovering from medical interventions."
|
|
47
|
+
)
|
|
44
48
|
tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
|
|
45
49
|
|
|
46
|
-
def
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
ensure_directory_exists(os.path.dirname(csv_path))
|
|
50
|
+
def __init__(self, data_path: str):
|
|
51
|
+
super().__init__()
|
|
52
|
+
self.data_path = data_path
|
|
50
53
|
|
|
54
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
55
|
+
check_file_exists(
|
|
56
|
+
self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
|
|
57
|
+
)
|
|
51
58
|
instances: List[Instance] = []
|
|
52
59
|
# For now, we assign all instances to the test split (zero-shot setting).
|
|
53
60
|
split = TEST_SPLIT
|
|
54
61
|
|
|
55
|
-
with open(
|
|
62
|
+
with open(self.data_path, "r", encoding="utf-8") as csvfile:
|
|
56
63
|
reader = csv.DictReader(csvfile)
|
|
57
64
|
for row in reader:
|
|
58
65
|
# Retrieve and strip the relevant fields.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_alghafa_scenario_get_instances():
|
|
10
|
+
scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 562
|
|
14
|
+
assert actual_instances[0].id == "id0_test"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي' # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
assert len(actual_instances[0].references) == 4
|
|
21
|
+
assert actual_instances[0].references[0].output.text == "الشجاعة"
|
|
22
|
+
assert actual_instances[0].references[0].tags == []
|
|
23
|
+
assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
|
|
24
|
+
assert actual_instances[0].references[1].tags == [CORRECT_TAG]
|
|
25
|
+
assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "التواضع"
|
|
28
|
+
assert actual_instances[0].references[3].tags == []
|
|
29
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_aratrust_get_instances():
|
|
10
|
+
scenario = AraTrustScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 522
|
|
14
|
+
assert actual_instances[0].id == "id0"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
|
|
17
|
+
)
|
|
18
|
+
assert len(actual_instances[0].references) == 1
|
|
19
|
+
assert actual_instances[0].references[0].output.text == "ب"
|
|
20
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
21
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_bluex_scenario():
|
|
10
|
+
scenario = BLUEX_Scenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
assert len(instances) > 100
|
|
15
|
+
|
|
16
|
+
assert instances[100].split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
|
|
19
|
+
|
|
20
|
+
assert len(instances[0].input.text) == 1011
|
|
21
|
+
|
|
22
|
+
assert instances[0].references == [
|
|
23
|
+
Reference(
|
|
24
|
+
output=Output(
|
|
25
|
+
text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
|
|
26
|
+
),
|
|
27
|
+
tags=[],
|
|
28
|
+
),
|
|
29
|
+
Reference(
|
|
30
|
+
output=Output(
|
|
31
|
+
text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
|
|
32
|
+
'outra coisa".'
|
|
33
|
+
),
|
|
34
|
+
tags=[],
|
|
35
|
+
),
|
|
36
|
+
Reference(
|
|
37
|
+
output=Output(
|
|
38
|
+
text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
|
|
39
|
+
"o presente."
|
|
40
|
+
),
|
|
41
|
+
tags=[],
|
|
42
|
+
),
|
|
43
|
+
Reference(
|
|
44
|
+
output=Output(
|
|
45
|
+
text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
|
|
46
|
+
'linhas tortas".'
|
|
47
|
+
),
|
|
48
|
+
tags=[],
|
|
49
|
+
),
|
|
50
|
+
Reference(
|
|
51
|
+
output=Output(
|
|
52
|
+
text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
|
|
53
|
+
'desgraça...".'
|
|
54
|
+
),
|
|
55
|
+
tags=[CORRECT_TAG],
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
assert instances[0].references[4].is_correct
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_exam_multilingual_scenario_get_instances():
|
|
10
|
+
scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 393
|
|
14
|
+
assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
|
|
15
|
+
assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
|
|
16
|
+
assert len(actual_instances[0].references) == 4
|
|
17
|
+
assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
|
|
18
|
+
assert actual_instances[0].references[0].tags == []
|
|
19
|
+
assert (
|
|
20
|
+
actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
|
|
21
|
+
)
|
|
22
|
+
assert actual_instances[0].references[1].tags == []
|
|
23
|
+
assert (
|
|
24
|
+
actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
|
|
25
|
+
)
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
|
|
28
|
+
assert actual_instances[0].references[3].tags == [CORRECT_TAG]
|
|
29
|
+
assert actual_instances[0].split == TRAIN_SPLIT
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_healthqa_br_instance():
|
|
10
|
+
scenario = HEALTHQA_BR_Scenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
instance = instances[35]
|
|
15
|
+
|
|
16
|
+
assert instance.split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
|
|
19
|
+
|
|
20
|
+
assert instance.references == [
|
|
21
|
+
Reference(
|
|
22
|
+
output=Output(
|
|
23
|
+
text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
|
|
24
|
+
"tentativa de redução do volume."
|
|
25
|
+
),
|
|
26
|
+
tags=[],
|
|
27
|
+
),
|
|
28
|
+
Reference(
|
|
29
|
+
output=Output(
|
|
30
|
+
text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
|
|
31
|
+
"imediata do cirurgião."
|
|
32
|
+
),
|
|
33
|
+
tags=[CORRECT_TAG],
|
|
34
|
+
),
|
|
35
|
+
Reference(
|
|
36
|
+
output=Output(
|
|
37
|
+
text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
|
|
38
|
+
"abdominal."
|
|
39
|
+
),
|
|
40
|
+
tags=[],
|
|
41
|
+
),
|
|
42
|
+
Reference(
|
|
43
|
+
output=Output(
|
|
44
|
+
text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
|
|
45
|
+
),
|
|
46
|
+
tags=[],
|
|
47
|
+
),
|
|
48
|
+
Reference(
|
|
49
|
+
output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
|
|
50
|
+
tags=[],
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
|
|
55
|
+
assert len(correct_refs) == 1
|
|
56
|
+
|
|
57
|
+
assert instance.references[1].is_correct
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.infinite_bench_en_qa_scenario import InfiniteBenchEnQAScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_infinite_bench_en_qa_scenario():
|
|
10
|
+
with TemporaryDirectory() as tmpdir:
|
|
11
|
+
scenario = InfiniteBenchEnQAScenario(max_num_words=10000000)
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 351
|
|
14
|
+
assert instances[0].split == "test"
|
|
15
|
+
assert len(instances[0].input.text) == 381829
|
|
16
|
+
assert len(instances[0].references) == 1
|
|
17
|
+
assert len(instances[0].references[0].output.text) == 8
|
|
18
|
+
assert instances[0].references[0].tags == [CORRECT_TAG]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import re
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
from helm.benchmark.scenarios.infinite_bench_en_sum_scenario import InfiniteBenchEnSumScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def count_words(text: str) -> int:
|
|
9
|
+
return len(re.split(r"\s+", text.strip()))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.scenarios
|
|
13
|
+
def test_infinite_bench_en_sum_scenario():
|
|
14
|
+
with TemporaryDirectory() as tmpdir:
|
|
15
|
+
scenario = InfiniteBenchEnSumScenario(max_num_words=10000000)
|
|
16
|
+
instances = scenario.get_instances(tmpdir)
|
|
17
|
+
assert len(instances) == 103
|
|
18
|
+
assert instances[0].split == "test"
|
|
19
|
+
assert len(instances[0].input.text) == 1745528
|
|
20
|
+
references = instances[0].references
|
|
21
|
+
assert len(references[0].output.text) == 2865
|
|
22
|
+
assert references[0].tags == [CORRECT_TAG]
|
|
23
|
+
|
|
24
|
+
scenario = InfiniteBenchEnSumScenario(max_num_words=100000)
|
|
25
|
+
instances = scenario.get_instances(tmpdir)
|
|
26
|
+
assert len(instances) == 48
|
|
27
|
+
assert instances[0].split == "test"
|
|
28
|
+
assert len(instances[0].input.text) == 381778
|
|
29
|
+
references = instances[0].references
|
|
30
|
+
assert len(references[0].output.text) == 4217
|
|
31
|
+
assert references[0].tags == [CORRECT_TAG]
|
|
@@ -77,7 +77,8 @@ class TruthfulQAScenario(Scenario):
|
|
|
77
77
|
"""Downloads the TruthfulQA dataset."""
|
|
78
78
|
# Download the raw data
|
|
79
79
|
data_dir = os.path.join(output_path, "data")
|
|
80
|
-
|
|
80
|
+
|
|
81
|
+
url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/d71c110897f5d31c5d7f309e7bc316c152f6f031/data/v1/TruthfulQA.csv" # noqa: E501
|
|
81
82
|
ensure_directory_exists(data_dir)
|
|
82
83
|
ensure_file_downloaded(source_url=url, target_path=os.path.join(data_dir, self.DATASET_FILE_NAME))
|
|
83
84
|
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MSRVTTScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
|
|
22
|
+
This is achieved by collecting 257 popular queries from a commercial video search engine, with 118 videos for
|
|
23
|
+
each query. In its current version, MSR-VTT provides 10K web video clips with 41.2 hours and 200K clip-sentence
|
|
24
|
+
pairs in total, covering the most comprehensive categories and diverse visual content, and representing the
|
|
25
|
+
largest dataset in terms of sentence and vocabulary. Each clip is annotated with about 20 natural sentences
|
|
26
|
+
by 1,327 AMT workers.
|
|
27
|
+
|
|
28
|
+
Website link: https://cove.thecvf.com/datasets/839
|
|
29
|
+
|
|
30
|
+
Citation:
|
|
31
|
+
MSR-VTT: A Large Video Description Dataset for Bridging Video and Language Jun Xu, Tao Mei, Ting Yao, Yong Rui
|
|
32
|
+
CVPR 2016
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
DOWNLOAD_URL: str = "https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip"
|
|
36
|
+
|
|
37
|
+
name = "msr_vtt"
|
|
38
|
+
description = "Video captioning dataset with 10K web video clips and 200K clip-sentence pairs."
|
|
39
|
+
tags = ["vision-language", "video", "captioning"]
|
|
40
|
+
|
|
41
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
42
|
+
# Download the dataset
|
|
43
|
+
target_path: str = os.path.join(output_path, "data")
|
|
44
|
+
ensure_file_downloaded(
|
|
45
|
+
source_url=self.DOWNLOAD_URL,
|
|
46
|
+
target_path=target_path,
|
|
47
|
+
unpack=True,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
annotation_path: str = os.path.join(target_path, "annotation", "MSR_VTT.json")
|
|
51
|
+
with open(annotation_path, "r") as f:
|
|
52
|
+
annotations = json.load(f)["annotations"]
|
|
53
|
+
|
|
54
|
+
video_path_to_annotations: dict[str, set[str]] = defaultdict(set)
|
|
55
|
+
for annotation in annotations:
|
|
56
|
+
video_id: str = annotation["image_id"]
|
|
57
|
+
video_path: str = os.path.join(target_path, "videos", "all", f"{video_id}.mp4")
|
|
58
|
+
assert os.path.exists(video_path), f"Video does not exist at path: {video_path}"
|
|
59
|
+
video_path_to_annotations[video_path].add(annotation["caption"])
|
|
60
|
+
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
for video_path, captions in video_path_to_annotations.items():
|
|
63
|
+
content: List[MediaObject] = [
|
|
64
|
+
MediaObject(location=video_path, content_type="video/mp4"),
|
|
65
|
+
]
|
|
66
|
+
references: List[Reference] = [Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions]
|
|
67
|
+
instances.append(
|
|
68
|
+
Instance(
|
|
69
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
70
|
+
references=references,
|
|
71
|
+
split=TEST_SPLIT,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return instances
|
helm/benchmark/server.py
CHANGED
|
@@ -9,7 +9,7 @@ import json
|
|
|
9
9
|
from os import path
|
|
10
10
|
import urllib
|
|
11
11
|
|
|
12
|
-
from bottle import Bottle, static_file, HTTPResponse
|
|
12
|
+
from bottle import Bottle, static_file, HTTPResponse, response
|
|
13
13
|
import yaml
|
|
14
14
|
|
|
15
15
|
from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
|
|
@@ -21,6 +21,7 @@ app = Bottle()
|
|
|
21
21
|
|
|
22
22
|
@app.get("/config.js")
|
|
23
23
|
def serve_config():
|
|
24
|
+
response.content_type = "application/javascript; charset=UTF-8"
|
|
24
25
|
if app.config["helm.release"]:
|
|
25
26
|
return (
|
|
26
27
|
f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
|
helm/benchmark/slurm_jobs.py
CHANGED
|
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class SlurmJobState:
|
|
16
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
17
16
|
# Non-exhaustive list of Slurm job states.
|
|
18
17
|
# See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
|
|
19
18
|
|
|
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
|
|
|
81
80
|
except subprocess.CalledProcessError as e:
|
|
82
81
|
# Default CalledProcessError message doesn't have output, so re-raise here to include the output.
|
|
83
82
|
raise Exception(f"{str(e)} output: {e.output}")
|
|
84
|
-
search_result = re.search("JobState=(\w+)", scontrol_output.decode())
|
|
83
|
+
search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
|
|
85
84
|
if not search_result:
|
|
86
85
|
raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
|
|
87
86
|
return search_result.group(1)
|
helm/benchmark/slurm_runner.py
CHANGED
|
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
|
|
|
26
26
|
FAILURE_SLURM_JOB_STATES,
|
|
27
27
|
)
|
|
28
28
|
from helm.common.general import ensure_directory_exists
|
|
29
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
29
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
|
|
30
30
|
|
|
31
31
|
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
32
32
|
|
|
@@ -343,7 +343,14 @@ def main():
|
|
|
343
343
|
help="Path to the RunSpec JSON file",
|
|
344
344
|
required=True,
|
|
345
345
|
)
|
|
346
|
+
parser.add_argument(
|
|
347
|
+
"--log-config",
|
|
348
|
+
type=str,
|
|
349
|
+
default=None,
|
|
350
|
+
help="PATH to a YAML file to customize logging",
|
|
351
|
+
)
|
|
346
352
|
args = parser.parse_args()
|
|
353
|
+
setup_default_logging(args.log_config)
|
|
347
354
|
|
|
348
355
|
# Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
|
|
349
356
|
with open(args.slurm_runner_spec_path, "r") as f:
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Schema for Arabic scenarios
|
|
3
|
+
############################################################
|
|
4
|
+
metrics:
|
|
5
|
+
# Infrastructure metrics:
|
|
6
|
+
- name: num_perplexity_tokens
|
|
7
|
+
display_name: '# tokens'
|
|
8
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
9
|
+
- name: num_bytes
|
|
10
|
+
display_name: '# bytes'
|
|
11
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
12
|
+
|
|
13
|
+
- name: num_references
|
|
14
|
+
display_name: '# ref'
|
|
15
|
+
description: Number of references.
|
|
16
|
+
- name: num_train_trials
|
|
17
|
+
display_name: '# trials'
|
|
18
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
19
|
+
- name: estimated_num_tokens_cost
|
|
20
|
+
display_name: 'cost'
|
|
21
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
22
|
+
- name: num_prompt_tokens
|
|
23
|
+
display_name: '# prompt tokens'
|
|
24
|
+
description: Number of tokens in the prompt.
|
|
25
|
+
- name: num_prompt_characters
|
|
26
|
+
display_name: '# prompt chars'
|
|
27
|
+
description: Number of characters in the prompt.
|
|
28
|
+
- name: num_completion_tokens
|
|
29
|
+
display_name: '# completion tokens'
|
|
30
|
+
description: Actual number of completion tokens (over all completions).
|
|
31
|
+
- name: num_output_tokens
|
|
32
|
+
display_name: '# output tokens'
|
|
33
|
+
description: Actual number of output tokens.
|
|
34
|
+
- name: max_num_output_tokens
|
|
35
|
+
display_name: 'Max output tokens'
|
|
36
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
37
|
+
- name: num_requests
|
|
38
|
+
display_name: '# requests'
|
|
39
|
+
description: Number of distinct API requests.
|
|
40
|
+
- name: num_instances
|
|
41
|
+
display_name: '# eval'
|
|
42
|
+
description: Number of evaluation instances.
|
|
43
|
+
- name: num_train_instances
|
|
44
|
+
display_name: '# train'
|
|
45
|
+
description: Number of training instances (e.g., in-context examples).
|
|
46
|
+
- name: prompt_truncated
|
|
47
|
+
display_name: truncated
|
|
48
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
49
|
+
- name: finish_reason_length
|
|
50
|
+
display_name: finish b/c length
|
|
51
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
52
|
+
- name: finish_reason_stop
|
|
53
|
+
display_name: finish b/c stop
|
|
54
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
55
|
+
- name: finish_reason_endoftext
|
|
56
|
+
display_name: finish b/c endoftext
|
|
57
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
58
|
+
- name: finish_reason_unknown
|
|
59
|
+
display_name: finish b/c unknown
|
|
60
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
61
|
+
- name: num_completions
|
|
62
|
+
display_name: '# completions'
|
|
63
|
+
description: Number of completions.
|
|
64
|
+
- name: predicted_index
|
|
65
|
+
display_name: Predicted index
|
|
66
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
67
|
+
- name: inference_runtime
|
|
68
|
+
display_name: Observed inference runtime (s)
|
|
69
|
+
short_display_name: Observed inference time (s)
|
|
70
|
+
lower_is_better: true
|
|
71
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
72
|
+
|
|
73
|
+
# Accuracy metrics:
|
|
74
|
+
- name: exact_match
|
|
75
|
+
display_name: Exact match
|
|
76
|
+
short_display_name: EM
|
|
77
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
78
|
+
lower_is_better: false
|
|
79
|
+
- name: quasi_exact_match
|
|
80
|
+
display_name: Quasi-exact match
|
|
81
|
+
short_display_name: EM
|
|
82
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
83
|
+
lower_is_better: false
|
|
84
|
+
- name: prefix_exact_match
|
|
85
|
+
display_name: Prefix exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
- name: quasi_prefix_exact_match
|
|
90
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
91
|
+
display_name: Prefix quasi-exact match
|
|
92
|
+
short_display_name: PEM
|
|
93
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
|
|
96
|
+
############################################################
|
|
97
|
+
perturbations: []
|
|
98
|
+
|
|
99
|
+
############################################################
|
|
100
|
+
metric_groups:
|
|
101
|
+
- name: accuracy
|
|
102
|
+
display_name: Accuracy
|
|
103
|
+
aggregation_strategies:
|
|
104
|
+
- mean
|
|
105
|
+
metrics:
|
|
106
|
+
- name: ${main_name}
|
|
107
|
+
split: ${main_split}
|
|
108
|
+
|
|
109
|
+
- name: efficiency
|
|
110
|
+
display_name: Efficiency
|
|
111
|
+
aggregation_strategies:
|
|
112
|
+
- mean
|
|
113
|
+
metrics:
|
|
114
|
+
- name: inference_runtime
|
|
115
|
+
split: ${main_split}
|
|
116
|
+
|
|
117
|
+
- name: general_information
|
|
118
|
+
display_name: General information
|
|
119
|
+
hide_win_rates: true
|
|
120
|
+
metrics:
|
|
121
|
+
- name: num_instances
|
|
122
|
+
split: ${main_split}
|
|
123
|
+
- name: num_train_instances
|
|
124
|
+
split: ${main_split}
|
|
125
|
+
- name: prompt_truncated
|
|
126
|
+
split: ${main_split}
|
|
127
|
+
- name: num_prompt_tokens
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
- name: num_output_tokens
|
|
130
|
+
split: ${main_split}
|
|
131
|
+
|
|
132
|
+
############################################################
|
|
133
|
+
run_groups:
|
|
134
|
+
- name: arabic_scenarios
|
|
135
|
+
display_name: Arabic Scenarios
|
|
136
|
+
description: Arabic Scenarios
|
|
137
|
+
category: All scenarios
|
|
138
|
+
subgroups:
|
|
139
|
+
- mmmlu
|
|
140
|
+
- arabic_mmlu
|
|
141
|
+
- alghafa
|
|
142
|
+
- exams_multilingual
|
|
143
|
+
- aratrust
|
|
144
|
+
|
|
145
|
+
- name: mmmlu
|
|
146
|
+
display_name: Multilingual MMLU (Arabic)
|
|
147
|
+
description: Multilingual MMLU (Arabic)
|
|
148
|
+
metric_groups:
|
|
149
|
+
- accuracy
|
|
150
|
+
- efficiency
|
|
151
|
+
- general_information
|
|
152
|
+
environment:
|
|
153
|
+
main_name: exact_match
|
|
154
|
+
main_split: test
|
|
155
|
+
taxonomy:
|
|
156
|
+
task: multiple-choice question answering
|
|
157
|
+
what: math, science, history, etc.
|
|
158
|
+
who: various online sources
|
|
159
|
+
when: before 2021
|
|
160
|
+
language: Arabic
|
|
161
|
+
|
|
162
|
+
- name: arabic_mmlu
|
|
163
|
+
display_name: Arabic MMLU
|
|
164
|
+
description: Arabic MMLU
|
|
165
|
+
metric_groups:
|
|
166
|
+
- accuracy
|
|
167
|
+
- efficiency
|
|
168
|
+
- general_information
|
|
169
|
+
environment:
|
|
170
|
+
main_name: exact_match
|
|
171
|
+
main_split: test
|
|
172
|
+
taxonomy:
|
|
173
|
+
task: "question answering"
|
|
174
|
+
what: "academic questions across various disciplines"
|
|
175
|
+
who: "academic exams writers and takers"
|
|
176
|
+
when: "before 2024"
|
|
177
|
+
language: Arabic
|
|
178
|
+
|
|
179
|
+
- name: alghafa
|
|
180
|
+
display_name: AlGhafa
|
|
181
|
+
description: AlGhafa
|
|
182
|
+
metric_groups:
|
|
183
|
+
- accuracy
|
|
184
|
+
- efficiency
|
|
185
|
+
- general_information
|
|
186
|
+
environment:
|
|
187
|
+
main_name: exact_match
|
|
188
|
+
main_split: test
|
|
189
|
+
taxonomy:
|
|
190
|
+
task: "multiple choice question answering"
|
|
191
|
+
what: Various
|
|
192
|
+
who: Various
|
|
193
|
+
when: "before 2023"
|
|
194
|
+
language: Arabic
|
|
195
|
+
|
|
196
|
+
- name: exams_multilingual
|
|
197
|
+
display_name: EXAMS (Arabic)
|
|
198
|
+
description: EXAMS (Arabic)
|
|
199
|
+
metric_groups:
|
|
200
|
+
- accuracy
|
|
201
|
+
- efficiency
|
|
202
|
+
- general_information
|
|
203
|
+
environment:
|
|
204
|
+
main_name: exact_match
|
|
205
|
+
main_split: test
|
|
206
|
+
taxonomy:
|
|
207
|
+
task: "multiple choice question answering"
|
|
208
|
+
what: High school examinations
|
|
209
|
+
who: High school examinations writers and test-takers
|
|
210
|
+
when: before 2020
|
|
211
|
+
language: Arabic
|
|
212
|
+
|
|
213
|
+
- name: aratrust
|
|
214
|
+
display_name: AraTrust
|
|
215
|
+
description: AraTrust
|
|
216
|
+
metric_groups:
|
|
217
|
+
- accuracy
|
|
218
|
+
- efficiency
|
|
219
|
+
- general_information
|
|
220
|
+
environment:
|
|
221
|
+
main_name: exact_match
|
|
222
|
+
main_split: test
|
|
223
|
+
taxonomy:
|
|
224
|
+
task: "question answering"
|
|
225
|
+
what: "academic questions across various disciplines"
|
|
226
|
+
who: "academic exams writers and takers"
|
|
227
|
+
when: "before 2024"
|
|
228
|
+
language: Arabic
|