crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Run spec functions for Vietnam WVS cultural alignment evaluation."""
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
4
|
+
get_generation_adapter_spec,
|
|
5
|
+
)
|
|
6
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
7
|
+
get_exact_match_metric_specs,
|
|
8
|
+
get_f1_metric_specs,
|
|
9
|
+
get_open_ended_generation_metric_specs,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.metrics.lmkt_metric_specs import get_semantic_similarity_metric_specs
|
|
12
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
13
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
14
|
+
|
|
15
|
+
INSTRUCTIONS = {
|
|
16
|
+
"cultural_value_understanding_wvs": {
|
|
17
|
+
"en": {
|
|
18
|
+
"instructions": "Please respond as the {country} persona described below.",
|
|
19
|
+
"input_noun": "Question",
|
|
20
|
+
"output_noun": "Answer",
|
|
21
|
+
},
|
|
22
|
+
"vi": {
|
|
23
|
+
"instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
|
|
24
|
+
"input_noun": "Câu hỏi",
|
|
25
|
+
"output_noun": "Trả lời",
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
"social_norm_application_normad": {
|
|
29
|
+
"en": {
|
|
30
|
+
"instructions": "Please respond as the {country} persona described below.",
|
|
31
|
+
"input_noun": "Situation",
|
|
32
|
+
"output_noun": "Response",
|
|
33
|
+
},
|
|
34
|
+
"vi": {
|
|
35
|
+
"instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
|
|
36
|
+
"input_noun": "Tình huống",
|
|
37
|
+
"output_noun": "Phản hồi",
|
|
38
|
+
},
|
|
39
|
+
},
|
|
40
|
+
"social_norm_explanation_normad": {
|
|
41
|
+
"en": {
|
|
42
|
+
"instructions": "Please respond as the {country} persona described below.",
|
|
43
|
+
"input_noun": "Situation",
|
|
44
|
+
"output_noun": "Explanation",
|
|
45
|
+
},
|
|
46
|
+
"vi": {
|
|
47
|
+
"instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
|
|
48
|
+
"input_noun": "Tình huống",
|
|
49
|
+
"output_noun": "Giải thích",
|
|
50
|
+
},
|
|
51
|
+
},
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
COUNTRIES = {
|
|
55
|
+
"US": "United States",
|
|
56
|
+
"VN": "Vietnam",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@run_spec_function("cultural_value_understanding_wvs")
|
|
61
|
+
def get_cultural_value_understanding_wvs_spec(language: str, country: str) -> RunSpec:
|
|
62
|
+
scenario_spec = ScenarioSpec(
|
|
63
|
+
class_name="helm.benchmark.scenarios.lmkt_scenarios.CulturalValueUnderstandingWVSScenario",
|
|
64
|
+
args={
|
|
65
|
+
"language": language,
|
|
66
|
+
"num_personas": 300,
|
|
67
|
+
"num_question_variants": 4,
|
|
68
|
+
"include_few_shot_examples": True,
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
adapter_spec = get_generation_adapter_spec(
|
|
73
|
+
instructions=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["instructions"].format(
|
|
74
|
+
country=COUNTRIES[country]
|
|
75
|
+
),
|
|
76
|
+
input_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["input_noun"],
|
|
77
|
+
output_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["output_noun"],
|
|
78
|
+
max_tokens=3,
|
|
79
|
+
stop_sequences=[],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return RunSpec(
|
|
83
|
+
name="cultural_value_understanding_wvs",
|
|
84
|
+
scenario_spec=scenario_spec,
|
|
85
|
+
adapter_spec=adapter_spec,
|
|
86
|
+
metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
|
|
87
|
+
groups=["lmkt", "cultural_value_understanding_wvs"],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@run_spec_function("social_norm_application_normad")
|
|
92
|
+
def get_social_norm_application_normad_spec(language: str, country: str) -> RunSpec:
|
|
93
|
+
scenario_spec = ScenarioSpec(
|
|
94
|
+
class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormApplicationNormADScenario",
|
|
95
|
+
args={
|
|
96
|
+
"language": language,
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
adapter_spec = get_generation_adapter_spec(
|
|
101
|
+
instructions=INSTRUCTIONS["social_norm_application_normad"][language]["instructions"].format(
|
|
102
|
+
country=COUNTRIES[country]
|
|
103
|
+
),
|
|
104
|
+
input_noun=INSTRUCTIONS["social_norm_application_normad"][language]["input_noun"],
|
|
105
|
+
output_noun=INSTRUCTIONS["social_norm_application_normad"][language]["output_noun"],
|
|
106
|
+
max_tokens=5,
|
|
107
|
+
stop_sequences=[],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return RunSpec(
|
|
111
|
+
name="social_norm_application_normad",
|
|
112
|
+
scenario_spec=scenario_spec,
|
|
113
|
+
adapter_spec=adapter_spec,
|
|
114
|
+
metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
|
|
115
|
+
groups=["lmkt", "social_norm_application_normad"],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@run_spec_function("social_norm_explanation_normad")
|
|
120
|
+
def get_social_norm_explanation_normad_spec(language: str, country: str) -> RunSpec:
|
|
121
|
+
scenario_spec = ScenarioSpec(
|
|
122
|
+
class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormExplanationNormADScenario",
|
|
123
|
+
args={
|
|
124
|
+
"language": language,
|
|
125
|
+
},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
adapter_spec = get_generation_adapter_spec(
|
|
129
|
+
instructions=INSTRUCTIONS["social_norm_explanation_normad"][language]["instructions"].format(
|
|
130
|
+
country=COUNTRIES[country]
|
|
131
|
+
),
|
|
132
|
+
input_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["input_noun"],
|
|
133
|
+
output_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["output_noun"],
|
|
134
|
+
max_tokens=128,
|
|
135
|
+
stop_sequences=[],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return RunSpec(
|
|
139
|
+
name="social_norm_explanation_normad",
|
|
140
|
+
scenario_spec=scenario_spec,
|
|
141
|
+
adapter_spec=adapter_spec,
|
|
142
|
+
metric_specs=get_open_ended_generation_metric_specs() + get_semantic_similarity_metric_specs(),
|
|
143
|
+
groups=["lmkt", "social_norm_explanation_normad"],
|
|
144
|
+
)
|
|
@@ -1,4 +1,9 @@
|
|
|
1
|
-
from helm.benchmark.adaptation.adapter_spec import
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
2
|
+
ADAPT_CHAT,
|
|
3
|
+
ADAPT_GENERATION,
|
|
4
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
5
|
+
AdapterSpec,
|
|
6
|
+
)
|
|
2
7
|
from helm.benchmark.metrics.common_metric_specs import (
|
|
3
8
|
get_exact_match_metric_specs,
|
|
4
9
|
get_open_ended_generation_metric_specs,
|
|
@@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
|
|
|
29
34
|
)
|
|
30
35
|
|
|
31
36
|
|
|
37
|
+
def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
|
|
38
|
+
return AdapterSpec(
|
|
39
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
40
|
+
global_prefix="",
|
|
41
|
+
global_suffix="",
|
|
42
|
+
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.", # noqa: E501
|
|
43
|
+
input_prefix="",
|
|
44
|
+
input_suffix="\n",
|
|
45
|
+
reference_prefix="A. ",
|
|
46
|
+
reference_suffix="\n",
|
|
47
|
+
output_prefix="",
|
|
48
|
+
output_suffix="",
|
|
49
|
+
instance_prefix="",
|
|
50
|
+
max_train_instances=0,
|
|
51
|
+
num_outputs=1,
|
|
52
|
+
temperature=0.0,
|
|
53
|
+
max_tokens=max_tokens,
|
|
54
|
+
stop_sequences=[],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
32
58
|
@run_spec_function("ruler_hotpotqa")
|
|
33
59
|
def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
|
|
34
60
|
scenario_spec = ScenarioSpec(
|
|
@@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
|
|
|
96
122
|
)
|
|
97
123
|
|
|
98
124
|
|
|
125
|
+
@run_spec_function("infinite_bench_en_mc")
|
|
126
|
+
def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
|
|
127
|
+
scenario_spec = ScenarioSpec(
|
|
128
|
+
class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
|
|
129
|
+
args={
|
|
130
|
+
"max_num_words": max_num_words,
|
|
131
|
+
},
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
|
|
135
|
+
metric_specs = get_exact_match_metric_specs()
|
|
136
|
+
|
|
137
|
+
return RunSpec(
|
|
138
|
+
name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
|
|
139
|
+
scenario_spec=scenario_spec,
|
|
140
|
+
adapter_spec=adapter_spec,
|
|
141
|
+
metric_specs=metric_specs,
|
|
142
|
+
groups=["infinite_bench_en_mc"],
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
99
146
|
@run_spec_function("infinite_bench_en_sum")
|
|
100
147
|
def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
|
|
101
148
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
2
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
3
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("mmmlu")
|
|
9
|
+
def get_mmmlu_spec(locale: str, subject: str) -> RunSpec:
|
|
10
|
+
scenario_spec = ScenarioSpec(
|
|
11
|
+
class_name="helm.benchmark.scenarios.mmmlu_scenario.MMMLUScenario", args={"locale": locale, "subject": subject}
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
15
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
16
|
+
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
17
|
+
input_noun="Question",
|
|
18
|
+
output_noun="Answer",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return RunSpec(
|
|
22
|
+
name=f"mmmlu:locale={locale},subject={subject}",
|
|
23
|
+
scenario_spec=scenario_spec,
|
|
24
|
+
adapter_spec=adapter_spec,
|
|
25
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
26
|
+
groups=["mmmlu", f"mmmlu_{locale}_{subject}"],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@run_spec_function("exams_multilingual")
|
|
31
|
+
def get_exams_multilingual_spec(language: str, subject: str) -> RunSpec:
|
|
32
|
+
scenario_spec = ScenarioSpec(
|
|
33
|
+
class_name="helm.benchmark.scenarios.exams_multilingual_scenario.EXAMSMultilingualScenario",
|
|
34
|
+
args={"language": language, "subject": subject},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
38
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
39
|
+
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
40
|
+
input_noun="Question",
|
|
41
|
+
output_noun="Answer",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return RunSpec(
|
|
45
|
+
name=f"exams_multilingual:locale={language},subject={subject}",
|
|
46
|
+
scenario_spec=scenario_spec,
|
|
47
|
+
adapter_spec=adapter_spec,
|
|
48
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
49
|
+
groups=["exams_multilingual", f"exams_multilingual_{language}_{subject}"],
|
|
50
|
+
)
|
|
@@ -73,9 +73,7 @@ def get_ultra_suite_classification_run_spec() -> RunSpec:
|
|
|
73
73
|
scenario_spec = ScenarioSpec(
|
|
74
74
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_classification_scenario.UltraSuiteClassificationScenario", # noqa: E501
|
|
75
75
|
)
|
|
76
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
77
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
78
|
-
)
|
|
76
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
79
77
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
80
78
|
run_spec_name: str = "ultra_suite_classification"
|
|
81
79
|
return RunSpec(
|
|
@@ -92,9 +90,7 @@ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
|
|
|
92
90
|
scenario_spec = ScenarioSpec(
|
|
93
91
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_breakdown_scenario.UltraSuiteDisorderBreakdownScenario", # noqa: E501
|
|
94
92
|
)
|
|
95
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
96
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
97
|
-
)
|
|
93
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
98
94
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
99
95
|
run_spec_name: str = "ultra_suite_classification_breakdown"
|
|
100
96
|
return RunSpec(
|
|
@@ -112,7 +108,7 @@ def get_ultra_suite_disorder_breakdown_run_spec() -> RunSpec:
|
|
|
112
108
|
@run_spec_function("ultra_suite_asr_classification")
|
|
113
109
|
def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
|
|
114
110
|
scenario_spec = ScenarioSpec(
|
|
115
|
-
class_name="helm.benchmark.scenarios.audio_language.
|
|
111
|
+
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_classification_scenario.UltraSuiteASRClassificationScenario", # noqa: E501
|
|
116
112
|
)
|
|
117
113
|
adapter_spec = _get_generation_adapter_spec(
|
|
118
114
|
instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording is provided to you, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Do not make any assumptions about the words the child is expected to say. Only transcribe based on the words that the child actually says. Only respond with the text transcription, no other text or commentary.""", # noqa: E501
|
|
@@ -133,7 +129,7 @@ def get_ultra_suite_asr_classification_run_spec() -> RunSpec:
|
|
|
133
129
|
@run_spec_function("ultra_suite_asr_transcription")
|
|
134
130
|
def get_ultra_suite_asr_transcription_run_spec() -> RunSpec:
|
|
135
131
|
scenario_spec = ScenarioSpec(
|
|
136
|
-
class_name="helm.benchmark.scenarios.audio_language.
|
|
132
|
+
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_asr_transcription_scenario.UltraSuiteASRTranscriptionScenario", # noqa: E501
|
|
137
133
|
)
|
|
138
134
|
adapter_spec = _get_generation_adapter_spec(
|
|
139
135
|
instructions="""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. Based on your expertise transcribe the child's speech into text. Try to understand what the child is expected to say. And only respond with the transcription of the child's speech. Not the pathologist's prompt or any other commentary. Only respond with the text transcription, no other text, commentary or punctuations.""", # noqa: E501
|
|
@@ -155,9 +151,7 @@ def get_ultra_suite_disorder_symptoms_run_spec() -> RunSpec:
|
|
|
155
151
|
scenario_spec = ScenarioSpec(
|
|
156
152
|
class_name="helm.benchmark.scenarios.audio_language.ultra_suite_disorder_symptoms_scenario.UltraSuiteDisorderSymptomsScenario", # noqa: E501
|
|
157
153
|
)
|
|
158
|
-
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
|
|
159
|
-
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
160
|
-
)
|
|
154
|
+
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer")
|
|
161
155
|
metric_specs: List[MetricSpec] = audio_classification_metric_specs()
|
|
162
156
|
run_spec_name: str = "ultra_suite_disorder_symptoms"
|
|
163
157
|
return RunSpec(
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AlGhafaScenario(Scenario):
|
|
20
|
+
"""AlGhafa Evaluation Benchmark for Arabic Language Models
|
|
21
|
+
|
|
22
|
+
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
23
|
+
|
|
24
|
+
Multiple-choice evaluation benchmark for zero- and few-shot evaluation of Arabic LLMs,
|
|
25
|
+
consisting of
|
|
26
|
+
|
|
27
|
+
- https://huggingface.co/datasets/OALL/AlGhafa-Arabic-LLM-Benchmark-Native/
|
|
28
|
+
- https://aclanthology.org/2023.arabicnlp-1.21/
|
|
29
|
+
|
|
30
|
+
Citation:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
@inproceedings{almazrouei-etal-2023-alghafa,
|
|
34
|
+
title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
|
|
35
|
+
author = "Almazrouei, Ebtesam and
|
|
36
|
+
Cojocaru, Ruxandra and
|
|
37
|
+
Baldo, Michele and
|
|
38
|
+
Malartic, Quentin and
|
|
39
|
+
Alobeidli, Hamza and
|
|
40
|
+
Mazzotta, Daniele and
|
|
41
|
+
Penedo, Guilherme and
|
|
42
|
+
Campesan, Giulia and
|
|
43
|
+
Farooq, Mugariya and
|
|
44
|
+
Alhammadi, Maitha and
|
|
45
|
+
Launay, Julien and
|
|
46
|
+
Noune, Badreddine",
|
|
47
|
+
editor = "Sawaf, Hassan and
|
|
48
|
+
El-Beltagy, Samhaa and
|
|
49
|
+
Zaghouani, Wajdi and
|
|
50
|
+
Magdy, Walid and
|
|
51
|
+
Abdelali, Ahmed and
|
|
52
|
+
Tomeh, Nadi and
|
|
53
|
+
Abu Farha, Ibrahim and
|
|
54
|
+
Habash, Nizar and
|
|
55
|
+
Khalifa, Salam and
|
|
56
|
+
Keleg, Amr and
|
|
57
|
+
Haddad, Hatem and
|
|
58
|
+
Zitouni, Imed and
|
|
59
|
+
Mrini, Khalil and
|
|
60
|
+
Almatham, Rawan",
|
|
61
|
+
booktitle = "Proceedings of ArabicNLP 2023",
|
|
62
|
+
month = dec,
|
|
63
|
+
year = "2023",
|
|
64
|
+
address = "Singapore (Hybrid)",
|
|
65
|
+
publisher = "Association for Computational Linguistics",
|
|
66
|
+
url = "https://aclanthology.org/2023.arabicnlp-1.21/",
|
|
67
|
+
doi = "10.18653/v1/2023.arabicnlp-1.21",
|
|
68
|
+
pages = "244--275",
|
|
69
|
+
abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs."
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
""" # noqa: E501
|
|
73
|
+
|
|
74
|
+
name = "alghafa"
|
|
75
|
+
description = "AlGhafa"
|
|
76
|
+
tags = ["multiple choice"]
|
|
77
|
+
|
|
78
|
+
HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
79
|
+
REFERENCE_PREFIX = "sol"
|
|
80
|
+
|
|
81
|
+
def __init__(self, subset: str):
|
|
82
|
+
super().__init__()
|
|
83
|
+
self.subset = subset
|
|
84
|
+
|
|
85
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
86
|
+
cache_dir = os.path.join(output_path, "data")
|
|
87
|
+
ensure_directory_exists(cache_dir)
|
|
88
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
89
|
+
"OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
|
|
90
|
+
self.subset,
|
|
91
|
+
revision="a31ebd34ca311d7e0cfc6ad7f458b3435af280f5",
|
|
92
|
+
cache_dir=cache_dir,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Read all instances
|
|
96
|
+
instances: List[Instance] = []
|
|
97
|
+
for split_name, dataset in dataset_splits.items():
|
|
98
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
99
|
+
option_indexes = [
|
|
100
|
+
int(s.removeprefix(self.REFERENCE_PREFIX))
|
|
101
|
+
for s in dataset[0].keys()
|
|
102
|
+
if s.startswith(self.REFERENCE_PREFIX)
|
|
103
|
+
]
|
|
104
|
+
for row_index, row in enumerate(dataset):
|
|
105
|
+
input = Input(text=row["query"])
|
|
106
|
+
references: List[Reference] = []
|
|
107
|
+
# Need to add 1 because label is zero-indexed and has a value from 0 to (N - 1),
|
|
108
|
+
# but column names are 1 indexed and have values from "sol1" to "solN"
|
|
109
|
+
correct_option_index = int(row["label"]) + 1
|
|
110
|
+
for option_index in option_indexes:
|
|
111
|
+
column_name = f"{self.REFERENCE_PREFIX}{option_index}"
|
|
112
|
+
references.append(
|
|
113
|
+
Reference(
|
|
114
|
+
output=Output(text=row[column_name]),
|
|
115
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
instance = Instance(
|
|
119
|
+
id=f"id{row_index}_{split_name}",
|
|
120
|
+
input=input,
|
|
121
|
+
references=references,
|
|
122
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
123
|
+
)
|
|
124
|
+
instances.append(instance)
|
|
125
|
+
|
|
126
|
+
return instances
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ArabicMMLUScenario(Scenario):
|
|
20
|
+
"""ArabicMMLU
|
|
21
|
+
|
|
22
|
+
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
23
|
+
|
|
24
|
+
ArabicMMLU is the first multi-task language understanding benchmark
|
|
25
|
+
for Arabic language, sourced from school exams across diverse educational
|
|
26
|
+
levels in different countries spanning North Africa, the Levant, and the
|
|
27
|
+
Gulf regions. The data comprises 40 tasks and 14,575 multiple-choice questions
|
|
28
|
+
in Modern Standard Arabic (MSA), and is carefully constructed by collaborating
|
|
29
|
+
with native speakers in the region.
|
|
30
|
+
|
|
31
|
+
- https://huggingface.co/datasets/MBZUAI/ArabicMMLU
|
|
32
|
+
- https://aclanthology.org/2024.findings-acl.334/
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name = "arabic_mmlu"
|
|
36
|
+
description = "Arabic Massive Multitask Language Understanding"
|
|
37
|
+
tags = ["knowledge", "multiple_choice"]
|
|
38
|
+
|
|
39
|
+
OPTIONS = ["A", "B", "C", "D"]
|
|
40
|
+
HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
41
|
+
|
|
42
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
43
|
+
cache_dir = os.path.join(output_path, "data")
|
|
44
|
+
ensure_directory_exists(cache_dir)
|
|
45
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
46
|
+
"MBZUAI/ArabicMMLU",
|
|
47
|
+
"All",
|
|
48
|
+
revision="7aa530e2893ac420352b3f5c1a1310c010e9758b",
|
|
49
|
+
cache_dir=cache_dir,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Read all instances
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
for split_name, dataset in dataset_splits.items():
|
|
55
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
56
|
+
for row_index, row in enumerate(dataset):
|
|
57
|
+
input = Input(text=row["Question"])
|
|
58
|
+
references: List[Reference] = []
|
|
59
|
+
correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
|
|
60
|
+
for option_index in range(1, 6):
|
|
61
|
+
column_name = f"Option {option_index}"
|
|
62
|
+
if not row[column_name]:
|
|
63
|
+
continue
|
|
64
|
+
references.append(
|
|
65
|
+
Reference(
|
|
66
|
+
output=Output(text=row[column_name]),
|
|
67
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
instance = Instance(
|
|
71
|
+
id=f"id{row_index}",
|
|
72
|
+
input=input,
|
|
73
|
+
references=references,
|
|
74
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
75
|
+
)
|
|
76
|
+
instances.append(instance)
|
|
77
|
+
|
|
78
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AraTrustScenario(Scenario):
|
|
19
|
+
"""AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic
|
|
20
|
+
|
|
21
|
+
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
22
|
+
|
|
23
|
+
AraTrust is a comprehensive Trustworthiness benchmark for LLMs in Arabic.
|
|
24
|
+
AraTrust comprises 522 human-written multiple-choice questions addressing
|
|
25
|
+
diverse dimensions related to truthfulness, ethics, safety, physical health,
|
|
26
|
+
mental health, unfairness, illegal activities, privacy, and offensive language.
|
|
27
|
+
|
|
28
|
+
- https://huggingface.co/datasets/asas-ai/AraTrust
|
|
29
|
+
- https://arxiv.org/abs/2403.09017
|
|
30
|
+
|
|
31
|
+
Citation:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
@misc{alghamdi2024aratrustevaluationtrustworthinessllms,
|
|
35
|
+
title={AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic},
|
|
36
|
+
author={Emad A. Alghamdi and Reem I. Masoud and Deema Alnuhait and Afnan Y. Alomairi and Ahmed Ashraf and Mohamed Zaytoon},
|
|
37
|
+
year={2024},
|
|
38
|
+
eprint={2403.09017},
|
|
39
|
+
archivePrefix={arXiv},
|
|
40
|
+
primaryClass={cs.CL},
|
|
41
|
+
url={https://arxiv.org/abs/2403.09017},
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
""" # noqa: E501
|
|
45
|
+
|
|
46
|
+
name = "aratrust"
|
|
47
|
+
description = "aratrust"
|
|
48
|
+
tags = ["trustworthiness"]
|
|
49
|
+
|
|
50
|
+
OPTION_KEYS = ["A", "B", "C"]
|
|
51
|
+
|
|
52
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
+
cache_dir = os.path.join(output_path, "data")
|
|
54
|
+
ensure_directory_exists(cache_dir)
|
|
55
|
+
dataset: datasets.Dataset = datasets.load_dataset(
|
|
56
|
+
"asas-ai/AraTrust",
|
|
57
|
+
revision="d4dd124ed5b90aeb65a7dda7d88e34fb464a31ec",
|
|
58
|
+
cache_dir=cache_dir,
|
|
59
|
+
split="test",
|
|
60
|
+
)
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
for row_index, row in enumerate(dataset):
|
|
63
|
+
question_text = row["Question"]
|
|
64
|
+
option_texts = [row[option_key] for option_key in self.OPTION_KEYS if row[option_key]]
|
|
65
|
+
joined_option_texts = "\n".join(option_texts)
|
|
66
|
+
input = Input(text=f"{question_text}\n\n{joined_option_texts}\n")
|
|
67
|
+
references = [Reference(output=Output(text=row["Answer"]), tags=[CORRECT_TAG])]
|
|
68
|
+
instance = Instance(
|
|
69
|
+
id=f"id{row_index}",
|
|
70
|
+
input=input,
|
|
71
|
+
references=references,
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
instances.append(instance)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -19,7 +19,7 @@ from helm.common.audio_utils import extract_audio
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class CasualConversations2Scenario(Scenario):
|
|
22
|
-
"""
|
|
22
|
+
r"""
|
|
23
23
|
Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
|
|
24
24
|
The videos feature paid individuals who agreed to participate in the project and explicitly provided
|
|
25
25
|
Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
|
|
@@ -19,7 +19,7 @@ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class MUStARDScenario(Scenario):
|
|
22
|
-
"""
|
|
22
|
+
r"""
|
|
23
23
|
MUStARD: Multimodal Sarcasm Detection Dataset
|
|
24
24
|
|
|
25
25
|
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -58,7 +58,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
58
58
|
name = "speech_disorder"
|
|
59
59
|
description = "A scenario for evaluating speech disorders in children"
|
|
60
60
|
tags = ["audio", "classification", "speech_disorder", "asr"]
|
|
61
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite"
|
|
62
61
|
|
|
63
62
|
# Classification options
|
|
64
63
|
options: List[str] = ["Healthy", "Unhealthy"]
|
|
@@ -70,14 +69,18 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
70
69
|
- Audio files (e.g., .mp3)
|
|
71
70
|
- A JSON file with annotations containing 'answer' field
|
|
72
71
|
"""
|
|
73
|
-
print(
|
|
74
|
-
|
|
72
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
73
|
+
data_path = snapshot_download(
|
|
74
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
75
|
+
repo_type="dataset",
|
|
76
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
77
|
+
)
|
|
75
78
|
|
|
76
79
|
instances: List[Instance] = []
|
|
77
80
|
split: str = TEST_SPLIT
|
|
78
81
|
|
|
79
82
|
# Find all pairs of audio and JSON files
|
|
80
|
-
pairs = find_audio_json_pairs(
|
|
83
|
+
pairs = find_audio_json_pairs(data_path)
|
|
81
84
|
|
|
82
85
|
for audio_path, json_path in tqdm(pairs):
|
|
83
86
|
|
|
@@ -88,9 +91,7 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
88
91
|
# Get the correct answer and convert to label
|
|
89
92
|
answer = annotation["disorder_class"]
|
|
90
93
|
# Create references for each option
|
|
91
|
-
references: List[Reference] = []
|
|
92
|
-
reference = Reference(Output(text=answer), tags=[CORRECT_TAG])
|
|
93
|
-
references.append(reference)
|
|
94
|
+
references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
94
95
|
|
|
95
96
|
# Create the input with audio and instruction
|
|
96
97
|
content = [
|