crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HEALTHQA_BR_Scenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
|
|
19
|
+
within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
|
|
20
|
+
nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
|
|
21
|
+
Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
|
|
22
|
+
covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
|
|
23
|
+
psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
|
|
24
|
+
of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "healthqa_br"
|
|
28
|
+
description = "MQA benchmark with questions from Brazilian entrance exams"
|
|
29
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
30
|
+
|
|
31
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
32
|
+
# Download the raw data and read all the dialogues
|
|
33
|
+
dataset: Any
|
|
34
|
+
# Read all the instances
|
|
35
|
+
instances: List[Instance] = []
|
|
36
|
+
cache_dir = str(Path(output_path) / "data")
|
|
37
|
+
|
|
38
|
+
dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
|
|
39
|
+
for example in dataset["train"]:
|
|
40
|
+
question_choices = example["question"]
|
|
41
|
+
answer = example["answer"].strip().upper()
|
|
42
|
+
|
|
43
|
+
# Separate the question statement from the alternatives
|
|
44
|
+
question_text, choices_text = self.split_question_and_choices(question_choices)
|
|
45
|
+
|
|
46
|
+
# Extract alternatives from text choices_text
|
|
47
|
+
pattern = r"'([A-Z])':\s*'([^']+)'"
|
|
48
|
+
matches = re.findall(pattern, choices_text)
|
|
49
|
+
answers_dict = {label: text for label, text in matches}
|
|
50
|
+
|
|
51
|
+
if answer not in answers_dict:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
correct_answer_text = answers_dict[answer]
|
|
55
|
+
|
|
56
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
57
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
|
|
58
|
+
|
|
59
|
+
instance = Instance(
|
|
60
|
+
input=Input(text=question_text),
|
|
61
|
+
split=TEST_SPLIT,
|
|
62
|
+
references=[answer_to_reference(text) for text in answers_dict.values()],
|
|
63
|
+
)
|
|
64
|
+
instances.append(instance)
|
|
65
|
+
return instances
|
|
66
|
+
|
|
67
|
+
def split_question_and_choices(self, full_text: str):
|
|
68
|
+
# Search for the first occurrence of the alternative pattern
|
|
69
|
+
match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
|
|
70
|
+
if match:
|
|
71
|
+
# Everything before the alternatives
|
|
72
|
+
question_part = full_text[: match.start()].strip()
|
|
73
|
+
# All of the alternatives (from match to end)
|
|
74
|
+
choices_part = full_text[match.start() :].strip()
|
|
75
|
+
else:
|
|
76
|
+
# If you don't find a pattern, consider everything as a question, and no alternative.
|
|
77
|
+
question_part = full_text.strip()
|
|
78
|
+
choices_part = ""
|
|
79
|
+
|
|
80
|
+
return question_part, choices_part
|
|
@@ -2,11 +2,13 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
|
|
|
35
37
|
instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
|
|
36
38
|
instances.append(instance)
|
|
37
39
|
return instances
|
|
40
|
+
|
|
41
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
42
|
+
return ScenarioMetadata(
|
|
43
|
+
name="helpdesk_call_summarization",
|
|
44
|
+
display_name="Helpdesk Call summarization",
|
|
45
|
+
short_display_name=None,
|
|
46
|
+
description="Helpdesk Call summarization",
|
|
47
|
+
taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
|
|
48
|
+
main_metric="unknown",
|
|
49
|
+
main_split="test",
|
|
50
|
+
)
|
|
@@ -4,9 +4,10 @@ from typing import List, Union
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
9
|
from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
9
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
10
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
13
|
# pd.read_excel() uses xlrd
|
|
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
|
|
|
467
468
|
instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
|
|
468
469
|
|
|
469
470
|
return instances
|
|
471
|
+
|
|
472
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
473
|
+
return ScenarioMetadata(
|
|
474
|
+
name="ice",
|
|
475
|
+
display_name="ICE (International Corpus of English)",
|
|
476
|
+
short_display_name="ICE",
|
|
477
|
+
description="The International Corpus of English (ICE) drawn from English speakers from "
|
|
478
|
+
"various places in the world, initiated by [Greenbaum "
|
|
479
|
+
"(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
|
|
480
|
+
taxonomy=TaxonomyInfo(
|
|
481
|
+
task="language modeling",
|
|
482
|
+
what="?",
|
|
483
|
+
when="?",
|
|
484
|
+
who="?",
|
|
485
|
+
language="English varieties from different nations",
|
|
486
|
+
),
|
|
487
|
+
main_metric="bits_per_byte",
|
|
488
|
+
main_split="test",
|
|
489
|
+
)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import datasets
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.schema import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
7
|
+
ScenarioMetadata,
|
|
6
8
|
Instance,
|
|
7
9
|
Input,
|
|
8
10
|
TEST_SPLIT,
|
|
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
|
|
|
51
53
|
instances.append(instance)
|
|
52
54
|
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name=self.name,
|
|
60
|
+
display_name="IFEval",
|
|
61
|
+
description=self.description,
|
|
62
|
+
main_metric="ifeval_strict_accuracy",
|
|
63
|
+
main_split="test",
|
|
64
|
+
taxonomy=TaxonomyInfo(
|
|
65
|
+
task="instruction following",
|
|
66
|
+
what="verifiable general domain instruction following",
|
|
67
|
+
who="human annotators",
|
|
68
|
+
when="2023",
|
|
69
|
+
language="English",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Dict, Optional
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
VALID_SPLIT,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
16
18
|
|
|
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
|
|
|
143
145
|
for split in [TRAIN_SPLIT, VALID_SPLIT]:
|
|
144
146
|
instances.extend(self.get_split_instances(target_path, split, contrast_map))
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="imdb",
|
|
152
|
+
display_name="IMDB",
|
|
153
|
+
description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
|
|
154
|
+
"2011)](https://aclanthology.org/P11-1015/).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
|
|
157
|
+
),
|
|
158
|
+
main_metric="quasi_exact_match",
|
|
159
|
+
main_split="valid",
|
|
160
|
+
)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnMCScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.MC
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
|
|
24
|
+
InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
|
|
25
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "infinite_bench_en_mc"
|
|
29
|
+
description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
|
+
tags = ["question_answering"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_num_words: int):
|
|
33
|
+
self.max_num_words = max_num_words
|
|
34
|
+
super().__init__()
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
# Get InfiniteBench from HuggingFace
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
|
|
41
|
+
# Define the features schema
|
|
42
|
+
ft = Features(
|
|
43
|
+
{
|
|
44
|
+
"id": Value("int64"),
|
|
45
|
+
"context": Value("string"),
|
|
46
|
+
"input": Value("string"),
|
|
47
|
+
"answer": Sequence(Value("string")),
|
|
48
|
+
"options": Sequence(Value("string")),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load the dataset with the specified features
|
|
53
|
+
dataset = load_dataset(
|
|
54
|
+
"xinrongzhang2022/InfiniteBench",
|
|
55
|
+
split="longbook_choice_eng",
|
|
56
|
+
features=ft,
|
|
57
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(dataset, Dataset)
|
|
61
|
+
|
|
62
|
+
def count_words(text: str) -> int:
|
|
63
|
+
return len(re.split(r"\s+", text.strip()))
|
|
64
|
+
|
|
65
|
+
dataset = dataset.filter(
|
|
66
|
+
lambda example: count_words(example["context"])
|
|
67
|
+
+ count_words(example["input"])
|
|
68
|
+
+ sum(count_words(option) for option in example["options"])
|
|
69
|
+
<= self.max_num_words
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Read all instances
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for row in dataset:
|
|
75
|
+
assert len(row["answer"]) == 1
|
|
76
|
+
id = row["id"]
|
|
77
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
78
|
+
references = [
|
|
79
|
+
Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
|
|
80
|
+
for option in row["options"]
|
|
81
|
+
]
|
|
82
|
+
instance = Instance(
|
|
83
|
+
id=id,
|
|
84
|
+
input=input,
|
|
85
|
+
references=references,
|
|
86
|
+
split=TEST_SPLIT,
|
|
87
|
+
)
|
|
88
|
+
instances.append(instance)
|
|
89
|
+
|
|
90
|
+
return instances
|
|
@@ -26,7 +26,7 @@ class InfiniteBenchEnQAScenario(Scenario):
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
name = "infinite_bench_en_qa"
|
|
29
|
-
description = "∞Bench En.QA is
|
|
29
|
+
description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
30
|
tags = ["question_answering"]
|
|
31
31
|
|
|
32
32
|
def __init__(self, max_num_words: int):
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class KoalaScenario(Scenario):
|
|
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
|
|
|
39
40
|
)
|
|
40
41
|
instances.append(instance)
|
|
41
42
|
return instances
|
|
43
|
+
|
|
44
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
45
|
+
return ScenarioMetadata(
|
|
46
|
+
name="koala",
|
|
47
|
+
display_name="Koala test dataset",
|
|
48
|
+
short_display_name="Koala test dataset",
|
|
49
|
+
description="The test dataset from the [Koala "
|
|
50
|
+
"paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
|
|
51
|
+
"instruction-following models.",
|
|
52
|
+
taxonomy=TaxonomyInfo(
|
|
53
|
+
task="open-ended instruction following",
|
|
54
|
+
what="Instructions for LLMs",
|
|
55
|
+
when="Before 2023",
|
|
56
|
+
who="Web users",
|
|
57
|
+
language="English",
|
|
58
|
+
),
|
|
59
|
+
main_metric="Helpfulness",
|
|
60
|
+
main_split="test",
|
|
61
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
import json
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
|
|
|
149
151
|
with open(target_path, "r") as f:
|
|
150
152
|
raw_dataset = json.load(f)
|
|
151
153
|
return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
|
|
154
|
+
|
|
155
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
156
|
+
return ScenarioMetadata(
|
|
157
|
+
name="kpi_edgar",
|
|
158
|
+
display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
|
|
159
|
+
short_display_name=None,
|
|
160
|
+
description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
|
|
161
|
+
"Dataset and Accompanying Metric for Relation Extraction from Financial "
|
|
162
|
+
"Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
|
|
163
|
+
taxonomy=TaxonomyInfo(
|
|
164
|
+
task="named entity recognition",
|
|
165
|
+
what="financial reports",
|
|
166
|
+
when="before 2022",
|
|
167
|
+
who="financial experts",
|
|
168
|
+
language="English",
|
|
169
|
+
),
|
|
170
|
+
main_metric="adjusted_macro_f1_score",
|
|
171
|
+
main_split="test",
|
|
172
|
+
)
|
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from typing import List
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Input,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
TEST_SPLIT,
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
|
|
|
127
129
|
instances.append(instance)
|
|
128
130
|
|
|
129
131
|
return instances
|
|
132
|
+
|
|
133
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
134
|
+
return ScenarioMetadata(
|
|
135
|
+
name="legal_contract_summarization",
|
|
136
|
+
display_name="Legal Contract Summarization",
|
|
137
|
+
short_display_name=None,
|
|
138
|
+
description="Plain English Summarization of Contracts [(Manor et al., "
|
|
139
|
+
"2019)](https://aclanthology.org/W19-2201.pdf).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="summarization",
|
|
142
|
+
what="legal contracts (e.g. terms of service, license agreements)",
|
|
143
|
+
when="before 2019",
|
|
144
|
+
who="lawyers",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="rouge_l",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -5,6 +5,7 @@ from typing import List, Optional, Any
|
|
|
5
5
|
import datasets
|
|
6
6
|
from datasets import load_dataset
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Scenario,
|
|
10
11
|
Instance,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
_ALL_LANGUAGES = {
|
|
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
|
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
return instances
|
|
210
|
+
|
|
211
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
212
|
+
if self.dataset_name == "BillSum":
|
|
213
|
+
return ScenarioMetadata(
|
|
214
|
+
name="billsum_legal_summarization",
|
|
215
|
+
display_name="BillSum",
|
|
216
|
+
description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
|
|
217
|
+
"2020](https://aclanthology.org/D19-5406/)).",
|
|
218
|
+
taxonomy=TaxonomyInfo(
|
|
219
|
+
task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
|
|
220
|
+
),
|
|
221
|
+
main_metric="rouge_2",
|
|
222
|
+
main_split="test",
|
|
223
|
+
)
|
|
224
|
+
elif self.dataset_name == "MultiLexSum":
|
|
225
|
+
return ScenarioMetadata(
|
|
226
|
+
name="multilexsum_legal_summarization",
|
|
227
|
+
display_name="MultiLexSum",
|
|
228
|
+
description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
|
|
229
|
+
"2022](https://arxiv.org/abs/2206.10883)).",
|
|
230
|
+
taxonomy=TaxonomyInfo(
|
|
231
|
+
task="summarization",
|
|
232
|
+
what="legal text from US civil rights lawsuits",
|
|
233
|
+
when=None,
|
|
234
|
+
who="lawyers",
|
|
235
|
+
language="English",
|
|
236
|
+
),
|
|
237
|
+
main_metric="rouge_2",
|
|
238
|
+
main_split="test",
|
|
239
|
+
)
|
|
240
|
+
elif self.dataset_name == "EurLexSum":
|
|
241
|
+
return ScenarioMetadata(
|
|
242
|
+
name="eurlexsum_legal_summarization",
|
|
243
|
+
display_name="EurLexSum",
|
|
244
|
+
description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
|
|
245
|
+
"2022](https://arxiv.org/abs/2210.13448)).",
|
|
246
|
+
taxonomy=TaxonomyInfo(
|
|
247
|
+
task="summarization",
|
|
248
|
+
what="legal text from EU legislation",
|
|
249
|
+
when="1960 - 2020",
|
|
250
|
+
who="lawyers",
|
|
251
|
+
language="English",
|
|
252
|
+
),
|
|
253
|
+
main_metric="rouge_2",
|
|
254
|
+
main_split="test",
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
raise Exception(f"Unknown dataset {self.dataset_name}")
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
|
|
|
102
104
|
instances.append(instance)
|
|
103
105
|
|
|
104
106
|
return instances
|
|
107
|
+
|
|
108
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
109
|
+
return ScenarioMetadata(
|
|
110
|
+
name="legal_support",
|
|
111
|
+
display_name="LegalSupport",
|
|
112
|
+
description="Scenario introduced in this work to measure fine-grained legal reasoning "
|
|
113
|
+
"through reverse entailment.",
|
|
114
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
115
|
+
main_metric="quasi_exact_match",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -5,6 +5,7 @@ import datasets
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
Scenario,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
|
|
@@ -144,3 +146,21 @@ class LegalBenchScenario(Scenario):
|
|
|
144
146
|
instances.append(instance)
|
|
145
147
|
|
|
146
148
|
return instances
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
151
|
+
return ScenarioMetadata(
|
|
152
|
+
name="legalbench",
|
|
153
|
+
display_name="LegalBench",
|
|
154
|
+
description="LegalBench is a large collaboratively constructed benchmark of legal "
|
|
155
|
+
"reasoning. Five representative tasks are included here. See [(Guha et al, "
|
|
156
|
+
"2023)[https://arxiv.org/abs/2308.11462] for more details.",
|
|
157
|
+
taxonomy=TaxonomyInfo(
|
|
158
|
+
task="text classification",
|
|
159
|
+
what="fact patterns, questions, and legal documents",
|
|
160
|
+
when="n/a",
|
|
161
|
+
who="lawyers",
|
|
162
|
+
language="English",
|
|
163
|
+
),
|
|
164
|
+
main_metric="quasi_exact_match",
|
|
165
|
+
main_split="test",
|
|
166
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Input,
|
|
18
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
ECTHR_A = "ecthr_a"
|
|
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
|
|
|
261
262
|
for subset in self.subsets:
|
|
262
263
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
263
264
|
return instances
|
|
265
|
+
|
|
266
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
267
|
+
return ScenarioMetadata(
|
|
268
|
+
name="lex_glue",
|
|
269
|
+
display_name="LexGLUE",
|
|
270
|
+
description="A Benchmark Dataset for Legal Language Understanding in English",
|
|
271
|
+
taxonomy=None,
|
|
272
|
+
main_metric="classification_macro_f1",
|
|
273
|
+
main_split="test",
|
|
274
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Output,
|
|
18
18
|
Input,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
|
|
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
|
|
|
466
467
|
for subset in self.subsets:
|
|
467
468
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
468
469
|
return instances
|
|
470
|
+
|
|
471
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
472
|
+
return ScenarioMetadata(
|
|
473
|
+
name="lextreme",
|
|
474
|
+
display_name="LEXTREME",
|
|
475
|
+
description="A Multilingual Legal Benchmark for Natural Language Understanding",
|
|
476
|
+
taxonomy=None,
|
|
477
|
+
main_metric="classification_macro_f1",
|
|
478
|
+
main_split="test",
|
|
479
|
+
)
|