crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsStudentMistakeScenario(Scenario):
|
|
7
|
+
name = "codeinsights_student_mistake"
|
|
8
|
+
description = "Mimic how students mistake their C++ codes on foundational questions"
|
|
9
|
+
tags = ["codeinsights", "c++", "student_mistake"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario3_data.csv")
|
|
17
|
+
student_topic = pd.read_csv(
|
|
18
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
instances = []
|
|
22
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
23
|
+
student_df = student_df.sort_values(by=["student_id", "question_unittest_id", "timestamp"])
|
|
24
|
+
if len(student_df) < 4:
|
|
25
|
+
continue
|
|
26
|
+
first = student_df.iloc[0]
|
|
27
|
+
second = student_df.iloc[1]
|
|
28
|
+
third = student_df.iloc[2]
|
|
29
|
+
target = student_df.iloc[3]
|
|
30
|
+
|
|
31
|
+
# Get test cases for this question
|
|
32
|
+
question_id = target.get("question_unittest_id", None)
|
|
33
|
+
question_test_cases = []
|
|
34
|
+
tc_parsing_success = True
|
|
35
|
+
|
|
36
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
37
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
38
|
+
input_idx = testcase_str.find("Input:")
|
|
39
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
40
|
+
output_idx = testcase_str.find("Output:")
|
|
41
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
42
|
+
tc_parsing_success = False
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
testcase = {
|
|
46
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
47
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
48
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
49
|
+
}
|
|
50
|
+
question_test_cases.append(testcase)
|
|
51
|
+
|
|
52
|
+
if not tc_parsing_success:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if len(question_test_cases) < self.num_testcases:
|
|
56
|
+
# If not enough test cases, skip this question
|
|
57
|
+
continue
|
|
58
|
+
if self.num_testcases >= 0:
|
|
59
|
+
# If more than one test case is requested, only take the first ones
|
|
60
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
61
|
+
|
|
62
|
+
# Get student pass (0 or 1) for the target question
|
|
63
|
+
student_correctness_pattern = target.get("pass", None)
|
|
64
|
+
main_part = int(student_correctness_pattern) # "1111111111"
|
|
65
|
+
# Convert each character to an int
|
|
66
|
+
student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
|
|
67
|
+
|
|
68
|
+
# Student specific topic performance in previous attempts
|
|
69
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
70
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
71
|
+
for _, row in topic_performance.iterrows():
|
|
72
|
+
topic = row["topic"]
|
|
73
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
74
|
+
perfect = round(row["perfect"], 2)
|
|
75
|
+
|
|
76
|
+
student_level_prompt += (
|
|
77
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
78
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
prompt = (
|
|
82
|
+
"=== Student Profile ===\n"
|
|
83
|
+
f"{student_level_prompt}\n"
|
|
84
|
+
"When students submit a code to the platform, it will be tested by number of unit tests, where"
|
|
85
|
+
"- Unit test pass rate = proportion of unit tests passed with the code \n"
|
|
86
|
+
"- Full pass rate = proportion of code passing all unit tests\n\n"
|
|
87
|
+
"=== Past Mistake Examples ===\n"
|
|
88
|
+
"Example 1 (Week {first['week']}, Topic: {first['topic']}):\n"
|
|
89
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
90
|
+
"Template:\n"
|
|
91
|
+
f"{first['question_template']}\n"
|
|
92
|
+
"Student's Response Code with Error:\n"
|
|
93
|
+
f"{first['response_mistake']}\n\n"
|
|
94
|
+
"Example 2 (Week {second['week']}, Topic: {second['topic']}):\n"
|
|
95
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
96
|
+
"Template:\n"
|
|
97
|
+
f"{second['question_template']}\n"
|
|
98
|
+
"Student's Response Code with Error:\n"
|
|
99
|
+
f"{second['response_mistake']}\n\n"
|
|
100
|
+
"Example 3 (Week {third['week']}, Topic: {third['topic']}):\n"
|
|
101
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
102
|
+
"Template:\n"
|
|
103
|
+
f"{third['question_template']}\n"
|
|
104
|
+
"Student's Response Code with Error:\n"
|
|
105
|
+
f"{third['response_mistake']}\n\n"
|
|
106
|
+
"=== New Target Problem ===\n"
|
|
107
|
+
f"Week: {target['week']}, Topic: {target['topic']}\n"
|
|
108
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
109
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
110
|
+
if question_test_cases
|
|
111
|
+
else ""
|
|
112
|
+
"Template:\n"
|
|
113
|
+
f"{target['question_template']}\n\n"
|
|
114
|
+
"⚠**Instructions:**\n"
|
|
115
|
+
"1. Mimic your own coding style, naming conventions, indentation, and typical error patterns.\n"
|
|
116
|
+
"2. Introduce mistake you are likely to make (e.g., off‐by‐one index, wrong initialization, "
|
|
117
|
+
"missing edge case).\n"
|
|
118
|
+
"3. Do **not** produce a fully correct solution or add unfamiliar optimizations.\n\n"
|
|
119
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
120
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
121
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
122
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
123
|
+
"Ensure your code is includes any class definition when needed. "
|
|
124
|
+
"Return the code in C++ code block format, and nothing else."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
print(f"\n=== DEBUG INFO FOR STUDENT {student_id}, QUESTION {question_id} ===")
|
|
128
|
+
print(f"Test cases loaded: {len(question_test_cases)}")
|
|
129
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
130
|
+
print(f"Original pass field: {target.get('pass', 'MISSING')}")
|
|
131
|
+
print(f"Question template exists: {'question_template' in target}")
|
|
132
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
133
|
+
|
|
134
|
+
# Also add this validation in your UnitTestAlignmentMetric evaluate_generation method:
|
|
135
|
+
def evaluate_generation(self, adapter_spec, request_state, metric_service, eval_cache_path):
|
|
136
|
+
print("\n=== UNIT TEST METRIC DEBUG ===")
|
|
137
|
+
print(f"Has extra_data: {hasattr(request_state.instance, 'extra_data')}")
|
|
138
|
+
if hasattr(request_state.instance, "extra_data"):
|
|
139
|
+
extra_data = request_state.instance.extra_data
|
|
140
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
141
|
+
print(f"Test cases: {len(extra_data.get('test_cases', []))}")
|
|
142
|
+
print(f"Student pattern: {extra_data.get('student_correctness_pattern', 'MISSING')}")
|
|
143
|
+
|
|
144
|
+
instances.append(
|
|
145
|
+
Instance(
|
|
146
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
147
|
+
input=Input(text=prompt),
|
|
148
|
+
references=[Reference(output=Output(text=target["response_mistake"]), tags=[CORRECT_TAG])],
|
|
149
|
+
extra_data={
|
|
150
|
+
"question_template": target["question_template"],
|
|
151
|
+
"test_cases": question_test_cases,
|
|
152
|
+
"question_id": str(question_id) if question_id else None,
|
|
153
|
+
"question_name": target.get("question_name", ""),
|
|
154
|
+
"student_id": str(student_id),
|
|
155
|
+
"student_correctness_pattern": student_correctness_list,
|
|
156
|
+
},
|
|
157
|
+
split=VALID_SPLIT,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
return instances
|
|
161
|
+
|
|
162
|
+
def _load_test_cases(self):
|
|
163
|
+
"""
|
|
164
|
+
Load test cases from external source or return None if not available.
|
|
165
|
+
This method should be implemented based on where your test cases are stored.
|
|
166
|
+
|
|
167
|
+
Expected format:
|
|
168
|
+
{
|
|
169
|
+
"question_id": [
|
|
170
|
+
{
|
|
171
|
+
"unittest": "test_id",
|
|
172
|
+
"input": "test input code",
|
|
173
|
+
"output": "expected output"
|
|
174
|
+
},
|
|
175
|
+
...
|
|
176
|
+
],
|
|
177
|
+
...
|
|
178
|
+
}
|
|
179
|
+
"""
|
|
180
|
+
try:
|
|
181
|
+
response = requests.get(
|
|
182
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
183
|
+
)
|
|
184
|
+
if response.status_code == 200:
|
|
185
|
+
return response.json()
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
188
|
+
return {}
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -72,6 +74,19 @@ class HellaSwagScenario(Scenario):
|
|
|
72
74
|
assert len(answers) == 4
|
|
73
75
|
return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
|
|
74
76
|
|
|
77
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
78
|
+
return ScenarioMetadata(
|
|
79
|
+
name="hellaswag",
|
|
80
|
+
display_name="HellaSwag",
|
|
81
|
+
description="The HellaSwag benchmark for commonsense reasoning in question answering "
|
|
82
|
+
"[(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).",
|
|
83
|
+
taxonomy=TaxonomyInfo(
|
|
84
|
+
task="question answering", what="commonsense reasoning", when="?", who="?", language="English"
|
|
85
|
+
),
|
|
86
|
+
main_metric="exact_match",
|
|
87
|
+
main_split="valid",
|
|
88
|
+
)
|
|
89
|
+
|
|
75
90
|
|
|
76
91
|
class OpenBookQA(Scenario):
|
|
77
92
|
name = "openbookqa"
|
|
@@ -113,6 +128,17 @@ class OpenBookQA(Scenario):
|
|
|
113
128
|
assert item["question"]["choices"][correct_choice]["label"] == item["answerKey"]
|
|
114
129
|
return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
|
|
115
130
|
|
|
131
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
132
|
+
return ScenarioMetadata(
|
|
133
|
+
name="openbookqa",
|
|
134
|
+
display_name="OpenbookQA",
|
|
135
|
+
description="The OpenbookQA benchmark for commonsense-intensive open book question "
|
|
136
|
+
"answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
|
|
137
|
+
taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
|
|
138
|
+
main_metric="exact_match",
|
|
139
|
+
main_split="test",
|
|
140
|
+
)
|
|
141
|
+
|
|
116
142
|
|
|
117
143
|
class CommonSenseQAScenario(Scenario):
|
|
118
144
|
name = "commonsenseqa"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
rules:
|
|
2
|
+
- category: Root
|
|
3
|
+
expansions:
|
|
4
|
+
- text: ${Task} ${Style}
|
|
5
|
+
|
|
6
|
+
- category: Task
|
|
7
|
+
expansions:
|
|
8
|
+
- text: Explain ${HowTo}
|
|
9
|
+
- text: Explain ${Topic}
|
|
10
|
+
- text: Tell me a joke about ${Topic}
|
|
11
|
+
- text: Tell me a joke about ${TopicPair}
|
|
12
|
+
- text: What are the similarities between ${TopicPair}
|
|
13
|
+
- text: What are the differences between ${TopicPair}
|
|
14
|
+
- text: Tell me 5 surprising facts about ${Topic}
|
|
15
|
+
- text: Persuade me to learn about ${Topic}
|
|
16
|
+
|
|
17
|
+
- category: HowTo
|
|
18
|
+
expansions:
|
|
19
|
+
- text: how airplanes fly
|
|
20
|
+
- text: how maglev trains work
|
|
21
|
+
- text: how to grow tomatoes in the wintertime
|
|
22
|
+
|
|
23
|
+
- category: Topic
|
|
24
|
+
expansions:
|
|
25
|
+
- text: the quicksort algorithm
|
|
26
|
+
- text: stochastic gradient descent
|
|
27
|
+
- text: the Great Vowel Shift
|
|
28
|
+
- text: northern lights
|
|
29
|
+
- text: the Romantic period
|
|
30
|
+
- text: the Civil Rights movement
|
|
31
|
+
- text: the Pacific Northwest
|
|
32
|
+
- text: El Niño
|
|
33
|
+
|
|
34
|
+
- category: TopicPair
|
|
35
|
+
expansions:
|
|
36
|
+
- text: north and south
|
|
37
|
+
- text: gradient descent and gradient ascent
|
|
38
|
+
- text: vowels and consonants
|
|
39
|
+
- text: C and C++
|
|
40
|
+
- text: Google and Microsoft
|
|
41
|
+
- text: US and Canada
|
|
42
|
+
- text: cats and dogs
|
|
43
|
+
- text: the Baroque period and the Romantic period
|
|
44
|
+
|
|
45
|
+
- category: Style
|
|
46
|
+
expansions:
|
|
47
|
+
- text: as a paragraph.
|
|
48
|
+
- text: as a haiku.
|
|
49
|
+
- text: as a limerick.
|
|
50
|
+
- text: in the style of a Shakespeare sonnet.
|
|
51
|
+
- text: in the style of a court case.
|
|
52
|
+
- text: in the style of Snoop Dogg.
|
|
53
|
+
- text: so that a ${Age}-year old can understand it.
|
|
54
|
+
- text: in ${Language}.
|
|
55
|
+
- text: in 3 bullet points.
|
|
56
|
+
- text: in 8 bullet points.
|
|
57
|
+
|
|
58
|
+
- category: Age
|
|
59
|
+
expansions:
|
|
60
|
+
- text: "5"
|
|
61
|
+
- text: "9"
|
|
62
|
+
- text: "13"
|
|
63
|
+
|
|
64
|
+
- category: Language
|
|
65
|
+
expansions:
|
|
66
|
+
- text: Italian
|
|
67
|
+
- text: Greek
|
|
68
|
+
- text: Indian
|
|
69
|
+
- text: Chinese
|
|
70
|
+
- text: Thai
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List, Any
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
VALID_SPLIT,
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_file_downloaded
|
|
16
18
|
|
|
@@ -95,3 +97,22 @@ class ConvFinQACalcScenario(Scenario):
|
|
|
95
97
|
for raw_instance in raw_instances:
|
|
96
98
|
instances.append(self.convert_to_instance(raw_instance, split))
|
|
97
99
|
return instances
|
|
100
|
+
|
|
101
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
102
|
+
return ScenarioMetadata(
|
|
103
|
+
name="conv_fin_qa_calc",
|
|
104
|
+
display_name="ConvFinQACalc",
|
|
105
|
+
short_display_name=None,
|
|
106
|
+
description="A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain "
|
|
107
|
+
"of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey "
|
|
108
|
+
"al., 2022)](https://arxiv.org/pdf/2210.03849.pdf).",
|
|
109
|
+
taxonomy=TaxonomyInfo(
|
|
110
|
+
task="question answering with numeric reasoning",
|
|
111
|
+
what="financial reports",
|
|
112
|
+
when="1999 to 2019",
|
|
113
|
+
who="financial experts",
|
|
114
|
+
language="English",
|
|
115
|
+
),
|
|
116
|
+
main_metric="float_equiv",
|
|
117
|
+
main_split="valid",
|
|
118
|
+
)
|
|
@@ -3,8 +3,18 @@ import os
|
|
|
3
3
|
import tqdm
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from helm.benchmark.scenarios.scenario import
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
17
|
+
)
|
|
8
18
|
|
|
9
19
|
datatag2hash_text = {
|
|
10
20
|
# The "average" book.
|
|
@@ -81,3 +91,27 @@ class CopyrightScenario(Scenario):
|
|
|
81
91
|
),
|
|
82
92
|
)
|
|
83
93
|
return instances
|
|
94
|
+
|
|
95
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
96
|
+
if self.datatag in datatag2hash_code:
|
|
97
|
+
return ScenarioMetadata(
|
|
98
|
+
name="copyright_code",
|
|
99
|
+
display_name="Copyright (code)",
|
|
100
|
+
description="Scenario introduced in this work to measure copyright and memorization "
|
|
101
|
+
"behavior for code, based off of [Carlini et al. "
|
|
102
|
+
"(2021)](https://www.usenix.org/biblio-11958).",
|
|
103
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
104
|
+
main_metric="unknown",
|
|
105
|
+
main_split="test",
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
return ScenarioMetadata(
|
|
109
|
+
name="copyright_text",
|
|
110
|
+
display_name="Copyright (text)",
|
|
111
|
+
description="Scenario introduced in this work to measure copyright and memorization "
|
|
112
|
+
"behavior for books, based off of [Carlini et al. "
|
|
113
|
+
"(2021)](https://www.usenix.org/biblio-11958).",
|
|
114
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
115
|
+
main_metric="unknown",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -6,6 +6,7 @@ from typing import Any, List, Dict
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas import DataFrame
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
10
11
|
from helm.benchmark.scenarios.scenario import (
|
|
11
12
|
Scenario,
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
CORRECT_TAG,
|
|
17
18
|
Input,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
|
|
@@ -238,3 +240,22 @@ class CtiToMitreScenario(Scenario):
|
|
|
238
240
|
# return all instances
|
|
239
241
|
all_instances = instances_train + instances_test
|
|
240
242
|
return all_instances
|
|
243
|
+
|
|
244
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
245
|
+
return ScenarioMetadata(
|
|
246
|
+
name="cti_to_mitre",
|
|
247
|
+
display_name="CTI-to-MITRE Cyber Threat Intelligence",
|
|
248
|
+
short_display_name=None,
|
|
249
|
+
description="A classification benchmark based on Automatic Mapping of Unstructured Cyber "
|
|
250
|
+
"Threat Intelligence - An Experimental Study [(Orbinato et al., "
|
|
251
|
+
"2022)](https://arxiv.org/pdf/2208.12144.pdf).",
|
|
252
|
+
taxonomy=TaxonomyInfo(
|
|
253
|
+
task="text classification",
|
|
254
|
+
what="Descriptions of malicious techniques",
|
|
255
|
+
when="Before 2022",
|
|
256
|
+
who="Security professionals",
|
|
257
|
+
language="English",
|
|
258
|
+
),
|
|
259
|
+
main_metric="quasi_exact_match",
|
|
260
|
+
main_split="test",
|
|
261
|
+
)
|
|
@@ -1,9 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
6
|
from helm.common.general import ensure_directory_exists
|
|
6
|
-
from helm.benchmark.scenarios.scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Output,
|
|
14
|
+
VALID_SPLIT,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
TASKS = {
|
|
9
19
|
"counterfactual": ["snli_premise", "snli_hypothesis"]
|
|
@@ -167,3 +177,14 @@ class DecodingTrustAdvDemoScenario(Scenario):
|
|
|
167
177
|
if self.demo_name in ["cf", "zero"]:
|
|
168
178
|
break
|
|
169
179
|
return instances
|
|
180
|
+
|
|
181
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
182
|
+
return ScenarioMetadata(
|
|
183
|
+
name="decodingtrust_adv_demonstration",
|
|
184
|
+
display_name="DecodingTrust - Adversarial Demonstrations",
|
|
185
|
+
short_display_name="AdvDemo",
|
|
186
|
+
description="Robustness analysis of LM generations when facing adversarial demonstrations",
|
|
187
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
188
|
+
main_metric="quasi_exact_match",
|
|
189
|
+
main_split="test",
|
|
190
|
+
)
|
|
@@ -1,8 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from typing import List, Dict
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from helm.benchmark.scenarios.scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
TASK_DESCRIPTIONS = {
|
|
8
18
|
"sst2": "For the given sentence, label the sentiment of the sentence as positive or negative. The answer should be "
|
|
@@ -119,3 +129,15 @@ class DecodingTrustAdvRobustnessScenario(Scenario):
|
|
|
119
129
|
instances.append(instance)
|
|
120
130
|
|
|
121
131
|
return instances
|
|
132
|
+
|
|
133
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
134
|
+
return ScenarioMetadata(
|
|
135
|
+
name="decodingtrust_adv_robustness",
|
|
136
|
+
display_name="DecodingTrust - AdvGLUE++",
|
|
137
|
+
short_display_name="AdvGLUE++",
|
|
138
|
+
description="Adversarial perturbations of the GLUE dataset generated against open-source "
|
|
139
|
+
"LLMs including Alpaca, Vicuna, and Stable-Vicuna",
|
|
140
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
141
|
+
main_metric="quasi_exact_match",
|
|
142
|
+
main_split="test",
|
|
143
|
+
)
|
|
@@ -1,8 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from typing import List, Dict
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from helm.benchmark.scenarios.scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
TASK_DESCRIPTIONS = {
|
|
8
18
|
"predict_annual_income": "For each snippet of text, predict whether the person described has an annual income "
|
|
@@ -75,3 +85,14 @@ class DecodingTrustFairnessScenario(Scenario):
|
|
|
75
85
|
instances.append(instance)
|
|
76
86
|
|
|
77
87
|
return instances
|
|
88
|
+
|
|
89
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
90
|
+
return ScenarioMetadata(
|
|
91
|
+
name="decodingtrust_fairness",
|
|
92
|
+
display_name="DecodingTrust - Fairness",
|
|
93
|
+
short_display_name="Fairness",
|
|
94
|
+
description="Fairness analysis of LLMs",
|
|
95
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
96
|
+
main_metric="quasi_exact_match",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -4,8 +4,17 @@ import random
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from typing import List, Dict, Tuple, Union
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
|
-
from helm.benchmark.scenarios.scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Input,
|
|
13
|
+
Reference,
|
|
14
|
+
Output,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
ScenarioMetadata,
|
|
17
|
+
)
|
|
9
18
|
|
|
10
19
|
OPTION_DICT: Dict[str, Union[List[int], List[str]]] = {
|
|
11
20
|
"ethics_commonsense_short": ["not wrong", "wrong"],
|
|
@@ -322,3 +331,14 @@ class DecodingTrustMachineEthicsScenario(Scenario):
|
|
|
322
331
|
)
|
|
323
332
|
|
|
324
333
|
return instances
|
|
334
|
+
|
|
335
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
336
|
+
return ScenarioMetadata(
|
|
337
|
+
name="decodingtrust_machine_ethics",
|
|
338
|
+
display_name="DecodingTrust - Ethics",
|
|
339
|
+
short_display_name="Ethics",
|
|
340
|
+
description="Evaluation of the understanding of ethical behaviors of LLMs",
|
|
341
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
342
|
+
main_metric="quasi_exact_match",
|
|
343
|
+
main_split="test",
|
|
344
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from typing import List, Dict
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
13
14
|
CORRECT_TAG,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
TASK = {
|
|
@@ -202,3 +204,14 @@ class DecodingTrustOODRobustnessScenario(Scenario):
|
|
|
202
204
|
instances.append(instance)
|
|
203
205
|
|
|
204
206
|
return instances
|
|
207
|
+
|
|
208
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
209
|
+
return ScenarioMetadata(
|
|
210
|
+
name="decodingtrust_adv_demonstration",
|
|
211
|
+
display_name="DecodingTrust - Adversarial Demonstrations",
|
|
212
|
+
short_display_name="AdvDemo",
|
|
213
|
+
description="Robustness analysis of LM generations when facing adversarial demonstrations",
|
|
214
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
215
|
+
main_metric="quasi_exact_match",
|
|
216
|
+
main_split="test",
|
|
217
|
+
)
|
|
@@ -4,8 +4,9 @@ import string
|
|
|
4
4
|
import random
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
from typing import List, Dict, Optional
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT, ScenarioMetadata
|
|
9
10
|
|
|
10
11
|
option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
|
|
11
12
|
|
|
@@ -217,6 +218,17 @@ class DecodingTrustPrivacyScenario(Scenario):
|
|
|
217
218
|
|
|
218
219
|
return instances
|
|
219
220
|
|
|
221
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
222
|
+
return ScenarioMetadata(
|
|
223
|
+
name="decodingtrust_privacy",
|
|
224
|
+
display_name="DecodingTrust - Privacy",
|
|
225
|
+
short_display_name="Privacy",
|
|
226
|
+
description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
|
|
227
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
228
|
+
main_metric="quasi_exact_match",
|
|
229
|
+
main_split="test",
|
|
230
|
+
)
|
|
231
|
+
|
|
220
232
|
|
|
221
233
|
def get_local_domain(email):
|
|
222
234
|
return email.split("@")
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict
|
|
4
|
-
from helm.benchmark.
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import Reference, Output
|
|
7
8
|
|
|
@@ -66,3 +67,14 @@ class DecodingTrustStereotypeBiasScenario(Scenario):
|
|
|
66
67
|
instances.append(instance)
|
|
67
68
|
|
|
68
69
|
return instances
|
|
70
|
+
|
|
71
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
72
|
+
return ScenarioMetadata(
|
|
73
|
+
name="decodingtrust_stereotype_bias",
|
|
74
|
+
display_name="DecodingTrust - Stereotype Bias",
|
|
75
|
+
short_display_name="Stereotype",
|
|
76
|
+
description="Manually crafted stereotype user prompts from DecodingTrust",
|
|
77
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
78
|
+
main_metric="unknown",
|
|
79
|
+
main_split="test",
|
|
80
|
+
)
|