crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -55,6 +55,7 @@ import os
|
|
|
55
55
|
import sys
|
|
56
56
|
from typing import List, Dict, Iterable, Optional, cast
|
|
57
57
|
|
|
58
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
58
59
|
from helm.common.general import ensure_file_downloaded
|
|
59
60
|
from helm.common.hierarchical_logger import hlog
|
|
60
61
|
from helm.benchmark.scenarios.code_scenario_helper import run as run_reindent
|
|
@@ -69,6 +70,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
69
70
|
CORRECT_TAG,
|
|
70
71
|
Input,
|
|
71
72
|
Output,
|
|
73
|
+
ScenarioMetadata,
|
|
72
74
|
)
|
|
73
75
|
|
|
74
76
|
|
|
@@ -331,3 +333,29 @@ class CodeScenario(Scenario):
|
|
|
331
333
|
raise ValueError(f"Unknown dataset: {self.dataset}")
|
|
332
334
|
|
|
333
335
|
return cast(List[Instance], instances)
|
|
336
|
+
|
|
337
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
338
|
+
if self.dataset == "humaneval":
|
|
339
|
+
return ScenarioMetadata(
|
|
340
|
+
name="code_humaneval",
|
|
341
|
+
display_name="HumanEval (Code)",
|
|
342
|
+
description="The HumanEval benchmark for measuring functional correctness for synthesizing "
|
|
343
|
+
"programs from docstrings [(Chen et al., "
|
|
344
|
+
"2021)](https://arxiv.org/pdf/2107.03374.pdf).",
|
|
345
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
346
|
+
main_metric="pass",
|
|
347
|
+
main_split="test",
|
|
348
|
+
)
|
|
349
|
+
elif self.dataset == "apps":
|
|
350
|
+
return ScenarioMetadata(
|
|
351
|
+
name="code_apps",
|
|
352
|
+
display_name="APPS (Code)",
|
|
353
|
+
description="The APPS benchmark for measuring competence on code challenges [(Hendrycks et "
|
|
354
|
+
"al., "
|
|
355
|
+
"2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/c24cd76e1ce41366a4bbe8a49b02a028-Abstract-round2.html).",
|
|
356
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
357
|
+
main_metric="test_avg",
|
|
358
|
+
main_split="test",
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
raise Exception(f"Unknown dataset {self.dataset}")
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsCodeEfficiencyScenario(Scenario):
|
|
7
|
+
name = "codeinsights_code_efficiency"
|
|
8
|
+
description = "Evaluate runtime efficiency alignment between LLM-generated code and student code"
|
|
9
|
+
tags = ["codeinsights", "c++", "code_efficiency"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario4_data.csv")
|
|
17
|
+
|
|
18
|
+
# Load test cases (unit tests)
|
|
19
|
+
test_cases = self._load_test_cases()
|
|
20
|
+
|
|
21
|
+
# Get available question IDs with test cases
|
|
22
|
+
available_question_ids = set()
|
|
23
|
+
if test_cases:
|
|
24
|
+
available_question_ids = set(test_cases.keys())
|
|
25
|
+
print(f"Loaded test cases for {len(available_question_ids)} questions")
|
|
26
|
+
else:
|
|
27
|
+
print("WARNING: No test cases loaded!")
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
instances = []
|
|
31
|
+
skipped_no_tests = 0
|
|
32
|
+
skipped_insufficient_data = 0
|
|
33
|
+
|
|
34
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
35
|
+
student_df = student_df.sort_values("timestamp")
|
|
36
|
+
if len(student_df) < 4:
|
|
37
|
+
skipped_insufficient_data += 1
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
first = student_df.iloc[0]
|
|
41
|
+
second = student_df.iloc[1]
|
|
42
|
+
third = student_df.iloc[2]
|
|
43
|
+
target = student_df.iloc[3]
|
|
44
|
+
|
|
45
|
+
# Check if target question has test cases BEFORE processing
|
|
46
|
+
target_question_id = target.get("question_unittest_id", None)
|
|
47
|
+
if not target_question_id or str(target_question_id) not in available_question_ids:
|
|
48
|
+
skipped_no_tests += 1
|
|
49
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
# Get test cases for this question (we know they exist now)
|
|
53
|
+
question_test_cases = []
|
|
54
|
+
tc_parsing_success = True
|
|
55
|
+
|
|
56
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
57
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
58
|
+
input_idx = testcase_str.find("Input:")
|
|
59
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
60
|
+
output_idx = testcase_str.find("Output:")
|
|
61
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
62
|
+
tc_parsing_success = False
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
testcase = {
|
|
66
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
67
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
68
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
69
|
+
}
|
|
70
|
+
question_test_cases.append(testcase)
|
|
71
|
+
|
|
72
|
+
if not tc_parsing_success:
|
|
73
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
if len(question_test_cases) < self.num_testcases:
|
|
77
|
+
# If not enough test cases, skip this question
|
|
78
|
+
continue
|
|
79
|
+
if self.num_testcases >= 0:
|
|
80
|
+
# If more than one test case is requested, only take the first ones
|
|
81
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
82
|
+
|
|
83
|
+
# Get student pass pattern for the target question
|
|
84
|
+
student_correctness_pattern = target.get("pass", None)
|
|
85
|
+
if student_correctness_pattern is not None:
|
|
86
|
+
main_part = int(student_correctness_pattern)
|
|
87
|
+
# Convert each character to an int
|
|
88
|
+
student_correctness_list = [int(ch) for ch in str(main_part)]
|
|
89
|
+
else:
|
|
90
|
+
student_correctness_list = []
|
|
91
|
+
|
|
92
|
+
print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
|
|
93
|
+
print(f"Test cases loaded: {len(question_test_cases)}")
|
|
94
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
95
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
96
|
+
|
|
97
|
+
prompt = (
|
|
98
|
+
f"Week: {target['week']}\n"
|
|
99
|
+
f"Topic: {target['topic']}\n\n"
|
|
100
|
+
"Example 1:\n"
|
|
101
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
102
|
+
"Template:\n"
|
|
103
|
+
f"{first['question_template']}\n"
|
|
104
|
+
"Your Code:\n"
|
|
105
|
+
f"{first['response']}\n\n"
|
|
106
|
+
"Example 2:\n"
|
|
107
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
108
|
+
"Template:\n"
|
|
109
|
+
f"{second['question_template']}\n"
|
|
110
|
+
"Your Code:\n"
|
|
111
|
+
f"{second['response']}\n\n"
|
|
112
|
+
"Example 3:\n"
|
|
113
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
114
|
+
"Template:\n"
|
|
115
|
+
f"{third['question_template']}\n"
|
|
116
|
+
"Your Code:\n"
|
|
117
|
+
f"{third['response']}\n\n"
|
|
118
|
+
"Now, using that same student's coding style, attempt this:\n"
|
|
119
|
+
"Ensure that the code works perfectly, but its efficiency should be based on students' past examples.\n"
|
|
120
|
+
"If a student has a tendency to write correct but inefficient code, imitate the inefficiency "
|
|
121
|
+
"but if they write efficiently, write efficiently too.\n"
|
|
122
|
+
f"Question: {target['question_name']} — {target['question_text']}\n\n"
|
|
123
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
124
|
+
if question_test_cases
|
|
125
|
+
else ""
|
|
126
|
+
"Template:\n"
|
|
127
|
+
f"{target['question_template']}\n\n"
|
|
128
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
129
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
130
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
131
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
132
|
+
"Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly. "
|
|
133
|
+
"Return the code in C++ code block format, and nothing else, and produce only one set of code."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
instances.append(
|
|
137
|
+
Instance(
|
|
138
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
139
|
+
input=Input(text=prompt),
|
|
140
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
141
|
+
extra_data={
|
|
142
|
+
"question_template": target["question_template"],
|
|
143
|
+
"test_cases": question_test_cases,
|
|
144
|
+
"question_id": str(target_question_id),
|
|
145
|
+
"question_name": target.get("question_name", ""),
|
|
146
|
+
"student_id": str(student_id),
|
|
147
|
+
"student_correctness_pattern": student_correctness_list,
|
|
148
|
+
},
|
|
149
|
+
split=VALID_SPLIT,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Print summary statistics
|
|
154
|
+
print("\n=== INSTANCE CREATION SUMMARY ===")
|
|
155
|
+
print(f"Total instances created: {len(instances)}")
|
|
156
|
+
print(f"Skipped (insufficient data): {skipped_insufficient_data}")
|
|
157
|
+
print(f"Skipped (no test cases): {skipped_no_tests}")
|
|
158
|
+
print(f"Available test case question IDs: {len(available_question_ids)}")
|
|
159
|
+
|
|
160
|
+
if instances:
|
|
161
|
+
print("Sample created instances:")
|
|
162
|
+
for i, inst in enumerate(instances[:5]):
|
|
163
|
+
if inst.extra_data is None:
|
|
164
|
+
test_count = 0
|
|
165
|
+
else:
|
|
166
|
+
test_count = len(inst.extra_data.get("test_cases", []))
|
|
167
|
+
print(f" {inst.id}: {test_count} test cases")
|
|
168
|
+
|
|
169
|
+
return instances
|
|
170
|
+
|
|
171
|
+
def _load_test_cases(self):
|
|
172
|
+
"""
|
|
173
|
+
Load test cases from external source or return None if not available.
|
|
174
|
+
This method should be implemented based on where your test cases are stored.
|
|
175
|
+
|
|
176
|
+
Expected format:
|
|
177
|
+
{
|
|
178
|
+
"question_id": [
|
|
179
|
+
{
|
|
180
|
+
"unittest": "test_id",
|
|
181
|
+
"input": "test input code",
|
|
182
|
+
"output": "expected output"
|
|
183
|
+
},
|
|
184
|
+
...
|
|
185
|
+
],
|
|
186
|
+
...
|
|
187
|
+
}
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
response = requests.get(
|
|
191
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
192
|
+
)
|
|
193
|
+
if response.status_code == 200:
|
|
194
|
+
return response.json()
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
197
|
+
return {}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class CodeInsightsCorrectCodeScenario(Scenario):
|
|
6
|
+
name = "codeinsights_correct_code"
|
|
7
|
+
description = "Generate correct response code for C++ programming questions"
|
|
8
|
+
tags = ["codeinsights", "c++", "correct_code"]
|
|
9
|
+
|
|
10
|
+
def __init__(self, num_testcases: int = 1):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self.num_testcases = num_testcases
|
|
13
|
+
|
|
14
|
+
def get_instances(self, output_path: str):
|
|
15
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
|
|
16
|
+
|
|
17
|
+
# Load test cases (unit tests)
|
|
18
|
+
instances = []
|
|
19
|
+
for question_id, question_df in df.groupby("question_unittest_id"):
|
|
20
|
+
target = question_df.iloc[0]
|
|
21
|
+
question_test_cases = []
|
|
22
|
+
tc_parsing_success = True
|
|
23
|
+
|
|
24
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
25
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
26
|
+
input_idx = testcase_str.find("Input:")
|
|
27
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
28
|
+
output_idx = testcase_str.find("Output:")
|
|
29
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
30
|
+
tc_parsing_success = False
|
|
31
|
+
break
|
|
32
|
+
|
|
33
|
+
testcase = {
|
|
34
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
35
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
36
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
37
|
+
}
|
|
38
|
+
question_test_cases.append(testcase)
|
|
39
|
+
|
|
40
|
+
if not tc_parsing_success:
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
if len(question_test_cases) < self.num_testcases:
|
|
44
|
+
# If not enough test cases, skip this question
|
|
45
|
+
continue
|
|
46
|
+
if self.num_testcases >= 0:
|
|
47
|
+
# If more than one test case is requested, only take the first ones
|
|
48
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
49
|
+
|
|
50
|
+
prompt = (
|
|
51
|
+
f"Question: {target['question_name']} — {target['question_text']}\n\n"
|
|
52
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
53
|
+
if question_test_cases
|
|
54
|
+
else ""
|
|
55
|
+
"Template:\n"
|
|
56
|
+
f"{target['question_template']}\n\n"
|
|
57
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
58
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
59
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
60
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
61
|
+
"Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly. "
|
|
62
|
+
"Return the code in C++ code block format, and nothing else."
|
|
63
|
+
)
|
|
64
|
+
instances.append(
|
|
65
|
+
Instance(
|
|
66
|
+
id=f"{question_id}",
|
|
67
|
+
input=Input(text=prompt),
|
|
68
|
+
references=[],
|
|
69
|
+
extra_data={
|
|
70
|
+
"question_template": target["question_template"],
|
|
71
|
+
"test_cases": question_test_cases,
|
|
72
|
+
"question_id": str(question_id) if question_id else None,
|
|
73
|
+
"question_name": target.get("question_name", ""),
|
|
74
|
+
},
|
|
75
|
+
split=VALID_SPLIT,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return instances
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsEdgeCaseScenario(Scenario):
|
|
7
|
+
name = "codeinsights_edge_case"
|
|
8
|
+
description = "Evaluate alignment in edge case failure between LLM-generated code and student code"
|
|
9
|
+
tags = ["codeinsights", "c++", "edge_case"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario5_data.csv")
|
|
17
|
+
|
|
18
|
+
student_topic = pd.read_csv(
|
|
19
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Load test cases (unit tests)
|
|
23
|
+
test_cases = self._load_test_cases()
|
|
24
|
+
|
|
25
|
+
# Get available question IDs with test cases
|
|
26
|
+
available_question_ids = set()
|
|
27
|
+
if test_cases:
|
|
28
|
+
available_question_ids = set(test_cases.keys())
|
|
29
|
+
print(f"Loaded test cases for {len(available_question_ids)} questions")
|
|
30
|
+
else:
|
|
31
|
+
print("WARNING: No test cases loaded!")
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
instances = []
|
|
35
|
+
skipped_no_tests = 0
|
|
36
|
+
skipped_insufficient_data = 0
|
|
37
|
+
|
|
38
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
39
|
+
student_df = student_df.sort_values("timestamp")
|
|
40
|
+
target = student_df.iloc[0]
|
|
41
|
+
|
|
42
|
+
# Check if target question has test cases BEFORE processing
|
|
43
|
+
target_question_id = target.get("question_unittest_id", None)
|
|
44
|
+
if not target_question_id or str(target_question_id) not in available_question_ids:
|
|
45
|
+
skipped_no_tests += 1
|
|
46
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
# Get test cases for this question (we know they exist now)
|
|
50
|
+
target_test_cases = []
|
|
51
|
+
tc_parsing_success = True
|
|
52
|
+
|
|
53
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
54
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
55
|
+
input_idx = testcase_str.find("Input:")
|
|
56
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
57
|
+
output_idx = testcase_str.find("Output:")
|
|
58
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
59
|
+
tc_parsing_success = False
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
testcase = {
|
|
63
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
64
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
65
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
66
|
+
}
|
|
67
|
+
target_test_cases.append(testcase)
|
|
68
|
+
|
|
69
|
+
# Verify test cases are not empty
|
|
70
|
+
if not tc_parsing_success:
|
|
71
|
+
skipped_no_tests += 1
|
|
72
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
if len(target_test_cases) < self.num_testcases:
|
|
76
|
+
# If not enough test cases, skip this question
|
|
77
|
+
continue
|
|
78
|
+
if self.num_testcases >= 0:
|
|
79
|
+
# If more than one test case is requested, only take the first ones
|
|
80
|
+
target_test_cases = target_test_cases[: self.num_testcases]
|
|
81
|
+
|
|
82
|
+
# Get student pass pattern for the target question
|
|
83
|
+
student_correctness_pattern = target.get("pass", None)
|
|
84
|
+
if student_correctness_pattern is not None:
|
|
85
|
+
main_part = int(student_correctness_pattern)
|
|
86
|
+
# Convert each character to an int
|
|
87
|
+
student_correctness_list = [int(ch) for ch in str(main_part)]
|
|
88
|
+
else:
|
|
89
|
+
student_correctness_list = []
|
|
90
|
+
|
|
91
|
+
# Student specific topic performance in previous attempts
|
|
92
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
93
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
94
|
+
for _, row in topic_performance.iterrows():
|
|
95
|
+
topic = row["topic"]
|
|
96
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
97
|
+
perfect = round(row["perfect"], 2)
|
|
98
|
+
|
|
99
|
+
student_level_prompt += (
|
|
100
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
101
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
|
|
105
|
+
print(f"Test cases loaded: {len(target_test_cases)}")
|
|
106
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
107
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
108
|
+
|
|
109
|
+
prompt = (
|
|
110
|
+
"You are analyzing a student’s likely mistakes on an upcoming programming problem.\n"
|
|
111
|
+
"Your task: **predict exactly ONE unit-test index (0-based) that the student is most likely to fail.**\n" # noqa: E501
|
|
112
|
+
"Return *only* that integer. No explanation.\n\n"
|
|
113
|
+
"=== Student Profile ===\n"
|
|
114
|
+
f"{student_level_prompt}\n"
|
|
115
|
+
"For the given programming question, identify which unit test the student would fail considering "
|
|
116
|
+
"their past performance, as well as consideration of unit test difficulty.\n"
|
|
117
|
+
f"Week: {target['week']}\n"
|
|
118
|
+
f"Topic: {target['topic']}\n\n"
|
|
119
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
120
|
+
f"Unit Tests: {target_test_cases}\n"
|
|
121
|
+
if target_test_cases
|
|
122
|
+
else ""
|
|
123
|
+
"Think silently about:\n"
|
|
124
|
+
"• Which test seems hardest for the given topic?\n"
|
|
125
|
+
"• Where has the student historically struggled?\n"
|
|
126
|
+
"• Any edge-cases in the tests’ inputs/outputs?\n\n"
|
|
127
|
+
"******** ANSWER FORMAT (STRICT) ********\n"
|
|
128
|
+
"<integer>\n"
|
|
129
|
+
"****************************************"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
instances.append(
|
|
133
|
+
Instance(
|
|
134
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
135
|
+
input=Input(text=prompt),
|
|
136
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
137
|
+
extra_data={
|
|
138
|
+
"question_template": target["question_template"],
|
|
139
|
+
"test_cases": target_test_cases,
|
|
140
|
+
"question_id": str(target_question_id),
|
|
141
|
+
"question_name": target.get("question_name", ""),
|
|
142
|
+
"student_id": str(student_id),
|
|
143
|
+
"student_correctness_pattern": student_correctness_list,
|
|
144
|
+
},
|
|
145
|
+
split=VALID_SPLIT,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Print summary statistics
|
|
150
|
+
print("\n=== INSTANCE CREATION SUMMARY ===")
|
|
151
|
+
print(f"Skipped (insufficient data): {skipped_insufficient_data}")
|
|
152
|
+
print(f"Skipped (no test cases): {skipped_no_tests}")
|
|
153
|
+
print(f"Available test case question IDs: {len(available_question_ids)}")
|
|
154
|
+
|
|
155
|
+
if len(instances) >= 5:
|
|
156
|
+
print("Sample created instances:")
|
|
157
|
+
for i, inst in enumerate(instances[:5]):
|
|
158
|
+
if inst.extra_data is None:
|
|
159
|
+
test_count = 0
|
|
160
|
+
else:
|
|
161
|
+
test_count = len(inst.extra_data.get("test_cases", []))
|
|
162
|
+
print(f" {inst.id}: {test_count} test cases")
|
|
163
|
+
|
|
164
|
+
return instances
|
|
165
|
+
|
|
166
|
+
def _load_test_cases(self):
|
|
167
|
+
"""
|
|
168
|
+
Load test cases from external source or return None if not available.
|
|
169
|
+
This method should be implemented based on where your test cases are stored.
|
|
170
|
+
|
|
171
|
+
Expected format:
|
|
172
|
+
{
|
|
173
|
+
"question_id": [
|
|
174
|
+
{
|
|
175
|
+
"unittest": "test_id",
|
|
176
|
+
"input": "test input code",
|
|
177
|
+
"output": "expected output"
|
|
178
|
+
},
|
|
179
|
+
...
|
|
180
|
+
],
|
|
181
|
+
...
|
|
182
|
+
}
|
|
183
|
+
"""
|
|
184
|
+
try:
|
|
185
|
+
response = requests.get(
|
|
186
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
187
|
+
)
|
|
188
|
+
if response.status_code == 200:
|
|
189
|
+
return response.json()
|
|
190
|
+
except Exception as e:
|
|
191
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
192
|
+
return {}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsStudentCodingScenario(Scenario):
|
|
7
|
+
name = "codeinsights_student_coding"
|
|
8
|
+
description = "Mimic student C++ style on foundational questions"
|
|
9
|
+
tags = ["codeinsights", "c++", "student_coding"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
|
|
17
|
+
student_topic = pd.read_csv(
|
|
18
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
instances = []
|
|
22
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
23
|
+
student_df = student_df.sort_values("timestamp")
|
|
24
|
+
if len(student_df) < 4:
|
|
25
|
+
continue
|
|
26
|
+
first = student_df.iloc[0]
|
|
27
|
+
second = student_df.iloc[1]
|
|
28
|
+
third = student_df.iloc[2]
|
|
29
|
+
target = student_df.iloc[3]
|
|
30
|
+
|
|
31
|
+
# Get test cases for this question
|
|
32
|
+
question_id = target.get("question_unittest_id", None)
|
|
33
|
+
question_test_cases = []
|
|
34
|
+
tc_parsing_success = True
|
|
35
|
+
|
|
36
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
37
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
38
|
+
input_idx = testcase_str.find("Input:")
|
|
39
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
40
|
+
output_idx = testcase_str.find("Output:")
|
|
41
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
42
|
+
tc_parsing_success = False
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
testcase = {
|
|
46
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
47
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
48
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
49
|
+
}
|
|
50
|
+
question_test_cases.append(testcase)
|
|
51
|
+
|
|
52
|
+
if not tc_parsing_success:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if len(question_test_cases) < self.num_testcases:
|
|
56
|
+
# If not enough test cases, skip this question
|
|
57
|
+
continue
|
|
58
|
+
if self.num_testcases >= 0:
|
|
59
|
+
# If more than one test case is requested, only take the first ones
|
|
60
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
61
|
+
|
|
62
|
+
# Get student pass (0 or 1) for the target question
|
|
63
|
+
student_correctness_pattern = target.get("pass", None)
|
|
64
|
+
main_part = int(student_correctness_pattern) # "1111111111"
|
|
65
|
+
# Convert each character to an int
|
|
66
|
+
student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
|
|
67
|
+
|
|
68
|
+
# Student specific topic performance in previous attempts
|
|
69
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
70
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
71
|
+
for _, row in topic_performance.iterrows():
|
|
72
|
+
topic = row["topic"]
|
|
73
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
74
|
+
perfect = round(row["perfect"], 2)
|
|
75
|
+
|
|
76
|
+
student_level_prompt += (
|
|
77
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
78
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
prompt = (
|
|
82
|
+
"=== Student Profile ===\n"
|
|
83
|
+
f"{student_level_prompt}\n"
|
|
84
|
+
f"Week: {target['week']}\n"
|
|
85
|
+
f"Topic: {target['topic']}\n\n"
|
|
86
|
+
"Example 1:\n"
|
|
87
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
88
|
+
"Template:\n"
|
|
89
|
+
f"{first['question_template']}\n"
|
|
90
|
+
"Your Code:\n"
|
|
91
|
+
f"{first['response']}\n\n"
|
|
92
|
+
"Example 2:\n"
|
|
93
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
94
|
+
"Template:\n"
|
|
95
|
+
f"{second['question_template']}\n"
|
|
96
|
+
"Your Code:\n"
|
|
97
|
+
f"{second['response']}\n\n"
|
|
98
|
+
"Example 3:\n"
|
|
99
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
100
|
+
"Template:\n"
|
|
101
|
+
f"{third['question_template']}\n"
|
|
102
|
+
"Your Code:\n"
|
|
103
|
+
f"{third['response']}\n\n"
|
|
104
|
+
"Now, using that same student style, attempt this:\n"
|
|
105
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
106
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
107
|
+
if question_test_cases
|
|
108
|
+
else ""
|
|
109
|
+
"Template:\n"
|
|
110
|
+
f"{target['question_template']}\n\n"
|
|
111
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
112
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
113
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
114
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
115
|
+
"Ensure your code includes any class definition when needed. "
|
|
116
|
+
"Return the code in C++ code block format, and nothing else."
|
|
117
|
+
)
|
|
118
|
+
instances.append(
|
|
119
|
+
Instance(
|
|
120
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
121
|
+
input=Input(text=prompt),
|
|
122
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
123
|
+
extra_data={
|
|
124
|
+
"question_template": target["question_template"],
|
|
125
|
+
"test_cases": question_test_cases,
|
|
126
|
+
"question_id": str(question_id) if question_id else None,
|
|
127
|
+
"question_name": target.get("question_name", ""),
|
|
128
|
+
"student_id": str(student_id),
|
|
129
|
+
"student_correctness_pattern": student_correctness_list,
|
|
130
|
+
},
|
|
131
|
+
split=VALID_SPLIT,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
return instances
|
|
135
|
+
|
|
136
|
+
def _load_test_cases(self):
|
|
137
|
+
"""
|
|
138
|
+
Load test cases from external source or return None if not available.
|
|
139
|
+
This method should be implemented based on where your test cases are stored.
|
|
140
|
+
|
|
141
|
+
Expected format:
|
|
142
|
+
{
|
|
143
|
+
"question_id": [
|
|
144
|
+
{
|
|
145
|
+
"unittest": "test_id",
|
|
146
|
+
"input": "test input code",
|
|
147
|
+
"output": "expected output"
|
|
148
|
+
},
|
|
149
|
+
...
|
|
150
|
+
],
|
|
151
|
+
...
|
|
152
|
+
}
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
response = requests.get(
|
|
156
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
157
|
+
)
|
|
158
|
+
if response.status_code == 200:
|
|
159
|
+
return response.json()
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
162
|
+
return {}
|