crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import re
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
import shutil
|
|
7
|
+
|
|
8
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
9
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
10
|
+
from helm.benchmark.metrics.metric import Metric
|
|
11
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
12
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
13
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def compile_code(i, temp_dir, timeout=10):
|
|
17
|
+
"""
|
|
18
|
+
Compiles the C++ file at temp_dir/tc_{i}.cpp and outputs to temp_dir/tc_{i}.out.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
i (int): Index of the code to compile.
|
|
22
|
+
temp_dir (str): Temporary directory where the C++ files are located.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
str or None: Path to the executable if compilation succeeds, else None.
|
|
26
|
+
"""
|
|
27
|
+
executable = os.path.join(temp_dir, f"tc_{i}.out")
|
|
28
|
+
cpp_file = os.path.join(temp_dir, f"tc_{i}.cpp")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
result = subprocess.run(
|
|
32
|
+
["timeout", str(timeout), "g++", "-std=c++11", cpp_file, "-o", executable],
|
|
33
|
+
stdout=subprocess.PIPE,
|
|
34
|
+
stderr=subprocess.PIPE,
|
|
35
|
+
text=True, # Optional: to get output as string
|
|
36
|
+
timeout=timeout + 2, # Optional: timeout for compilation
|
|
37
|
+
)
|
|
38
|
+
if result.returncode != 0:
|
|
39
|
+
# print(f"Compilation failed for {cpp_file}:\n{result.stderr}")
|
|
40
|
+
return None
|
|
41
|
+
return executable
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"An error occurred while compiling {cpp_file}: {e}")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parallel_compile(codes, temp_dir, timeout=10, max_workers=4):
|
|
48
|
+
"""
|
|
49
|
+
Compiles multiple C++ codes in parallel.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
codes (list): List of code snippets or identifiers.
|
|
53
|
+
temp_dir (str): Directory containing the C++ files.
|
|
54
|
+
max_workers (int): Maximum number of worker processes.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list: List of paths to the compiled executables or None for failed compilations.
|
|
58
|
+
"""
|
|
59
|
+
executables = []
|
|
60
|
+
for i in range(len(codes)):
|
|
61
|
+
executable = compile_code(i, temp_dir, timeout)
|
|
62
|
+
executables.append(executable)
|
|
63
|
+
|
|
64
|
+
return executables
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def run_executable(executable, std_in, timeout=10):
|
|
68
|
+
"""
|
|
69
|
+
Runs an executable with a timeout and captures its output.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
executable (str): Path to the executable to run.
|
|
73
|
+
timeout (int): Timeout for running the executable in seconds.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
tuple: (return_code, output) where return_code is 0 if successful, non-zero otherwise,
|
|
77
|
+
and output is the stdout captured from the execution.
|
|
78
|
+
"""
|
|
79
|
+
if executable is None:
|
|
80
|
+
return (0, "") # Return 0 and empty output for failed compilations
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
result = subprocess.run(
|
|
84
|
+
["timeout", str(timeout), executable],
|
|
85
|
+
input=std_in,
|
|
86
|
+
stdout=subprocess.PIPE,
|
|
87
|
+
stderr=subprocess.PIPE,
|
|
88
|
+
text=True, # To decode stdout and stderr as strings
|
|
89
|
+
timeout=timeout + 2, # Add a small buffer to the timeout
|
|
90
|
+
)
|
|
91
|
+
return (result.returncode, result.stdout)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
print(f"An error occurred while running {executable}: {e}")
|
|
94
|
+
return (1, "") # Non-zero return code for errors
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def parallel_run_executables(executables, std_inputs, timeout=10, max_workers=4):
|
|
98
|
+
"""
|
|
99
|
+
Runs multiple executables in parallel with a timeout.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
executables (list): List of paths to the executables.
|
|
103
|
+
max_workers (int): Maximum number of worker processes.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
list: List of results containing the outputs from running each executable.
|
|
107
|
+
"""
|
|
108
|
+
results = []
|
|
109
|
+
for std_in, executable in zip(std_inputs, executables):
|
|
110
|
+
result_code, output = run_executable(executable, std_in, timeout)
|
|
111
|
+
results.append((result_code, output))
|
|
112
|
+
|
|
113
|
+
return results
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class CPPEvaluator:
|
|
117
|
+
def __init__(self, template, testcases, timeout=10, max_workers=8):
|
|
118
|
+
"""Initializes the CPPEvaluator class.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
template (str): The template code with placeholders for the student's answer and test cases.
|
|
122
|
+
testcases (Dict[str]): A list of test cases, each containing the input, output, and optional std_in.
|
|
123
|
+
max_workers (int, optional): The maximum number of workers to use for parallel processing. Defaults to 8.
|
|
124
|
+
"""
|
|
125
|
+
self.template = template
|
|
126
|
+
self.testcases = testcases
|
|
127
|
+
self.timeout = timeout
|
|
128
|
+
self.max_workers = max_workers
|
|
129
|
+
self.formatted_testcases, self.std_inputs = self.format_testcases()
|
|
130
|
+
|
|
131
|
+
def format_testcases(self):
|
|
132
|
+
"""Formats the test cases into the required format for the grading engine.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Tuple[List[Dict[str]], List[str]]: A tuple containing the formatted test cases and standard inputs.
|
|
136
|
+
"""
|
|
137
|
+
formatted_testcases = []
|
|
138
|
+
std_inputs = []
|
|
139
|
+
for testcase in self.testcases:
|
|
140
|
+
formatted_testcases.append(
|
|
141
|
+
{
|
|
142
|
+
"extra": "",
|
|
143
|
+
"testcode": testcase["input"],
|
|
144
|
+
"expected_output": testcase["output"],
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
if "std_in" not in testcase:
|
|
148
|
+
std_inputs.append("")
|
|
149
|
+
else:
|
|
150
|
+
std_inputs.append(testcase["std_in"])
|
|
151
|
+
return formatted_testcases, std_inputs
|
|
152
|
+
|
|
153
|
+
def generate_code(self, student_answer):
|
|
154
|
+
"""Generates the C++ code with the student's answer and test cases.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
student_answer (str): The student's answer to be inserted into the template.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
List[str]: A list of C++ code snippets with the student's answer and test cases inserted.
|
|
161
|
+
"""
|
|
162
|
+
# Insert the student's answer and test cases into the template
|
|
163
|
+
code = self.template.replace("{{ STUDENT_ANSWER }}", student_answer)
|
|
164
|
+
|
|
165
|
+
# Find the for loop in the template
|
|
166
|
+
start_index = code.find("{% for TEST in TESTCASES %}")
|
|
167
|
+
end_index = code.find("{% endfor %}") + len("{% endfor %}")
|
|
168
|
+
|
|
169
|
+
list_codes = []
|
|
170
|
+
for testcase in self.formatted_testcases:
|
|
171
|
+
# Insert the test case code into the template between the for loop
|
|
172
|
+
testcode = code[:start_index] + testcase["testcode"] + code[end_index:]
|
|
173
|
+
list_codes.append(testcode)
|
|
174
|
+
|
|
175
|
+
return list_codes
|
|
176
|
+
|
|
177
|
+
def write_and_compile_code(self, codes):
|
|
178
|
+
"""Writes and compiles the C++ code.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
codes (List[str]): A list of C++ code snippets.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Tuple[List[str], str]: A tuple containing the list of executable paths and the temporary directory.
|
|
185
|
+
"""
|
|
186
|
+
# Write the C++ code to a temporary file
|
|
187
|
+
temp_dir = tempfile.mkdtemp()
|
|
188
|
+
for i, code in enumerate(codes):
|
|
189
|
+
cpp_file = os.path.join(temp_dir, f"tc_{i}.cpp")
|
|
190
|
+
with open(cpp_file, "w") as file:
|
|
191
|
+
file.write(code)
|
|
192
|
+
|
|
193
|
+
# Compile the C++ code
|
|
194
|
+
executables = parallel_compile(codes, temp_dir, timeout=self.timeout, max_workers=self.max_workers)
|
|
195
|
+
|
|
196
|
+
return executables, temp_dir
|
|
197
|
+
|
|
198
|
+
def evaluate(self, student_answer):
|
|
199
|
+
"""Evaluates the student's answer using the test cases.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
student_answer (str): The student's answer to be evaluated.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Dict[str, Union[float, List[int]]]: A dictionary containing the score and test case results.
|
|
206
|
+
"""
|
|
207
|
+
# Generate the C++ code with the student's answer
|
|
208
|
+
codes = self.generate_code(student_answer)
|
|
209
|
+
|
|
210
|
+
# Write and compile the C++ code
|
|
211
|
+
executables, temp_dir = self.write_and_compile_code(codes)
|
|
212
|
+
list_result = []
|
|
213
|
+
|
|
214
|
+
executation_results = parallel_run_executables(
|
|
215
|
+
executables, self.std_inputs, timeout=self.timeout, max_workers=self.max_workers
|
|
216
|
+
)
|
|
217
|
+
for i, testcase in enumerate(self.testcases):
|
|
218
|
+
if executation_results[i][0] != 0:
|
|
219
|
+
list_result.append(0)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
expected_output = testcase["output"]
|
|
223
|
+
student_output = executation_results[i][1]
|
|
224
|
+
if expected_output.strip() != student_output.strip():
|
|
225
|
+
list_result.append(0)
|
|
226
|
+
else:
|
|
227
|
+
list_result.append(1)
|
|
228
|
+
|
|
229
|
+
# Delete the temporary directory
|
|
230
|
+
try:
|
|
231
|
+
shutil.rmtree(temp_dir)
|
|
232
|
+
except OSError as e:
|
|
233
|
+
print("Error: %s - %s." % (e.filename, e.strerror))
|
|
234
|
+
|
|
235
|
+
if len(list_result) == 0:
|
|
236
|
+
return {"score": 0, "testcases": list_result}
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
"score": sum(list_result) / len(list_result),
|
|
240
|
+
"testcases": list_result,
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class CodeInsightsFunctionalCorrectnessMetric(Metric):
|
|
245
|
+
"""
|
|
246
|
+
Metric for evaluating functional correctness of C++ code generation.
|
|
247
|
+
|
|
248
|
+
Measures each model's functional correctness by computing the proportion of problems
|
|
249
|
+
for which its generated code passes all provided unit tests. For every generated solution,
|
|
250
|
+
we compile the C++ code (using g++) and execute the full test cases. We record the
|
|
251
|
+
proportions of the unit test that passes for each problem and then take the mean across
|
|
252
|
+
all problems. This yields a score between 0 and 1, where 1 indicates the model produced
|
|
253
|
+
flawless codes, and lower values reveal the fraction of tasks it could not solve all
|
|
254
|
+
the unit test cases.
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
def __init__(self, timeout: int = 10, max_workers: int = 8):
|
|
258
|
+
"""
|
|
259
|
+
Initializes the CodeInsightsFunctionalCorrectnessMetric.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
timeout (int): Timeout for each test case execution.
|
|
263
|
+
max_workers (int): Maximum number of workers for parallel processing.
|
|
264
|
+
"""
|
|
265
|
+
super().__init__()
|
|
266
|
+
self.timeout = timeout
|
|
267
|
+
self.max_workers = max_workers
|
|
268
|
+
|
|
269
|
+
def evaluate_generation(
|
|
270
|
+
self,
|
|
271
|
+
adapter_spec: AdapterSpec,
|
|
272
|
+
request_state: RequestState,
|
|
273
|
+
metric_service: MetricService,
|
|
274
|
+
eval_cache_path: str,
|
|
275
|
+
) -> List[Stat]:
|
|
276
|
+
"""
|
|
277
|
+
Evaluate LLM-generated code by running unit tests and computing pass rate.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
List of Stat objects containing the functional correctness score
|
|
281
|
+
"""
|
|
282
|
+
print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
|
|
283
|
+
print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
|
|
284
|
+
|
|
285
|
+
# Get the generated code from the request state
|
|
286
|
+
if not request_state.result or not request_state.result.completions:
|
|
287
|
+
print("ERROR: No output generated")
|
|
288
|
+
return self._create_failure_stats("No output generated")
|
|
289
|
+
|
|
290
|
+
generated_code = request_state.result.completions[0].text.strip()
|
|
291
|
+
generated_code = self._extract_student_code(generated_code)
|
|
292
|
+
print(f"Generated code length: {len(generated_code)}")
|
|
293
|
+
print(f"Generated code preview: {generated_code[:200]}...")
|
|
294
|
+
|
|
295
|
+
# Get test cases from instance extra_data
|
|
296
|
+
if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
|
|
297
|
+
print("ERROR: No extra_data available")
|
|
298
|
+
print(f"Instance attributes: {dir(request_state.instance)}")
|
|
299
|
+
return self._create_failure_stats("No test data available")
|
|
300
|
+
|
|
301
|
+
extra_data = request_state.instance.extra_data
|
|
302
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
303
|
+
|
|
304
|
+
test_cases = extra_data.get("test_cases", [])
|
|
305
|
+
question_template = extra_data.get("question_template", "")
|
|
306
|
+
question_name = extra_data.get("question_name", "UNKNOWN")
|
|
307
|
+
|
|
308
|
+
print(f"Question name: {question_name}")
|
|
309
|
+
print(f"Number of test cases: {len(test_cases)}")
|
|
310
|
+
print(f"Template length: {len(question_template)}")
|
|
311
|
+
|
|
312
|
+
if not test_cases:
|
|
313
|
+
print("ERROR: No test cases available")
|
|
314
|
+
return self._create_failure_stats("No test cases available")
|
|
315
|
+
|
|
316
|
+
print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
|
|
317
|
+
|
|
318
|
+
# Run unit tests and calculate pass rate
|
|
319
|
+
evaluator = CPPEvaluator(question_template, test_cases, timeout=self.timeout, max_workers=self.max_workers)
|
|
320
|
+
pass_rate = evaluator.evaluate(generated_code)["score"]
|
|
321
|
+
|
|
322
|
+
print(f"Final pass rate: {pass_rate}")
|
|
323
|
+
print("=== END DEBUG ===\n")
|
|
324
|
+
|
|
325
|
+
return [Stat(MetricName("functional_correctness")).add(pass_rate)]
|
|
326
|
+
|
|
327
|
+
def _extract_student_code(self, model_code: str) -> str:
|
|
328
|
+
"""
|
|
329
|
+
Extracts clean C++ code from model output:
|
|
330
|
+
- Trims preambles
|
|
331
|
+
- Removes student's main()
|
|
332
|
+
"""
|
|
333
|
+
code_blocks = re.findall(r"```(?:c\+\+)?\n(.*?)```", model_code, flags=re.DOTALL)
|
|
334
|
+
if code_blocks:
|
|
335
|
+
model_code = code_blocks[0].strip() # Use the first code block
|
|
336
|
+
print("[Markdown extraction] Used fenced code blocks.")
|
|
337
|
+
|
|
338
|
+
# Post-processing
|
|
339
|
+
# Comment out as a testing - 7/3/2025
|
|
340
|
+
lines = model_code.strip().splitlines()
|
|
341
|
+
start_keywords = ("#include", "using namespace")
|
|
342
|
+
for i, line in enumerate(lines):
|
|
343
|
+
if any(line.strip().startswith(k) for k in start_keywords):
|
|
344
|
+
lines[i] = ""
|
|
345
|
+
code = "\n".join(lines).strip()
|
|
346
|
+
if "int main" in code:
|
|
347
|
+
code = code.split("int main")[0].strip()
|
|
348
|
+
|
|
349
|
+
# --- Final touch ---
|
|
350
|
+
if "print(" in code and "void print()" not in code and "print()" not in code:
|
|
351
|
+
print("⚠️ WARNING: `print()` is called in test input but not defined.")
|
|
352
|
+
|
|
353
|
+
return code
|
|
354
|
+
|
|
355
|
+
def _create_failure_stats(self, error_message: str) -> List[Stat]:
|
|
356
|
+
"""
|
|
357
|
+
Create default statistics for failure cases.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
error_message: Description of the failure
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
List containing a single Stat with 0.0 functional correctness score
|
|
364
|
+
"""
|
|
365
|
+
print(f"METRIC FAILURE: {error_message}")
|
|
366
|
+
return [Stat(MetricName("functional_correctness")).add(0.0)]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
from helm.benchmark.metrics.codeinsights_code_evaluation_metrics import CodeInsightsCodeEvaluationMetric
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UnittestAlignmentMetric(Metric):
|
|
14
|
+
"""
|
|
15
|
+
Compare LLM unit-test results with the student’s correctness pattern.
|
|
16
|
+
|
|
17
|
+
Adds:
|
|
18
|
+
• functional_correctness (pass-rate)
|
|
19
|
+
• edge_case_slip_match (binary 0/1)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# ------------------------------------------------------------------ I#
|
|
23
|
+
# HELM entry-point #
|
|
24
|
+
# ------------------------------------------------------------------ #
|
|
25
|
+
def evaluate_generation( # HELM entry-point
|
|
26
|
+
self,
|
|
27
|
+
adapter_spec: AdapterSpec,
|
|
28
|
+
request_state: RequestState,
|
|
29
|
+
metric_service: MetricService,
|
|
30
|
+
eval_cache_path: str,
|
|
31
|
+
) -> List[Stat]:
|
|
32
|
+
# ------------------------------------------------------------------
|
|
33
|
+
# 1. Parse the model’s answer --------------------------------------
|
|
34
|
+
# ------------------------------------------------------------------
|
|
35
|
+
default_stat = Stat(MetricName("unittest_alignment")).add(0.0)
|
|
36
|
+
|
|
37
|
+
if not request_state.result or not request_state.result.completions:
|
|
38
|
+
# No output → automatic miss
|
|
39
|
+
return [default_stat]
|
|
40
|
+
|
|
41
|
+
raw_output: str = request_state.result.completions[0].text.strip()
|
|
42
|
+
|
|
43
|
+
# Extract the *first* integer we see (robust to whitespace / newlines)
|
|
44
|
+
match = re.search(r"-?\d+", raw_output)
|
|
45
|
+
if match is None:
|
|
46
|
+
# Model didn’t emit an integer → miss
|
|
47
|
+
return [default_stat]
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
predicted_index: int = int(match.group())
|
|
51
|
+
except ValueError:
|
|
52
|
+
# Shouldn’t happen, but be safe
|
|
53
|
+
return [default_stat]
|
|
54
|
+
|
|
55
|
+
# ------------------------------------------------------------------
|
|
56
|
+
# 2. Retrieve ground-truth failure index ---------------------------
|
|
57
|
+
# ------------------------------------------------------------------
|
|
58
|
+
extra = getattr(request_state.instance, "extra_data", {}) or {}
|
|
59
|
+
correctness_pattern: List[int] = extra.get("student_correctness_pattern", [])
|
|
60
|
+
|
|
61
|
+
# Indices where the student failed (value == 0)
|
|
62
|
+
failed_indices: List[int] = [i for i, v in enumerate(correctness_pattern) if v == 0]
|
|
63
|
+
|
|
64
|
+
# If we don’t have exactly one failing test, treat as miss
|
|
65
|
+
if len(failed_indices) != 1:
|
|
66
|
+
return [default_stat]
|
|
67
|
+
|
|
68
|
+
actual_index: int = failed_indices[0]
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# 3. Compare & return ---------------------------------------------
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
alignment_score = 1.0 if predicted_index == actual_index else 0.0
|
|
74
|
+
return [Stat(MetricName("unittest_alignment")).add(alignment_score)]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class CodeInsightsUnittestAlignmentMetric(CodeInsightsCodeEvaluationMetric):
|
|
78
|
+
"""unit-test alignment (with new metrics)."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, use_codebert: bool = True):
|
|
81
|
+
super().__init__(use_codebert=use_codebert)
|
|
82
|
+
self.alignment_metric = UnittestAlignmentMetric()
|
|
83
|
+
|
|
84
|
+
def evaluate_generation(
|
|
85
|
+
self,
|
|
86
|
+
adapter_spec: AdapterSpec,
|
|
87
|
+
request_state: RequestState,
|
|
88
|
+
metric_service: MetricService,
|
|
89
|
+
eval_cache_path: str,
|
|
90
|
+
):
|
|
91
|
+
stats = self.alignment_metric.evaluate_generation(adapter_spec, request_state, metric_service, eval_cache_path)
|
|
92
|
+
return stats
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_functional_correctness_metric_specs() -> List[MetricSpec]:
|
|
6
|
+
return [
|
|
7
|
+
MetricSpec(
|
|
8
|
+
class_name="helm.benchmark.metrics.codeinsights_correct_code_metrics.CodeInsightsFunctionalCorrectnessMetric", # noqa: E501
|
|
9
|
+
args={"timeout": 10, "max_workers": 1},
|
|
10
|
+
)
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_comprehensive_code_evaluation_metric_specs(use_codebert: bool = True) -> List[MetricSpec]:
|
|
15
|
+
return [
|
|
16
|
+
MetricSpec(
|
|
17
|
+
class_name="helm.benchmark.metrics.codeinsights_code_evaluation_metrics.CodeInsightsComprehensiveCodeEvaluationMetric", # noqa: E501
|
|
18
|
+
args={"use_codebert": use_codebert},
|
|
19
|
+
)
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_code_efficiency_metric_specs(
|
|
24
|
+
num_runtime_runs: int = 5,
|
|
25
|
+
timeout_seconds: int = 10,
|
|
26
|
+
use_codebert: bool = True, # ➊ add arg if you wish
|
|
27
|
+
):
|
|
28
|
+
return [
|
|
29
|
+
MetricSpec( # existing metric → runtime & correctness
|
|
30
|
+
class_name="helm.benchmark.metrics.codeinsights_code_efficiency_metrics.CodeInsightsCodeEfficiencyMetric",
|
|
31
|
+
args={
|
|
32
|
+
"num_runtime_runs": num_runtime_runs,
|
|
33
|
+
"timeout_seconds": timeout_seconds,
|
|
34
|
+
},
|
|
35
|
+
),
|
|
36
|
+
MetricSpec( # ➋ NEW metric → AST + CodeBERT
|
|
37
|
+
class_name="helm.benchmark.metrics.codeinsights_code_evaluation_metrics.CodeInsightsCodeEvaluationMetric",
|
|
38
|
+
args={"use_codebert": use_codebert},
|
|
39
|
+
),
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_edge_case_metric_specs(
|
|
44
|
+
use_codebert: bool = True,
|
|
45
|
+
) -> List[MetricSpec]:
|
|
46
|
+
return [
|
|
47
|
+
MetricSpec(
|
|
48
|
+
class_name="helm.benchmark.metrics.codeinsights_edge_case_metrics.CodeInsightsUnittestAlignmentMetric", # noqa: E501
|
|
49
|
+
args={"use_codebert": use_codebert},
|
|
50
|
+
)
|
|
51
|
+
]
|
|
@@ -16,7 +16,7 @@ from helm.common.request import RequestResult
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class CometMetric(Metric):
|
|
19
|
-
"""COMET machine translation metric using a regression model.
|
|
19
|
+
r"""COMET machine translation metric using a regression model.
|
|
20
20
|
The model takes a triplet of source sentence, translation, and reference
|
|
21
21
|
and computes a score in the range [0, 1] reflecting the quality of the predicted
|
|
22
22
|
translation.
|
|
@@ -3,7 +3,7 @@ from typing import Any, List
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
-
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric, MetricMetadata
|
|
7
7
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
8
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
9
|
from helm.benchmark.metrics.statistic import Stat
|
|
@@ -70,3 +70,14 @@ class ConvFinQACalcMetric(Metric):
|
|
|
70
70
|
return [
|
|
71
71
|
Stat(MetricName("float_equiv")).add(float_equiv(model_answer, gold_answer)),
|
|
72
72
|
]
|
|
73
|
+
|
|
74
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
75
|
+
return [
|
|
76
|
+
MetricMetadata(
|
|
77
|
+
name="float_equiv",
|
|
78
|
+
display_name="Float Equivalence",
|
|
79
|
+
description="Float Equivalence",
|
|
80
|
+
lower_is_better=False,
|
|
81
|
+
group=None,
|
|
82
|
+
),
|
|
83
|
+
]
|
|
@@ -25,7 +25,7 @@ def _longest_common_prefix_length(s1: np.ndarray, s2: np.ndarray, previous_best:
|
|
|
25
25
|
min_len = min(len(s1), len(s2))
|
|
26
26
|
s1, s2 = s1[:min_len], s2[:min_len]
|
|
27
27
|
(nonzeros,) = np.cumprod(s1 == s2).nonzero() # Get indices (inclusive) up to which s1 and s2 are the same.
|
|
28
|
-
result = np.max(nonzeros) + 1 if len(nonzeros) > 0 else 0
|
|
28
|
+
result = np.max(nonzeros).item() + 1 if len(nonzeros) > 0 else 0
|
|
29
29
|
return result if previous_best is None else max(previous_best, result)
|
|
30
30
|
|
|
31
31
|
|
|
@@ -8,7 +8,7 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
9
9
|
from helm.benchmark.window_services.window_service import WindowService
|
|
10
10
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
11
|
-
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
|
|
11
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricMetadata, MetricResult, PerInstanceStats
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
14
14
|
from helm.benchmark.metrics.tokens.auto_token_cost_estimator import AutoTokenCostEstimator
|
|
@@ -93,3 +93,32 @@ class DryRunMetric(MetricInterface):
|
|
|
93
93
|
merge_stat(stats, Stat(MetricName("num_requests")).add(len(scenario_state.request_states)))
|
|
94
94
|
|
|
95
95
|
return MetricResult(list(stats.values()), per_instance_stats)
|
|
96
|
+
|
|
97
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
98
|
+
return [
|
|
99
|
+
MetricMetadata(
|
|
100
|
+
name="estimated_num_tokens_cost",
|
|
101
|
+
display_name="cost",
|
|
102
|
+
short_display_name=None,
|
|
103
|
+
description="An estimate of the number of tokens (including prompt and output completions) needed to "
|
|
104
|
+
"perform the request.",
|
|
105
|
+
lower_is_better=None,
|
|
106
|
+
group=None,
|
|
107
|
+
),
|
|
108
|
+
MetricMetadata(
|
|
109
|
+
name="num_completions",
|
|
110
|
+
display_name="# completions",
|
|
111
|
+
short_display_name=None,
|
|
112
|
+
description="Number of completions.",
|
|
113
|
+
lower_is_better=None,
|
|
114
|
+
group=None,
|
|
115
|
+
),
|
|
116
|
+
MetricMetadata(
|
|
117
|
+
name="num_prompt_tokens",
|
|
118
|
+
display_name="# prompt tokens",
|
|
119
|
+
short_display_name=None,
|
|
120
|
+
description="Number of tokens in the prompt.",
|
|
121
|
+
lower_is_better=None,
|
|
122
|
+
group="general_information",
|
|
123
|
+
),
|
|
124
|
+
]
|