PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +191 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +47 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +24 -6
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/huggingface_client.py +2 -2
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +33 -20
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -13
helm/clients/vertexai_client.py +19 -11
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +525 -172
helm/config/model_metadata.yaml +185 -10
helm/config/tokenizer_configs.yaml +100 -2
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/med_qa_scenario.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Dict, List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
     VALID_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -103,3 +105,15 @@ class MedQAScenario(Scenario):
                     instances.append(instance)
         return instances
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="med_qa",
+            display_name="MedQA",
+            description="MedQA is an open domain question answering dataset composed of questions from "
+            "professional medical board exams ([Jin et al. "
+            "2020](https://arxiv.org/pdf/2009.13081.pdf)).",
+            taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
+            main_metric="quasi_exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/medalign_scenario.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     PassageQuestionInput,
     Output,
+    ScenarioMetadata,
 )
 from helm.benchmark.scenarios.medalign_scenario_helper import return_dataset_dataframe  # type: ignore
@@ -92,3 +94,24 @@ class MedalignScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         dataset = return_dataset_dataframe(self.max_length, self.data_path)
         return self.process_tsv(dataset)
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medalign",
+            display_name="MedAlign",
+            short_display_name="MedAlign",
+            description="MedAlign is a benchmark that evaluates a model's ability to interpret and "
+            "follow instructions grounded in longitudinal electronic health records (EHR). "
+            "Each instance includes an event-stream style patient record and a natural "
+            "language question or task, requiring clinically informed reading comprehension "
+            "and reasoning [(Fleming et al., 2023)](https://arxiv.org/abs/2308.14089).",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Answer questions and follow instructions over longitudinal EHR",
+                when="Any",
+                who="Clinician, Researcher",
+                language="English",
+            ),
+            main_metric="medalign_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/medalign_scenario_helper.py CHANGED Viewed

@@ -2,22 +2,13 @@
 # type: ignore
 # fmt: off
-import ast
-import datetime
 import transformers
-import langchain
-import langchain.prompts
-import lxml.etree
 import os
 import pandas as pd
-import re
 import tiktoken
-from langchain_community.retrievers import BM25Retriever
 from tqdm import tqdm
-from typing import Any, Dict, Optional, Union, Callable
-from langchain.schema import Document
-import langchain_community
+from typing import Any, Dict, Optional, Callable
 from helm.common.general import check_file_exists
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
     return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
-def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
-    """
-    Retrieve and filter relevant EHR visits based on a query and target length.
-    This function retrieves electronic health record (EHR) visit strings, sorts them
-    by relevance using the BM25Retriever, and constructs a list of final documents
-    that fit within a specified character length. The final list ensures that the
-    most important visit isn't cut off and is sorted chronologically.
-    Parameters:
-        ehr_visit_strs (list of str): List of EHR visit strings.
-        query (str): Query string to retrieve relevant visits.
-        target_length (int): Maximum total token count for the final list of documents.
-        tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
-    Returns:
-        list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
-    """
-    ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
-    langchain_docs = [
-        langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
-    ]
-    # `k` is the number of documents to retrieve
-    # We retrieve everything and just use the BM25Retriever to sort the documents
-    retriever = langchain_community.retrievers.BM25Retriever.from_documents(
-        langchain_docs, k=len(langchain_docs)
-    )
-    # Invoking the retriever means the most relevant documents are sorted first
-    sorted_docs = retriever.invoke(query)
-    # Define the regex pattern to find the start time
-    # pattern = r'start="([\d/]+ [\d:]+)"'
-    pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
-    docs = []
-    dts = []
-    # Find the startime of the document
-    for doc in sorted_docs:
-        doc_content = doc.page_content
-        start_dt_match = re.search(pattern, doc_content)
-        if start_dt_match:
-            start_dt = start_dt_match.group(1)
-            parsed = False
-            # Try different date formats
-            for fmt in (
-                "%m/%d/%y %I:%M %p",
-                "%m/%d/%Y %I:%M %p",
-                "%m/%d/%y %H:%M",
-                "%m/%d/%Y %H:%M",
-            ):
-                try:
-                    dts.append(datetime.datetime.strptime(start_dt, fmt))
-                    parsed = True
-                    break
-                except ValueError:
-                    continue
-            if not parsed:
-                print(f"Error parsing date: {start_dt}")
-                continue
-        else:
-            print(f"Start time not found., {doc_content}")
-            dts.append(datetime.datetime.min)
-        docs.append(doc_content)
-    final_docs = []
-    current_length = 0
-    # Add documents until we exceed the allocated context length
-    for i in range(len(docs)):
-        doc_content = docs[i]
-        doc_length = len(tokenizer.encode(doc_content))
-        final_docs.append((dts[i], doc_content))
-        current_length += doc_length
-        if current_length > target_length:
-            break
-    # Sort final_docs chronologically
-    final_docs.sort(key=lambda x: x[0])
-    # Extract only the document content for the final output
-    final_docs_content = [doc_content for _, doc_content in final_docs]
-    return final_docs_content
 def pack_and_trim_prompts(
     instructions: Dict[int, Dict[str, str]],
     ehrs: Dict[int, str],
-    prompt_template: langchain.prompts.PromptTemplate,
+    prompt_string: str,
     context_length: int,
     generation_length: int,
     tokenizer: Any,
-    use_RAG: bool = True,
     verbose: bool = False,
     include_ehr: bool = True,
 ) -> Dict[int, str]:
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
         patient_id = int(instructions[instruction_id]["patient_id"])
         relevant_ehr = ehrs[patient_id]
-        # Calculate how many tokens of EHR we can include in the prompt
         num_tokens_instruction = len(tokenizer.encode(instruction))
-        num_tokens_prompt_template = len(tokenizer.encode(prompt_template.template))
+        num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
         if include_ehr:
             target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
         else:
             target_ehr_length = 0
         if target_ehr_length <= 0:
-            prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr="")
+            prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
         else:
-            if use_RAG:
-                # Return a list of the most relevant visit strings
-                most_relevant_visits = retrieve_most_relevant_visits(
-                    ehr_visit_strs=relevant_ehr,
-                    query=instruction,
-                    target_length=target_ehr_length,
-                    tokenizer=tokenizer,
-                )
-                relevant_ehr = "\n".join(most_relevant_visits)
             # Do a first pass with a fast tokenizer
             fast_tokenizer = tiktoken.get_encoding("cl100k_base")
             fast_encoded = fast_tokenizer.encode(relevant_ehr)
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
                 encoded_ehr = tokenizer.encode(fast_truncated_ehr)
                 truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
                 truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
-                prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr=truncated_ehr)
+                prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
+            else:
+                # If the fast encoding is still too long, just use the full EHR up to allowed length
+                truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
+                prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
-                prompts_map[instruction_id] = prompt_with_truncated_ehr
+        prompts_map[instruction_id] = prompt_with_truncated_ehr
-                if verbose:
-                    print(prompt_with_truncated_ehr)
-                    print("~" * 20)
+        if verbose:
+            print(prompt_with_truncated_ehr)
+            print("~" * 20)
     return prompts_map
@@ -322,7 +217,6 @@ def preprocess_prompts(
     generation_length,
     path_to_instructions,
     path_to_ehrs,
-    use_RAG,
     include_ehr,
     tokenizer,
     codes_only=False,
@@ -347,16 +241,18 @@ def preprocess_prompts(
     # CONSTRUCT & TRUNCATE PROMPTS #
     print("Constructing prompts using instructions and EHRs...")
-    prompt_string="Instruction: Answer the following question based on the EHR:\n\nEHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
-    prompt_template = langchain.prompts.PromptTemplate.from_template(prompt_string)
+    prompt_string = (
+        "Instruction: Answer the following question based on the EHR:\n\n"
+        "EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
+    )
     filled_prompts = pack_and_trim_prompts(
         instructions=instructions,
         ehrs=ehrs,
-        prompt_template=prompt_template,
+        prompt_string=prompt_string,
         context_length=target_context_length,
         generation_length=generation_length,
         tokenizer=tokenizer,
-        use_RAG=use_RAG,
         verbose=False,
         include_ehr=include_ehr,
     )
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
     path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
     path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
     check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
-    use_RAG = False
     include_ehr = True
     tokenizer = "tiktoken"
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
         generation_length=generation_length,
         path_to_instructions=path_to_instructions,
         path_to_ehrs=path_to_ehrs,
-        use_RAG=use_RAG,
         include_ehr=include_ehr,
         tokenizer=tokenizer,
     )

helm/benchmark/scenarios/medbullets_scenario.py CHANGED Viewed

@@ -3,6 +3,7 @@ import csv
 import sys
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     TEST_SPLIT,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
     Output,
     Reference,
     Scenario,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_file_downloaded
@@ -143,3 +145,23 @@ class MedBulletsScenario(Scenario):
             csv_path = self.download_csv(output_path, split_suffix)
             instances.extend(self.process_csv(csv_path, split))
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medbullets",
+            display_name="Medbullets",
+            description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
+            "a model's ability to understand and apply clinical knowledge. Each question is "
+            "accompanied by a patient scenario and five multiple-choice options, similar to "
+            "those found on Step 2 and Step 3 board exams [(MedBullets, "
+            "2025)](https://step2.medbullets.com).",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Medical knowledge testing",
+                when="Any",
+                who="Medical student, . Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/medcalc_bench_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     PassageQuestionInput,
     Output,
+    ScenarioMetadata,
 )
@@ -125,3 +127,23 @@ class MedCalcBenchScenario(Scenario):
             instances.extend(self.process_csv(data, split))
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medcalc_bench",
+            display_name="MedCalc-Bench",
+            description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
+            "compute clinically relevant values from patient notes. Each instance consists "
+            "of a clinical note describing the patient's condition, a diagnostic question "
+            "targeting a specific medical value, and a ground truth response. [(Khandekar "
+            "et al., 2024)](https://arxiv.org/abs/2406.12036).",
+            taxonomy=TaxonomyInfo(
+                task="Computational reasoning",
+                what="Compute a specific medical value from a patient note",
+                when="Any",
+                who="Clinician, Researcher",
+                language="English",
+            ),
+            main_metric="medcalc_bench_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/medec_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import os
 from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
     TEST_SPLIT,
     Input,
     Output,
+    ScenarioMetadata,
 )
 from helm.common.general import ensure_file_downloaded
@@ -123,3 +125,24 @@ class MedecScenario(Scenario):
         instances.extend(self.process_csv(test_csv, TEST_SPLIT))
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medec",
+            display_name="Medec",
+            description="Medec is a benchmark composed of clinical narratives that include either "
+            "correct documentation or medical errors. Each entry includes sentence-level "
+            "identifiers and an associated correction task. The model must review the "
+            "narrative and either identify the erroneous sentence and correct it, or "
+            "confirm that the text is entirely accurate [(Abacha et al., "
+            "2025)](https://arxiv.org/abs/2412.19260).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Detect and correct errors in medical narratives",
+                when="Any",
+                who="Researcher, Clinician",
+                language="English",
+            ),
+            main_metric="medec_error_flag_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/medhallu_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Output,
     Input,
+    ScenarioMetadata,
 )
@@ -70,3 +72,24 @@ Answer: {answer}
             )
             instances.append(hallucinated_instance)
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medhallu",
+            display_name="MedHallu",
+            description="MedHallu is a benchmark focused on evaluating factual correctness in "
+            "biomedical question answering. Each instance contains a PubMed-derived "
+            "knowledge snippet, a biomedical question, and a model-generated answer. The "
+            "task is to classify whether the answer is factually correct or contains "
+            "hallucinated (non-grounded) information. This benchmark is designed to assess "
+            "the factual reliability of medical language models.",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Verify whether answers to questions from PubMed articles are " "factual or hallucinated",
+                when="Any",
+                who="Researcher",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

helm/benchmark/scenarios/medhelm/__init__.py ADDED Viewed

File without changes

helm/benchmark/scenarios/medhelm/judges.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+# The judges to be used for evaluating the note summary scenario.
+# name: The short name for the judge.
+# model: The field value matching the 'model_name' field under model_deployments.yaml
+# model_deployment: The field value matching the 'name' under model_deployments.yaml.
+judges:
+  - name: "gpt"
+    model: "openai/gpt-4o-2024-05-13"
+    model_deployment: "stanfordhealthcare/gpt-4o-2024-05-13"
+  - name: "llama"
+    model: "meta/llama-3.3-70b-instruct"
+    model_deployment: "stanfordhealthcare/llama-3.3-70b-instruct"
+  - name: "claude"
+    model: "anthropic/claude-3-7-sonnet-20250219"
+    model_deployment: "stanfordhealthcare/claude-3-7-sonnet-20250219"

helm/benchmark/scenarios/medhelm_configurable_scenario.py ADDED Viewed

@@ -0,0 +1,101 @@
+import string
+import json
+import pandas as pd
+from typing import List
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.run_specs.medhelm.benchmark_config import get_benchmark_config_from_path
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    CORRECT_TAG,
+    Reference,
+    Input,
+    Output,
+    TEST_SPLIT,
+    ScenarioMetadata,
+)
+class MedHELMConfigurableScenario(Scenario):
+    """
+    MedHELM configuratble scenario
+    """
+    tags = ["biomedical"]
+    def __init__(self, name: str, config_path: str):
+        super().__init__()
+        self.benchmark_config = get_benchmark_config_from_path(config_path)
+        self.name = name
+        self.description = self.benchmark_config.description
+    def get_columns_in_template(self, template: str) -> List[str]:
+        """
+        Extract field names from a template string using Python's Formatter.
+        Example: "Name: {name}, Age: {age}" → ["name", "age"]
+        """
+        formatter = string.Formatter()
+        return [fname for _, fname, _, _ in formatter.parse(template) if fname]
+    def populate_template(self, template: str, row: pd.Series, fields: List[str]) -> str:
+        """
+        Populate the template with values from the row using format_map.
+        Missing fields default to empty string.
+        """
+        mapping = {field: row.get(field, "") for field in fields}
+        return template.format_map(mapping)
+    def get_references(self, row: pd.Series) -> List[Reference]:
+        references: List[Reference] = []
+        if "correct_answer" in row:
+            references.append(Reference(Output(text=row["correct_answer"]), tags=[CORRECT_TAG]))
+        if "incorrect_answers" in row:
+            for incorrect_answer in row["incorrect_answers"]:
+                references.append(Reference(Output(text=incorrect_answer), tags=[]))
+        return references
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(self.benchmark_config.prompt_file, msg=f"Prompt file for {self.name} does not exist")
+        check_file_exists(self.benchmark_config.dataset_file, msg=f"Dataset file for {self.name} does not exist")
+        instances: List[Instance] = []
+        df = pd.read_csv(self.benchmark_config.dataset_file)
+        if "correct_answer" not in df.columns:
+            if not self._is_llm_as_judge() or len(self.benchmark_config.metrics) > 1:
+                raise ValueError(
+                    "Dataset must contain 'correct_answer' column unless using jury_score as the only metric."
+                )
+        if "incorrect_answers" in df.columns:
+            df["incorrect_answers"] = df["incorrect_answers"].apply(json.loads)
+        with open(self.benchmark_config.prompt_file, "r") as f:
+            template = f.read()
+        fields = self.get_columns_in_template(template)
+        for _, row in df.iterrows():
+            filled = self.populate_template(template, row, fields)
+            prompt = Input(text=filled)
+            instances.append(Instance(input=prompt, references=self.get_references(row), split=TEST_SPLIT))
+        return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name=self.name,
+            display_name=self.name,
+            description=self.description,
+            taxonomy=TaxonomyInfo(
+                task="",
+                what="",
+                when="",
+                who="",
+                language="",
+            ),
+            main_metric=self.benchmark_config.main_metric.name,
+            main_split="test",
+        )
+    def _is_llm_as_judge(self) -> bool:
+        for metric in self.benchmark_config.metrics:
+            if metric.name == "jury_score":
+                return True
+        return False

helm/benchmark/scenarios/medi_qa_scenario.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List
 from datasets import load_dataset
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.hierarchical_logger import hlog
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,
     Input,
     Output,
+    ScenarioMetadata,
 )
@@ -109,3 +111,24 @@ class MediQAScenario(Scenario):
             instances.extend(self.process_csv(data, split))
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medi_qa",
+            display_name="MEDIQA",
+            description="MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and "
+            "generate medically accurate answers to patient-generated questions. Each "
+            "instance includes a consumer health question, a set of candidate answers (used "
+            "in ranking tasks), relevance annotations, and optionally, additional context. "
+            "The benchmark focuses on supporting patient understanding and accessibility in "
+            "health communication.",
+            taxonomy=TaxonomyInfo(
+                task="Text generation",
+                what="Retrieve and rank answers based on medical question " "understanding",
+                when="Any",
+                who="Clinician, Medical Student",
+                language="English",
+            ),
+            main_metric="medi_qa_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/medication_qa_scenario.py CHANGED Viewed

@@ -3,9 +3,19 @@ from typing import List
 import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
 from helm.common.general import ensure_file_downloaded
-from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+    ScenarioMetadata,
+)
 class MedicationQAScenario(Scenario):
@@ -64,3 +74,23 @@ class MedicationQAScenario(Scenario):
         ]
         return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="medication_qa",
+            display_name="MedicationQA",
+            description="MedicationQA is a benchmark composed of open-ended consumer health questions "
+            "specifically focused on medications. Each example consists of a free-form "
+            "question and a corresponding medically grounded answer. The benchmark "
+            "evaluates a model's ability to provide accurate, accessible, and informative "
+            "medication-related responses for a lay audience.",
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Answer consumer medication-related questions",
+                when="Any",
+                who="Patient, Pharmacist",
+                language="English",
+            ),
+            main_metric="medication_qa_accuracy",
+            main_split="test",
+        )

helm/benchmark/scenarios/melt_scenarios.py CHANGED Viewed

@@ -439,13 +439,13 @@ class MELTMATHScenario(Scenario):
         for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
             if split == TRAIN_SPLIT and self.use_official_examples:
                 train_instances = [
-                    ("Kết quả của $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
+                    ("Kết quả của $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
                     (
                         "Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
                         + " nếu thứ tự các cuốn sách được chọn không quan trọng?",
                         "15",
                     ),
-                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\sqrt{59}"),
+                    ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\\sqrt{59}"),
                     (
                         "Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
                         + " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"

crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl