PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +77 -0
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +168 -45
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +254 -111
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +43 -9
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +9 -2
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +19 -0
helm/config/model_deployments.yaml +412 -18
helm/config/model_metadata.yaml +447 -25
helm/config/tokenizer_configs.yaml +93 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/annotation/air_bench_annotator.py ADDED Viewed

@@ -0,0 +1,64 @@
+import datasets
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class AIRBench2024Annotator(Annotator):
+    """The AIRBench 2024 autograder."""
+    name = "air_bench_2024"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "stanford-crfm/air-bench-2024", "judge_prompts", split="test", cache_dir=cache_dir
+        )
+        self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        category_id = request_state.instance.references[0].output.text
+        prompt_template = self._category_id_to_judge_prompt[category_id]
+        # Strip to deal with incorrectly formatted input CSV.
+        # TODO: Stop stripping after CSV is fixed.
+        annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
+            "{{ANSWER}}", model_output_text
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/annotation/annotator_factory.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Mapping, Optional
+from helm.clients.auto_client import AutoClient
 from helm.common.credentials_utils import provide_api_key
 from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
 from helm.common.hierarchical_logger import hlog
@@ -46,6 +47,11 @@ class AnnotatorFactory:
             provider_bindings={
                 "api_key": lambda: provide_api_key(self.credentials, annotator_name),
                 "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
+                "auto_client": lambda: AutoClient(
+                    credentials=self.credentials,
+                    file_storage_path=self.file_storage_path,
+                    cache_backend_config=self.cache_backend_config,
+                ),
             },
         )
         annotator = create_object(annotator_spec)

helm/benchmark/annotation/live_qa_annotator.py ADDED Viewed

@@ -0,0 +1,84 @@
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class LiveQAAnnotator(Annotator):
+    """The LiveQA autograder."""
+    name = "live_qa"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        correct_responses = ""
+        for i, reference in enumerate(request_state.instance.references):
+            correct_responses += "Correct Response #" + str(i) + ": " + reference.output.text + "\n"
+        prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
+The user's request will be provided in these tags:
+<user_request>
+{{QUESTION}}
+</user_request>
+The model's response will be provided in these tags:
+<model_response>
+{{ANSWER}}
+</model_response>
+Some potential correct responses will be provided in these tags:
+<correct_responses>
+{{CORRECT_RESPONSES}}
+</correct_responses>
+Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
+# 1, if the model response fully matches the correct response content and intent for at least one correct response
+# 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
+# 0.3, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
+# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
+Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
+Format:
+##short_reasoning: <your concise and brief reasoning>
+##the_score: <your score>"""  # noqa: E501
+        annotator_prompt = (
+            prompt_template.strip('"')
+            .strip()
+            .replace("{{QUESTION}}", model_input_text)
+            .replace("{{ANSWER}}", model_output_text)
+            .replace("{{CORRECT_RESPONSES}}", correct_responses)
+        )
+        annotator_request = Request(
+            model="openai/gpt-4-turbo-2024-04-09",
+            model_deployment="openai/gpt-4-turbo-2024-04-09",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/annotation/medication_qa_annotator.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class MedicationQAAnnotator(Annotator):
+    """The LiveQA autograder."""
+    name = "medication_qa"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        correct_response = request_state.instance.references[0].output.text
+        prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
+The user's request will be provided in these tags:
+<user_request>
+{{QUESTION}}
+</user_request>
+The model's response will be provided in these tags:
+<model_response>
+{{ANSWER}}
+</model_response>
+Some potential correct responses will be provided in these tags:
+<correct_response>
+{{CORRECT_RESPONSE}}
+</correct_response>
+Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
+# 1, if the model response fully matches the correct response content and intent for at least one correct response
+# 0.5, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
+# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
+Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
+Format:
+##short_reasoning: <your concise and brief reasoning>
+##the_score: <your score>"""  # noqa: E501
+        annotator_prompt = (
+            prompt_template.strip('"')
+            .strip()
+            .replace("{{QUESTION}}", model_input_text)
+            .replace("{{ANSWER}}", model_output_text)
+            .replace("{{CORRECT_RESPONSE}}", correct_response)
+        )
+        annotator_request = Request(
+            model="openai/gpt-4-turbo-2024-04-09",
+            model_deployment="openai/gpt-4-turbo-2024-04-09",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

helm/benchmark/augmentations/translate_perturbation.py CHANGED Viewed

@@ -17,6 +17,7 @@ class TranslatePerturbation(TextPerturbation):
         language_code: str = "zh-CN"
     name: str = "translate"
+    should_perturb_references: bool = True
     def __init__(self, language_code: str):
         self.language_code: str = language_code

helm/benchmark/huggingface_registration.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+from typing import Optional, Dict, Union
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
@@ -17,14 +17,22 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 def register_huggingface_model(
-    helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
+    helm_model_name: str,
+    pretrained_model_name_or_path: str,
+    revision: Optional[str] = None,
+    openvino: Optional[bool] = False,
 ) -> None:
-    object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
+    object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
     if revision:
         object_spec_args["revision"] = revision
+    if openvino:
+        object_spec_args["openvino"] = openvino
     # Auto-infer model properties from the tokenizer.
-    with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
+    create_tokenizer_args: Dict[str, str] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
+    if revision:
+        create_tokenizer_args["revision"] = revision
+    with HuggingFaceTokenizer.create_tokenizer(**create_tokenizer_args) as tokenizer:
         max_sequence_length = tokenizer.model_max_length
         end_of_text_token = tokenizer.eos_token or ""
         prefix_token = tokenizer.bos_token or ""
@@ -71,7 +79,7 @@ def register_huggingface_model(
     register_tokenizer_config(tokenizer_config)
-def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> None:
+def register_huggingface_hub_model_from_flag_value(raw_model_string: str, openvino=False) -> None:
     raw_model_string_parts = raw_model_string.split("@")
     pretrained_model_name_or_path: str
     revision: Optional[str]
@@ -88,10 +96,11 @@ def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> Non
         helm_model_name=raw_model_string,
         pretrained_model_name_or_path=pretrained_model_name_or_path,
         revision=revision,
+        openvino=openvino,
     )
-def register_huggingface_local_model_from_flag_value(path: str) -> None:
+def register_huggingface_local_model_from_flag_value(path: str, openvino=False) -> None:
     if not path:
         raise ValueError("Path to Hugging Face model must be non-empty")
     path_parts = os.path.split(path)
@@ -99,4 +108,5 @@ def register_huggingface_local_model_from_flag_value(path: str) -> None:
     register_huggingface_model(
         helm_model_name=helm_model_name,
         pretrained_model_name_or_path=path,
+        openvino=openvino,
     )

helm/benchmark/metrics/air_bench_metrics.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
+from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+class AIRBench2024BasicGenerationMetric(Metric):
+    """Replacement for BasicGenerationMetric for AIRBench 2024.
+    We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
+    because we abuse "references" to store metadata rather than true metadata."""
+    def __init__(self):
+        super().__init__()
+        self.efficiency_metric = EfficiencyMetric()
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
+class AIRBench2024ScoreMetric(Metric):
+    """Score metrics for AIRBench 2024."""
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert len(request_state.instance.references) > 1
+        category_text = request_state.instance.references[0].output.text
+        category_parts = category_text.split(".")
+        assert len(category_parts) == 3
+        assert request_state.annotations
+        score = request_state.annotations["air_bench_2024"]["score"]
+        return [
+            Stat(MetricName("air_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
+            Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
+                score
+            ),
+        ]

helm/benchmark/metrics/fin_qa_metrics.py ADDED Viewed

@@ -0,0 +1,60 @@
+import math
+import json
+from typing import List, Union
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.fin_qa_metrics_helper import (  # type: ignore
+    equal_program,
+    eval_program,
+    program_tokenization,
+)
+def _get_program_accuracy(reference_program: List[str], generated_program: List[str]) -> float:
+    return 1.0 if equal_program(reference_program, generated_program) else 0.0
+def _get_execution_accuracy(reference_execution: str, generated_program: List[str], table: List[List[str]]) -> float:
+    invalid_flag: int
+    generated_result: Union[str, float]
+    invalid_flag, generated_result = eval_program(generated_program, table)
+    if invalid_flag:
+        return 0.0
+    if reference_execution == "yes" or reference_execution == "no":
+        return 1.0 if reference_execution == generated_result else 0
+    else:
+        if not isinstance(generated_result, float):
+            return 0.0
+        return 1.0 if math.isclose(float(reference_execution), generated_result) else 0
+class FinQAMetric(Metric):
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert len(request_state.instance.references) == 3
+        reference_text = request_state.instance.references[0].output.text
+        reference_program = program_tokenization(reference_text)
+        reference_execution = request_state.instance.references[1].output.text
+        table: List[List[str]] = json.loads(request_state.instance.references[2].output.text)
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        generated_text = request_state.result.completions[0].text.strip()
+        generated_program = program_tokenization(generated_text)
+        return [
+            Stat(MetricName("program_accuracy")).add(_get_program_accuracy(reference_program, generated_program)),
+            Stat(MetricName("execution_accuracy")).add(
+                _get_execution_accuracy(reference_execution, generated_program, table)
+            ),
+        ]

crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl