PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.1.2)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -2,16 +2,19 @@ from dataclasses import dataclass, field
 from typing import List, Optional
 from helm.common.image_generation_parameters import ImageGenerationParameters
+from helm.common.reeval_parameters import REEvalParameters
 # Adaptation methods
 ADAPT_GENERATION: str = "generation"
+ADAPT_CHAT: str = "chat"
 ADAPT_LANGUAGE_MODELING: str = "language_modeling"
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
+ADAPT_EHR_INSTRUCTION: str = "ehr_instruction"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
@@ -63,6 +66,12 @@ class AdapterSpec:
     reference_suffix: str = "\n"
     """The string that is included after each reference (for multiple-choice questions)."""
+    chain_of_thought_prefix: str = ""
+    """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
+    chain_of_thought_suffix: str = "\n"
+    """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
     output_prefix: str = "Output: "
     """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
@@ -124,6 +133,9 @@ class AdapterSpec:
     image_generation_parameters: Optional[ImageGenerationParameters] = None
     """Parameters for image generation."""
+    reeval_parameters: Optional[REEvalParameters] = None
+    """Parameters for reeval evaluation."""
     # Set hash=False to make `AdapterSpec` hashable
     eval_splits: Optional[List[str]] = field(default=None, hash=False)
     """The splits from which evaluation instances will be drawn."""

helm/benchmark/adaptation/adapters/adapter_factory.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_EHR_INSTRUCTION,
     ADAPT_GENERATION,
+    ADAPT_CHAT,
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
@@ -12,6 +15,7 @@ from helm.benchmark.adaptation.adapter_spec import (
 from helm.benchmark.adaptation.adapters.adapter import Adapter
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
 from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
+from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
 from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
 from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
 from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
@@ -19,8 +23,12 @@ from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimo
 )
 from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
+    MultipleChoiceJointChainOfThoughtAdapter,
+)
 from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
+from helm.benchmark.adaptation.adapters.ehr_instruction_adapter import EHRInstructionAdapter
 class AdapterFactory:
@@ -32,12 +40,18 @@ class AdapterFactory:
         method: str = adapter_spec.method
         adapter: Adapter
-        if method == ADAPT_GENERATION:
+        if method == ADAPT_EHR_INSTRUCTION:
+            adapter = EHRInstructionAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_GENERATION:
             adapter = GenerationAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_CHAT:
+            adapter = ChatAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_LANGUAGE_MODELING:
             adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
             adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+            adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
             adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:

helm/benchmark/adaptation/adapters/binary_ranking_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Optional
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance, Reference, TRAIN_SPLIT, EVAL_SPLITS, CORRECT_TAG
 from helm.common.request import Request
-from .in_context_learning_adapter import InContextLearningAdapter
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
 class BinaryRankingAdapter(InContextLearningAdapter):

helm/benchmark/adaptation/adapters/chat_adapter.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import List
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.scenarios.scenario import Instance
+from helm.common.request import Request
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
+class ChatAdapter(InContextLearningAdapter):
+    """
+    Each `Instance` in a `Scenario` has a history of the format:
+    [
+        {"role": "user", "content": <user-content>},
+        {"role": "assistant", "content": <assistant-content>},
+        {"role": "user", "content": <user-content>},
+        ...
+    ]
+    """
+    def generate_requests(
+        self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
+    ) -> List[RequestState]:
+        if eval_instance.input.messages is None:
+            raise ValueError("ChatAdapter requires input.messages of instances to be non-empty")
+        request = Request(
+            model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
+            messages=eval_instance.input.messages,
+            num_completions=self.adapter_spec.num_outputs,
+            temperature=self.adapter_spec.temperature,
+            max_tokens=self.adapter_spec.max_tokens,
+            stop_sequences=self.adapter_spec.stop_sequences,
+            random=self.adapter_spec.random,
+            image_generation_parameters=self.adapter_spec.image_generation_parameters,
+        )
+        request_state = RequestState(
+            instance=eval_instance,
+            reference_index=None,
+            request_mode=None,
+            train_trial_index=train_trial_index,
+            output_mapping=None,
+            request=request,
+            result=None,
+            num_train_instances=0,
+            prompt_truncated=False,
+        )
+        return [request_state]

helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py ADDED Viewed

@@ -0,0 +1,108 @@
+from typing import List, Optional
+from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
+from helm.benchmark.adaptation.prompt import Prompt
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, Instance
+from helm.benchmark.window_services.window_service import EncodeResult
+from helm.common.tokenization_request import TokenizationToken
+# in the prompt templates for EHR instructions, this is the placeholder for the EHR part
+# which we use to compute accurate tokenized sequence lengths
+PROMPT_TEMPLATE_EHR_PLACEHOLDER = "{ehr}"
+class EHRInstructionAdapter(GenerationAdapter):
+    """
+    Each instance consists of the following:
+    EHRInstructionInput:
+        question: the question to answer or instruction to follow
+        ehr: the XML-tagged EHR to use as context to answer the question
+        prompt_template: a string template for how to combine the question + ehr
+    Reference output:
+        text: the 'golden' clinician response to the question
+    This Adapter combines the above into RequestStates with logic to truncate the EHR specifically
+    to fit in the context window with enough room for the instruction/question and the specified
+    amount of generated tokens.
+    """
+    def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
+        """
+        Main adaptation method which takes all instances and turns them into `RequestState` objects.
+        """
+        # sanity check, since for now we assume that there are no training instances at all
+        if any(instance.split == TRAIN_SPLIT for instance in instances):
+            raise RuntimeError(f"Got train instances for {self.__class__.__name__} - expected only eval instances.")
+        # use superclass implementation here
+        return super().adapt(instances, parallelism)
+    def construct_prompt(
+        self,
+        train_instances: List[Instance],  # unused
+        eval_instance: Instance,
+        include_output: bool,  # unused
+        reference_index: Optional[int],  # unused
+    ) -> Prompt:
+        """
+        Uses the instance to construct a prompt for a given eval instance.
+        Parameters
+        ----------
+        eval_instance: Instance
+            the instance we wish to use to construct the prompt
+        """
+        # start by simply getting the inputs
+        question = eval_instance.input.text
+        assert eval_instance.extra_data is not None
+        ehr_text: str = eval_instance.extra_data["ehr"]
+        prompt_template: str = eval_instance.extra_data["prompt_template"]
+        full_prompt_text = prompt_template.format(question=question, ehr=ehr_text)
+        # insert the question and see how many tokens we have so far
+        prompt_with_instr_no_ehr_placeholder = prompt_template.format(question=question, ehr="")
+        num_tokens_no_ehr = self.window_service.get_num_tokens(prompt_with_instr_no_ehr_placeholder)
+        # number of tokens we can allow the EHR part to be
+        target_ehr_num_tokens = (
+            self.window_service.max_request_length - self.adapter_spec.max_tokens - num_tokens_no_ehr
+        )
+        # round-trip tokenization to get the correct token length we need
+        # NOTE: we truncate from the left side so that the most recent pieces of the EHR are included in the context
+        # as opposed to the canonical way of truncating from the right. This is done to match the MedAlign method.
+        full_ehr_tokens: EncodeResult = self.window_service.encode(ehr_text, max_length=None, truncation=False)
+        truncated_ehr_tokens: List[TokenizationToken] = full_ehr_tokens.tokens[-target_ehr_num_tokens:]
+        ehr_truncated: str
+        ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
+        # create the truncated prompt
+        truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
+        num_truncations = 1
+        while (
+            num_extra_tokens := self.adapter_spec.max_tokens
+            + self.window_service.get_num_tokens(truncated_prompt_text)
+            - self.window_service.max_request_length
+        ) > 0:
+            truncated_ehr_tokens = truncated_ehr_tokens[num_extra_tokens:]
+            ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
+            truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
+            num_truncations += 1
+        # naively construct the full non-truncated prompt
+        prompt = Prompt(
+            global_prefix=self.adapter_spec.global_prefix,
+            global_suffix=self.adapter_spec.global_suffix,
+            instance_prefix=self.adapter_spec.instance_prefix,
+            substitutions=self.adapter_spec.substitutions,
+            instructions_block=self.adapter_spec.instructions,
+            train_instance_blocks=[],
+            eval_instance_block=full_prompt_text,
+            truncated_text=truncated_prompt_text,
+        )
+        return prompt

helm/benchmark/adaptation/adapters/generation_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.request import Request
-from .in_context_learning_adapter import InContextLearningAdapter
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
 class GenerationAdapter(InContextLearningAdapter):

helm/benchmark/adaptation/adapters/in_context_learning_adapter.py CHANGED Viewed

@@ -11,7 +11,7 @@ from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
 from helm.common.general import parallel_map
 from helm.common.request import Request
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
-from .adapter import Adapter
+from helm.benchmark.adaptation.adapters.adapter import Adapter
 class InContextLearningAdapter(Adapter, ABC):

helm/benchmark/adaptation/adapters/language_modeling_adapter.py CHANGED Viewed

@@ -7,7 +7,7 @@ from helm.common.general import flatten_list, parallel_map
 from helm.common.hierarchical_logger import hlog, htrack
 from helm.common.request import Request
 from helm.common.tokenization_request import TokenizationToken
-from .adapter import Adapter
+from helm.benchmark.adaptation.adapters.adapter import Adapter
 class LanguageModelingAdapter(Adapter):

helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py CHANGED Viewed

@@ -3,8 +3,10 @@ from typing import List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.request import Request
-from .in_context_learning_multimodal_adapter import InContextLearningMultimodalAdapter
-from .multimodal_prompt import MultimodalPrompt
+from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
+    InContextLearningMultimodalAdapter,
+)
+from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
 class GenerationMultimodalAdapter(InContextLearningMultimodalAdapter):

helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from helm.common.hierarchical_logger import hlog
 from helm.common.media_object import MediaObject, MultimediaObject
 from helm.common.request import Request
 from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
-from .multimodal_prompt import MultimodalPrompt
+from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
 class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):

helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from helm.common.request import Request
 from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
     InContextLearningMultimodalAdapter,
 )
-from .multimodal_prompt import MultimodalPrompt
+from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
 class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, ABC):

helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py CHANGED Viewed

@@ -8,8 +8,10 @@ from helm.benchmark.scenarios.scenario import Instance, Reference, Input, Output
 from helm.benchmark.window_services.test_utils import get_tokenizer_service
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL, AdapterFactory
-from .in_context_learning_multimodal_adapter import InContextLearningMultimodalAdapter
-from .multimodal_prompt import MultimodalPrompt
+from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
+    InContextLearningMultimodalAdapter,
+)
+from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
 class TestInContextLearningMultimodalAdapter(unittest.TestCase):

helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import List
 import unittest
 from helm.common.media_object import MediaObject, MultimediaObject
-from .multimodal_prompt import MultimodalPrompt
+from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
 class TestMultimodalContent(unittest.TestCase):

helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance, Input
-from .multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
 class MultipleChoiceCalibratedAdapter(MultipleChoiceSeparateAdapter):

helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Dict, Optional
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.request import Request
-from .in_context_learning_adapter import InContextLearningAdapter
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
 class MultipleChoiceJointAdapter(InContextLearningAdapter):
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
     @staticmethod
     def get_prefix_char(prefix: str) -> str:
-        return prefix.lstrip()[0]
+        return [char for char in prefix if char.isalnum()][0]
     @staticmethod
     def get_reference_prefix(prefix: str, i: int) -> str:

helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py ADDED Viewed

@@ -0,0 +1,87 @@
+from typing import Optional
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
+    """
+    Each `Instance` in a `Scenario` looks like this:
+        <input> -> <reference1>
+                   <reference2>
+                   <reference3> [correct]
+                   <reference4>
+        <instance_chain_of_thought>
+    We can define a label (e.g., letter) for each reference:
+        <global_prefix>
+        <instructions>
+        <input_prefix>
+        <input>                  # train
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+        <input_prefix>
+        <input>                  # test
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+        <global_suffix>
+    In general, each example is:
+        <input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
+        <output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
+    """
+    def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
+        """Return a list of lines corresponding to this example (part of the prompt)."""
+        # Input
+        result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
+        # Include the references
+        delimiter = ", "
+        no_correct_references = "n/a"
+        output = no_correct_references
+        for reference_index, reference in enumerate(instance.references):
+            prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+            result += prefix + reference.output.text + self.adapter_spec.reference_suffix
+            if reference.is_correct:
+                if output == no_correct_references:
+                    output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+                elif self.adapter_spec.multi_label:
+                    output += delimiter
+                    output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+        if include_output:
+            chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
+            chain_of_thought_block = (
+                self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
+            )
+            result += (
+                self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
+            )
+        else:
+            result += self.adapter_spec.output_prefix.rstrip()
+        return result

helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance
 from helm.common.request import Request
-from .in_context_learning_adapter import InContextLearningAdapter
+from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
 class MultipleChoiceSeparateAdapter(InContextLearningAdapter):

helm/benchmark/adaptation/adapters/test_generation_adapter.py CHANGED Viewed

@@ -14,9 +14,9 @@ from helm.benchmark.scenarios.scenario import (
 from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
 from helm.benchmark.adaptation.prompt import Prompt
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .adapter_factory import AdapterFactory, ADAPT_GENERATION
-from .generation_adapter import GenerationAdapter
-from .test_adapter import TestAdapter
+from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_GENERATION
+from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
+from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
 class TestGenerationAdapter(TestAdapter):

helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py CHANGED Viewed

@@ -5,8 +5,8 @@ from helm.common.tokenization_request import TokenizationToken
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.common.request import Request
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
-from .test_adapter import TestAdapter
+from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
+from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference

helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py CHANGED Viewed

@@ -2,8 +2,8 @@
 from typing import List, Set
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
-from .test_adapter import TestAdapter
+from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
 def _make_instance(

helm/benchmark/adaptation/common_adapter_specs.py CHANGED Viewed

@@ -4,6 +4,7 @@ from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_RANKING_BINARY,
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
     [output_noun]:
     """
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
-        input_prefix=f"{input_noun}: " if input_noun is not None else "",
-        input_suffix="\n" if input_noun is not None else "",
-        output_prefix=f"{output_noun}: ",
-        output_suffix="\n",
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        stop_sequences=["\n"],
+        sample_train=sample_train,
+        **kwargs,
+    )
+def get_multiple_choice_joint_chain_of_thought_adapter_spec(
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]: [output]
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]:
+    """
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+        instructions=format_instructions(instructions),
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
         max_train_instances=max_train_instances,
         num_outputs=num_outputs,
         max_tokens=max_tokens,
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
             sample_train=sample_train,
             **kwargs,
         )
+    elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+        return get_multiple_choice_joint_chain_of_thought_adapter_spec(
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
+        )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
     else:

helm/benchmark/adaptation/prompt.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 from dataclasses import dataclass
 from typing import List, Optional
-from .adapter_spec import Substitution
+from helm.benchmark.adaptation.adapter_spec import Substitution
 @dataclass(frozen=True)

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl