PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -1,6 +1,26 @@
 from dataclasses import dataclass, field
 from typing import List, Optional
+from helm.common.image_generation_parameters import ImageGenerationParameters
+# Adaptation methods
+ADAPT_GENERATION: str = "generation"
+ADAPT_LANGUAGE_MODELING: str = "language_modeling"
+ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
+ADAPT_RANKING_BINARY: str = "ranking_binary"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+]
+# Multimodal adaptation methods
+ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
+ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL: str = "multiple_choice_joint_multimodal"
 @dataclass(frozen=True)
 class Substitution:
@@ -71,6 +91,9 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    # Number of trials, where we query the model with the same requests, but different random seeds
+    num_trials: int = 1
     # If true, randomly sample N training examples; if false, select N consecutive training examples
     sample_train: bool = True
@@ -79,8 +102,7 @@ class AdapterSpec:
     # Model deployment to make the request to (need to fill in)
     model_deployment: str = ""
-    # DEPRECATED: old model field, kept for backward compatibility
-    # TODO: Remove this once we do not wish to support backward compatibility anymore.
+    # Model to make the request to
     model: str = ""
     # Temperature to use
@@ -96,5 +118,11 @@ class AdapterSpec:
     random: Optional[str] = None
     # If true, for instances with multiple correct reference, the gold answer should be considered
-    # to be all of the correct references rather than any of the correct references.
+    # to be all the correct references rather than any of the correct references.
     multi_label: bool = False
+    # Parameters for image generation
+    image_generation_parameters: Optional[ImageGenerationParameters] = None
+    # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
+    eval_splits: Optional[List[str]] = field(default=None, hash=False)

helm/benchmark/adaptation/adapters/adapter.py CHANGED Viewed

@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
 from typing import List
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.benchmark.window_services.window_service import WindowService
@@ -22,7 +22,7 @@ class Adapter(ABC):
         )
     @abstractmethod
-    def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
+    def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
         """
         Takes a a list of `Instance`s and returns a `ScenarioState` with the
         list of corresponding `RequestState`s.

helm/benchmark/adaptation/adapters/adapter_factory.py CHANGED Viewed

@@ -1,31 +1,26 @@
-from typing import List
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from .adapter import Adapter
-from .generation_adapter import GenerationAdapter
-from .language_modeling_adapter import LanguageModelingAdapter
-from .multiple_choice_joint_adapter import MultipleChoiceJointAdapter
-from .multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
-from .multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
-from .binary_ranking_adapter import BinaryRankingAdapter
-from .multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
-# Adaptation methods
-ADAPT_GENERATION: str = "generation"
-ADAPT_LANGUAGE_MODELING: str = "language_modeling"
-ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
-ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
-ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
-ADAPT_RANKING_BINARY: str = "ranking_binary"
-ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
-    ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_GENERATION,
+    ADAPT_GENERATION_MULTIMODAL,
+    ADAPT_LANGUAGE_MODELING,
+    ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
-]
-# Multimodal adaptation methods
-ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_RANKING_BINARY,
+    AdapterSpec,
+)
+from helm.benchmark.adaptation.adapters.adapter import Adapter
+from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
+from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
+from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
+from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
+from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
+    MultipleChoiceJointMultimodalAdapter,
+)
+from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
+from helm.benchmark.window_services.tokenizer_service import TokenizerService
 class AdapterFactory:
@@ -51,6 +46,8 @@ class AdapterFactory:
             adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_GENERATION_MULTIMODAL:
             adapter = GenerationMultimodalAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL:
+            adapter = MultipleChoiceJointMultimodalAdapter(adapter_spec, tokenizer_service)
         else:
             raise ValueError(f"Invalid adaptation method: {method}")

helm/benchmark/adaptation/adapters/generation_adapter.py CHANGED Viewed

@@ -46,6 +46,7 @@ class GenerationAdapter(InContextLearningAdapter):
             max_tokens=self.adapter_spec.max_tokens,
             stop_sequences=self.adapter_spec.stop_sequences,
             random=self.adapter_spec.random,
+            image_generation_parameters=self.adapter_spec.image_generation_parameters,
         )
         request_state = RequestState(
             instance=eval_instance,

helm/benchmark/adaptation/adapters/in_context_learning_adapter.py CHANGED Viewed

@@ -7,9 +7,9 @@ from typing import List, Dict, Optional
 from helm.benchmark.adaptation.prompt import Prompt
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
 from helm.common.general import parallel_map
+from helm.common.request import Request
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from .adapter import Adapter
@@ -30,7 +30,7 @@ class InContextLearningAdapter(Adapter, ABC):
         pass
     @htrack(None)
-    def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
+    def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
         """
         Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         The reason we don't do this per eval instance is that we create a common set of
@@ -64,7 +64,7 @@ class InContextLearningAdapter(Adapter, ABC):
                 )
         hlog(f"{len(all_request_states)} requests")
-        return ScenarioState(self.adapter_spec, all_request_states)
+        return all_request_states
     def _adapt_trial_index(
         self,
@@ -101,7 +101,23 @@ class InContextLearningAdapter(Adapter, ABC):
                             hlog(line)
         # Flatten and return
-        return [request_state for result in results for request_state in result]
+        all_request_states: List[RequestState] = [request_state for result in results for request_state in result]
+        return self._add_trials(all_request_states)
+    def _add_trials(self, request_states: List[RequestState]) -> List[RequestState]:
+        """Expand the request states by adding trials."""
+        if self.adapter_spec.num_trials <= 1:
+            return request_states
+        all_request_states: List[RequestState] = request_states.copy()
+        for i in range(1, self.adapter_spec.num_trials):
+            seed: str = str(i)
+            for request_state in request_states:
+                request: Request = replace(request_state.request, random=seed)
+                all_request_states.append(replace(request_state, request=request))
+        assert len(all_request_states) == len(request_states) * self.adapter_spec.num_trials
+        return all_request_states
     def sample_examples(
         self, all_train_instances: List[Instance], seed: int, sample_train: bool = True

helm/benchmark/adaptation/adapters/language_modeling_adapter.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import List, Tuple, Optional
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.scenarios.scenario import Instance, EVAL_SPLITS
 from helm.benchmark.window_services.window_service import EncodeResult
 from helm.common.general import flatten_list, parallel_map
@@ -26,7 +25,7 @@ class LanguageModelingAdapter(Adapter):
     """
     @htrack(None)
-    def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
+    def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
         """
         Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         Only requires eval instances.
@@ -46,7 +45,7 @@ class LanguageModelingAdapter(Adapter):
         )
         hlog(f"{len(all_request_states)} requests")
-        return ScenarioState(self.adapter_spec, all_request_states)
+        return all_request_states
     def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
         """

helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py ADDED Viewed

@@ -0,0 +1,104 @@
+from abc import ABC
+from typing import Dict, List, Optional
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.scenarios.scenario import Instance
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.request import Request
+from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
+    InContextLearningMultimodalAdapter,
+)
+from .multimodal_prompt import MultimodalPrompt
+class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, ABC):
+    """
+    An `Adapter`, guided by the `AdapterSpec`, takes a `Scenario` and produces
+    a `ScenarioState`. This `Adapter` has additional logic to support in-context
+    learning for multimodal models.
+    """
+    @staticmethod
+    def get_reference_prefix(prefix: str, i: int) -> str:
+        """
+        Example: prefix = "\nA. ", i = 2, return "\nC. "
+        """
+        return prefix.replace("A", chr(ord("A") + i))
+    def generate_requests(
+        self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
+    ) -> List[RequestState]:
+        prompt: MultimodalPrompt = self.construct_prompt(
+            training_instances, eval_instance, include_output=False, reference_index=None
+        )
+        output_mapping: Dict[str, str] = dict(
+            (self.get_reference_prefix("A", reference_index), reference.output.text)
+            for reference_index, reference in enumerate(eval_instance.references)
+        )
+        request = Request(
+            model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
+            multimodal_prompt=prompt.multimedia_object,
+            num_completions=self.adapter_spec.num_outputs,
+            temperature=self.adapter_spec.temperature,
+            max_tokens=self.adapter_spec.max_tokens,
+            stop_sequences=[],
+            random=self.adapter_spec.random,
+        )
+        request_state = RequestState(
+            instance=eval_instance,
+            reference_index=None,
+            request_mode=None,
+            train_trial_index=train_trial_index,
+            output_mapping=output_mapping,
+            request=request,
+            result=None,
+            num_train_instances=prompt.num_train_instances,
+            prompt_truncated=False,
+        )
+        return [request_state]
+    def construct_example_multimodal_prompt(
+        self, instance: Instance, include_output: bool, reference_index: Optional[int]
+    ) -> MultimediaObject:
+        """
+        Returns a single example of the prompt. `include_output` controls whether the gold output is included.
+        """
+        # Input
+        assert instance.input.multimedia_content is not None
+        result: MultimediaObject = instance.input.multimedia_content.add_textual_prefix(self.adapter_spec.input_prefix)
+        result = result.add_textual_suffix(self.adapter_spec.input_suffix)
+        # Include the references
+        delimiter: str = ", "
+        no_correct_references: str = "n/a"
+        output: str = no_correct_references
+        for reference_index, reference in enumerate(instance.references):
+            prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+            if reference.output.multimedia_content is not None:
+                reference_output_content: MultimediaObject = reference.output.multimedia_content
+                reference_output_content = reference_output_content.add_textual_prefix(prefix)
+                reference_output_content = reference_output_content.add_textual_suffix(
+                    self.adapter_spec.reference_suffix
+                )
+                result = result.combine(reference_output_content)
+            else:
+                result = result.add_textual_suffix(prefix + reference.output.text + self.adapter_spec.reference_suffix)
+            if reference.is_correct:
+                if output == no_correct_references:
+                    output = self.get_reference_prefix("A", reference_index)
+                elif self.adapter_spec.multi_label:
+                    output += delimiter
+                    output += self.get_reference_prefix("A", reference_index)
+        if include_output:
+            output_content: MultimediaObject = MultimediaObject([MediaObject(text=output, content_type="text/plain")])
+            output_content = output_content.add_textual_prefix(self.adapter_spec.output_prefix)
+            output_content = output_content.add_textual_suffix(self.adapter_spec.output_suffix)
+            result = result.combine(output_content)
+        else:
+            result = result.add_textual_suffix(self.adapter_spec.output_prefix.rstrip())
+        return result

helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import shutil
 import tempfile
 import unittest
+from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
 from helm.common.media_object import MediaObject, MultimediaObject
 from helm.benchmark.scenarios.scenario import Instance, Reference, Input, Output, TEST_SPLIT, TRAIN_SPLIT, CORRECT_TAG
@@ -14,7 +15,7 @@ from .multimodal_prompt import MultimodalPrompt
 class TestInContextLearningMultimodalAdapter(unittest.TestCase):
     def setup_method(self, _):
         self._path: str = tempfile.mkdtemp()
-        self._tokenizer_service = get_tokenizer_service(self._path)
+        self._tokenizer_service = get_tokenizer_service(self._path, BlackHoleCacheBackendConfig())
     def teardown_method(self, _):
         shutil.rmtree(self._path)

helm/benchmark/adaptation/adapters/test_adapter.py CHANGED Viewed

@@ -2,6 +2,7 @@ import shutil
 import tempfile
 from helm.common.authentication import Authentication
+from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
 from helm.proxy.services.server_service import ServerService
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
@@ -13,7 +14,7 @@ class TestAdapter:
     def setup_method(self):
         self.path: str = tempfile.mkdtemp()
-        service = ServerService(base_path=self.path, root_mode=True)
+        service = ServerService(base_path=self.path, root_mode=True, cache_backend_config=BlackHoleCacheBackendConfig())
         self.tokenizer_service = TokenizerService(service, Authentication("test"))
     def teardown_method(self, _):

helm/benchmark/adaptation/adapters/test_generation_adapter.py CHANGED Viewed

@@ -11,24 +11,27 @@ from helm.benchmark.scenarios.scenario import (
     Input,
     Output,
 )
-from helm.benchmark.run_specs import get_scenario_spec1, get_adapter_spec1
+from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
 from helm.benchmark.adaptation.prompt import Prompt
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from .adapter_factory import AdapterFactory, ADAPT_GENERATION
+from .generation_adapter import GenerationAdapter
 from .test_adapter import TestAdapter
 class TestGenerationAdapter(TestAdapter):
     def test_adapt(self):
-        scenario = create_scenario(get_scenario_spec1())
-        adapter_spec = get_adapter_spec1()
+        run_spec = get_simple1_spec()
+        scenario = create_scenario(run_spec.scenario_spec)
+        adapter_spec = run_spec.adapter_spec
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
-        scenario_state = adapter.adapt(scenario.get_instances(output_path=""), parallelism=1)
+        instances = scenario.get_instances(output_path="")
+        request_states = adapter.adapt(instances, parallelism=1)
+        non_train_instances = [instance for instance in instances if instance.split != TRAIN_SPLIT]
         # Make sure we generated the right number of request_states:
         # For each trial, instance and reference (+ 1 for free-form generation).
-        num_instances = len(scenario_state.instances)
-        assert num_instances * adapter_spec.num_train_trials == len(scenario_state.request_states)
+        assert len(non_train_instances) * adapter_spec.num_train_trials == len(request_states)
     def test_construct_prompt(self):
         adapter_spec = AdapterSpec(
@@ -194,7 +197,7 @@ class TestGenerationAdapter(TestAdapter):
             ],
             split=TEST_SPLIT,
         )
-        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
+        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1)
         assert len(actual_instances) == 1
         assert actual_instances[0].request.prompt == (
             "Input: Second reference is correct\n"
@@ -244,7 +247,7 @@ class TestGenerationAdapter(TestAdapter):
             ],
             split=TEST_SPLIT,
         )
-        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
+        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1)
         assert len(actual_instances) == 1
         assert actual_instances[0].request.prompt == (
             "Input: Second reference is correct\n"
@@ -254,3 +257,24 @@ class TestGenerationAdapter(TestAdapter):
             "Input: First reference is correct\n"
             "Output:"
         )
+    def test_construct_prompt_image_generation(self):
+        adapter_spec = AdapterSpec(
+            model_deployment="openai/dall-e-2",
+            method=ADAPT_GENERATION,
+            input_prefix="",
+            input_suffix="",
+            output_prefix="",
+            output_suffix="",
+            max_train_instances=0,
+            num_outputs=1,
+            max_tokens=0,
+        )
+        adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
+        assert isinstance(adapter, GenerationAdapter)
+        eval_instance = Instance(Input(text="a blue dog"), references=[])
+        prompt: Prompt = adapter.construct_prompt([], eval_instance, include_output=False, reference_index=None)
+        assert adapter.window_service.fits_within_context_window(prompt.text)
+        assert prompt.text == "a blue dog"

helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # mypy: check_untyped_defs = False
 from typing import List
-from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
 from helm.common.tokenization_request import TokenizationToken
 from helm.benchmark.adaptation.request_state import RequestState
@@ -11,18 +10,6 @@ from .test_adapter import TestAdapter
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
-class MockGPT2Window(GPT2WindowService):
-    """Utility for overriding properties of a GPT2WindowService for test purposes."""
-    def __init__(self, service, *, max_sequence_length):
-        super().__init__(service)
-        self._max_sequence_length = max_sequence_length
-    @property
-    def max_sequence_length(self) -> int:
-        return self._max_sequence_length
 class TestLanguageModelingAdapter(TestAdapter):
     def test_construct_language_modeling_prompt(self):
         adapter_spec = AdapterSpec(
@@ -100,7 +87,7 @@ class TestLanguageModelingAdapter(TestAdapter):
             split=TEST_SPLIT,
         )
         # Ensure the adapter returns the correct prompt
-        request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
+        request_states: List[RequestState] = adapter.adapt([instance], parallelism=1)
         request: Request = request_states[0].request
         # The prompt should be "<|endoftext|>Excuse me, do you have the time?"
         assert request.prompt == "<|endoftext|>Excuse me, do you have the time?"
@@ -112,7 +99,7 @@ class TestLanguageModelingAdapter(TestAdapter):
             references=[reference],
             split=TEST_SPLIT,
         )
-        request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1).request_states
+        request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1)
         request_long: Request = request_states_long[0].request
         # Count the number of tokens of the prompt
         num_tokens = len(adapter.window_service.encode(request_long.prompt).token_values)
@@ -130,7 +117,7 @@ class TestLanguageModelingAdapter(TestAdapter):
         adapter_2 = AdapterFactory.get_adapter(adapter_spec_2_, self.tokenizer_service)
         # Step 2.1. Check that if the prompt is not too long, it is not truncated
-        request_state_2: List[RequestState] = adapter_2.adapt([instance], parallelism=1).request_states
+        request_state_2: List[RequestState] = adapter_2.adapt([instance], parallelism=1)
         request_2: Request = request_state_2[0].request
         # The prompt should be unchanged
         assert request_2.prompt == "<|endoftext|>Excuse me, do you have the time?"
@@ -138,7 +125,7 @@ class TestLanguageModelingAdapter(TestAdapter):
         # Step 2.2. Check that if the prompt + max_tokens is too long, it is truncated
         # but that we keep the same number of tokens as in the previous test
-        request_states_long_2: List[RequestState] = adapter_2.adapt([instance_long], parallelism=1).request_states
+        request_states_long_2: List[RequestState] = adapter_2.adapt([instance_long], parallelism=1)
         request_long_2: Request = request_states_long_2[0].request
         # Count the number of tokens of the prompt
         num_tokens_2 = len(adapter_2.window_service.encode(request_long_2.prompt).token_values)
@@ -159,12 +146,13 @@ class TestLanguageModelingAdapter(TestAdapter):
         )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         # Monkey patch the window service to have really short max sequences.
-        adapter.window_service = MockGPT2Window(self.tokenizer_service, max_sequence_length=max_sequence_length)
+        adapter.window_service._max_sequence_length = max_sequence_length
+        adapter.window_service._max_request_length = max_sequence_length + 1
         input_text = Input(text=" ".join(str(i) for i in range(input_tokens)))
         instance = Instance(input=input_text, references=[], split=TEST_SPLIT)
         # Generate the requests
-        request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
+        request_states: List[RequestState] = adapter.adapt([instance], parallelism=1)
         # A smaller window service creates more requests
         assert len(request_states) == 3
         assert request_states[0].request.prompt == "<|endoftext|>0 1 2 3 4 5 6 7 8 9"

helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py CHANGED Viewed

@@ -1,10 +1,23 @@
 # mypy: check_untyped_defs = False
+from typing import List, Set
 from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
 from .test_adapter import TestAdapter
+def _make_instance(
+    text: str, reference_texts: List[str], correct_references: Set[int], is_eval: bool = False
+) -> Instance:
+    references = []
+    for i, reference_text in enumerate(reference_texts):
+        tags = [CORRECT_TAG] if i in correct_references else []
+        references.append(Reference(Output(text=reference_text), tags=tags))
+    split = TEST_SPLIT if is_eval else TRAIN_SPLIT
+    return Instance(Input(text=text), references=references, split=split)
 class TestMultipleChoiceJointAdapter(TestAdapter):
     def test_sample_examples(self):
         adapter_spec = AdapterSpec(
@@ -53,6 +66,47 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
         examples = adapter.sample_examples(all_train_instances, seed=0)
         assert len(examples) == 3
+    def test_sample_examples_unique_labels(self):
+        """This is a demonstration of behavior reported in issue #2224."""
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
+        )
+        adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
+        all_train_instances = [
+            # Three with 0 being correct.
+            _make_instance("one", ["0", "1"], correct_references={0}),
+            _make_instance("two", ["2", "3"], correct_references={0}),
+            _make_instance("three", ["4", "5"], correct_references={0}),
+            # Two with 1 being correct.
+            _make_instance("four", ["6", "7"], correct_references={1}),
+            _make_instance("five", ["8", "9"], correct_references={1}),
+        ]
+        eval_instance = _make_instance("eval", ["10", "11"], correct_references={1}, is_eval=True)
+        request_states = adapter.adapt(all_train_instances + [eval_instance], parallelism=1)
+        assert len(request_states) == 1
+        # In every case, we are showing that model that Output should be "A".
+        assert request_states[0].request.prompt == (
+            "Input: three\n"
+            "A. 4\n"
+            "B. 5\n"
+            "Output: A\n"
+            "\n"
+            "Input: two\n"
+            "A. 2\n"
+            "B. 3\n"
+            "Output: A\n"
+            "\n"
+            "Input: one\n"
+            "A. 0\n"
+            "B. 1\n"
+            "Output: A\n"
+            "\n"
+            "Input: eval\n"
+            "A. 10\n"
+            "B. 11\n"
+            "Output:"
+        )
     def test_multiple_correct_reference(self):
         adapter_spec = AdapterSpec(
             method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -91,9 +145,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
             ],
             split=TEST_SPLIT,
         )
-        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
-        assert len(actual_instances) == 1
-        assert actual_instances[0].request.prompt == (
+        request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
+        assert len(request_states) == 1
+        assert request_states[0].request.prompt == (
             "Input: Second reference is correct\n"
             "A. First\n"
             "B. Second\n"
@@ -150,9 +204,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
             ],
             split=TEST_SPLIT,
         )
-        actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
-        assert len(actual_instances) == 1
-        assert actual_instances[0].request.prompt == (
+        request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
+        assert len(request_states) == 1
+        assert request_states[0].request.prompt == (
             "Input: Second reference is correct\n"
             "A. First\n"
             "B. Second\n"

crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl