PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -1,11 +1,8 @@
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 import os
-import json
 from typing import Dict, Iterable, List, Optional, Set, Tuple
-import dacite
 from helm.benchmark.adaptation.adapters.adapter_factory import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
@@ -13,48 +10,15 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.scenario_state import ScenarioState
-from helm.benchmark.augmentations.dialect_perturbation import DialectPerturbation
-from helm.benchmark.augmentations.extra_space_perturbation import ExtraSpacePerturbation
-from helm.benchmark.augmentations.filler_words_perturbation import FillerWordsPerturbation
-from helm.benchmark.augmentations.gender_perturbation import GenderPerturbation
-from helm.benchmark.augmentations.misspelling_perturbation import MisspellingPerturbation
-from helm.benchmark.augmentations.person_name_perturbation import PersonNamePerturbation
 from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
-from helm.benchmark.augmentations.space_perturbation import SpacePerturbation
-from helm.benchmark.augmentations.synonym_perturbation import SynonymPerturbation
-from helm.benchmark.augmentations.typos_perturbation import TyposPerturbation
 from helm.benchmark.metrics.metric import PerInstanceStats
 from helm.benchmark.presentation.schema import Schema
 from helm.benchmark.runner import RunSpec
 from helm.benchmark.scenarios.scenario import Instance
-from helm.common.general import asdict_without_nones, write
+from helm.common.general import write
 from helm.common.hierarchical_logger import htrack
 from helm.common.request import Request
-# TODO(#1251): Add proper class registration
-_PERTURBATION_NAME_TO_DESCRIPTION = {
-    DialectPerturbation.name: DialectPerturbation.Description,
-    ExtraSpacePerturbation.name: ExtraSpacePerturbation.Description,
-    FillerWordsPerturbation.name: FillerWordsPerturbation.Description,
-    GenderPerturbation.name: GenderPerturbation.Description,
-    MisspellingPerturbation.name: MisspellingPerturbation.Description,
-    PersonNamePerturbation.name: PersonNamePerturbation.Description,
-    SpacePerturbation.name: SpacePerturbation.Description,
-    SynonymPerturbation.name: SynonymPerturbation.Description,
-    TyposPerturbation.name: TyposPerturbation.Description,
-}
-def _deserialize_perturbation_description(raw_perturbation_description: Dict) -> PerturbationDescription:
-    """Convert a raw dictionary to a PerturbationDescription.
-    This uses the name field to look up the correct PerturbationDescription subclass to output.
-    """
-    factory = _PERTURBATION_NAME_TO_DESCRIPTION.get(raw_perturbation_description["name"], PerturbationDescription)
-    return factory(**raw_perturbation_description)
-_DACITE_CONFIG = dacite.Config(type_hooks={PerturbationDescription: _deserialize_perturbation_description})
+from helm.common.codec import from_json, to_json
 @dataclass(frozen=True)
@@ -117,8 +81,7 @@ def _read_scenario_state(run_path: str) -> ScenarioState:
     if not os.path.exists(scenario_state_path):
         raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
     with open(scenario_state_path) as f:
-        raw_scenario_state = json.load(f)
-        return dacite.from_dict(ScenarioState, raw_scenario_state, config=_DACITE_CONFIG)
+        return from_json(f.read(), ScenarioState)
 def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
@@ -126,8 +89,7 @@ def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
     if not os.path.exists(per_instance_stats_path):
         raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
     with open(per_instance_stats_path) as f:
-        raw_per_instance_stats = json.load(f)
-        return [dacite.from_dict(PerInstanceStats, r, config=_DACITE_CONFIG) for r in raw_per_instance_stats]
+        return from_json(f.read(), List[PerInstanceStats])
 def _truncate_predicted_text(
@@ -286,13 +248,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema):
     write(
         os.path.join(run_path, "instances.json"),
-        json.dumps(list(map(asdict_without_nones, instance_id_to_instance.values())), indent=2),
-    )
-    write(
-        os.path.join(run_path, "display_predictions.json"),
-        json.dumps(list(map(asdict_without_nones, predictions)), indent=2),
+        to_json(list(instance_id_to_instance.values())),
     )
+    write(os.path.join(run_path, "display_predictions.json"), to_json(predictions))
     write(
         os.path.join(run_path, "display_requests.json"),
-        json.dumps(list(map(asdict_without_nones, requests)), indent=2),
+        to_json(requests),
     )

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -205,6 +205,9 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     return aggregate_win_rates
+AGGREGATE_WIN_RATE_COLUMN = 1
 class Summarizer:
     """Summarize the benchmark results in JSON files to be displayed in the UI."""
@@ -288,7 +291,7 @@ class Summarizer:
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
         run_dir_names = sorted([p for p in os.listdir(self.run_suite_path) if p != "eval_cache" and p != "groups"])
-        for run_dir_name in tqdm(run_dir_names):
+        for run_dir_name in tqdm(run_dir_names, disable=None):
             run_spec_path: str = os.path.join(self.run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(self.run_suite_path, run_dir_name, "stats.json")
             if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
@@ -747,7 +750,6 @@ class Summarizer:
             # add overall win rate as the second column
             WIN_RATE_AGGREGATION = "mean"
             win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-            AGGREGATE_WIN_RATE_COLUMN = 1
             description = "How many models this model outperform on average (over columns)."
             table.header.insert(
                 AGGREGATE_WIN_RATE_COLUMN,

helm/benchmark/presentation/test_create_plots.py ADDED Viewed

@@ -0,0 +1,32 @@
+from helm.common.general import asdict_without_nones
+from helm.benchmark.presentation.table import Table, Cell, HeaderCell
+from helm.benchmark.presentation.create_plots import parse_table
+def test_table_parsing():
+    title = "table"
+    scenarios = ["A", "B", "C", "D"]
+    models = ["X", "Y", "Z"]
+    header = []
+    rows = [[] for m in models]
+    header.append(HeaderCell("Models"))
+    header.append(HeaderCell("Mean win rate"))
+    for s in scenarios:
+        header.append(HeaderCell(s, lower_is_better=True, metadata={"run_group": s, "metric": "accuracy"}))
+    for i, model in enumerate(models):
+        rows[i].append(Cell(model))
+        rows[i].append(Cell(0.1 * i))
+        for j, s in enumerate(scenarios):
+            rows[i].append(Cell(i * 10 + j))
+    summarize_table = Table(title, header, rows)
+    table = parse_table(asdict_without_nones(summarize_table))
+    assert table.adapters == models
+    assert list(table.mean_win_rates) == [0.0, 0.1, 0.2]
+    assert len(table.columns) == len(scenarios)
+    for j, c in enumerate(table.columns):
+        assert c.group == scenarios[j]
+        assert c.lower_is_better
+        assert c.metric == "accuracy"
+        for i, v in enumerate(c.values):
+            assert v == i * 10 + j

helm/benchmark/run.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import argparse
 from dataclasses import replace
+import os
 from typing import List, Optional
+from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec
+from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
 from helm.proxy.services.remote_service import create_authentication, add_service_args
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
@@ -16,8 +19,52 @@ from .run_specs import construct_run_specs
 LATEST_SYMLINK: str = "latest"
+def run_entries_to_run_specs(
+    run_entries: List[RunEntry],
+    max_eval_instances: Optional[int] = None,
+    num_train_trials: Optional[int] = None,
+    models_to_run: Optional[List[str]] = None,
+    groups_to_run: Optional[List[str]] = None,
+    priority: Optional[int] = None,
+) -> List[RunSpec]:
+    """Runs RunSpecs given a list of RunSpec descriptions."""
+    run_specs: List[RunSpec] = []
+    for entry in run_entries:
+        # Filter by priority
+        if priority is not None and entry.priority > priority:
+            continue
+        for run_spec in construct_run_specs(parse_object_spec(entry.description)):
+            # Filter by models
+            if models_to_run and run_spec.adapter_spec.model not in models_to_run:
+                continue
+            # Filter by groups
+            if groups_to_run and not any(group in groups_to_run for group in run_spec.groups):
+                continue
+            # Modify AdapterSpec
+            adapter_spec: AdapterSpec = run_spec.adapter_spec
+            if max_eval_instances is not None:
+                adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
+            if num_train_trials is not None or adapter_spec.max_train_instances == 0:
+                adapter_spec = replace(
+                    adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
+                )
+            run_spec = replace(run_spec, adapter_spec=adapter_spec)
+            # Append groups
+            if entry.groups is not None:
+                groups_name: str = "" if len(entry.groups) == 0 else f",groups={'-'.join(sorted(entry.groups))}"
+                run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + entry.groups)
+            run_specs.append(run_spec)
+    return run_specs
 def run_benchmarking(
-    run_spec_descriptions: List[str],
+    run_specs: List[RunSpec],
     auth: Authentication,
     url: str,
     local: bool,
@@ -27,15 +74,11 @@ def run_benchmarking(
     suite: str,
     dry_run: bool,
     skip_instances: bool,
-    max_eval_instances: Optional[int] = None,
-    num_train_trials: Optional[int] = None,
-    groups: Optional[List[str]] = None,
-    models_to_run: Optional[List[str]] = None,
-    groups_to_run: Optional[List[str]] = None,
+    skip_completed_runs: bool,
+    exit_on_error: bool,
     mongo_uri: str = "",
 ) -> List[RunSpec]:
     """Runs RunSpecs given a list of RunSpec descriptions."""
     execution_spec = ExecutionSpec(
         auth=auth,
         url=url,
@@ -45,47 +88,28 @@ def run_benchmarking(
         dry_run=dry_run,
         mongo_uri=mongo_uri,
     )
-    def override(run_spec: RunSpec) -> RunSpec:
-        """Override parts of `run_spec`."""
-        # Modify AdapterSpec
-        adapter_spec: AdapterSpec = run_spec.adapter_spec
-        if max_eval_instances is not None:
-            adapter_spec = replace(adapter_spec, max_eval_instances=max_eval_instances)
-        if num_train_trials is not None or adapter_spec.max_train_instances == 0:
-            adapter_spec = replace(
-                adapter_spec, num_train_trials=1 if adapter_spec.max_train_instances == 0 else num_train_trials
-            )
-        run_spec = replace(run_spec, adapter_spec=adapter_spec)
-        # Append groups
-        if groups is not None:
-            groups_name: str = "" if len(groups) == 0 else f",groups={'-'.join(sorted(groups))}"
-            run_spec = replace(run_spec, name=run_spec.name + groups_name, groups=run_spec.groups + groups)
-        return run_spec
-    run_specs = [
-        override(run_spec)
-        for description in run_spec_descriptions
-        for run_spec in construct_run_specs(parse_object_spec(description))
-        if (not models_to_run or run_spec.adapter_spec.model in models_to_run)
-        and (not groups_to_run or any(group in groups_to_run for group in run_spec.groups))
-    ]
-    if len(run_specs) == 0:
-        return run_specs
     with htrack_block("run_specs"):
         for run_spec in run_specs:
-            hlog(run_spec.name)
+            hlog(run_spec)
-    runner = Runner(execution_spec, output_path, suite, run_specs, skip_instances)
-    runner.run_all()
+    runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
+    runner.run_all(run_specs)
     return run_specs
+def symlink_latest(output_path: str, suite: str) -> None:
+    # Create a symlink runs/latest -> runs/<name_of_suite>,
+    # so runs/latest always points to the latest run suite.
+    runs_dir: str = os.path.join(output_path, "runs")
+    suite_dir: str = os.path.join(runs_dir, suite)
+    symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
+    hlog(f"Symlinking {suite_dir} to {LATEST_SYMLINK}.")
+    if os.path.islink(symlink_path):
+        # Remove the previous symlink if it exists.
+        os.unlink(symlink_path)
+    os.symlink(os.path.abspath(suite_dir), symlink_path)
 def add_run_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "-o", "--output-path", type=str, help="Where to save all the output", default="benchmark_output"
@@ -149,19 +173,86 @@ def validate_args(args):
 @htrack(None)
 def main():
-    """
-    Main entry point for running the benchmark.
-    """
     parser = argparse.ArgumentParser()
     add_service_args(parser)
-    parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=["simple1"])
+    parser.add_argument(
+        "-c",
+        "--conf-paths",
+        nargs="+",
+        help="Where to read RunSpecs to run from",
+        default=[],
+    )
+    parser.add_argument(
+        "--models-to-run",
+        nargs="+",
+        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
+        default=None,
+    )
+    parser.add_argument(
+        "--groups-to-run",
+        nargs="+",
+        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
+        default=None,
+    )
+    parser.add_argument(
+        "--exit-on-error",
+        action="store_true",
+        default=None,
+        help="Fail and exit immediately if a particular RunSpec fails.",
+    )
+    parser.add_argument(
+        "--skip-completed-runs",
+        action="store_true",
+        default=None,
+        help="Skip RunSpecs that have completed i.e. output files exists.",
+    )
+    parser.add_argument(
+        "--priority",
+        type=int,
+        default=None,
+        help="Run RunSpecs with priority less than or equal to this number. "
+        "If a value for --priority is not specified, run on everything",
+    )
+    parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
+    parser.add_argument(
+        "--enable-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
+        "Format: namespace/model_name[@revision]",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
+    for huggingface_model_name in args.enable_huggingface_models:
+        register_huggingface_model_config(huggingface_model_name)
+    run_entries: List[RunEntry] = []
+    if args.conf_paths:
+        run_entries.extend(read_run_entries(args.conf_paths).entries)
+    if args.run_specs:
+        run_entries.extend(
+            [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
+        )
+    run_specs = run_entries_to_run_specs(
+        run_entries=run_entries,
+        max_eval_instances=args.max_eval_instances,
+        num_train_trials=args.num_train_trials,
+        models_to_run=args.models_to_run,
+        groups_to_run=args.groups_to_run,
+        priority=args.priority,
+    )
+    hlog(f"{len(run_entries)} entries produced {len(run_specs)} run specs")
+    if len(run_specs) == 0:
+        hlog("There were no RunSpecs or they got filtered out.")
+        return
     auth: Authentication = Authentication("") if args.skip_instances or args.local else create_authentication(args)
     run_benchmarking(
-        args.run_specs,
+        run_specs=run_specs,
         auth=auth,
         url=args.server_url,
         local=args.local,
@@ -171,10 +262,15 @@ def main():
         suite=args.suite,
         dry_run=args.dry_run,
         skip_instances=args.skip_instances,
-        max_eval_instances=args.max_eval_instances,
+        skip_completed_runs=args.skip_completed_runs,
+        exit_on_error=args.exit_on_error,
         mongo_uri=args.mongo_uri,
     )
+    symlink_latest(output_path=args.output_path, suite=args.suite)
+    hlog("Done.")
 if __name__ == "__main__":
     main()

helm/benchmark/run_expander.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import replace
-from typing import List, Dict, Optional, Tuple
+from typing import List, Dict, Optional, Tuple, Type
 from helm.proxy.models import (
     get_all_code_models,
@@ -302,35 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
     """
     name = "model"
-    values_dict = {
-        "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
-        "ai21/j1-jumbo": ["ai21/j1-jumbo"],
-        "openai/curie": ["openai/curie"],
-        "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"],  # Compare ChatGPT to text-davinci-003
-        "all": get_all_models(),
-        "text_code": get_all_text_models() + get_all_code_models(),
-        "text": get_all_text_models(),
-        "code": get_all_code_models(),
-        "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
-        "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
-        "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
-        "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
-        "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
-        "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
-        "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
-    }
-    # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
-    # which contains the subset of models with the ablation tag.
-    ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
-    ablation_values_dict = {}
-    for family_name, models in values_dict.items():
-        ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
-    for family_name, models in ablation_values_dict.items():
-        if family_name == "ablation_all":
-            values_dict["ablation"] = models
+    def __init__(self, value):
+        """
+        `value` is either the actual value to use or a lookup into the values dict.
+        """
+        if value in self.values_dict:
+            self.values = self.values_dict[value]
         else:
-            values_dict[family_name] = models
+            self.values = [value]
+    @property
+    def values_dict(self):
+        values_dict = {
+            "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
+            "ai21/j1-jumbo": ["ai21/j1-jumbo"],
+            "openai/curie": ["openai/curie"],
+            "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"],  # Compare ChatGPT to text-davinci-003
+            "all": get_all_models(),
+            "text_code": get_all_text_models() + get_all_code_models(),
+            "text": get_all_text_models(),
+            "code": get_all_code_models(),
+            "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
+            "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
+            "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
+            "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
+            "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
+            "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
+            "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
+            "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
+            "opinions_qa_openai": [
+                "openai/ada",
+                "openai/davinci",
+                "openai/text-ada-001",
+                "openai/text-davinci-001",
+                "openai/text-davinci-002",
+                "openai/text-davinci-003",
+            ],
+            "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
+        }
+        # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
+        # which contains the subset of models with the ablation tag.
+        ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
+        ablation_values_dict = {}
+        for family_name, models in values_dict.items():
+            ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
+        for family_name, models in ablation_values_dict.items():
+            if family_name == "ablation_all":
+                values_dict["ablation"] = models
+            else:
+                values_dict[family_name] = models
+        return values_dict
 ############################################################
@@ -739,6 +762,7 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
         "AlephAlpha/luminous-extended": ["AlephAlpha/luminous-extended"],
         "AlephAlpha/luminous-supreme": ["AlephAlpha/luminous-supreme"],
         "AlephAlpha/luminous-world": ["AlephAlpha/luminous-world"],
+        "huggingface/santacoder": ["bigcode/santacoder"],
     }
     model_tags_and_tokenizers = [
         (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
@@ -768,7 +792,8 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         # Find right tokenizer given model.
         if isinstance(self.all_values, dict):
-            self.values = self.all_values[run_spec.adapter_spec.model]
+            model: str = run_spec.adapter_spec.model
+            self.values = self.all_values[model] if model in self.all_values else []
         else:
             self.values = self.all_values
         return super().expand(run_spec)
@@ -818,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
         ]
-RUN_EXPANDERS = dict(
-    (expander.name, expander)
-    for expander in [
-        InstructionsRunExpander,
-        PromptRunExpander,
-        NewlineRunExpander,
-        StopRunExpander,
-        GlobalPrefixRunExpander,
-        NumTrainTrialsRunExpander,
-        MaxTrainInstancesRunExpander,
-        NumOutputsRunExpander,
-        ModelRunExpander,
-        DataAugmentationRunExpander,
-        TokenizerRunExpander,
-        NumPromptTokensRunExpander,
-        NumOutputTokensRunExpander,
-    ]
-)
+class ChatMLRunExpander(RunExpander):
+    """
+    Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
+    A 1-shot example:
+    <|im_start|>system
+    Translate from English to French
+    <|im_end|>
+    <|im_start|>user
+    How are you?
+    <|im_end|>
+    <|im_start|>user
+    Comment allez-vous?
+    <|im_end|>
+    <|im_start|>user
+    {{user input here}}<|im_end|>
+    """
+    name = "chatml"
+    def __init__(self):
+        self.name = type(self).name
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        adapter_spec = run_spec.adapter_spec
+        # according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
+        # few-shot examples should do `<|im_start|>system name=example_user`
+        # or `<|im_start|>system name=example_assistant`
+        # but it is also possible to put examples into a user message.
+        scenario_name = run_spec.name.split(":")[0]
+        if scenario_name in ("msmarco",):
+            # output_prefix:
+            #     Does the passage answer the query?
+            #     Answer:
+            #
+            # new_output_prefix:
+            #     Does the passage answer the query?<|im_end|>
+            #     <|im_start|>assistant
+            #     Answer:
+            new_output_prefix = (
+                adapter_spec.output_prefix.split("\n")[0]
+                + "<|im_end|>\n<|im_start|>assistant\n"
+                + adapter_spec.output_prefix.split("\n")[1]
+            )
+        elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
+            # output_prefix:
+            #     Summarize the above article in 1 sentence.
+            #
+            # new_output_prefix:
+            #     Summarize the above article in 1 sentence.<|im_end|>
+            #     <|im_start|>assistant
+            #
+            new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
+        else:
+            # output_prefix:
+            #     {output_prefix}
+            #
+            # new_output_prefix:
+            #     <|im_end|>
+            #     <|im_start|>assistant
+            #     {output_prefix}
+            new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
+        adapter_spec = replace(
+            adapter_spec,
+            # This is a hack to make sure <|im_start|>user goes before the reference.
+            instructions=(
+                f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
+                if adapter_spec.instructions != ""
+                else "<|im_start|>user\n"
+            ),
+            instance_prefix="",
+            output_prefix=new_output_prefix,
+            output_suffix="<|im_end|>\n<|im_start|>user\n",
+            stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
+        )
+        return [
+            replace(
+                run_spec,
+                adapter_spec=adapter_spec,
+            ),
+        ]
+RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
+    InstructionsRunExpander,
+    PromptRunExpander,
+    NewlineRunExpander,
+    StopRunExpander,
+    GlobalPrefixRunExpander,
+    NumTrainTrialsRunExpander,
+    MaxTrainInstancesRunExpander,
+    NumOutputsRunExpander,
+    ModelRunExpander,
+    DataAugmentationRunExpander,
+    TokenizerRunExpander,
+    NumPromptTokensRunExpander,
+    NumOutputTokensRunExpander,
+    ChatMLRunExpander,
+]
+RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl