PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/proxy/models.py CHANGED Viewed

@@ -11,8 +11,14 @@ EMBEDDING_MODEL_TAG: str = "embedding"
 FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "full_functionality_text"
 LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "limited_functionality_text"
+# ChatML format
+CHATML_MODEL_TAG: str = "chatml"
 # For OpenAI models with wider context windows
-WIDER_CONTEXT_WINDOW_TAG: str = "wider_context_window"
+WIDER_CONTEXT_WINDOW_TAG: str = "wider_context_window"  # 4000 tokens
+# For AI21 Jurassic-2 models with wider context windows
+AI21_WIDER_CONTEXT_WINDOW_TAG: str = "ai21_wider_context_window"
 # To fetch models that use these tokenizers
 GPT2_TOKENIZER_TAG: str = "gpt2_tokenizer"
@@ -122,6 +128,31 @@ ALL_MODELS = [
         description="Jurassic-1 Large (7.5B parameters)",
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
     ),
+    # AI21 Jurassic-2 Models: https://www.ai21.com/blog/introducing-j2
+    Model(
+        group="jurassic",
+        creator_organization="AI21 Labs",
+        name="ai21/j2-jumbo",
+        display_name="Jurassic-2 Jumbo (178B)",
+        description="Jurassic-2 Jumbo (178B parameters)",
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
+    ),
+    Model(
+        group="jurassic",
+        creator_organization="AI21 Labs",
+        name="ai21/j2-grande",
+        display_name="Jurassic-2 Grande (17B)",
+        description="Jurassic-2 Grande (17B parameters) with a few tweaks to the training process.",
+        tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
+    ),
+    Model(
+        group="jurassic",
+        creator_organization="AI21 Labs",
+        name="ai21/j2-large",
+        display_name="Jurassic-2 Large (7.5B)",
+        description="Jurassic-2 Large (7.5B parameters)",
+        tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
+    ),
     # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
     Model(
         group="luminous",
@@ -250,6 +281,24 @@ ALL_MODELS = [
         description="Cohere small v20220720 (410M parameters)",
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
     ),
+    Model(
+        group="cohere",
+        creator_organization="Cohere",
+        name="cohere/command-medium-beta",
+        display_name="Cohere Command beta (6.1B)",
+        description="Cohere Command beta (6.1B parameters) is fine-tuned from the medium model "
+        "to respond well with instruction-like prompts",
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
+    ),
+    Model(
+        group="cohere",
+        creator_organization="Cohere",
+        name="cohere/command-xlarge-beta",
+        display_name="Cohere Command beta (52.4B)",
+        description="Cohere Command beta (52.4B parameters) is fine-tuned from the XL model "
+        "to respond well with instruction-like prompts",
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
+    ),
     # EleutherAI
     Model(
         group="together",
@@ -285,6 +334,14 @@ ALL_MODELS = [
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
     ),
     # HuggingFace
+    Model(
+        group="huggingface",
+        creator_organization="OpenAI",
+        name="huggingface/gpt2",
+        display_name="GPT-2 (1.5B)",
+        description="GPT-2 (1.5B parameters)",
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
+    ),
     Model(
         group="huggingface",
         creator_organization="EleutherAI",
@@ -293,6 +350,15 @@ ALL_MODELS = [
         description="GPT-J (6B parameters) autoregressive language model trained on The Pile.",
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
     ),
+    Model(
+        group="huggingface",
+        creator_organization="BigCode",
+        name="huggingface/santacoder",
+        display_name="SantaCoder (1.1B)",
+        description="SantaCoder (1.1B parameters) model trained on the Python, Java, and "
+        "JavaScript subset of The Stack (v1.1).",
+        tags=[CODE_MODEL_TAG],
+    ),
     # Google
     Model(
         group="together",
@@ -306,6 +372,15 @@ ALL_MODELS = [
         # Does not support echo=True
         tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
     ),
+    Model(
+        group="together",
+        creator_organization="Google",
+        name="together/flan-t5-xxl",
+        display_name="Flan-T5 (11B)",
+        description="Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks.",
+        # Does not support echo=True
+        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
+    ),
     Model(
         group="together",
         creator_organization="Google",
@@ -323,12 +398,13 @@ ALL_MODELS = [
             NLG_PREFIX_TAG,
         ],
     ),
+    # H3 model
     Model(
-        group="huggingface",
-        creator_organization="OpenAI",
-        name="huggingface/gpt2",
-        display_name="GPT-2 (1.5B)",
-        description="GPT-2 (1.5B parameters)",
+        group="together",
+        creator_organization="HazyResearch",
+        name="together/h3-2.7b",
+        display_name="H3 (2.7B)",
+        description="H3 (2.7B parameters) is a decoder-only language model based on state space models.",
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
     ),
     # OPT
@@ -480,7 +556,21 @@ ALL_MODELS = [
         description="Code model that is a stronger, multilingual version of the Codex (12B) model in the paper.",
         tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
     ),
-    # ChatGPT - https://openai.com/blog/chatgpt
+    # ChatGPT: https://openai.com/blog/chatgpt
+    Model(
+        group="gpt3",
+        creator_organization="OpenAI",
+        name="openai/gpt-3.5-turbo-0301",
+        display_name="gpt-3.5-turbo-0301",
+        # https://platform.openai.com/docs/models/gpt-3-5
+        description="Sibling model of text-davinci-003 is optimized for chat but works well "
+        "for traditional completions tasks as well. Snapshot from 2023-03-01.",
+        # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
+        # sequence length is smaller at 4087 with one user input message and one assistant
+        # output message because ChatGPT uses special tokens for message roles and boundaries.
+        # We use a rounded-down sequence length of 4000 to account for these special tokens.
+        tags=[TEXT_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
+    ),
     Model(
         group="gpt3",
         creator_organization="OpenAI",
@@ -532,6 +622,14 @@ ALL_MODELS = [
         description="GPT-JT (6B parameters) is a fork of GPT-J",
         tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
     ),
+    Model(
+        group="together",
+        creator_organization="Together",
+        name="together/gpt-neoxt-chat-base-20b",
+        display_name="GPT-NeoXT-Chat-Base (20B)",
+        description="GPT-NeoXT-Chat-Base (20B parameters) is a fork of GPT-NeoX",
+        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
+    ),
     # Tsinghua
     Model(
         group="together",
@@ -557,6 +655,16 @@ ALL_MODELS = [
         #       https://github.com/stanford-crfm/benchmarking/issues/738
         tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
     ),
+    # PaLM
+    Model(
+        group="google",
+        creator_organization="Google",
+        name="google/palm",
+        display_name="PaLM (540B)",
+        description="Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips "
+        "([paper](https://arxiv.org/pdf/2204.02311.pdf)).",
+        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
+    ),
     # For debugging
     Model(
         group="simple",

helm/proxy/test_models.py CHANGED Viewed

@@ -24,4 +24,4 @@ def test_get_models_by_organization():
 def test_all_code_models():
-    assert get_all_code_models() == ["openai/code-davinci-002", "openai/code-davinci-001", "openai/code-cushman-001"]
+    assert "openai/code-davinci-002" in get_all_code_models()

helm/benchmark/presentation/present.py DELETED Viewed

@@ -1,249 +0,0 @@
-import argparse
-import os
-import traceback
-from tqdm import tqdm
-from typing import List, Optional
-from helm.common.authentication import Authentication
-from helm.common.general import write_lines
-from helm.common.hierarchical_logger import hlog, htrack
-from helm.benchmark.run import run_benchmarking, add_run_args, validate_args, LATEST_SYMLINK
-from helm.benchmark.runner import RunSpec
-from helm.benchmark.presentation.run_entry import read_run_entries
-from helm.proxy.services.remote_service import add_service_args, create_authentication
-"""
-Runs all the RunSpecs in run_specs.conf and outputs JSON files.
-TODO: rename this file to `run_all.py`
-Usage:
-    venv/bin/helm-run
-"""
-class AllRunner:
-    """Runs all RunSpecs specified in the configuration file."""
-    def __init__(
-        self,
-        auth: Authentication,
-        conf_paths: List[str],
-        url: str,
-        local: bool,
-        local_path: str,
-        output_path: str,
-        suite: str,
-        num_threads: int,
-        dry_run: Optional[bool],
-        skip_instances: bool,
-        max_eval_instances: Optional[int],
-        num_train_trials: Optional[int],
-        models_to_run: Optional[List[str]],
-        groups_to_run: Optional[List[str]],
-        exit_on_error: bool,
-        priority: Optional[int],
-        mongo_uri: str,
-    ):
-        self.auth: Authentication = auth
-        self.conf_paths: List[str] = conf_paths
-        self.url: str = url
-        self.local: bool = local
-        self.local_path: str = local_path
-        self.output_path: str = output_path
-        self.suite: str = suite
-        self.num_threads: int = num_threads
-        self.dry_run: Optional[bool] = dry_run
-        self.skip_instances: bool = skip_instances
-        self.max_eval_instances: Optional[int] = max_eval_instances
-        self.num_train_trials: Optional[int] = num_train_trials
-        self.models_to_run: Optional[List[str]] = models_to_run
-        self.groups_to_run: Optional[List[str]] = groups_to_run
-        self.exit_on_error: bool = exit_on_error
-        self.priority: Optional[int] = priority
-        self.mongo_uri = mongo_uri
-    @htrack(None)
-    def run(self):
-        run_specs: List[RunSpec] = []
-        runs_dir: str = os.path.join(self.output_path, "runs")
-        suite_dir: str = os.path.join(runs_dir, self.suite)
-        run_entries = read_run_entries(self.conf_paths)
-        for entry in tqdm(run_entries.entries):
-            # Filter by priority
-            priority: int = entry.priority
-            if self.priority is not None and priority > self.priority:
-                continue
-            try:
-                new_run_specs = run_benchmarking(
-                    run_spec_descriptions=[entry.description],
-                    auth=self.auth,
-                    url=self.url,
-                    local=self.local,
-                    local_path=self.local_path,
-                    num_threads=self.num_threads,
-                    output_path=self.output_path,
-                    suite=self.suite,
-                    dry_run=self.dry_run,
-                    skip_instances=self.skip_instances,
-                    max_eval_instances=self.max_eval_instances,
-                    num_train_trials=self.num_train_trials,
-                    groups=entry.groups,
-                    models_to_run=self.models_to_run,
-                    groups_to_run=self.groups_to_run,
-                    mongo_uri=self.mongo_uri,
-                )
-                run_specs.extend(new_run_specs)
-            except Exception as e:
-                if self.exit_on_error:
-                    raise e
-                else:
-                    hlog(f"Error when running {entry.description}:\n{traceback.format_exc()}")
-        if len(run_specs) == 0:
-            hlog("There were no RunSpecs or they got filtered out.")
-            return
-        hlog(f"{len(run_entries.entries)} entries produced into {len(run_specs)} run specs")
-        if self.skip_instances:
-            self.write_parallel_commands(suite_dir, run_specs)
-        # Create a symlink runs/latest -> runs/<name_of_suite>,
-        # so runs/latest always points to the latest run suite.
-        symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
-        if os.path.islink(symlink_path):
-            # Remove the previous symlink if it exists.
-            os.unlink(symlink_path)
-        os.symlink(os.path.abspath(suite_dir), symlink_path)
-    def write_parallel_commands(self, suite_dir: str, run_specs: List[RunSpec]):
-        """
-        Print out scripts to run after.
-        """
-        # Print out all the models and groups that we're touching.
-        models = set()
-        groups = set()
-        for run_spec in run_specs:
-            models.add(run_spec.adapter_spec.model)
-            for group in run_spec.groups:
-                groups.add(group)
-        hlog(f"{len(models)} models: {' '.join(models)}")
-        hlog(f"{len(groups)} groups: {' '.join(groups)}")
-        # Write wrapper for helm-run that can be used through Slurm
-        lines = [
-            "#!/bin/bash",
-            "",
-            ". venv/bin/activate",
-            'helm-run "$@"',
-        ]
-        write_lines(os.path.join(suite_dir, "helm-run.sh"), lines)
-        # Write out bash script for launching the entire benchmark
-        lines = []
-        for model in models:
-            for group in groups:
-                # Try to match the arguments of `run_benchmarking`
-                # Build arguments
-                present_args = []
-                present_args.append(f"--confs {' '.join(self.conf_paths)}")
-                if self.local:
-                    present_args.append("--local")
-                present_args.append(f"--num-threads {self.num_threads}")
-                present_args.append(f"--suite {self.suite}")
-                if self.max_eval_instances is not None:
-                    present_args.append(f"--max-eval-instances {self.max_eval_instances}")
-                present_args.append(f"--models-to-run {model}")
-                present_args.append(f"--scenario-groups-to-run {group}")
-                lines.append(
-                    f"sbatch --partition john "
-                    f"--cpus {self.num_threads} "
-                    f"-o benchmark_output/runs/{self.suite}/slurm-%j.out "
-                    f"{suite_dir}/helm-run.sh "
-                    f"{' '.join(present_args)}"
-                )
-        lines.append("echo '# Run these after Slurm jobs terminate'")
-        lines.append(f"echo 'helm-run --local --suite {self.suite} --skip-instances'")
-        lines.append(f"echo 'helm-summarize --suite {self.suite}'")
-        write_lines(os.path.join(suite_dir, "run-all.sh"), lines)
-def main():
-    parser = argparse.ArgumentParser()
-    add_service_args(parser)
-    parser.add_argument(
-        "-c",
-        "--conf-paths",
-        nargs="+",
-        help="Where to read RunSpecs to run from",
-        default=["src/helm/benchmark/presentation/run_specs.conf"],
-    )
-    parser.add_argument(
-        "--models-to-run",
-        nargs="+",
-        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
-        default=None,
-    )
-    parser.add_argument(
-        "--groups-to-run",
-        nargs="+",
-        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
-        default=None,
-    )
-    parser.add_argument(
-        "--exit-on-error",
-        action="store_true",
-        default=None,
-        help="Fail and exit immediately if a particular RunSpec fails.",
-    )
-    parser.add_argument(
-        "--priority",
-        type=int,
-        default=None,
-        help="Run RunSpecs with priority less than or equal to this number. "
-        "If a value for --priority is not specified, run on everything",
-    )
-    add_run_args(parser)
-    args = parser.parse_args()
-    validate_args(args)
-    runner = AllRunner(
-        # Use a dummy API key when `skip_instances` or `local` is set.
-        # The benchmarking framework will not make any requests to the proxy server when
-        # `skip_instances` is set, so a valid API key is not necessary.
-        # Setting `local` will run and cache everything locally.
-        auth=Authentication("") if args.skip_instances or args.local else create_authentication(args),
-        conf_paths=args.conf_paths,
-        url=args.server_url,
-        local=args.local,
-        local_path=args.local_path,
-        output_path=args.output_path,
-        suite=args.suite,
-        num_threads=args.num_threads,
-        dry_run=args.dry_run,
-        skip_instances=args.skip_instances,
-        max_eval_instances=args.max_eval_instances,
-        num_train_trials=args.num_train_trials,
-        models_to_run=args.models_to_run,
-        groups_to_run=args.groups_to_run,
-        exit_on_error=args.exit_on_error,
-        priority=args.priority,
-        mongo_uri=args.mongo_uri,
-    )
-    # Run the benchmark!
-    runner.run()
-    hlog("Done.")
-if __name__ == "__main__":
-    main()

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl