PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/static/benchmarking.js CHANGED Viewed

@@ -124,6 +124,44 @@ $(function () {
     return $table;
   }
+  function renderPlots() {
+    const container = $('<div>', {class: "container"});
+    const links = $('<div>');
+    container.append(links);
+    const tableLinks = [];
+    function renderPlot(name, title) {
+        const plot = $('<div>', {class: "plot"});
+        const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
+        plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
+        plot.append(caption);
+        plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
+        container.append(plot);
+        tableLinks.push($('<a>', {href: '#' + title}).append(title));
+    }
+    renderPlot("generic_summary", "Metric spread for core scenarios");
+    renderPlot("model_ranking_all", "Head-to-head win rate per each model");
+    renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
+    renderPlot("metric_correlation", "Correlation between metrics");
+    renderPlot("accuracy_v_access", "Accuracy as a function of model access");
+    renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
+    renderPlot("accuracy_over_release_date", "Accuracy over time");
+    renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
+    renderPlot("targeted_evals", "Targeted evaluations");
+    renderPlot("in_context_ablations", "Number of in-context examples ablation");
+    renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
+    links.append(renderItems(tableLinks));
+    return container;
+  }
   function renderRunsOverview(runSpecs) {
     let query = '';
     const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
@@ -1170,6 +1208,11 @@ $(function () {
       $main.empty()
       $main.append(renderHeader('Scenarios', renderScenarios()));
       refreshHashLocation();
+    } else if (urlParams.plots) {
+      // Plots
+      $main.empty()
+      $main.append(renderHeader('Plots', renderPlots()));
+      refreshHashLocation();
     } else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
       // Predictions for a set of run specs (matching a regular expression)
       $main.text('Loading runs...');

helm/benchmark/static/index.html CHANGED Viewed

@@ -22,6 +22,7 @@
             <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
             <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
             <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
+            <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
             <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
           </ul>
         </div>
@@ -48,5 +49,6 @@
     <script src="json-urls-root.js"></script>
     <script src="json-urls.js"></script>
     <script src="benchmarking.js"></script>
+    <script src="plot-captions.js"></script>
   </body>
 </html>

helm/benchmark/static/json-urls.js CHANGED Viewed

@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
 function requestsJsonUrl(suite, runSpecName) {
   return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
 }
+function plotUrl(suite, plotName) {
+  return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
+}

helm/benchmark/static/plot-captions.js ADDED Viewed

@@ -0,0 +1,16 @@
+////////////////////////////////////////////////////////////
+// Dictionary of plot captions
+const plotCaptions = {
+    "generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
+    "model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
+    "accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
+    "metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
+    "accuracy_v_access": "The relationship between access (open vs. limited vs.  closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
+    "accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
+    "accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
+    "accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
+    "targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
+    "in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
+    "mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
+};

helm/benchmark/static/schema.yaml CHANGED Viewed

@@ -30,6 +30,27 @@ models:
     access: limited
     num_parameters: 17000000000
     release_date: 2022-10-28
+  - name: ai21/j2-jumbo
+    display_name: Jurassic-2 Jumbo (178B)
+    description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2023-03-09
+  - name: ai21/j2-grande
+    display_name: Jurassic-2 Grande (17B)
+    description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2023-03-09
+  - name: ai21/j2-large
+    display_name: Jurassic-2 Large (7.5B)
+    description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 7500000000
+    release_date: 2023-03-09
   #  Aleph Alpha
   # TODO: add Luminous World when it's released
@@ -92,6 +113,13 @@ models:
     num_parameters: 11000000000
     release_date: 2021-10-15
+  # BigCode
+  - name: huggingface/santacoder
+    display_name: SantaCoder (1.1B)
+    description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
+    creator_organization: BigCode
+    access: open
   # Cohere
   - name: cohere/xlarge-20220609
     display_name: Cohere xlarge v20220609 (52.4B)
@@ -135,6 +163,22 @@ models:
     access: limited
     num_parameters: 6100000000
     release_date: 2022-11-08
+  - name: cohere/command-medium-beta
+    display_name: Cohere Command beta (6.1B)
+    description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-11-08
+    todo: true
+  - name: cohere/command-xlarge-beta
+    display_name: Cohere Command beta (52.4B)
+    description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-11-08
+    todo: true
   # DeepMind
   - name: deepmind/gopher
@@ -188,7 +232,6 @@ models:
     description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
     creator_organization: Google
     access: open
-    todo: true
   - name: google/palm
     display_name: PaLM (540B)
@@ -197,7 +240,35 @@ models:
     access: closed
     todo: true
+  # HazyResearch
+  - name: together/h3-2.7b
+    display_name: H3 (2.7B)
+    description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
+    creator_organization: HazyResearch
+    access: open
+    num_parameters: 2700000000
+    release_date: 2023-01-23
+    todo: true
   # Meta
+  - name: together/opt-iml-175b
+    display_name: OPT-IML (175B)
+    description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 175000000000
+    release_date: 2022-12-22
+    todo: true
+  - name: together/opt-iml-30b
+    display_name: OPT-IML (30B)
+    description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-12-22
+    todo: true
   - name: together/opt-175b
     display_name: OPT (175B)
     description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
@@ -223,6 +294,15 @@ models:
     release_date: 2022-11-15
     todo: true
+  - name: together/galactica-30b
+    display_name: Galactica (30B)
+    description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-11-15
+    todo: true
   # Microsoft/NVIDIA
   - name: microsoft/TNLGv2_530B
     display_name: TNLG v2 (530B)
@@ -327,6 +407,12 @@ models:
     description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
     creator_organization: OpenAI
     access: limited
+  - name: openai/gpt-3.5-turbo-0301
+    display_name: gpt-3.5-turbo-0301
+    description: Sibling model Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-03-01
   - name: openai/chat-gpt
     display_name: ChatGPT
     description: Sibling model to InstructGPT which interacts in a conversational way. See [OpenAI's announcement](https://openai.com/blog/chatgpt/). The size of the model is unknown.
@@ -344,6 +430,24 @@ models:
     num_parameters: 6700000000
     release_date: 2022-11-29
     todo: true
+  - name: together/gpt-neoxt-chat-base-20b
+    display_name: GPT-NeoXT-Chat-Base (20B)
+    description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
+    creator_organization: Together
+    access: open
+    num_parameters: 20000000000
+    release_date: 2023-03-08
+    todo: true
+  # Salesforce
+  - name: together/codegen
+    display_name: CodeGen (16B)
+    description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
+    creator_organization: Tsinghua
+    access: open
+    num_parameters: 16000000000
+    release_date: 2022-03-25
+    todo: true
   # Tsinghua
   - name: together/glm
@@ -354,6 +458,15 @@ models:
     num_parameters: 130000000000
     release_date: 2022-08-04
+  - name: together/codegeex
+    display_name: CodeGeeX (13B)
+    description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
+    creator_organization: Tsinghua
+    access: open
+    num_parameters: 13000000000
+    release_date: 2022-09-19
+    todo: true
   # Yandex
   - name: together/yalm
     display_name: YaLM (100B)
@@ -563,6 +676,14 @@ metrics:
     display_name: F1
     description: Average F1 score in terms of word overlap between the model output and correct reference.
     lower_is_better: false
+  - name: classification_macro_f1
+    display_name: Macro-F1
+    description: Population-level macro-averaged F1 score.
+    lower_is_better: false
+  - name: classification_micro_f1
+    display_name: Micro-F1
+    description: Population-level micro-averaged F1 score.
+    lower_is_better: false
   - name: absolute_value_difference
     display_name: Absolute difference
     short_display_name: Diff.
@@ -1094,6 +1215,14 @@ metric_groups:
       - name: monte_carlo_entropy
         split: ${main_split}
+  - name: classification_metrics
+    display_name: Classification metrics
+    metrics:
+      - name: classification_macro_f1
+        split: ${main_split}
+      - name: classification_micro_f1
+        split: ${main_split}
 ############################################################
 run_groups:
 ## Top-level
@@ -2031,6 +2160,30 @@ run_groups:
       when: n/a
       language: synthetic
+  - name: lextreme
+    display_name: LEXTREME
+    description: A Multilingual Legal Benchmark for Natural Language Understanding
+    metric_groups:
+      - classification_metrics
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: classification_macro_f1
+      main_split: test
+  - name: lex_glue
+    display_name: LexGLUE
+    description: A Benchmark Dataset for Legal Language Understanding in English
+    metric_groups:
+      - classification_metrics
+      - calibration
+      - efficiency
+      - general_information
+    environment:
+      main_name: classification_macro_f1
+      main_split: test
   - name: entity_data_imputation
     display_name: Data imputation
     description: Scenario from [Mei et al. (2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability to impute missing entities in a data table.

helm/benchmark/window_services/cohere_window_service.py CHANGED Viewed

@@ -141,3 +141,23 @@ class CohereWindowService(LocalWindowService):
             result = result[:-1]
         return result
+class CohereCommandWindowService(CohereWindowService):
+    def __init__(self, service: TokenizerService):
+        super().__init__(service)
+    @property
+    def max_request_length(self) -> int:
+        """
+        The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
+        If we exceed the `max_sequence_length`, we get the following error:
+        Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
+        exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
+        For the Command model, in rare situations, the co.tokenize returns a shorter list of tokens
+        than the co.generate. This causes sequence length errors for rare inputs. Cohere's advice is
+        to reduce the sequence length to 2020 to avoid these issues.
+        """
+        return 2020

helm/benchmark/window_services/flan_t5_window_service.py ADDED Viewed

@@ -0,0 +1,29 @@
+from .encoder_decoder_window_service import EncoderDecoderWindowService
+from .tokenizer_service import TokenizerService
+class FlanT5WindowService(EncoderDecoderWindowService):
+    def __init__(self, service: TokenizerService):
+        super().__init__(service)
+    @property
+    def max_sequence_length(self) -> int:
+        """Return the max sequence length."""
+        # We subtract 1 to account for <extra_id_0> that gets appended to prompts.
+        return 512 - 1
+    @property
+    def end_of_text_token(self) -> str:
+        """The end of text token."""
+        return "</s>"
+    @property
+    def tokenizer_name(self) -> str:
+        """Name of the tokenizer to use when sending a request."""
+        return "google/flan-t5-xxl"
+    @property
+    def prefix_token(self) -> str:
+        """The prefix token is the same as the end of text token."""
+        # echo=True is not supported
+        return ""

helm/benchmark/window_services/huggingface_window_service.py ADDED Viewed

@@ -0,0 +1,39 @@
+from helm.proxy.clients.huggingface_tokenizer import HuggingFaceTokenizers
+from .local_window_service import LocalWindowService
+from .tokenizer_service import TokenizerService
+from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig
+class HuggingFaceWindowService(LocalWindowService):
+    def __init__(self, service: TokenizerService, model_config: HuggingFaceModelConfig):
+        super().__init__(service)
+        self._tokenizer_name = str(model_config)
+        tokenizer = HuggingFaceTokenizers.get_tokenizer(self._tokenizer_name)
+        self._prefix_token = tokenizer.bos_token
+        self._end_of_text_token = tokenizer.eos_token
+        self._max_request_length = tokenizer.model_max_length
+    @property
+    def max_sequence_length(self) -> int:
+        """Return the max sequence length of this tokenizer."""
+        return self._max_request_length
+    @property
+    def max_request_length(self) -> int:
+        """Return the max request length of this tokenizer."""
+        return self.max_sequence_length
+    @property
+    def end_of_text_token(self) -> str:
+        """The end of text token."""
+        return self._end_of_text_token
+    @property
+    def tokenizer_name(self) -> str:
+        """Name of the tokenizer to use when sending a request."""
+        return self._tokenizer_name
+    @property
+    def prefix_token(self) -> str:
+        """The prefix token."""
+        return self._prefix_token

helm/benchmark/window_services/santacoder_window_service.py ADDED Viewed

@@ -0,0 +1,27 @@
+from .local_window_service import LocalWindowService
+from .tokenizer_service import TokenizerService
+class SantaCoderWindowService(LocalWindowService):
+    def __init__(self, service: TokenizerService):
+        super().__init__(service)
+    @property
+    def max_sequence_length(self) -> int:
+        return 2048
+    @property
+    def max_request_length(self) -> int:
+        return self.max_sequence_length
+    @property
+    def end_of_text_token(self) -> str:
+        return "<|endoftext|>"
+    @property
+    def tokenizer_name(self) -> str:
+        return "bigcode/santacoder"
+    @property
+    def prefix_token(self) -> str:
+        return self.end_of_text_token

helm/benchmark/window_services/test_flan_t5_window_service.py ADDED Viewed

@@ -0,0 +1,12 @@
+import tempfile
+from helm.benchmark.window_services.test_t511b_window_service import TestT511bWindowService
+from helm.benchmark.window_services.window_service_factory import TokenizerService, WindowServiceFactory
+from helm.benchmark.window_services.test_utils import get_tokenizer_service
+class TestFlanT5WindowService(TestT511bWindowService):
+    def setup_method(self):
+        self.path: str = tempfile.mkdtemp()
+        service: TokenizerService = get_tokenizer_service(self.path)
+        self.window_service = WindowServiceFactory.get_window_service("together/flan-t5-xxl", service)

helm/benchmark/window_services/wider_ai21_window_service.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .ai21_window_service import AI21WindowService
+class WiderAI21WindowService(AI21WindowService):
+    @property
+    def max_sequence_length(self) -> int:
+        """
+        Return the max sequence length of the larger AI21 Jurassic-2 models.
+        The AI21 server automatically prepends a token to every prompt,
+        so the actual max sequence length is 8192 - 1 = 8191.
+        """
+        return 8191

helm/benchmark/window_services/window_service_factory.py CHANGED Viewed

@@ -1,7 +1,14 @@
-from helm.proxy.models import get_model, get_model_names_with_tag, Model, WIDER_CONTEXT_WINDOW_TAG
+from helm.proxy.models import (
+    get_model,
+    get_model_names_with_tag,
+    Model,
+    AI21_WIDER_CONTEXT_WINDOW_TAG,
+    WIDER_CONTEXT_WINDOW_TAG,
+)
 from .ai21_window_service import AI21WindowService
+from .wider_ai21_window_service import WiderAI21WindowService
 from .anthropic_window_service import AnthropicWindowService
-from .cohere_window_service import CohereWindowService
+from .cohere_window_service import CohereWindowService, CohereCommandWindowService
 from .luminous_window_service import (
     LuminousBaseWindowService,
     LuminousExtendedWindowService,
@@ -12,17 +19,21 @@ from .openai_window_service import OpenAIWindowService
 from .wider_openai_window_service import WiderOpenAIWindowService
 from .mt_nlg_window_service import MTNLGWindowService
 from .bloom_window_service import BloomWindowService
+from .huggingface_window_service import HuggingFaceWindowService
 from .ice_window_service import ICEWindowService
+from .santacoder_window_service import SantaCoderWindowService
 from .gpt2_window_service import GPT2WindowService
 from .gptj_window_service import GPTJWindowService
 from .gptneox_window_service import GPTNeoXWindowService
 from .opt_window_service import OPTWindowService
 from .t0pp_window_service import T0ppWindowService
 from .t511b_window_service import T511bWindowService
+from .flan_t5_window_service import FlanT5WindowService
 from .ul2_window_service import UL2WindowService
 from .yalm_window_service import YaLMWindowService
 from .window_service import WindowService
 from .tokenizer_service import TokenizerService
+from helm.proxy.clients.huggingface_client import get_huggingface_model_config
 class WindowServiceFactory:
@@ -37,9 +48,13 @@ class WindowServiceFactory:
         engine: str = model.engine
         window_service: WindowService
-        if model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
+        huggingface_model_config = get_huggingface_model_config(model_name)
+        if huggingface_model_config:
+            window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config)
+        elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
             window_service = WiderOpenAIWindowService(service)
-        elif organization == "openai" or organization == "simple":
+        # For the Google models, we approximate with the OpenAIWindowService
+        elif organization == "openai" or organization == "simple" or organization == "google":
             window_service = OpenAIWindowService(service)
         elif organization == "AlephAlpha":
             if engine == "luminous-base":
@@ -56,6 +71,8 @@ class WindowServiceFactory:
             window_service = MTNLGWindowService(service)
         elif organization == "anthropic":
             window_service = AnthropicWindowService(service)
+        elif engine == "santacoder":
+            window_service = SantaCoderWindowService(service)
         elif model_name == "huggingface/gpt2":
             window_service = GPT2WindowService(service)
         elif model_name == "together/bloom":
@@ -66,22 +83,32 @@ class WindowServiceFactory:
             window_service = ICEWindowService(service)
         elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "gooseai/gpt-j-6b"]:
             window_service = GPTJWindowService(service)
-        elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b"]:
+        elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b", "together/gpt-neoxt-chat-base-20b"]:
             window_service = GPTNeoXWindowService(service)
+        elif model_name == "together/h3-2.7b":
+            window_service = GPT2WindowService(service)
         elif model_name in ["together/opt-66b", "together/opt-175b"]:
             window_service = OPTWindowService(service)
         elif model_name == "together/t0pp":
             window_service = T0ppWindowService(service)
         elif model_name == "together/t5-11b":
             window_service = T511bWindowService(service)
+        elif model_name == "together/flan-t5-xxl":
+            window_service = FlanT5WindowService(service)
         elif model_name == "together/ul2":
             window_service = UL2WindowService(service)
         elif model_name == "together/yalm":
             window_service = YaLMWindowService(service)
         elif organization == "cohere":
-            window_service = CohereWindowService(service)
+            if "command" in engine:
+                window_service = CohereCommandWindowService(service)
+            else:
+                window_service = CohereWindowService(service)
         elif organization == "ai21":
-            window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
+            if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
+                window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
+            else:
+                window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
         else:
             raise ValueError(f"Unhandled model name: {model_name}")

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl