PyPI - crfm-helm - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

crfm-helm 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/MANIFEST.in RENAMED Viewed

@@ -1,3 +1,4 @@
-recursive-include src/helm/benchmark/efficiency_data/ *.json
+recursive-include src/helm/proxy/clients/ *.sp
+recursive-include src/helm/benchmark/ *.json
 recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
 include requirements.txt

{crfm-helm-0.2.0/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.0
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
 <img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt=""  width="800"/>
-Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/v1.0/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
+Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
 - Collection of datasets in a standard format (e.g., NaturalQuestions)
 - Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/requirements.txt RENAMED Viewed

@@ -6,19 +6,13 @@
 #
 #   pip freeze | xargs pip uninstall -y
 #   pip install -r requirements.txt
+#   pip install -r requirements-dev.txt
 #   pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
 #
 # Also update the versions in the manual installation steps in pre-commit.sh.
 #
 # Check that everything works because the versions might be upgraded.
-# Development
-pytest~=7.2.0
-black~=22.10.0
-mypy~=0.982
-pre-commit~=2.20.0
-flake8~=5.0.4
 # Common
 zstandard~=0.18.0
 tqdm~=4.64.1
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
 dacite~=1.6.0
 # Proxy
+aleph-alpha-client~=2.14.0
 bottle~=0.12.23
 gunicorn~=20.1.0
 Mako~=1.2.3
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
 pymongo~=4.2.0
 retrying~=1.3.3
 websocket-client~=1.3.2 # For Anthropic
-openai~=0.25.0
-transformers~=4.22.2  # For HuggingFace tokenizer
+openai~=0.27.0
+transformers~=4.26.1
+tokenizers~=0.13.2
 icetk~=0.0.4
 protobuf~=3.20.2  # Can't use 4.21.0 due to backward incompatibility
 google-api-python-client~=2.64.0
@@ -49,6 +45,8 @@ jsonlines~=3.1.0  # Not really needed
 sympy~=1.11.1  # For math scenarios
 sentencepiece~=0.1.97
 numba~=0.56.4
+cattrs~=22.2.0
+xlrd~=2.0.1  # Used by pandas.read_excel in ice_scenario
 # Metrics
 importlib-resources~=5.10.0
@@ -67,3 +65,9 @@ summ-eval~=0.892
 # End users should install a CUDA version of PyTorch manually if needed
 torch~=1.12.1  # Summarization metrics
 torchvision~=0.13.1
+# plotting
+colorcet~=3.0.1
+matplotlib~=3.6.0
+numpy~=1.23.3
+seaborn~=0.11.0

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/setup.py RENAMED Viewed

@@ -11,7 +11,7 @@ def get_requirements(path: str):
 setup(
     name="crfm-helm",
-    version="0.2.0",
+    version="0.2.2",
     description="Benchmark for language models",
     long_description="Benchmark for language models",
     url="https://github.com/stanford-crfm/helm",
@@ -31,9 +31,10 @@ setup(
     install_requires=get_requirements("requirements.txt"),
     entry_points={
         "console_scripts": [
-            "helm-run=helm.benchmark.presentation.present:main",
+            "helm-run=helm.benchmark.run:main",
             "helm-summarize=helm.benchmark.presentation.summarize:main",
             "helm-server=helm.benchmark.server:main",
+            "helm-create-plots=helm.benchmark.presentation.create_plots:main",
             "crfm-proxy-server=helm.proxy.server:main",
             "crfm-proxy-cli=helm.proxy.cli:main",
         ]

{crfm-helm-0.2.0 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.0
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt RENAMED Viewed

@@ -44,6 +44,7 @@ src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
 src/helm/benchmark/augmentations/__init__.py
 src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
 src/helm/benchmark/augmentations/contrast_sets_perturbation.py
+src/helm/benchmark/augmentations/correct_to_misspelling.json
 src/helm/benchmark/augmentations/data_augmenter.py
 src/helm/benchmark/augmentations/dialect_perturbation.py
 src/helm/benchmark/augmentations/extra_space_perturbation.py
@@ -59,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
 src/helm/benchmark/augmentations/synonym_perturbation.py
 src/helm/benchmark/augmentations/test_perturbation.py
 src/helm/benchmark/augmentations/typos_perturbation.py
+src/helm/benchmark/contamination/__init__.py
 src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
 src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
 src/helm/benchmark/efficiency_data/training_efficiency.json
@@ -67,10 +69,12 @@ src/helm/benchmark/metrics/basic_metrics.py
 src/helm/benchmark/metrics/bbq_metrics.py
 src/helm/benchmark/metrics/bias_metrics.py
 src/helm/benchmark/metrics/bias_word_lists.py
+src/helm/benchmark/metrics/classification_metrics.py
 src/helm/benchmark/metrics/code_metrics.py
 src/helm/benchmark/metrics/code_metrics_helper.py
 src/helm/benchmark/metrics/copyright_metrics.py
 src/helm/benchmark/metrics/disinformation_metrics.py
+src/helm/benchmark/metrics/machine_translation_metrics.py
 src/helm/benchmark/metrics/metric.py
 src/helm/benchmark/metrics/metric_name.py
 src/helm/benchmark/metrics/metric_service.py
@@ -79,6 +83,7 @@ src/helm/benchmark/metrics/ranking_metrics.py
 src/helm/benchmark/metrics/statistic.py
 src/helm/benchmark/metrics/summarization_metrics.py
 src/helm/benchmark/metrics/test_bias_metrics.py
+src/helm/benchmark/metrics/test_classification_metrics.py
 src/helm/benchmark/metrics/test_metric.py
 src/helm/benchmark/metrics/test_numeracy_metrics.py
 src/helm/benchmark/metrics/test_statistic.py
@@ -99,13 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
 src/helm/benchmark/metrics/tokens/token_cost_estimator.py
 src/helm/benchmark/presentation/__init__.py
 src/helm/benchmark/presentation/contamination.py
-src/helm/benchmark/presentation/present.py
+src/helm/benchmark/presentation/create_plots.py
 src/helm/benchmark/presentation/run_display.py
 src/helm/benchmark/presentation/run_entry.py
 src/helm/benchmark/presentation/schema.py
 src/helm/benchmark/presentation/summarize.py
 src/helm/benchmark/presentation/table.py
 src/helm/benchmark/presentation/test_contamination.py
+src/helm/benchmark/presentation/test_create_plots.py
 src/helm/benchmark/presentation/test_run_entry.py
 src/helm/benchmark/scenarios/__init__.py
 src/helm/benchmark/scenarios/babi_qa_scenario.py
@@ -119,6 +125,7 @@ src/helm/benchmark/scenarios/code_scenario.py
 src/helm/benchmark/scenarios/code_scenario_helper.py
 src/helm/benchmark/scenarios/commonsense_scenario.py
 src/helm/benchmark/scenarios/copyright_scenario.py
+src/helm/benchmark/scenarios/covid_dialog_scenario.py
 src/helm/benchmark/scenarios/dialogue_scenarios.py
 src/helm/benchmark/scenarios/disinformation_scenario.py
 src/helm/benchmark/scenarios/dyck_language_scenario.py
@@ -126,17 +133,26 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
 src/helm/benchmark/scenarios/entity_matching_scenario.py
 src/helm/benchmark/scenarios/gsm_scenario.py
 src/helm/benchmark/scenarios/ice_scenario.py
+src/helm/benchmark/scenarios/imdb_listdir.json
 src/helm/benchmark/scenarios/imdb_scenario.py
 src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
 src/helm/benchmark/scenarios/legal_support_scenario.py
+src/helm/benchmark/scenarios/lex_glue_scenario.py
+src/helm/benchmark/scenarios/lextreme_scenario.py
 src/helm/benchmark/scenarios/lsat_qa_scenario.py
 src/helm/benchmark/scenarios/math_scenario.py
+src/helm/benchmark/scenarios/me_q_sum_scenario.py
+src/helm/benchmark/scenarios/med_dialog_scenario.py
+src/helm/benchmark/scenarios/med_mcqa_scenario.py
+src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py
+src/helm/benchmark/scenarios/med_qa_scenario.py
 src/helm/benchmark/scenarios/mmlu_scenario.py
 src/helm/benchmark/scenarios/msmarco_scenario.py
 src/helm/benchmark/scenarios/narrativeqa_scenario.py
 src/helm/benchmark/scenarios/natural_qa_scenario.py
 src/helm/benchmark/scenarios/newsqa_scenario.py
 src/helm/benchmark/scenarios/numeracy_scenario.py
+src/helm/benchmark/scenarios/opinions_qa_scenario.py
 src/helm/benchmark/scenarios/pubmed_qa_scenario.py
 src/helm/benchmark/scenarios/quac_scenario.py
 src/helm/benchmark/scenarios/raft_scenario.py
@@ -153,6 +169,7 @@ src/helm/benchmark/scenarios/truthful_qa_scenario.py
 src/helm/benchmark/scenarios/twitter_aae_scenario.py
 src/helm/benchmark/scenarios/wikifact_scenario.py
 src/helm/benchmark/scenarios/wikitext_103_scenario.py
+src/helm/benchmark/scenarios/wmt_14_scenario.py
 src/helm/benchmark/static/benchmarking.css
 src/helm/benchmark/static/benchmarking.js
 src/helm/benchmark/static/contamination.yaml
@@ -161,6 +178,7 @@ src/helm/benchmark/static/index.html
 src/helm/benchmark/static/info-icon.png
 src/helm/benchmark/static/json-urls-root.js
 src/helm/benchmark/static/json-urls.js
+src/helm/benchmark/static/plot-captions.js
 src/helm/benchmark/static/schema.yaml
 src/helm/benchmark/static/utils.js
 src/helm/benchmark/static/images/crfm-logo.png
@@ -188,21 +206,25 @@ src/helm/benchmark/window_services/anthropic_window_service.py
 src/helm/benchmark/window_services/bloom_window_service.py
 src/helm/benchmark/window_services/cohere_window_service.py
 src/helm/benchmark/window_services/encoder_decoder_window_service.py
+src/helm/benchmark/window_services/flan_t5_window_service.py
 src/helm/benchmark/window_services/gpt2_window_service.py
 src/helm/benchmark/window_services/gptj_window_service.py
 src/helm/benchmark/window_services/gptneox_window_service.py
+src/helm/benchmark/window_services/huggingface_window_service.py
 src/helm/benchmark/window_services/ice_window_service.py
 src/helm/benchmark/window_services/local_window_service.py
 src/helm/benchmark/window_services/luminous_window_service.py
 src/helm/benchmark/window_services/mt_nlg_window_service.py
 src/helm/benchmark/window_services/openai_window_service.py
 src/helm/benchmark/window_services/opt_window_service.py
+src/helm/benchmark/window_services/santacoder_window_service.py
 src/helm/benchmark/window_services/t0pp_window_service.py
 src/helm/benchmark/window_services/t511b_window_service.py
 src/helm/benchmark/window_services/test_ai21_window_service.py
 src/helm/benchmark/window_services/test_bloom_window_service.py
 src/helm/benchmark/window_services/test_cohere_window_service.py
 src/helm/benchmark/window_services/test_cohere_window_service_utils.py
+src/helm/benchmark/window_services/test_flan_t5_window_service.py
 src/helm/benchmark/window_services/test_gpt2_window_service.py
 src/helm/benchmark/window_services/test_gptj_window_service.py
 src/helm/benchmark/window_services/test_gptneox_window_service.py
@@ -217,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
 src/helm/benchmark/window_services/test_yalm_window_service.py
 src/helm/benchmark/window_services/tokenizer_service.py
 src/helm/benchmark/window_services/ul2_window_service.py
+src/helm/benchmark/window_services/wider_ai21_window_service.py
 src/helm/benchmark/window_services/wider_openai_window_service.py
 src/helm/benchmark/window_services/window_service.py
 src/helm/benchmark/window_services/window_service_factory.py
@@ -224,12 +247,14 @@ src/helm/benchmark/window_services/yalm_window_service.py
 src/helm/common/__init__.py
 src/helm/common/authentication.py
 src/helm/common/cache.py
+src/helm/common/codec.py
 src/helm/common/general.py
 src/helm/common/hierarchical_logger.py
 src/helm/common/object_spec.py
 src/helm/common/perspective_api_request.py
 src/helm/common/request.py
 src/helm/common/test_cache.py
+src/helm/common/test_codec.py
 src/helm/common/test_general.py
 src/helm/common/tokenization_request.py
 src/helm/proxy/__init__.py
@@ -250,8 +275,10 @@ src/helm/proxy/clients/auto_client.py
 src/helm/proxy/clients/chat_gpt_client.py
 src/helm/proxy/clients/client.py
 src/helm/proxy/clients/cohere_client.py
+src/helm/proxy/clients/google_client.py
 src/helm/proxy/clients/goose_ai_client.py
 src/helm/proxy/clients/huggingface_client.py
+src/helm/proxy/clients/huggingface_model_registry.py
 src/helm/proxy/clients/huggingface_tokenizer.py
 src/helm/proxy/clients/ice_tokenizer_client.py
 src/helm/proxy/clients/microsoft_client.py
@@ -260,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
 src/helm/proxy/clients/simple_client.py
 src/helm/proxy/clients/test_client.py
 src/helm/proxy/clients/test_huggingface_client.py
+src/helm/proxy/clients/test_huggingface_model_registry.py
 src/helm/proxy/clients/test_huggingface_tokenizer.py
 src/helm/proxy/clients/test_ice_tokenizer_client.py
 src/helm/proxy/clients/test_yalm_tokenizer_client.py
@@ -267,6 +295,7 @@ src/helm/proxy/clients/together_client.py
 src/helm/proxy/clients/yalm_tokenizer_client.py
 src/helm/proxy/clients/yalm_tokenizer/__init__.py
 src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
+src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
 src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
 src/helm/proxy/services/__init__.py
 src/helm/proxy/services/remote_service.py

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 [console_scripts]
 crfm-proxy-cli = helm.proxy.cli:main
 crfm-proxy-server = helm.proxy.server:main
-helm-run = helm.benchmark.presentation.present:main
+helm-create-plots = helm.benchmark.presentation.create_plots:main
+helm-run = helm.benchmark.run:main
 helm-server = helm.benchmark.server:main
 helm-summarize = helm.benchmark.presentation.summarize:main

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt RENAMED Viewed

@@ -1,12 +1,8 @@
-pytest~=7.2.0
-black~=22.10.0
-mypy~=0.982
-pre-commit~=2.20.0
-flake8~=5.0.4
 zstandard~=0.18.0
 tqdm~=4.64.1
 pyhocon~=0.3.59
 dacite~=1.6.0
+aleph-alpha-client~=2.14.0
 bottle~=0.12.23
 gunicorn~=20.1.0
 Mako~=1.2.3
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
 pymongo~=4.2.0
 retrying~=1.3.3
 websocket-client~=1.3.2
-openai~=0.25.0
-transformers~=4.22.2
+openai~=0.27.0
+transformers~=4.26.1
+tokenizers~=0.13.2
 icetk~=0.0.4
 protobuf~=3.20.2
 google-api-python-client~=2.64.0
@@ -26,6 +23,8 @@ jsonlines~=3.1.0
 sympy~=1.11.1
 sentencepiece~=0.1.97
 numba~=0.56.4
+cattrs~=22.2.0
+xlrd~=2.0.1
 importlib-resources~=5.10.0
 nltk~=3.7
 scipy~=1.9.1
@@ -39,3 +38,7 @@ spacy~=3.2.4
 summ-eval~=0.892
 torch~=1.12.1
 torchvision~=0.13.1
+colorcet~=3.0.1
+matplotlib~=3.6.0
+numpy~=1.23.3
+seaborn~=0.11.0

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py RENAMED Viewed

@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario  # noqa
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
+from .scenarios import opinions_qa_scenario  # noqa
+# Biomedical
+from .scenarios import covid_dialog_scenario  # noqa
+from .scenarios import me_q_sum_scenario  # noqa
+from .scenarios import med_dialog_scenario  # noqa
+from .scenarios import med_mcqa_scenario  # noqa
+from .scenarios import med_paragraph_simplification_scenario  # noqa
+from .scenarios import med_qa_scenario  # noqa
 from .scenarios import pubmed_qa_scenario  # noqa
+from .scenarios import wmt_14_scenario  # noqa
 # Metrics
 from .metrics import basic_metrics  # noqa
 from .metrics import bbq_metrics  # noqa
 from .metrics import bias_metrics  # noqa
+from .metrics import classification_metrics  # noqa
 from .metrics import code_metrics  # noqa
 from .metrics import copyright_metrics  # noqa
 from .metrics import disinformation_metrics  # noqa
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics  # noqa
 from .metrics import summarization_metrics  # noqa
 from .metrics import toxicity_metrics  # noqa
 from .metrics import tokens_metric  # noqa
+from .metrics import machine_translation_metrics  # noqa
 # Perturbations for data augmentation
 from .augmentations.extra_space_perturbation import ExtraSpacePerturbation  # noqa

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py RENAMED Viewed

@@ -68,6 +68,9 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    # If true, randomly sample N training examples; if false, select N consecutive training examples
+    sample_train: bool = True
     # Decoding parameters (inherited by `Request`)
     # Model to make the request to (need to fill in)

{crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py RENAMED Viewed

@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
     @htrack(None)
     def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         """
-        Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
+        Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         The reason we don't do this per eval instance is that we create a common set of
         training instances which is shared across all eval instances.
         """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
+        self.train_instances: List[Instance] = self.sample_examples(
+            all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
+        )
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
         # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
         return [request_state for result in results for request_state in result]
-    def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
+    def sample_examples(
+        self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
+    ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
         random.seed(seed)
         num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
+        examples: List[Instance] = []
+        if not sample_train:
+            # Select sequentially from the train set
+            examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
+            return examples
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         for instance in all_train_instances:
             if instance.first_correct_reference:
                 label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
             sorted_labels.extend(labels)
         labels_iterable = cycle(sorted_labels)
-        examples: List[Instance] = []
         while num_instances_to_sample > 0:
             next_label: Optional[str] = next(labels_iterable, None)
             if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
         # References (optionally) and output
         output: str
+        delimiter = ","
         if reference_index is None:
             # Put only the correct reference as the output
-            correct_reference: Optional[Reference] = instance.first_correct_reference
-            output = correct_reference.output.text if correct_reference is not None else "n/a"
+            correct_references: List[Reference] = instance.all_correct_references
+            if not correct_references:
+                output = "n/a"
+            else:
+                output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
         else:
             reference = instance.references[reference_index]
             output = reference.output.text

crfm-helm 0.2.0__tar.gz → 0.2.2__tar.gz

crfm-helm 0.2.0tar.gz → 0.2.2tar.gz