PyPI - crfm-helm - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

crfm-helm 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/MANIFEST.in RENAMED Viewed

@@ -1,3 +1,4 @@
+recursive-include src/helm/proxy/clients/ *.sp
 recursive-include src/helm/benchmark/ *.json
 recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
 include requirements.txt

{crfm-helm-0.2.1/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.1
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
 <img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt=""  width="800"/>
-Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/v1.0/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
+Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
 - Collection of datasets in a standard format (e.g., NaturalQuestions)
 - Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/requirements.txt RENAMED Viewed

@@ -6,19 +6,13 @@
 #
 #   pip freeze | xargs pip uninstall -y
 #   pip install -r requirements.txt
+#   pip install -r requirements-dev.txt
 #   pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
 #
 # Also update the versions in the manual installation steps in pre-commit.sh.
 #
 # Check that everything works because the versions might be upgraded.
-# Development
-pytest~=7.2.0
-black~=22.10.0
-mypy~=0.982
-pre-commit~=2.20.0
-flake8~=5.0.4
 # Common
 zstandard~=0.18.0
 tqdm~=4.64.1
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
 dacite~=1.6.0
 # Proxy
+aleph-alpha-client~=2.14.0
 bottle~=0.12.23
 gunicorn~=20.1.0
 Mako~=1.2.3
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
 pymongo~=4.2.0
 retrying~=1.3.3
 websocket-client~=1.3.2 # For Anthropic
-openai~=0.25.0
-transformers~=4.22.2  # For HuggingFace tokenizer
+openai~=0.27.0
+transformers~=4.26.1
+tokenizers~=0.13.2
 icetk~=0.0.4
 protobuf~=3.20.2  # Can't use 4.21.0 due to backward incompatibility
 google-api-python-client~=2.64.0
@@ -50,6 +46,7 @@ sympy~=1.11.1  # For math scenarios
 sentencepiece~=0.1.97
 numba~=0.56.4
 cattrs~=22.2.0
+xlrd~=2.0.1  # Used by pandas.read_excel in ice_scenario
 # Metrics
 importlib-resources~=5.10.0
@@ -68,3 +65,9 @@ summ-eval~=0.892
 # End users should install a CUDA version of PyTorch manually if needed
 torch~=1.12.1  # Summarization metrics
 torchvision~=0.13.1
+# plotting
+colorcet~=3.0.1
+matplotlib~=3.6.0
+numpy~=1.23.3
+seaborn~=0.11.0

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/setup.py RENAMED Viewed

@@ -11,7 +11,7 @@ def get_requirements(path: str):
 setup(
     name="crfm-helm",
-    version="0.2.1",
+    version="0.2.2",
     description="Benchmark for language models",
     long_description="Benchmark for language models",
     url="https://github.com/stanford-crfm/helm",
@@ -34,6 +34,7 @@ setup(
             "helm-run=helm.benchmark.run:main",
             "helm-summarize=helm.benchmark.presentation.summarize:main",
             "helm-server=helm.benchmark.server:main",
+            "helm-create-plots=helm.benchmark.presentation.create_plots:main",
             "crfm-proxy-server=helm.proxy.server:main",
             "crfm-proxy-cli=helm.proxy.cli:main",
         ]

{crfm-helm-0.2.1 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.1
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt RENAMED Viewed

@@ -60,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
 src/helm/benchmark/augmentations/synonym_perturbation.py
 src/helm/benchmark/augmentations/test_perturbation.py
 src/helm/benchmark/augmentations/typos_perturbation.py
+src/helm/benchmark/contamination/__init__.py
 src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
 src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
 src/helm/benchmark/efficiency_data/training_efficiency.json
@@ -103,12 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
 src/helm/benchmark/metrics/tokens/token_cost_estimator.py
 src/helm/benchmark/presentation/__init__.py
 src/helm/benchmark/presentation/contamination.py
+src/helm/benchmark/presentation/create_plots.py
 src/helm/benchmark/presentation/run_display.py
 src/helm/benchmark/presentation/run_entry.py
 src/helm/benchmark/presentation/schema.py
 src/helm/benchmark/presentation/summarize.py
 src/helm/benchmark/presentation/table.py
 src/helm/benchmark/presentation/test_contamination.py
+src/helm/benchmark/presentation/test_create_plots.py
 src/helm/benchmark/presentation/test_run_entry.py
 src/helm/benchmark/scenarios/__init__.py
 src/helm/benchmark/scenarios/babi_qa_scenario.py
@@ -130,6 +133,7 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
 src/helm/benchmark/scenarios/entity_matching_scenario.py
 src/helm/benchmark/scenarios/gsm_scenario.py
 src/helm/benchmark/scenarios/ice_scenario.py
+src/helm/benchmark/scenarios/imdb_listdir.json
 src/helm/benchmark/scenarios/imdb_scenario.py
 src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
 src/helm/benchmark/scenarios/legal_support_scenario.py
@@ -148,6 +152,7 @@ src/helm/benchmark/scenarios/narrativeqa_scenario.py
 src/helm/benchmark/scenarios/natural_qa_scenario.py
 src/helm/benchmark/scenarios/newsqa_scenario.py
 src/helm/benchmark/scenarios/numeracy_scenario.py
+src/helm/benchmark/scenarios/opinions_qa_scenario.py
 src/helm/benchmark/scenarios/pubmed_qa_scenario.py
 src/helm/benchmark/scenarios/quac_scenario.py
 src/helm/benchmark/scenarios/raft_scenario.py
@@ -173,6 +178,7 @@ src/helm/benchmark/static/index.html
 src/helm/benchmark/static/info-icon.png
 src/helm/benchmark/static/json-urls-root.js
 src/helm/benchmark/static/json-urls.js
+src/helm/benchmark/static/plot-captions.js
 src/helm/benchmark/static/schema.yaml
 src/helm/benchmark/static/utils.js
 src/helm/benchmark/static/images/crfm-logo.png
@@ -200,9 +206,11 @@ src/helm/benchmark/window_services/anthropic_window_service.py
 src/helm/benchmark/window_services/bloom_window_service.py
 src/helm/benchmark/window_services/cohere_window_service.py
 src/helm/benchmark/window_services/encoder_decoder_window_service.py
+src/helm/benchmark/window_services/flan_t5_window_service.py
 src/helm/benchmark/window_services/gpt2_window_service.py
 src/helm/benchmark/window_services/gptj_window_service.py
 src/helm/benchmark/window_services/gptneox_window_service.py
+src/helm/benchmark/window_services/huggingface_window_service.py
 src/helm/benchmark/window_services/ice_window_service.py
 src/helm/benchmark/window_services/local_window_service.py
 src/helm/benchmark/window_services/luminous_window_service.py
@@ -216,6 +224,7 @@ src/helm/benchmark/window_services/test_ai21_window_service.py
 src/helm/benchmark/window_services/test_bloom_window_service.py
 src/helm/benchmark/window_services/test_cohere_window_service.py
 src/helm/benchmark/window_services/test_cohere_window_service_utils.py
+src/helm/benchmark/window_services/test_flan_t5_window_service.py
 src/helm/benchmark/window_services/test_gpt2_window_service.py
 src/helm/benchmark/window_services/test_gptj_window_service.py
 src/helm/benchmark/window_services/test_gptneox_window_service.py
@@ -230,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
 src/helm/benchmark/window_services/test_yalm_window_service.py
 src/helm/benchmark/window_services/tokenizer_service.py
 src/helm/benchmark/window_services/ul2_window_service.py
+src/helm/benchmark/window_services/wider_ai21_window_service.py
 src/helm/benchmark/window_services/wider_openai_window_service.py
 src/helm/benchmark/window_services/window_service.py
 src/helm/benchmark/window_services/window_service_factory.py
@@ -268,6 +278,7 @@ src/helm/proxy/clients/cohere_client.py
 src/helm/proxy/clients/google_client.py
 src/helm/proxy/clients/goose_ai_client.py
 src/helm/proxy/clients/huggingface_client.py
+src/helm/proxy/clients/huggingface_model_registry.py
 src/helm/proxy/clients/huggingface_tokenizer.py
 src/helm/proxy/clients/ice_tokenizer_client.py
 src/helm/proxy/clients/microsoft_client.py
@@ -276,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
 src/helm/proxy/clients/simple_client.py
 src/helm/proxy/clients/test_client.py
 src/helm/proxy/clients/test_huggingface_client.py
+src/helm/proxy/clients/test_huggingface_model_registry.py
 src/helm/proxy/clients/test_huggingface_tokenizer.py
 src/helm/proxy/clients/test_ice_tokenizer_client.py
 src/helm/proxy/clients/test_yalm_tokenizer_client.py
@@ -283,6 +295,7 @@ src/helm/proxy/clients/together_client.py
 src/helm/proxy/clients/yalm_tokenizer_client.py
 src/helm/proxy/clients/yalm_tokenizer/__init__.py
 src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
+src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
 src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
 src/helm/proxy/services/__init__.py
 src/helm/proxy/services/remote_service.py

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 [console_scripts]
 crfm-proxy-cli = helm.proxy.cli:main
 crfm-proxy-server = helm.proxy.server:main
+helm-create-plots = helm.benchmark.presentation.create_plots:main
 helm-run = helm.benchmark.run:main
 helm-server = helm.benchmark.server:main
 helm-summarize = helm.benchmark.presentation.summarize:main

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt RENAMED Viewed

@@ -1,12 +1,8 @@
-pytest~=7.2.0
-black~=22.10.0
-mypy~=0.982
-pre-commit~=2.20.0
-flake8~=5.0.4
 zstandard~=0.18.0
 tqdm~=4.64.1
 pyhocon~=0.3.59
 dacite~=1.6.0
+aleph-alpha-client~=2.14.0
 bottle~=0.12.23
 gunicorn~=20.1.0
 Mako~=1.2.3
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
 pymongo~=4.2.0
 retrying~=1.3.3
 websocket-client~=1.3.2
-openai~=0.25.0
-transformers~=4.22.2
+openai~=0.27.0
+transformers~=4.26.1
+tokenizers~=0.13.2
 icetk~=0.0.4
 protobuf~=3.20.2
 google-api-python-client~=2.64.0
@@ -27,6 +24,7 @@ sympy~=1.11.1
 sentencepiece~=0.1.97
 numba~=0.56.4
 cattrs~=22.2.0
+xlrd~=2.0.1
 importlib-resources~=5.10.0
 nltk~=3.7
 scipy~=1.9.1
@@ -40,3 +38,7 @@ spacy~=3.2.4
 summ-eval~=0.892
 torch~=1.12.1
 torchvision~=0.13.1
+colorcet~=3.0.1
+matplotlib~=3.6.0
+numpy~=1.23.3
+seaborn~=0.11.0

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py RENAMED Viewed

@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario  # noqa
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
+from .scenarios import opinions_qa_scenario  # noqa
 # Biomedical
 from .scenarios import covid_dialog_scenario  # noqa

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py RENAMED Viewed

@@ -68,6 +68,9 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    # If true, randomly sample N training examples; if false, select N consecutive training examples
+    sample_train: bool = True
     # Decoding parameters (inherited by `Request`)
     # Model to make the request to (need to fill in)

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py RENAMED Viewed

@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
     @htrack(None)
     def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         """
-        Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
+        Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         The reason we don't do this per eval instance is that we create a common set of
         training instances which is shared across all eval instances.
         """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
+        self.train_instances: List[Instance] = self.sample_examples(
+            all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
+        )
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
         # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
         return [request_state for result in results for request_state in result]
-    def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
+    def sample_examples(
+        self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
+    ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
         random.seed(seed)
         num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
+        examples: List[Instance] = []
+        if not sample_train:
+            # Select sequentially from the train set
+            examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
+            return examples
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         for instance in all_train_instances:
             if instance.first_correct_reference:
                 label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
             sorted_labels.extend(labels)
         labels_iterable = cycle(sorted_labels)
-        examples: List[Instance] = []
         while num_instances_to_sample > 0:
             next_label: Optional[str] = next(labels_iterable, None)
             if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
         # References (optionally) and output
         output: str
+        delimiter = ","
         if reference_index is None:
             # Put only the correct reference as the output
-            correct_reference: Optional[Reference] = instance.first_correct_reference
-            output = correct_reference.output.text if correct_reference is not None else "n/a"
+            correct_references: List[Reference] = instance.all_correct_references
+            if not correct_references:
+                output = "n/a"
+            else:
+                output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
         else:
             reference = instance.references[reference_index]
             output = reference.output.text

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/classification_metrics.py RENAMED Viewed

@@ -1,6 +1,7 @@
-from typing import List
+from typing import List, Optional
 from sklearn.metrics import f1_score
+from sklearn.preprocessing import MultiLabelBinarizer
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.basic_metrics import normalize_text
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
     Note:
     - The set of classes is derived from the correct references from all the instances.
-      This means that classes may be omitted if they never are never used as a correct
-      reference.
+      This means that classes may be omitted if they are never used as a correct reference.
     - Generations that are not in any of the known classes are counted as a
       negative prediction for every class.
     - Perturbed classes are considered different classes from unperturbed
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
     - Currently, multi-label classification is not supported.
     """
+    def __init__(self, delimiter: Optional[str] = None):
+        self.delimiter = delimiter
+    def is_multi_label(self) -> bool:
+        return bool(self.delimiter)
     def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
-        y_pred: List[str] = []
-        y_true: List[str] = []
-        for request_state in request_states:
+        y_pred: List[List[str]] = []
+        y_true: List[List[str]] = []
+        for request_state in request_states:  # one request state per instance
             # Only the generation adapter is supported.
             # TODO: Support multiple_choice_* adapters.
             if request_state.reference_index is not None:
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
             assert request_state.result is not None
             if len(request_state.result.completions) != 1:
                 raise ValueError("Result must contain exactly one completion")
-            num_correct = 0
-            for reference in request_state.instance.references:
-                if reference.is_correct:
-                    num_correct += 1
-                    y_true.append(normalize_text(reference.output.text))
-            if num_correct != 1:
-                # TODO: Support multi-label classification.
-                raise ValueError("ClassificationMetric does not support multi-label classification")
             if request_state.output_mapping:
                 raise ValueError("ClassificationMetric does not support multiple choice adapters")
-            y_pred.append(normalize_text(request_state.result.completions[0].text))
-        labels = list(set(y_true))
+            references = request_state.instance.all_correct_references
+            if not self.is_multi_label():
+                assert len(references) == 1
+            correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
+            y_true.append(correct_ref_texts)
+            input_text = request_state.result.completions[0].text
+            predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
+            y_pred.append([normalize_text(pred) for pred in predictions if pred])
+        labels: List[str] = list(set(y for ys in y_true for y in ys))
+        mlb = MultiLabelBinarizer().fit([labels])
+        y_true = mlb.transform(y_true)
+        y_pred = mlb.transform(y_pred)
         return [
-            Stat(MetricName("classification_macro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="macro")
-            ),
-            Stat(MetricName("classification_micro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
-            ),
+            Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
+            Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
         ]

{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_classification_metrics.py RENAMED Viewed

@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
 def test_evaluate_instances_binary_generation():
-    metric = ClassificationMetric()
+    metric = ClassificationMetric(delimiter=None)
     request_states = [
         _request_state("yes", [_Option("yes", True)]),
         _request_state("yes", [_Option("yes", True)]),
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
 def test_evaluate_instances_multi_class():
-    metric = ClassificationMetric()
+    # Note: no "a" because it would get filtered out by normalize_text()
+    metric = ClassificationMetric(delimiter=None)
     def _options(correct: str):
-        return [_Option(text, text == correct) for text in ["a", "b", "c"]]
+        return [_Option(text, text == correct) for text in ["d", "b", "c"]]
     request_states = [
-        _request_state("a", _options("a")),
-        _request_state("a", _options("a")),
-        _request_state("a", _options("a")),
-        _request_state("a", _options("b")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("b")),
         _request_state("b", _options("b")),
         _request_state("b", _options("b")),
         _request_state("b", _options("c")),
-        _request_state("c", _options("a")),
+        _request_state("c", _options("d")),
         _request_state("c", _options("c")),
         _request_state("invalid", _options("c")),
     ]
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
         metric.evaluate_instances(request_states),
         _expected_stats(
             {
-                "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
+                "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
                 "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
                 "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
             }
         ),
     )
+def test_evaluate_instances_multilabel():
+    # Note: no "a" because it would get filtered out by normalize_text()
+    metric = ClassificationMetric(delimiter=",")
+    def _options(correct: List[str]):
+        return [_Option(text, text in correct) for text in ["d", "b", "c"]]
+    request_states = [
+        _request_state("d,b", _options(["d", "b"])),
+        _request_state("d,b", _options(["d", "c"])),
+        _request_state("d", _options(["d"])),
+        _request_state("c", _options(["b"])),
+        _request_state("b", _options(["b", "c"])),
+        _request_state("d,b", _options(["c"])),
+        _request_state("d,c", _options(["d"])),
+        _request_state("d,b,c", _options(["d", "b", "c"])),
+        _request_state("", []),
+        _request_state("n/a", []),
+        _request_state("invalid", _options(["c"])),
+    ]
+    assert_stats_equal(
+        metric.evaluate_instances(request_states),
+        _expected_stats(
+            {
+                "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
+                "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
+                "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
+            }
+        ),
+    )

crfm-helm 0.2.1__tar.gz → 0.2.2__tar.gz

crfm-helm 0.2.1tar.gz → 0.2.2tar.gz