crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from sklearn.metrics import f1_score
|
|
4
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.metrics.basic_metrics import normalize_text
|
|
8
|
+
from helm.benchmark.metrics.metric import Metric, MetricName
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ClassificationMetric(Metric):
|
|
13
|
+
"""Defines metrics for multi-class classification using the generation adapter.
|
|
14
|
+
|
|
15
|
+
Currently provides `classification_macro_f1` and `classification_micro_f1`.
|
|
16
|
+
These are population-level F1 measures to measure classification performance where each
|
|
17
|
+
generation is a predicted class, and are different from the instance-level F1 measures
|
|
18
|
+
in `BasicMetrics` that are intended to measure word overlap between the correct references
|
|
19
|
+
and generations. The correct class should be provided by the normalized text of a correct
|
|
20
|
+
reference. The predicted class for each instance is the normalized text of the generation.
|
|
21
|
+
|
|
22
|
+
Note:
|
|
23
|
+
- The set of classes is derived from the correct references from all the instances.
|
|
24
|
+
This means that classes may be omitted if they are never used as a correct reference.
|
|
25
|
+
- Generations that are not in any of the known classes are counted as a
|
|
26
|
+
negative prediction for every class.
|
|
27
|
+
- Perturbed classes are considered different classes from unperturbed
|
|
28
|
+
classes.
|
|
29
|
+
- Currently, multi-label classification is not supported.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, delimiter: Optional[str] = None):
|
|
33
|
+
self.delimiter = delimiter
|
|
34
|
+
|
|
35
|
+
def is_multi_label(self) -> bool:
|
|
36
|
+
return bool(self.delimiter)
|
|
37
|
+
|
|
38
|
+
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
39
|
+
y_pred: List[List[str]] = []
|
|
40
|
+
y_true: List[List[str]] = []
|
|
41
|
+
for request_state in request_states: # one request state per instance
|
|
42
|
+
# Only the generation adapter is supported.
|
|
43
|
+
# TODO: Support multiple_choice_* adapters.
|
|
44
|
+
if request_state.reference_index is not None:
|
|
45
|
+
raise ValueError("ClassificationMetric does not support multiple choice separate adapters")
|
|
46
|
+
if request_state.request_mode == "calibration":
|
|
47
|
+
raise ValueError("ClassificationMetric does not support calibration requests")
|
|
48
|
+
assert request_state.result is not None
|
|
49
|
+
if len(request_state.result.completions) != 1:
|
|
50
|
+
raise ValueError("Result must contain exactly one completion")
|
|
51
|
+
if request_state.output_mapping:
|
|
52
|
+
raise ValueError("ClassificationMetric does not support multiple choice adapters")
|
|
53
|
+
|
|
54
|
+
references = request_state.instance.all_correct_references
|
|
55
|
+
if not self.is_multi_label():
|
|
56
|
+
assert len(references) == 1
|
|
57
|
+
correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
|
|
58
|
+
y_true.append(correct_ref_texts)
|
|
59
|
+
|
|
60
|
+
input_text = request_state.result.completions[0].text
|
|
61
|
+
predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
|
|
62
|
+
y_pred.append([normalize_text(pred) for pred in predictions if pred])
|
|
63
|
+
labels: List[str] = list(set(y for ys in y_true for y in ys))
|
|
64
|
+
mlb = MultiLabelBinarizer().fit([labels])
|
|
65
|
+
y_true = mlb.transform(y_true)
|
|
66
|
+
y_pred = mlb.transform(y_pred)
|
|
67
|
+
return [
|
|
68
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
69
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
70
|
+
]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from sacrebleu import BLEU
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from .metric import Metric
|
|
6
|
+
from .metric_name import MetricName
|
|
7
|
+
from .statistic import Stat
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MachineTranslationMetric(Metric):
|
|
11
|
+
"""
|
|
12
|
+
Compute the BLEU score for Machine Translation scenarios. The implementation is based on sacrebleu.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
16
|
+
"""
|
|
17
|
+
Compute the corpus-level metric based on all reqeust_states.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
bleu = BLEU()
|
|
21
|
+
|
|
22
|
+
refs: List[List[str]] = [[]]
|
|
23
|
+
sys: List = []
|
|
24
|
+
for request_state in request_states:
|
|
25
|
+
# Assume there is one referece per instance. TODO: Support multiple references after adding more scenarios.
|
|
26
|
+
num_references: int = len(request_state.instance.references)
|
|
27
|
+
if num_references != 1:
|
|
28
|
+
raise ValueError(f"This instance has {num_references} references, but we currently only support one.")
|
|
29
|
+
# Usually there is only one completion for each instance.
|
|
30
|
+
assert request_state.result is not None
|
|
31
|
+
if len(request_state.result.completions) != 1:
|
|
32
|
+
raise ValueError("Each request result should have only exactly one completion.")
|
|
33
|
+
sys.append(request_state.result.completions[0].text)
|
|
34
|
+
refs[0].append(request_state.instance.references[0].output.text)
|
|
35
|
+
bleu_score = bleu.corpus_score(sys, refs).score
|
|
36
|
+
return [Stat(MetricName("bleu")).add(bleu_score)]
|
|
@@ -5,17 +5,9 @@ import os
|
|
|
5
5
|
import pickle
|
|
6
6
|
|
|
7
7
|
import spacy
|
|
8
|
-
import subprocess
|
|
9
|
-
import sys
|
|
10
8
|
from typing import List, Dict, Optional
|
|
11
9
|
from collections import defaultdict
|
|
12
10
|
|
|
13
|
-
# Need to check spacy module is downloaded before importing DataStatsMetric
|
|
14
|
-
if not spacy.util.is_package("en_core_web_sm"):
|
|
15
|
-
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
|
16
|
-
|
|
17
|
-
from summ_eval.data_stats_metric import DataStatsMetric
|
|
18
|
-
|
|
19
11
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
20
12
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
21
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
@@ -56,6 +48,13 @@ class SummarizationMetric(Metric):
|
|
|
56
48
|
"rouge_2": get_rouge_function("rouge2"),
|
|
57
49
|
"rouge_l": get_rouge_function("rougeL"),
|
|
58
50
|
}
|
|
51
|
+
# Download en_core_web_sm before importing DataStatsMetric to
|
|
52
|
+
# avoid triggering a bug in DataStatsMetric that raises
|
|
53
|
+
# `NameError: name 'stderr' is not defined`
|
|
54
|
+
if not spacy.util.is_package("en_core_web_sm"):
|
|
55
|
+
spacy.cli.download("en_core_web_sm") # type: ignore
|
|
56
|
+
from summ_eval.data_stats_metric import DataStatsMetric
|
|
57
|
+
|
|
59
58
|
self.data_stats_metric = DataStatsMetric()
|
|
60
59
|
self.task: str = task
|
|
61
60
|
self.qa_fact_eval: Optional[Dict] = None
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Dict, List, NamedTuple
|
|
3
|
+
|
|
4
|
+
from pytest import approx
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.metrics.classification_metrics import ClassificationMetric
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference, CORRECT_TAG
|
|
10
|
+
from helm.common.request import Request, RequestResult, Sequence
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _Option(NamedTuple):
|
|
14
|
+
text: str
|
|
15
|
+
is_correct: bool
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _request_state(prediction: str, options: List[_Option]):
|
|
19
|
+
references = [
|
|
20
|
+
Reference(output=Output(text=option.text), tags=[CORRECT_TAG] if option.is_correct else [])
|
|
21
|
+
for option in options
|
|
22
|
+
]
|
|
23
|
+
return RequestState(
|
|
24
|
+
instance=Instance(input=Input(text=""), references=references),
|
|
25
|
+
reference_index=None,
|
|
26
|
+
request_mode=None,
|
|
27
|
+
train_trial_index=0,
|
|
28
|
+
output_mapping=None,
|
|
29
|
+
request=Request(),
|
|
30
|
+
result=RequestResult(
|
|
31
|
+
success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False
|
|
32
|
+
),
|
|
33
|
+
num_train_instances=0,
|
|
34
|
+
prompt_truncated=False,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def assert_stats_equal(actual_stats: List[Stat], expected_values: Dict[str, float]):
|
|
39
|
+
actual_values = {stat.name.name: stat.mean for stat in actual_stats}
|
|
40
|
+
assert actual_values == approx(expected_values)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
|
|
44
|
+
micro_counts: Dict[str, int] = defaultdict(int)
|
|
45
|
+
for class_counts in all_classes_counts.values():
|
|
46
|
+
for key, class_count in class_counts.items():
|
|
47
|
+
micro_counts[key] += class_count
|
|
48
|
+
micro_precision = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fp"])
|
|
49
|
+
micro_recall = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fn"])
|
|
50
|
+
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
|
|
51
|
+
|
|
52
|
+
class_f1: List[float] = []
|
|
53
|
+
for class_counts in all_classes_counts.values():
|
|
54
|
+
class_precision = class_counts["tp"] / (class_counts["tp"] + class_counts["fp"])
|
|
55
|
+
class_recall = class_counts["tp"] / (class_counts["tp"] + class_counts["fn"])
|
|
56
|
+
class_f1.append(2 * (class_precision * class_recall) / (class_precision + class_recall))
|
|
57
|
+
macro_f1 = sum(class_f1) / len(class_f1)
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
"classification_micro_f1": micro_f1,
|
|
61
|
+
"classification_macro_f1": macro_f1,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_evaluate_instances_binary_generation():
|
|
66
|
+
metric = ClassificationMetric(delimiter=None)
|
|
67
|
+
|
|
68
|
+
request_states = [
|
|
69
|
+
_request_state("yes", [_Option("yes", True)]),
|
|
70
|
+
_request_state("yes", [_Option("yes", True)]),
|
|
71
|
+
_request_state("yes", [_Option("yes", True)]),
|
|
72
|
+
_request_state("yes", [_Option("no", True)]),
|
|
73
|
+
_request_state("no", [_Option("yes", True)]),
|
|
74
|
+
_request_state("no", [_Option("no", True)]),
|
|
75
|
+
_request_state("invalid", [_Option("no", True)]),
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
assert_stats_equal(
|
|
79
|
+
metric.evaluate_instances(request_states),
|
|
80
|
+
_expected_stats(
|
|
81
|
+
{
|
|
82
|
+
"yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
|
|
83
|
+
"no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
|
|
84
|
+
}
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_evaluate_instances_multi_class():
|
|
90
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
91
|
+
metric = ClassificationMetric(delimiter=None)
|
|
92
|
+
|
|
93
|
+
def _options(correct: str):
|
|
94
|
+
return [_Option(text, text == correct) for text in ["d", "b", "c"]]
|
|
95
|
+
|
|
96
|
+
request_states = [
|
|
97
|
+
_request_state("d", _options("d")),
|
|
98
|
+
_request_state("d", _options("d")),
|
|
99
|
+
_request_state("d", _options("d")),
|
|
100
|
+
_request_state("d", _options("b")),
|
|
101
|
+
_request_state("b", _options("b")),
|
|
102
|
+
_request_state("b", _options("b")),
|
|
103
|
+
_request_state("b", _options("c")),
|
|
104
|
+
_request_state("c", _options("d")),
|
|
105
|
+
_request_state("c", _options("c")),
|
|
106
|
+
_request_state("invalid", _options("c")),
|
|
107
|
+
]
|
|
108
|
+
assert_stats_equal(
|
|
109
|
+
metric.evaluate_instances(request_states),
|
|
110
|
+
_expected_stats(
|
|
111
|
+
{
|
|
112
|
+
"d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
|
|
113
|
+
"b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
|
|
114
|
+
"c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
|
|
115
|
+
}
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_evaluate_instances_multilabel():
|
|
121
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
122
|
+
metric = ClassificationMetric(delimiter=",")
|
|
123
|
+
|
|
124
|
+
def _options(correct: List[str]):
|
|
125
|
+
return [_Option(text, text in correct) for text in ["d", "b", "c"]]
|
|
126
|
+
|
|
127
|
+
request_states = [
|
|
128
|
+
_request_state("d,b", _options(["d", "b"])),
|
|
129
|
+
_request_state("d,b", _options(["d", "c"])),
|
|
130
|
+
_request_state("d", _options(["d"])),
|
|
131
|
+
_request_state("c", _options(["b"])),
|
|
132
|
+
_request_state("b", _options(["b", "c"])),
|
|
133
|
+
_request_state("d,b", _options(["c"])),
|
|
134
|
+
_request_state("d,c", _options(["d"])),
|
|
135
|
+
_request_state("d,b,c", _options(["d", "b", "c"])),
|
|
136
|
+
_request_state("", []),
|
|
137
|
+
_request_state("n/a", []),
|
|
138
|
+
_request_state("invalid", _options(["c"])),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
assert_stats_equal(
|
|
142
|
+
metric.evaluate_instances(request_states),
|
|
143
|
+
_expected_stats(
|
|
144
|
+
{
|
|
145
|
+
"d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
|
|
146
|
+
"b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
|
|
147
|
+
"c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
|
|
148
|
+
}
|
|
149
|
+
),
|
|
150
|
+
)
|