crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,70 @@
1
+ from typing import List, Optional
2
+
3
+ from sklearn.metrics import f1_score
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.metrics.basic_metrics import normalize_text
8
+ from helm.benchmark.metrics.metric import Metric, MetricName
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class ClassificationMetric(Metric):
13
+ """Defines metrics for multi-class classification using the generation adapter.
14
+
15
+ Currently provides `classification_macro_f1` and `classification_micro_f1`.
16
+ These are population-level F1 measures to measure classification performance where each
17
+ generation is a predicted class, and are different from the instance-level F1 measures
18
+ in `BasicMetrics` that are intended to measure word overlap between the correct references
19
+ and generations. The correct class should be provided by the normalized text of a correct
20
+ reference. The predicted class for each instance is the normalized text of the generation.
21
+
22
+ Note:
23
+ - The set of classes is derived from the correct references from all the instances.
24
+ This means that classes may be omitted if they are never used as a correct reference.
25
+ - Generations that are not in any of the known classes are counted as a
26
+ negative prediction for every class.
27
+ - Perturbed classes are considered different classes from unperturbed
28
+ classes.
29
+ - Currently, multi-label classification is not supported.
30
+ """
31
+
32
+ def __init__(self, delimiter: Optional[str] = None):
33
+ self.delimiter = delimiter
34
+
35
+ def is_multi_label(self) -> bool:
36
+ return bool(self.delimiter)
37
+
38
+ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
39
+ y_pred: List[List[str]] = []
40
+ y_true: List[List[str]] = []
41
+ for request_state in request_states: # one request state per instance
42
+ # Only the generation adapter is supported.
43
+ # TODO: Support multiple_choice_* adapters.
44
+ if request_state.reference_index is not None:
45
+ raise ValueError("ClassificationMetric does not support multiple choice separate adapters")
46
+ if request_state.request_mode == "calibration":
47
+ raise ValueError("ClassificationMetric does not support calibration requests")
48
+ assert request_state.result is not None
49
+ if len(request_state.result.completions) != 1:
50
+ raise ValueError("Result must contain exactly one completion")
51
+ if request_state.output_mapping:
52
+ raise ValueError("ClassificationMetric does not support multiple choice adapters")
53
+
54
+ references = request_state.instance.all_correct_references
55
+ if not self.is_multi_label():
56
+ assert len(references) == 1
57
+ correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
58
+ y_true.append(correct_ref_texts)
59
+
60
+ input_text = request_state.result.completions[0].text
61
+ predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
62
+ y_pred.append([normalize_text(pred) for pred in predictions if pred])
63
+ labels: List[str] = list(set(y for ys in y_true for y in ys))
64
+ mlb = MultiLabelBinarizer().fit([labels])
65
+ y_true = mlb.transform(y_true)
66
+ y_pred = mlb.transform(y_pred)
67
+ return [
68
+ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
69
+ Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
70
+ ]
@@ -0,0 +1,36 @@
1
+ from typing import List
2
+ from sacrebleu import BLEU
3
+
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from .metric import Metric
6
+ from .metric_name import MetricName
7
+ from .statistic import Stat
8
+
9
+
10
+ class MachineTranslationMetric(Metric):
11
+ """
12
+ Compute the BLEU score for Machine Translation scenarios. The implementation is based on sacrebleu.
13
+ """
14
+
15
+ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
16
+ """
17
+ Compute the corpus-level metric based on all reqeust_states.
18
+ """
19
+
20
+ bleu = BLEU()
21
+
22
+ refs: List[List[str]] = [[]]
23
+ sys: List = []
24
+ for request_state in request_states:
25
+ # Assume there is one referece per instance. TODO: Support multiple references after adding more scenarios.
26
+ num_references: int = len(request_state.instance.references)
27
+ if num_references != 1:
28
+ raise ValueError(f"This instance has {num_references} references, but we currently only support one.")
29
+ # Usually there is only one completion for each instance.
30
+ assert request_state.result is not None
31
+ if len(request_state.result.completions) != 1:
32
+ raise ValueError("Each request result should have only exactly one completion.")
33
+ sys.append(request_state.result.completions[0].text)
34
+ refs[0].append(request_state.instance.references[0].output.text)
35
+ bleu_score = bleu.corpus_score(sys, refs).score
36
+ return [Stat(MetricName("bleu")).add(bleu_score)]
@@ -5,17 +5,9 @@ import os
5
5
  import pickle
6
6
 
7
7
  import spacy
8
- import subprocess
9
- import sys
10
8
  from typing import List, Dict, Optional
11
9
  from collections import defaultdict
12
10
 
13
- # Need to check spacy module is downloaded before importing DataStatsMetric
14
- if not spacy.util.is_package("en_core_web_sm"):
15
- subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
16
-
17
- from summ_eval.data_stats_metric import DataStatsMetric
18
-
19
11
  from helm.benchmark.adaptation.scenario_state import ScenarioState
20
12
  from helm.benchmark.adaptation.request_state import RequestState
21
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
@@ -56,6 +48,13 @@ class SummarizationMetric(Metric):
56
48
  "rouge_2": get_rouge_function("rouge2"),
57
49
  "rouge_l": get_rouge_function("rougeL"),
58
50
  }
51
+ # Download en_core_web_sm before importing DataStatsMetric to
52
+ # avoid triggering a bug in DataStatsMetric that raises
53
+ # `NameError: name 'stderr' is not defined`
54
+ if not spacy.util.is_package("en_core_web_sm"):
55
+ spacy.cli.download("en_core_web_sm") # type: ignore
56
+ from summ_eval.data_stats_metric import DataStatsMetric
57
+
59
58
  self.data_stats_metric = DataStatsMetric()
60
59
  self.task: str = task
61
60
  self.qa_fact_eval: Optional[Dict] = None
@@ -0,0 +1,150 @@
1
+ from collections import defaultdict
2
+ from typing import Dict, List, NamedTuple
3
+
4
+ from pytest import approx
5
+
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.metrics.classification_metrics import ClassificationMetric
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference, CORRECT_TAG
10
+ from helm.common.request import Request, RequestResult, Sequence
11
+
12
+
13
+ class _Option(NamedTuple):
14
+ text: str
15
+ is_correct: bool
16
+
17
+
18
+ def _request_state(prediction: str, options: List[_Option]):
19
+ references = [
20
+ Reference(output=Output(text=option.text), tags=[CORRECT_TAG] if option.is_correct else [])
21
+ for option in options
22
+ ]
23
+ return RequestState(
24
+ instance=Instance(input=Input(text=""), references=references),
25
+ reference_index=None,
26
+ request_mode=None,
27
+ train_trial_index=0,
28
+ output_mapping=None,
29
+ request=Request(),
30
+ result=RequestResult(
31
+ success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False
32
+ ),
33
+ num_train_instances=0,
34
+ prompt_truncated=False,
35
+ )
36
+
37
+
38
+ def assert_stats_equal(actual_stats: List[Stat], expected_values: Dict[str, float]):
39
+ actual_values = {stat.name.name: stat.mean for stat in actual_stats}
40
+ assert actual_values == approx(expected_values)
41
+
42
+
43
+ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
44
+ micro_counts: Dict[str, int] = defaultdict(int)
45
+ for class_counts in all_classes_counts.values():
46
+ for key, class_count in class_counts.items():
47
+ micro_counts[key] += class_count
48
+ micro_precision = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fp"])
49
+ micro_recall = micro_counts["tp"] / (micro_counts["tp"] + micro_counts["fn"])
50
+ micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
51
+
52
+ class_f1: List[float] = []
53
+ for class_counts in all_classes_counts.values():
54
+ class_precision = class_counts["tp"] / (class_counts["tp"] + class_counts["fp"])
55
+ class_recall = class_counts["tp"] / (class_counts["tp"] + class_counts["fn"])
56
+ class_f1.append(2 * (class_precision * class_recall) / (class_precision + class_recall))
57
+ macro_f1 = sum(class_f1) / len(class_f1)
58
+
59
+ return {
60
+ "classification_micro_f1": micro_f1,
61
+ "classification_macro_f1": macro_f1,
62
+ }
63
+
64
+
65
+ def test_evaluate_instances_binary_generation():
66
+ metric = ClassificationMetric(delimiter=None)
67
+
68
+ request_states = [
69
+ _request_state("yes", [_Option("yes", True)]),
70
+ _request_state("yes", [_Option("yes", True)]),
71
+ _request_state("yes", [_Option("yes", True)]),
72
+ _request_state("yes", [_Option("no", True)]),
73
+ _request_state("no", [_Option("yes", True)]),
74
+ _request_state("no", [_Option("no", True)]),
75
+ _request_state("invalid", [_Option("no", True)]),
76
+ ]
77
+
78
+ assert_stats_equal(
79
+ metric.evaluate_instances(request_states),
80
+ _expected_stats(
81
+ {
82
+ "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
83
+ "no": {"tp": 1, "fp": 1, "tn": 3, "fn": 2},
84
+ }
85
+ ),
86
+ )
87
+
88
+
89
+ def test_evaluate_instances_multi_class():
90
+ # Note: no "a" because it would get filtered out by normalize_text()
91
+ metric = ClassificationMetric(delimiter=None)
92
+
93
+ def _options(correct: str):
94
+ return [_Option(text, text == correct) for text in ["d", "b", "c"]]
95
+
96
+ request_states = [
97
+ _request_state("d", _options("d")),
98
+ _request_state("d", _options("d")),
99
+ _request_state("d", _options("d")),
100
+ _request_state("d", _options("b")),
101
+ _request_state("b", _options("b")),
102
+ _request_state("b", _options("b")),
103
+ _request_state("b", _options("c")),
104
+ _request_state("c", _options("d")),
105
+ _request_state("c", _options("c")),
106
+ _request_state("invalid", _options("c")),
107
+ ]
108
+ assert_stats_equal(
109
+ metric.evaluate_instances(request_states),
110
+ _expected_stats(
111
+ {
112
+ "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
113
+ "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
114
+ "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
115
+ }
116
+ ),
117
+ )
118
+
119
+
120
+ def test_evaluate_instances_multilabel():
121
+ # Note: no "a" because it would get filtered out by normalize_text()
122
+ metric = ClassificationMetric(delimiter=",")
123
+
124
+ def _options(correct: List[str]):
125
+ return [_Option(text, text in correct) for text in ["d", "b", "c"]]
126
+
127
+ request_states = [
128
+ _request_state("d,b", _options(["d", "b"])),
129
+ _request_state("d,b", _options(["d", "c"])),
130
+ _request_state("d", _options(["d"])),
131
+ _request_state("c", _options(["b"])),
132
+ _request_state("b", _options(["b", "c"])),
133
+ _request_state("d,b", _options(["c"])),
134
+ _request_state("d,c", _options(["d"])),
135
+ _request_state("d,b,c", _options(["d", "b", "c"])),
136
+ _request_state("", []),
137
+ _request_state("n/a", []),
138
+ _request_state("invalid", _options(["c"])),
139
+ ]
140
+
141
+ assert_stats_equal(
142
+ metric.evaluate_instances(request_states),
143
+ _expected_stats(
144
+ {
145
+ "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
146
+ "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
147
+ "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
148
+ }
149
+ ),
150
+ )