crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,108 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class AnnotationLabelMetric(Metric):
12
+ """Binary metric for labels produced by annotators.
13
+
14
+ Expects the annotation with the given annotator name and key to be a string label.
15
+
16
+ For each possible label in the list of possible labels, produces a
17
+ corresponding stat with a value of 1 or 0 indicating if the actual label
18
+ in the annoation."""
19
+
20
+ def __init__(self, annotator_name: str, key: str, labels: List[str]):
21
+ super().__init__()
22
+ self.annotator_name = annotator_name
23
+ self.key = key
24
+ self.labels = labels
25
+
26
+ def evaluate_generation(
27
+ self,
28
+ adapter_spec: AdapterSpec,
29
+ request_state: RequestState,
30
+ metric_service: MetricService,
31
+ eval_cache_path: str,
32
+ ) -> List[Stat]:
33
+ assert request_state.annotations
34
+ annotation_label = request_state.annotations[self.annotator_name][self.key]
35
+ if annotation_label not in self.labels:
36
+ raise ValueError(
37
+ f"Unrecognized annotation label '{annotation_label}' "
38
+ f"(known labels: {self.labels}) "
39
+ f"in annotation {request_state.annotations[self.annotator_name]} "
40
+ f"for instance id {request_state.instance.id}"
41
+ )
42
+ stats: List[Stat] = []
43
+ for label in self.labels:
44
+ stats.append(
45
+ Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}_{label}")).add(
46
+ 1 if label == annotation_label else 0
47
+ )
48
+ )
49
+ return stats
50
+
51
+
52
+ class AnnotationNumericMetric(Metric):
53
+ """Numeric metric for numbers produced by annotators.
54
+
55
+ Expects the annotation with the given annotator name and key to be a number."""
56
+
57
+ def __init__(self, annotator_name: str, key: str):
58
+ super().__init__()
59
+ self.annotator_name = annotator_name
60
+ self.key = key
61
+
62
+ def evaluate_generation(
63
+ self,
64
+ adapter_spec: AdapterSpec,
65
+ request_state: RequestState,
66
+ metric_service: MetricService,
67
+ eval_cache_path: str,
68
+ ) -> List[Stat]:
69
+ assert request_state.annotations
70
+ score = request_state.annotations[self.annotator_name][self.key]
71
+ return [Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}")).add(score)]
72
+
73
+
74
+ class AnnotationLikertScaleMetric(Metric):
75
+ """Numeric metric for labels produced by annotators.
76
+
77
+ Expects the annotation with the given annotator name and key to be a string label.
78
+
79
+ For each possible label in the list of possible labels, produces a
80
+ corresponding stat with a value of 1 or 0 indicating if the actual label
81
+ in the annoation."""
82
+
83
+ def __init__(self, annotator_name: str, key: str, min_score: int, max_score: int):
84
+ super().__init__()
85
+ self.annotator_name = annotator_name
86
+ self.key = key
87
+ self.min_score = min_score
88
+ self.max_score = max_score
89
+
90
+ def evaluate_generation(
91
+ self,
92
+ adapter_spec: AdapterSpec,
93
+ request_state: RequestState,
94
+ metric_service: MetricService,
95
+ eval_cache_path: str,
96
+ ) -> List[Stat]:
97
+ assert request_state.annotations
98
+ likert_score = request_state.annotations[self.annotator_name][self.key]
99
+ if likert_score < self.min_score or likert_score > self.max_score:
100
+ raise ValueError(
101
+ f"Likert score {likert_score} "
102
+ f"out of bounds {self.min_score} to {self.max_score} "
103
+ f"under key {self.key} and annotator {self.annotator_name} "
104
+ f"in annotation {request_state.annotations[self.annotator_name]} "
105
+ f"for instance id {request_state.instance.id}"
106
+ )
107
+ normalized_score = (likert_score - self.min_score) / (self.max_score - self.min_score)
108
+ return [Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}")).add(normalized_score)]
@@ -0,0 +1,188 @@
1
+ import re
2
+ import string
3
+ from typing import Callable, Dict, List
4
+ from collections import Counter
5
+
6
+ from pythainlp.tokenize import word_tokenize
7
+ from sacrebleu.metrics import CHRF
8
+
9
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
10
+ from helm.benchmark.adaptation.request_state import RequestState
11
+ from helm.benchmark.metrics.metric import Metric
12
+ from helm.benchmark.metrics.metric_name import MetricName
13
+ from helm.benchmark.metrics.metric_service import MetricService
14
+ from helm.benchmark.metrics.statistic import Stat
15
+
16
+
17
+ class BhasaMachineTranslationMetric(Metric):
18
+ """Machine Translation Metrics
19
+
20
+ This class computes the following standard machine translation metrics
21
+
22
+ 1. chr_f_plus_plus (ChrF++)
23
+
24
+ @inproceedings{popovic-2015-chrf,
25
+ title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
26
+ author = "Popovi{\'c}, Maja",
27
+ editor = "Bojar, Ond{\v{r}}ej and
28
+ Chatterjee, Rajan and
29
+ Federmann, Christian and
30
+ Haddow, Barry and
31
+ Hokamp, Chris and
32
+ Huck, Matthias and
33
+ Logacheva, Varvara and
34
+ Pecina, Pavel",
35
+ booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
36
+ month = sep,
37
+ year = "2015",
38
+ address = "Lisbon, Portugal",
39
+ publisher = "Association for Computational Linguistics",
40
+ url = "https://aclanthology.org/W15-3049",
41
+ doi = "10.18653/v1/W15-3049",
42
+ pages = "392--395",
43
+ github = "https://github.com/mjpost/sacrebleu",
44
+ }
45
+ """
46
+
47
+ def __init__(self):
48
+ self.chrf_scorer = CHRF(word_order=2)
49
+
50
+ def chr_f_plus_plus(self, refs: List[str], pred: str) -> Dict[str, float]:
51
+ metrics: Dict[str, float] = {}
52
+ metrics["chr_f_plus_plus"] = self.chrf_scorer.sentence_score(pred, refs).score
53
+ return metrics
54
+
55
+ def evaluate_generation(
56
+ self,
57
+ adapter_spec: AdapterSpec,
58
+ request_state: RequestState,
59
+ metric_service: MetricService,
60
+ eval_cache_path: str,
61
+ ) -> List[Stat]:
62
+ refs: List[str] = [ref.output.text for ref in request_state.instance.references]
63
+
64
+ assert request_state.result is not None
65
+ pred: str = request_state.result.completions[0].text.strip()
66
+
67
+ result: List[Stat] = []
68
+
69
+ # Compute ChrF++ metrics
70
+ result.extend(
71
+ [Stat(MetricName(name)).add(float(val)) for name, val in self.chr_f_plus_plus(refs, pred).items()]
72
+ )
73
+
74
+ return result
75
+
76
+
77
+ class BhasaQAMetric(Metric):
78
+ """Bhasa QA Metrics
79
+
80
+ This class computes the following standard SQuAD v1.1 metrics
81
+
82
+ 1. squad_exact_match_score (SQuAD exact match score)
83
+ 2. squad_f1_score (SQuAD macro-averaged F1 score)
84
+
85
+ @inproceedings{rajpurkar-etal-2016-squad,
86
+ title = "{SQ}u{AD}: 100,000+ Questions for Machine Comprehension of Text",
87
+ author = "Rajpurkar, Pranav and
88
+ Zhang, Jian and
89
+ Lopyrev, Konstantin and
90
+ Liang, Percy",
91
+ editor = "Su, Jian and
92
+ Duh, Kevin and
93
+ Carreras, Xavier",
94
+ booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
95
+ month = nov,
96
+ year = "2016",
97
+ address = "Austin, Texas",
98
+ publisher = "Association for Computational Linguistics",
99
+ url = "https://aclanthology.org/D16-1264",
100
+ doi = "10.18653/v1/D16-1264",
101
+ pages = "2383--2392",
102
+ }
103
+ """
104
+
105
+ def __init__(self, language: str = "en"):
106
+ self.language: str = language
107
+ self.metrics: Dict[str, Callable[[str, str], float]] = {
108
+ "squad_exact_match_score": self.squad_exact_match_score,
109
+ "squad_f1_score": self.squad_f1_score,
110
+ }
111
+
112
+ def normalize_answer(self, text: str) -> List[str]:
113
+ """
114
+ For Thai, this will split the text using PyThaiNLP's tokenizer.
115
+ For all other languages, this will:
116
+ - Lower text
117
+ - Remove punctuation
118
+ - Remove extra whitespace
119
+
120
+ If the language is English, it will
121
+ - Remove articles "a", "an", and "the"
122
+
123
+ Modifies code from [SQuAD v1.1](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py).
124
+ """
125
+
126
+ def remove_articles(text: str) -> str:
127
+ return re.sub(r"\b(a|an|the)\b", " ", text)
128
+
129
+ # This function is implemented to match SQuAD v1.1 behavior
130
+ def white_space_fix(text: str) -> str:
131
+ return " ".join(text.split())
132
+
133
+ def remove_punc(text: str) -> str:
134
+ exclude = set(string.punctuation)
135
+ return "".join(ch for ch in text if ch not in exclude)
136
+
137
+ def lower(text: str) -> str:
138
+ return text.lower()
139
+
140
+ normalized_text = remove_punc(lower(text))
141
+ if self.language == "th":
142
+ return word_tokenize(normalized_text, engine="newmm")
143
+ elif self.language == "en":
144
+ return white_space_fix(remove_articles(normalized_text)).split()
145
+ else:
146
+ return white_space_fix(normalized_text).split()
147
+
148
+ def squad_f1_score(self, gold: str, pred: str) -> float:
149
+ prediction_tokens = self.normalize_answer(pred)
150
+ ground_truth_tokens = self.normalize_answer(gold)
151
+ common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
152
+ num_same = sum(common.values())
153
+ if num_same == 0:
154
+ return 0
155
+ precision = 1.0 * num_same / len(prediction_tokens)
156
+ recall = 1.0 * num_same / len(ground_truth_tokens)
157
+ f1 = (2 * precision * recall) / (precision + recall)
158
+ return f1
159
+
160
+ def squad_exact_match_score(self, gold: str, pred: str) -> float:
161
+ return self.normalize_answer(pred) == self.normalize_answer(gold)
162
+
163
+ def evaluate_generation(
164
+ self,
165
+ adapter_spec: AdapterSpec,
166
+ request_state: RequestState,
167
+ metric_service: MetricService,
168
+ eval_cache_path: str,
169
+ ) -> List[Stat]:
170
+
171
+ stats: List[Stat] = []
172
+ if len(request_state.instance.references) > 0:
173
+ golds = [reference for reference in request_state.instance.references if reference.is_correct]
174
+ assert len(golds) > 0
175
+
176
+ assert request_state.result is not None
177
+ sorted_completions = sorted(request_state.result.completions, key=lambda x: -x.logprob)
178
+ preds = [completion.text.strip() for completion in sorted_completions]
179
+
180
+ for name, metric in self.metrics.items():
181
+ score_1 = max(metric(gold.output.text.strip(), preds[0]) for gold in golds)
182
+ metrics = [Stat(MetricName(name)).add(score_1)]
183
+ if adapter_spec.num_outputs != 1:
184
+ score_k = max(metric(gold.output.text.strip(), pred) for gold in golds for pred in preds)
185
+ metrics.append(Stat(MetricName(f"{name}@{adapter_spec.num_outputs}")).add(score_k))
186
+ stats.extend(metrics)
187
+
188
+ return stats
@@ -0,0 +1,10 @@
1
+ from typing import Any, Dict, List
2
+ from helm.benchmark.metrics.metric import MetricSpec
3
+
4
+
5
+ def get_bhasa_machine_translation_metric_specs() -> List[MetricSpec]:
6
+ return [MetricSpec(class_name="helm.benchmark.metrics.bhasa_metrics.BhasaMachineTranslationMetric")]
7
+
8
+
9
+ def get_bhasa_qa_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
10
+ return [MetricSpec(class_name="helm.benchmark.metrics.bhasa_metrics.BhasaQAMetric", args=args)]
@@ -27,14 +27,24 @@ import signal
27
27
  import sys
28
28
  import tempfile
29
29
  from typing import List, Union, Dict, Optional
30
+ from types import ModuleType
30
31
  from unittest.mock import patch, mock_open
31
32
 
32
33
  import numpy as np
33
- from pyext import RuntimeModule
34
34
 
35
35
  from helm.common.hierarchical_logger import hlog
36
36
 
37
37
 
38
+ class RuntimeModule(ModuleType):
39
+ """crfm-helm's replacement for pyext.RuntimeModule, since pyext is not supported by Python >=3.11"""
40
+
41
+ @staticmethod
42
+ def from_string(module_name: str, module_doc: str, module_contents: str) -> "RuntimeModule":
43
+ module = RuntimeModule(module_name, module_doc)
44
+ exec(module_contents, module.__dict__)
45
+ return module
46
+
47
+
38
48
  # === APPS evaluation below ===
39
49
  class CodeType(Enum):
40
50
  call_based = 0
@@ -0,0 +1,57 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
6
+ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
7
+ from helm.benchmark.metrics.metric import Metric
8
+ from helm.benchmark.metrics.metric_name import MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat
11
+
12
+
13
+ class SafetyBasicGenerationMetric(Metric):
14
+ """Replacement for BasicGenerationMetric for HELM Safety.
15
+ We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
16
+ because we abuse "references" to store metadata rather than true metadata."""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+ self.efficiency_metric = EfficiencyMetric()
21
+
22
+ def evaluate_generation(
23
+ self,
24
+ adapter_spec: AdapterSpec,
25
+ request_state: RequestState,
26
+ metric_service: MetricService,
27
+ eval_cache_path: str,
28
+ ) -> List[Stat]:
29
+ return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
30
+
31
+
32
+ class SafetyScoreMetric(Metric):
33
+ """Score metrics for HELM Safety."""
34
+
35
+ def evaluate_generation(
36
+ self,
37
+ adapter_spec: AdapterSpec,
38
+ request_state: RequestState,
39
+ metric_service: MetricService,
40
+ eval_cache_path: str,
41
+ ) -> List[Stat]:
42
+ assert request_state.annotations
43
+ safety_scenarios = ["harm_bench", "anthropic_red_team", "simple_safety_tests", "xstest"]
44
+ curr_scenario = None
45
+ for annotation_key in request_state.annotations:
46
+ if annotation_key in safety_scenarios:
47
+ curr_scenario = annotation_key
48
+ if curr_scenario is None:
49
+ raise ValueError("No safety scenario found in annotations")
50
+ gpt_score = request_state.annotations[curr_scenario]["gpt_score"]
51
+ llama_score = request_state.annotations[curr_scenario]["llama_score"]
52
+ score = (gpt_score + llama_score) / 2
53
+ return [
54
+ Stat(MetricName("safety_score")).add(score),
55
+ Stat(MetricName("safety_gpt_score")).add(gpt_score),
56
+ Stat(MetricName("safety_llama_score")).add(llama_score),
57
+ ]
@@ -179,9 +179,9 @@ class SummaCImager:
179
179
  model_outputs = self.model(**batch_tokens)
180
180
 
181
181
  batch_probs = torch.nn.functional.softmax(model_outputs["logits"], dim=-1)
182
- batch_evids = batch_probs[:, self.entailment_idx].tolist()
183
- batch_conts = batch_probs[:, self.contradiction_idx].tolist()
184
- batch_neuts = batch_probs[:, self.neutral_idx].tolist()
182
+ batch_evids = batch_probs[:, self.entailment_idx].tolist() # type: ignore
183
+ batch_conts = batch_probs[:, self.contradiction_idx].tolist() # type: ignore
184
+ batch_neuts = batch_probs[:, self.neutral_idx].tolist() # type: ignore
185
185
 
186
186
  for b, evid, cont, neut in zip(batch, batch_evids, batch_conts, batch_neuts):
187
187
  image[0, b["doc_i"], b["gen_i"]] = evid
@@ -10,8 +10,8 @@ class TestAI21TokenCostEstimator:
10
10
 
11
11
  def test_estimate_tokens(self):
12
12
  request = Request(
13
- model="openai/text-davinci-002",
14
- model_deployment="openai/text-davinci-002",
13
+ model="ai21/jamba-instruct",
14
+ model_deployment="ai21/jamba-instruct",
15
15
  prompt="The Center for Research on Foundation Models (CRFM) is "
16
16
  "an interdisciplinary initiative born out of the Stanford "
17
17
  "Institute for Human-Centered Artificial Intelligence (HAI) "
@@ -37,8 +37,8 @@ class TestOpenAITokenCostEstimator:
37
37
 
38
38
  def test_estimate_tokens(self):
39
39
  request = Request(
40
- model="openai/text-davinci-002",
41
- model_deployment="openai/text-davinci-002",
40
+ model="openai/davinci-002",
41
+ model_deployment="openai/davinci-002",
42
42
  prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
43
43
  num_completions=3,
44
44
  max_tokens=100,
@@ -49,8 +49,8 @@ class TestOpenAITokenCostEstimator:
49
49
 
50
50
  def test_estimate_tokens_with_echo_prompt(self):
51
51
  request = Request(
52
- model="openai/text-davinci-002",
53
- model_deployment="openai/text-davinci-002",
52
+ model="openai/davinci-002",
53
+ model_deployment="openai/davinci-002",
54
54
  prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
55
55
  echo_prompt=True,
56
56
  num_completions=1,
@@ -35,7 +35,7 @@ try:
35
35
  from PIL import Image
36
36
  import imagehash
37
37
  except ModuleNotFoundError as e:
38
- handle_module_not_found_error(e, suggestions=["image2structure"])
38
+ handle_module_not_found_error(e, suggestions=["image2struct"])
39
39
 
40
40
 
41
41
  def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
@@ -6,7 +6,7 @@ try:
6
6
  import cv2
7
7
  from PIL.Image import Image
8
8
  except ModuleNotFoundError as e:
9
- handle_module_not_found_error(e, suggestions=["image2structure"])
9
+ handle_module_not_found_error(e, suggestions=["image2struct"])
10
10
 
11
11
 
12
12
  def preprocess_image(image: Image) -> np.ndarray:
@@ -22,9 +22,6 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
22
22
  # OpenAI Chat format
23
23
  OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
24
24
 
25
- # Mistral instruction-following format
26
- MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
27
-
28
25
  # For Anthropic models
29
26
  ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
30
27
  ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
@@ -69,6 +66,9 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
69
66
  LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
70
67
  FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
71
68
 
69
+ # Deprecated models that are no longer available.
70
+ # These are usually closed API models that have been permanently removed
71
+ DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
72
72
 
73
73
  # Frozen is set to false as the model_deployment_registry.py file
74
74
  # might populate the deployment_names field.
@@ -16,6 +16,7 @@ class TestRunEntry:
16
16
 
17
17
  @pytest.mark.parametrize("fname", list_fnames())
18
18
  def test_read_all_specs(self, fname: str):
19
+ pytest.skip("Skipping slow tests")
19
20
  run_entries = read_run_entries([fname])
20
21
  for entry in run_entries.entries:
21
22
  construct_run_specs(parse_object_spec(entry.description))
helm/benchmark/run.py CHANGED
@@ -1,9 +1,11 @@
1
1
  import argparse
2
2
  from dataclasses import replace
3
3
  import os
4
+ import re
4
5
  from typing import List, Optional
5
6
 
6
7
 
8
+ from helm.benchmark import model_metadata_registry
7
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
9
11
  from helm.common.general import ensure_directory_exists
@@ -314,6 +316,19 @@ def main():
314
316
  ensure_directory_exists(args.output_path)
315
317
  set_benchmark_output_path(args.output_path)
316
318
 
319
+ # Validate the --models-to-run flag
320
+ if args.models_to_run:
321
+ all_models = set(model_metadata_registry.get_all_models())
322
+ for model_to_run in args.models_to_run:
323
+ if model_to_run not in all_models:
324
+ raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
325
+ else:
326
+ model_expander_pattern = re.compile(
327
+ r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
328
+ )
329
+ if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
330
+ raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
331
+
317
332
  run_specs = run_entries_to_run_specs(
318
333
  run_entries=run_entries,
319
334
  max_eval_instances=args.max_eval_instances,
@@ -10,6 +10,7 @@ from helm.benchmark.model_metadata_registry import (
10
10
  get_all_text_models,
11
11
  get_model_metadata,
12
12
  get_model_names_with_tag,
13
+ DEPRECATED_MODEL_TAG,
13
14
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
14
15
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
15
16
  ABLATION_MODEL_TAG,
@@ -343,16 +344,6 @@ class AnthropicClaude3RunExpander(RunExpander):
343
344
  run_spec,
344
345
  adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
345
346
  )
346
- if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
347
- instructions = "Answer with only a single letter."
348
- if run_spec.adapter_spec.instructions:
349
- instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
350
- return [
351
- replace(
352
- run_spec,
353
- adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
354
- ),
355
- ]
356
347
  return [run_spec]
357
348
 
358
349
 
@@ -610,6 +601,12 @@ class ModelRunExpander(ReplaceValueRunExpander):
610
601
  values_dict["ablation"] = models
611
602
  else:
612
603
  values_dict[family_name] = models
604
+
605
+ # For each of the keys above, filter out deprecated models.
606
+ deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
607
+ for family_name in values_dict.keys():
608
+ values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
609
+
613
610
  return values_dict
614
611
 
615
612
 
@@ -1402,23 +1399,26 @@ class OutputFormatInstructions(RunExpander):
1402
1399
 
1403
1400
  name = "output_format_instructions"
1404
1401
 
1402
+ _SUFFIX_SUFFIX = "_suffix"
1403
+
1405
1404
  def __init__(self, scenario: str):
1406
- self.scenario = scenario
1405
+ if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
1406
+ self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
1407
+ self.suffix = True
1408
+ else:
1409
+ self.scenario = scenario
1410
+ self.suffix = False
1407
1411
 
1408
1412
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1409
1413
  if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
1410
1414
  if self.scenario == "mmlu_only_last_question":
1411
1415
  instructions = "Answer only the last question with only a single letter."
1416
+ elif self.scenario == "mmlu":
1417
+ instructions = "Answer with only a single letter."
1418
+ elif self.scenario == "mcqa":
1419
+ instructions = "Answer with only a single letter."
1412
1420
  else:
1413
1421
  instructions = "Answer with only a single letter."
1414
- if run_spec.adapter_spec.instructions:
1415
- instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1416
- return [
1417
- replace(
1418
- run_spec,
1419
- adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1420
- ),
1421
- ]
1422
1422
  elif run_spec.adapter_spec.method == ADAPT_GENERATION:
1423
1423
  output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
1424
1424
  if self.scenario == "narrative_qa":
@@ -1433,27 +1433,53 @@ class OutputFormatInstructions(RunExpander):
1433
1433
  instructions = f"Answer with the {output_noun.lower()}."
1434
1434
  else:
1435
1435
  instructions = "Answer yes or no."
1436
+ elif self.scenario == "legalbench_abercrombie":
1437
+ instructions = "Answer with only 'generic', 'descriptive', 'suggestive', 'arbitrary' or 'fanciful'."
1438
+ elif self.scenario == "legalbench_function_of_decision_section":
1439
+ instructions = "Answer with only 'Facts', 'Procedural History', 'Issue', 'Rule', 'Analysis', 'Conclusion' or 'Decree'." # noqa: E501
1440
+ elif self.scenario == "legalbench_yes_or_no":
1441
+ instructions = "Answer with only 'Yes' or 'No'."
1436
1442
  elif self.scenario == "wmt_14":
1437
1443
  instructions = "Answer with the English translation."
1438
- else:
1439
- raise ValueError(f"Unknown scenario {self.scenario}")
1440
-
1441
- if run_spec.adapter_spec.output_prefix:
1444
+ elif self.scenario == "wmt_14_only_last_sentence":
1445
+ instructions = "Answer with only the English translation for the last sentence."
1446
+ elif self.scenario == "math":
1447
+ instructions = "Wrap the final answer with the \\boxed{} command."
1448
+ elif self.scenario == "numeric_nlg":
1449
+ instructions = "Answer with only description of the last table as a single paragraph on a single line."
1450
+ elif self.scenario == "tab_fact":
1442
1451
  instructions = (
1443
- f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
1452
+ "Answer with only the classification of the last statement, either 'refuted' or 'entailed'."
1453
+ )
1454
+ elif self.scenario == "wikitq":
1455
+ instructions = (
1456
+ "Answer only the last question with a short answer. "
1457
+ "Avoid extra, unnecessary information in the answer."
1444
1458
  )
1445
-
1446
- if run_spec.adapter_spec.instructions:
1447
- instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1448
1459
  else:
1449
- instructions = f"{instructions}\n"
1460
+ raise ValueError(f"Unknown scenario {self.scenario}")
1461
+
1462
+ if self.suffix:
1450
1463
  return [
1451
1464
  replace(
1452
1465
  run_spec,
1453
- adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1466
+ adapter_spec=replace(
1467
+ run_spec.adapter_spec,
1468
+ global_suffix=f"{run_spec.adapter_spec.global_suffix}\n\n{instructions}",
1469
+ ),
1454
1470
  ),
1455
1471
  ]
1456
- raise ValueError(f"Unknown scenario {self.scenario}")
1472
+
1473
+ if run_spec.adapter_spec.instructions:
1474
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1475
+ else:
1476
+ instructions = f"{instructions}\n"
1477
+ return [
1478
+ replace(
1479
+ run_spec,
1480
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1481
+ ),
1482
+ ]
1457
1483
 
1458
1484
 
1459
1485
  RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [