crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -9,12 +9,10 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
- import cattrs
13
12
  import os
14
13
  import datetime
15
14
  import urllib.parse
16
15
  import json
17
- import yaml
18
16
  from collections import defaultdict
19
17
  from dataclasses import dataclass, replace
20
18
  from statistics import mean, median
@@ -35,8 +33,6 @@ from helm.common.codec import from_json
35
33
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
36
34
  from helm.benchmark.scenarios.scenario import ScenarioSpec
37
35
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
38
- from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
39
- from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
40
36
  from helm.benchmark.metrics.metric_name import MetricName
41
37
  from helm.benchmark.metrics.metric import get_all_stats_by_name
42
38
  from helm.benchmark.metrics.statistic import Stat, merge_stat
@@ -58,9 +54,6 @@ from helm.benchmark.presentation.run_display import write_run_display_json
58
54
  from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
59
55
 
60
56
 
61
- OVERLAP_N_COUNT = 13
62
-
63
-
64
57
  @dataclass(frozen=True)
65
58
  class ExecutiveSummary:
66
59
  """
@@ -226,17 +219,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
226
219
  """
227
220
  assert aggregation in ["mean", "median"]
228
221
  win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
229
- for i, header_cell in enumerate(table.header):
222
+ for column_index, header_cell in enumerate(table.header):
230
223
  lower_is_better = header_cell.lower_is_better
231
224
  if lower_is_better is None: # column does not have a meaningful ordering
232
225
  continue
233
-
234
- values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
235
- if len(values) < 2: # don't rank a single model
226
+ value_to_count: Dict[float, int] = defaultdict(int)
227
+ for row in table.rows:
228
+ value = row[column_index].value
229
+ if value is not None:
230
+ value_to_count[value] += 1
231
+ value_to_wins: Dict[float, float] = {}
232
+ acc_count = 0
233
+ for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
234
+ value_to_wins[value] = acc_count + ((value_count - 1) / 2)
235
+ acc_count += value_count
236
+ total_count = acc_count
237
+ if total_count < 2:
236
238
  continue
237
- for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
238
- win_rate = wins / (len(values) - 1) # normalize to [0, 1]
239
- win_rates_per_row[j].append(win_rate)
239
+ for row_index, row in enumerate(table.rows):
240
+ value = row[column_index].value
241
+ if value is not None:
242
+ win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
240
243
 
241
244
  # Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
242
245
  # Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
@@ -251,7 +254,44 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
251
254
  return aggregate_win_rates
252
255
 
253
256
 
254
- AGGREGATE_WIN_RATE_COLUMN = 1
257
+ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
258
+ """
259
+ Computes the aggregate mean of each row across columns.
260
+ Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
261
+ non-null values of the row are in columns we skip).
262
+ """
263
+
264
+ row_means: List[Optional[float]] = []
265
+
266
+ # check for all header cells where specified, that lower_is_better is consistent
267
+ orderings = []
268
+ for elem in table.header:
269
+ orderings.append(elem.lower_is_better)
270
+ if len(set(orderings)) != 1:
271
+ raise Exception("Cannot mean columns with different values for lower_is_better")
272
+
273
+ for row in table.rows:
274
+ total = 0.0
275
+ count = 0
276
+ for cell in row:
277
+ if cell.value is not None:
278
+ total += float(cell.value)
279
+ count += 1
280
+ if count == 0:
281
+ row_means.append(None)
282
+ else:
283
+ row_means.append(total / count)
284
+
285
+ return row_means
286
+
287
+
288
+ class AggregationStrategy:
289
+ # TODO: Convert to StrEnum after upgrading to Python 3.11
290
+ WIN_RATE = "win_rate"
291
+ MEAN = "mean"
292
+
293
+
294
+ ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
255
295
 
256
296
 
257
297
  class Summarizer:
@@ -483,137 +523,6 @@ class Summarizer:
483
523
  for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
484
524
  self.read_runs_for_suite(suite, run_suite_path)
485
525
 
486
- def read_overlap_stats(self):
487
- """
488
- Load the overlap stats in the run suite path.
489
- Concretely:
490
- - get group -> scenario_spec information from self.runs
491
- run_spec data
492
- - read the files in the data_overlap directory in run_suite_path
493
- which are scenario_spec -> overlap ids
494
- - get aggregate stats for group -> overlap ratio
495
- """
496
-
497
- def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
498
- scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
499
- for run_spec in run_specs:
500
- scenario_spec = run_spec.scenario_spec
501
- groups = run_spec.groups
502
- if (
503
- scenario_spec.class_name
504
- != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
505
- ):
506
- scenario_specs_to_groups[scenario_spec] = groups
507
-
508
- group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
509
- for scenario_spec, groups in scenario_specs_to_groups.items():
510
- for group in groups:
511
- if group not in group_to_scenario_specs:
512
- group_to_scenario_specs[group] = []
513
- group_to_scenario_specs[group].append(scenario_spec)
514
- return group_to_scenario_specs
515
-
516
- def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
517
- """
518
- Takes the data_overlap_dir as input and returns a dictionary
519
- of stats_file_path -> List(model_names)
520
-
521
- Sample input:
522
- file_models_mapping:
523
- - file_name: file1
524
- model_names:
525
- - model1
526
- - model2
527
- - file_name: file2
528
- model_names:
529
- - model2
530
- - model3
531
-
532
- """
533
- metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
534
- if not os.path.exists(metadata_file_path):
535
- return {}
536
-
537
- with open(metadata_file_path, "r") as yaml_file:
538
- data = yaml.safe_load(yaml_file)
539
-
540
- file_metadata: Dict[str, List[str]] = {}
541
- for entry in data["file_models_mapping"]:
542
- if "file_name" in entry and "model_names" in entry:
543
- file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
544
- file_metadata[file_path] = entry["model_names"]
545
-
546
- return file_metadata
547
-
548
- # TODO: Delete this after @andyzorigin's project is done.
549
- self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
550
-
551
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
552
- if not os.path.isdir(data_overlap_dir):
553
- hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
554
- return
555
-
556
- group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
557
-
558
- stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
559
-
560
- for file_path, model_names in stats_file_metadata.items():
561
- overlap_stats_jsons = open(file_path, "r").readlines()
562
-
563
- data_overlap_stats_list: List[DataOverlapStats] = []
564
- for overlap_stats_json in overlap_stats_jsons:
565
- overlap_stats_dict = json.loads(overlap_stats_json)
566
- data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
567
-
568
- scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
569
- for data_overlap_stats in data_overlap_stats_list:
570
- data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
571
- n = data_overlap_stats_key.overlap_protocol_spec.n
572
- if n == OVERLAP_N_COUNT:
573
- light_scenario_key = data_overlap_stats_key.light_scenario_key
574
- scenario_spec = light_scenario_key.scenario_spec
575
- if scenario_spec in self.scenario_spec_instance_id_dict:
576
- # Get statistics based on the subset of instance_ids that HELM uses for a scenario
577
- instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
578
- num_instances = len(instance_ids)
579
- num_overlapping_inputs = len(
580
- set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
581
- )
582
- num_overlapping_references = len(
583
- set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
584
- )
585
- scenario_spec_overlap_counts[scenario_spec] = (
586
- num_instances,
587
- num_overlapping_inputs,
588
- num_overlapping_references,
589
- )
590
-
591
- for group, scenario_specs in group_to_scenario_specs.items():
592
- group_num_instances = 0
593
- group_num_overlapping_inputs = 0
594
- group_num_overlapping_references = 0
595
- for scenario_spec in scenario_specs:
596
- if scenario_spec in scenario_spec_overlap_counts:
597
- (
598
- num_instances,
599
- num_overlapping_inputs,
600
- num_overlapping_references,
601
- ) = scenario_spec_overlap_counts[scenario_spec]
602
- group_num_instances += num_instances
603
- group_num_overlapping_inputs += num_overlapping_inputs
604
- group_num_overlapping_references += num_overlapping_references
605
- if group_num_instances != 0:
606
- group_overlap_stats = GroupOverlapStats(
607
- group=group,
608
- num_instances=group_num_instances,
609
- num_overlapping_inputs=group_num_overlapping_inputs,
610
- num_overlapping_references=group_num_overlapping_references,
611
- )
612
- for model_name in model_names:
613
- # Assume model name will only be associated with single group overlap list for now
614
- # can update to join lists if need arises
615
- self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
616
-
617
526
  @htrack(None)
618
527
  def check_metrics_defined(self):
619
528
  """Check that all the metrics that appear in stats are defined."""
@@ -880,7 +789,7 @@ class Summarizer:
880
789
  sort_by_model_order: bool = True,
881
790
  sub_split: Optional[str] = None,
882
791
  bold_columns: bool = True,
883
- add_win_rate: bool = False,
792
+ aggregation_strategies: List[str] = [],
884
793
  ) -> Table:
885
794
  """
886
795
  Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1016,16 +925,6 @@ class Summarizer:
1016
925
 
1017
926
  description = ""
1018
927
 
1019
- group_overlap_stats = None
1020
- if (model_name, group_name) in self._model_group_overlap_stats:
1021
- group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
1022
-
1023
- description = (
1024
- f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
1025
- f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
1026
- f"{description}"
1027
- )
1028
-
1029
928
  # HACK: we want to hide stats for the following model-metric combinations:
1030
929
  # 1. Calibration metrics + AI21/Anthropic
1031
930
  # 2. MSMARCO metrics + AI21/Anthropic
@@ -1063,21 +962,44 @@ class Summarizer:
1063
962
 
1064
963
  table = Table(title=title, header=header, rows=rows, links=links, name=name)
1065
964
 
1066
- if add_win_rate:
1067
- # add overall win rate as the second column
1068
- WIN_RATE_AGGREGATION = "mean"
1069
- win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
1070
- description = "How many models this model outperform on average (over columns)."
1071
- table.header.insert(
1072
- AGGREGATE_WIN_RATE_COLUMN,
1073
- HeaderCell(
1074
- f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
1075
- description=description,
1076
- lower_is_better=False,
1077
- ),
1078
- )
1079
- for row, win_rate in zip(table.rows, win_rates):
1080
- row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
965
+ aggregate_header_cells: List[HeaderCell] = []
966
+ aggregate_row_values: List[List[Optional[float]]] = []
967
+
968
+ for strategy in aggregation_strategies:
969
+ if strategy == AggregationStrategy.WIN_RATE:
970
+ WIN_RATE_AGGREGATION = "mean"
971
+ win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
972
+ description = "How many models this model outperforms on average (over columns)."
973
+ aggregate_header_cells.append(
974
+ HeaderCell(
975
+ f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
976
+ description=description,
977
+ lower_is_better=False,
978
+ )
979
+ )
980
+ aggregate_row_values.append(win_rates)
981
+ elif strategy == AggregationStrategy.MEAN:
982
+ means = compute_aggregate_row_means(table)
983
+ description = "An average over columns representing the mean performance."
984
+ aggregate_header_cells.append(
985
+ HeaderCell(
986
+ "Mean performance",
987
+ description=description,
988
+ lower_is_better=table.header[0].lower_is_better,
989
+ )
990
+ )
991
+ aggregate_row_values.append(means)
992
+ else:
993
+ raise Exception(
994
+ f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
995
+ )
996
+
997
+ for i in range(len(aggregate_header_cells)):
998
+ aggregate_header_cell = aggregate_header_cells[i]
999
+ aggregate_rows = aggregate_row_values[i]
1000
+ table.header.insert(i + 1, aggregate_header_cell)
1001
+ for row, row_val in zip(table.rows, aggregate_rows):
1002
+ row.insert(i + 1, Cell(row_val))
1081
1003
 
1082
1004
  if bold_columns:
1083
1005
  for i, header_cell in enumerate(table.header):
@@ -1125,14 +1047,22 @@ class Summarizer:
1125
1047
 
1126
1048
  if len(adapter_to_runs) > 0:
1127
1049
  for metric_group in all_metric_groups:
1128
- display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
1050
+ metric_group_config = self.schema.name_to_metric_group[metric_group]
1051
+ display_name = metric_group_config.get_short_display_name()
1052
+ aggregate_strategies: List[str]
1053
+ if metric_group_config.aggregation_strategies is not None:
1054
+ aggregate_strategies = metric_group_config.aggregation_strategies
1055
+ elif metric_group_config.hide_win_rates:
1056
+ aggregate_strategies = []
1057
+ else:
1058
+ aggregate_strategies = [AggregationStrategy.WIN_RATE]
1129
1059
  table = self.create_group_table(
1130
1060
  name=metric_group,
1131
1061
  title=display_name,
1132
1062
  adapter_to_runs=adapter_to_runs,
1133
1063
  columns=[(subgroup, metric_group) for subgroup in subgroups],
1134
1064
  is_scenario_table=False,
1135
- add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
1065
+ aggregation_strategies=aggregate_strategies,
1136
1066
  )
1137
1067
  tables.append(table)
1138
1068
  return tables
@@ -1262,72 +1192,6 @@ class Summarizer:
1262
1192
 
1263
1193
  parallel_map(process, self.runs, parallelism=self.num_threads)
1264
1194
 
1265
- def read_scenario_spec_instance_ids(self, num_instances) -> None:
1266
- """
1267
- This file checks if there exists a file, scenario_spec_instance_ids.json
1268
- that it can read the instance_ids associated with scenario_specs.
1269
-
1270
- It will write the num_instances used in the run as part of the file name
1271
-
1272
- If it doesn't exist, it will go through all the scenario_state files
1273
- and parse the instance_ids and output it to the file for future uses
1274
-
1275
- Only when the scenario_specs for the data overlap script change
1276
- (or num_instances are different), will this need to be rerun.
1277
-
1278
- In such cases, do not include the file as part of the data_overlap directory.
1279
- """
1280
- self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
1281
-
1282
- data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
1283
- if not os.path.isdir(data_overlap_dir):
1284
- hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
1285
- return
1286
-
1287
- scenario_spec_instance_ids_json = os.path.join(
1288
- data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
1289
- )
1290
- if not os.path.exists(scenario_spec_instance_ids_json):
1291
- hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
1292
- self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
1293
- else:
1294
- hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
1295
- scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
1296
-
1297
- for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
1298
- scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
1299
- scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
1300
- self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
1301
- scenario_spec_instance_ids.instance_ids
1302
- )
1303
-
1304
- def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1305
- for run in self.runs:
1306
- run_spec = run.run_spec
1307
- scenario_spec = run_spec.scenario_spec
1308
- if scenario_spec in self.scenario_spec_instance_id_dict:
1309
- continue
1310
-
1311
- run_path = run.run_path
1312
- instances_file_path = os.path.join(run_path, "instances.json")
1313
- with open(instances_file_path, "r") as f:
1314
- raw_instances = json.load(f)
1315
-
1316
- # Optimization: Don't structure to dataclass, since we only need to read `id`
1317
- instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
1318
- self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
1319
-
1320
- all_scenario_spec_instance_ids = []
1321
- for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
1322
- scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
1323
- all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
1324
-
1325
- with open(file_path, "w") as f:
1326
- f.writelines(
1327
- f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
1328
- for scenario_spec_instance_ids in all_scenario_spec_instance_ids
1329
- )
1330
-
1331
1195
  def symlink_latest(self) -> None:
1332
1196
  # Create a symlink runs/latest -> runs/<name_of_suite>,
1333
1197
  # so runs/latest always points to the latest run suite.
@@ -1339,7 +1203,7 @@ class Summarizer:
1339
1203
  os.unlink(symlink_path)
1340
1204
  os.symlink(os.path.basename(self.run_release_path), symlink_path)
1341
1205
 
1342
- def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
1206
+ def run_pipeline(self, skip_completed: bool) -> None:
1343
1207
  """Run the entire summarization pipeline."""
1344
1208
  self.read_runs()
1345
1209
  self.group_runs()
@@ -1347,14 +1211,6 @@ class Summarizer:
1347
1211
 
1348
1212
  self.write_run_display_json(skip_completed)
1349
1213
 
1350
- # Must happen after summarizer.write_run_display_json()
1351
- # because it uses instances.json files
1352
- self.read_scenario_spec_instance_ids(num_instances)
1353
-
1354
- # Must happen after summarizer.read_scenario_spec_instance_ids()
1355
- # because it uses self.scenario_spec_instance_id_dict
1356
- self.read_overlap_stats()
1357
-
1358
1214
  # Must happen after self.read_runs()
1359
1215
  # because it uses self.runs
1360
1216
  self.write_schema()
@@ -1404,12 +1260,6 @@ def main():
1404
1260
  action="store_true",
1405
1261
  help="Skip write_run_display_json() for runs which already have all output display JSON files",
1406
1262
  )
1407
- parser.add_argument(
1408
- "-num-instances",
1409
- type=int,
1410
- help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
1411
- default=1000,
1412
- )
1413
1263
  parser.add_argument(
1414
1264
  "--local-path",
1415
1265
  type=str,
@@ -1461,7 +1311,7 @@ def main():
1461
1311
  num_threads=args.num_threads,
1462
1312
  allow_unknown_models=args.allow_unknown_models,
1463
1313
  )
1464
- summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
1314
+ summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
1465
1315
  hlog("Done.")
1466
1316
 
1467
1317
 
@@ -16,6 +16,7 @@ class TestRunEntry:
16
16
 
17
17
  @pytest.mark.parametrize("fname", list_fnames())
18
18
  def test_read_all_specs(self, fname: str):
19
+ pytest.skip("Skipping slow tests")
19
20
  run_entries = read_run_entries([fname])
20
21
  for entry in run_entries.entries:
21
22
  construct_run_specs(parse_object_spec(entry.description))
@@ -1,8 +1,9 @@
1
1
  import os
2
2
  import tempfile
3
3
 
4
- from helm.benchmark.presentation.summarize import Summarizer
4
+ from helm.benchmark.presentation.summarize import Summarizer, compute_aggregate_row_win_rates
5
5
  from helm.benchmark.presentation.schema import get_default_schema_path
6
+ from helm.benchmark.presentation.table import Cell, HeaderCell, Table
6
7
  from helm.common.general import ensure_directory_exists
7
8
 
8
9
 
@@ -19,7 +20,7 @@ def test_summarize_suite():
19
20
  num_threads=4,
20
21
  allow_unknown_models=True,
21
22
  )
22
- summarizer.run_pipeline(skip_completed=True, num_instances=1000)
23
+ summarizer.run_pipeline(skip_completed=True)
23
24
  assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
24
25
 
25
26
 
@@ -37,5 +38,146 @@ def test_summarize_release():
37
38
  num_threads=4,
38
39
  allow_unknown_models=True,
39
40
  )
40
- summarizer.run_pipeline(skip_completed=True, num_instances=1000)
41
+ summarizer.run_pipeline(skip_completed=True)
41
42
  assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
43
+
44
+
45
+ def test_compute_win_rates_one_scenario():
46
+ header = [
47
+ HeaderCell(value="Model"),
48
+ HeaderCell(value="Scenario A", lower_is_better=False),
49
+ ]
50
+ values = [
51
+ ["Model A", 1],
52
+ ["Model B", 2],
53
+ ["Model C", 3],
54
+ ["Model D", 4],
55
+ ["Model E", 5],
56
+ ]
57
+ rows = [[Cell(value) for value in row_values] for row_values in values]
58
+ table = Table(title="Test Table", header=header, rows=rows)
59
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
60
+
61
+
62
+ def test_compute_win_rates_two_scenarios():
63
+ header = [
64
+ HeaderCell(value="Model"),
65
+ HeaderCell(value="Scenario A", lower_is_better=False),
66
+ HeaderCell(value="Scenario B", lower_is_better=False),
67
+ ]
68
+ values = [
69
+ ["Model A", 1, 3],
70
+ ["Model B", 2, 1],
71
+ ["Model C", 3, 2],
72
+ ["Model D", 4, 5],
73
+ ["Model E", 5, 4],
74
+ ]
75
+ rows = [[Cell(value) for value in row_values] for row_values in values]
76
+ table = Table(title="Test Table", header=header, rows=rows)
77
+ assert compute_aggregate_row_win_rates(table) == [0.25, 0.125, 0.375, 0.875, 0.875]
78
+
79
+
80
+ def test_compute_win_rates_incomplete_values():
81
+ header = [
82
+ HeaderCell(value="Model"),
83
+ HeaderCell(value="Scenario A", lower_is_better=False),
84
+ HeaderCell(value="Scenario B", lower_is_better=False),
85
+ ]
86
+ values = [
87
+ ["Model A", 1, 3],
88
+ ["Model B", 2, 1],
89
+ ["Model C", 3, None],
90
+ ["Model D", 4, None],
91
+ ["Model E", 5, None],
92
+ ]
93
+ rows = [[Cell(value) for value in row_values] for row_values in values]
94
+ table = Table(title="Test Table", header=header, rows=rows)
95
+ assert compute_aggregate_row_win_rates(table) == [0.5, 0.125, 0.5, 0.75, 1]
96
+
97
+
98
+ def test_compute_win_rates_ignore_nones():
99
+ header = [
100
+ HeaderCell(value="Model"),
101
+ HeaderCell(value="Scenario A", lower_is_better=False),
102
+ HeaderCell(value="Scenario B", lower_is_better=False),
103
+ HeaderCell(value="Scenario C", lower_is_better=False),
104
+ ]
105
+ values = [
106
+ ["Model A", 1, None, None],
107
+ ["Model B", 2, None, 1],
108
+ ["Model C", 3, None, None],
109
+ ["Model D", 4, None, None],
110
+ ["Model E", 5, None, None],
111
+ ]
112
+ rows = [[Cell(value) for value in row_values] for row_values in values]
113
+ table = Table(title="Test Table", header=header, rows=rows)
114
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
115
+
116
+
117
+ def test_compute_win_rates_ignore_unset_lower_is_better():
118
+ header = [
119
+ HeaderCell(value="Model"),
120
+ HeaderCell(value="Scenario A", lower_is_better=False),
121
+ HeaderCell(value="Scenario B"),
122
+ ]
123
+ values = [
124
+ ["Model A", 1, 3],
125
+ ["Model B", 2, 1],
126
+ ["Model C", 3, 2],
127
+ ["Model D", 4, 5],
128
+ ["Model E", 5, 4],
129
+ ]
130
+ rows = [[Cell(value) for value in row_values] for row_values in values]
131
+ table = Table(title="Test Table", header=header, rows=rows)
132
+ assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
133
+
134
+
135
+ def test_compute_win_rates_no_win_rate():
136
+ header = [
137
+ HeaderCell(value="Model"),
138
+ HeaderCell(value="Scenario A", lower_is_better=False),
139
+ ]
140
+ values = [
141
+ ["Model A", None],
142
+ ["Model B", None],
143
+ ["Model C", None],
144
+ ["Model D", None],
145
+ ["Model E", None],
146
+ ]
147
+ rows = [[Cell(value) for value in row_values] for row_values in values]
148
+ table = Table(title="Test Table", header=header, rows=rows)
149
+ assert compute_aggregate_row_win_rates(table) == [None, None, None, None, None]
150
+
151
+
152
+ def test_compute_win_rates_ties():
153
+ header = [
154
+ HeaderCell(value="Model"),
155
+ HeaderCell(value="Scenario A", lower_is_better=False),
156
+ ]
157
+ values = [
158
+ ["Model A", 1],
159
+ ["Model B", 1],
160
+ ["Model C", 1],
161
+ ["Model D", 4],
162
+ ["Model E", 5],
163
+ ]
164
+ rows = [[Cell(value) for value in row_values] for row_values in values]
165
+ table = Table(title="Test Table", header=header, rows=rows)
166
+ assert compute_aggregate_row_win_rates(table) == [0.25, 0.25, 0.25, 0.75, 1.0]
167
+
168
+
169
+ def test_compute_win_rates_lower_is_better():
170
+ header = [
171
+ HeaderCell(value="Model"),
172
+ HeaderCell(value="Scenario A", lower_is_better=True),
173
+ ]
174
+ values = [
175
+ ["Model A", 1],
176
+ ["Model B", 2],
177
+ ["Model C", 3],
178
+ ["Model D", 4],
179
+ ["Model E", 5],
180
+ ]
181
+ rows = [[Cell(value) for value in row_values] for row_values in values]
182
+ table = Table(title="Test Table", header=header, rows=rows)
183
+ assert compute_aggregate_row_win_rates(table) == [1, 0.75, 0.5, 0.25, 0]
helm/benchmark/run.py CHANGED
@@ -1,9 +1,11 @@
1
1
  import argparse
2
2
  from dataclasses import replace
3
3
  import os
4
+ import re
4
5
  from typing import List, Optional
5
6
 
6
7
 
8
+ from helm.benchmark import model_metadata_registry
7
9
  from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
8
10
  from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
9
11
  from helm.common.general import ensure_directory_exists
@@ -314,6 +316,19 @@ def main():
314
316
  ensure_directory_exists(args.output_path)
315
317
  set_benchmark_output_path(args.output_path)
316
318
 
319
+ # Validate the --models-to-run flag
320
+ if args.models_to_run:
321
+ all_models = set(model_metadata_registry.get_all_models())
322
+ for model_to_run in args.models_to_run:
323
+ if model_to_run not in all_models:
324
+ raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
325
+ else:
326
+ model_expander_pattern = re.compile(
327
+ r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
328
+ )
329
+ if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
330
+ raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
331
+
317
332
  run_specs = run_entries_to_run_specs(
318
333
  run_entries=run_entries,
319
334
  max_eval_instances=args.max_eval_instances,