crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -9,12 +9,10 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
-
import cattrs
|
|
13
12
|
import os
|
|
14
13
|
import datetime
|
|
15
14
|
import urllib.parse
|
|
16
15
|
import json
|
|
17
|
-
import yaml
|
|
18
16
|
from collections import defaultdict
|
|
19
17
|
from dataclasses import dataclass, replace
|
|
20
18
|
from statistics import mean, median
|
|
@@ -35,8 +33,6 @@ from helm.common.codec import from_json
|
|
|
35
33
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
36
34
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
37
35
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
38
|
-
from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
|
|
39
|
-
from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
|
|
40
36
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
41
37
|
from helm.benchmark.metrics.metric import get_all_stats_by_name
|
|
42
38
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
@@ -58,9 +54,6 @@ from helm.benchmark.presentation.run_display import write_run_display_json
|
|
|
58
54
|
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
59
55
|
|
|
60
56
|
|
|
61
|
-
OVERLAP_N_COUNT = 13
|
|
62
|
-
|
|
63
|
-
|
|
64
57
|
@dataclass(frozen=True)
|
|
65
58
|
class ExecutiveSummary:
|
|
66
59
|
"""
|
|
@@ -226,17 +219,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
226
219
|
"""
|
|
227
220
|
assert aggregation in ["mean", "median"]
|
|
228
221
|
win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
|
|
229
|
-
for
|
|
222
|
+
for column_index, header_cell in enumerate(table.header):
|
|
230
223
|
lower_is_better = header_cell.lower_is_better
|
|
231
224
|
if lower_is_better is None: # column does not have a meaningful ordering
|
|
232
225
|
continue
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
226
|
+
value_to_count: Dict[float, int] = defaultdict(int)
|
|
227
|
+
for row in table.rows:
|
|
228
|
+
value = row[column_index].value
|
|
229
|
+
if value is not None:
|
|
230
|
+
value_to_count[value] += 1
|
|
231
|
+
value_to_wins: Dict[float, float] = {}
|
|
232
|
+
acc_count = 0
|
|
233
|
+
for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
|
|
234
|
+
value_to_wins[value] = acc_count + ((value_count - 1) / 2)
|
|
235
|
+
acc_count += value_count
|
|
236
|
+
total_count = acc_count
|
|
237
|
+
if total_count < 2:
|
|
236
238
|
continue
|
|
237
|
-
for
|
|
238
|
-
|
|
239
|
-
|
|
239
|
+
for row_index, row in enumerate(table.rows):
|
|
240
|
+
value = row[column_index].value
|
|
241
|
+
if value is not None:
|
|
242
|
+
win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
|
|
240
243
|
|
|
241
244
|
# Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
|
|
242
245
|
# Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
|
|
@@ -251,7 +254,44 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
251
254
|
return aggregate_win_rates
|
|
252
255
|
|
|
253
256
|
|
|
254
|
-
|
|
257
|
+
def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
258
|
+
"""
|
|
259
|
+
Computes the aggregate mean of each row across columns.
|
|
260
|
+
Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
|
|
261
|
+
non-null values of the row are in columns we skip).
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
row_means: List[Optional[float]] = []
|
|
265
|
+
|
|
266
|
+
# check for all header cells where specified, that lower_is_better is consistent
|
|
267
|
+
orderings = []
|
|
268
|
+
for elem in table.header:
|
|
269
|
+
orderings.append(elem.lower_is_better)
|
|
270
|
+
if len(set(orderings)) != 1:
|
|
271
|
+
raise Exception("Cannot mean columns with different values for lower_is_better")
|
|
272
|
+
|
|
273
|
+
for row in table.rows:
|
|
274
|
+
total = 0.0
|
|
275
|
+
count = 0
|
|
276
|
+
for cell in row:
|
|
277
|
+
if cell.value is not None:
|
|
278
|
+
total += float(cell.value)
|
|
279
|
+
count += 1
|
|
280
|
+
if count == 0:
|
|
281
|
+
row_means.append(None)
|
|
282
|
+
else:
|
|
283
|
+
row_means.append(total / count)
|
|
284
|
+
|
|
285
|
+
return row_means
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class AggregationStrategy:
|
|
289
|
+
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
290
|
+
WIN_RATE = "win_rate"
|
|
291
|
+
MEAN = "mean"
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
|
|
255
295
|
|
|
256
296
|
|
|
257
297
|
class Summarizer:
|
|
@@ -483,137 +523,6 @@ class Summarizer:
|
|
|
483
523
|
for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
|
|
484
524
|
self.read_runs_for_suite(suite, run_suite_path)
|
|
485
525
|
|
|
486
|
-
def read_overlap_stats(self):
|
|
487
|
-
"""
|
|
488
|
-
Load the overlap stats in the run suite path.
|
|
489
|
-
Concretely:
|
|
490
|
-
- get group -> scenario_spec information from self.runs
|
|
491
|
-
run_spec data
|
|
492
|
-
- read the files in the data_overlap directory in run_suite_path
|
|
493
|
-
which are scenario_spec -> overlap ids
|
|
494
|
-
- get aggregate stats for group -> overlap ratio
|
|
495
|
-
"""
|
|
496
|
-
|
|
497
|
-
def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
|
|
498
|
-
scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
|
|
499
|
-
for run_spec in run_specs:
|
|
500
|
-
scenario_spec = run_spec.scenario_spec
|
|
501
|
-
groups = run_spec.groups
|
|
502
|
-
if (
|
|
503
|
-
scenario_spec.class_name
|
|
504
|
-
!= "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
|
|
505
|
-
):
|
|
506
|
-
scenario_specs_to_groups[scenario_spec] = groups
|
|
507
|
-
|
|
508
|
-
group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
|
|
509
|
-
for scenario_spec, groups in scenario_specs_to_groups.items():
|
|
510
|
-
for group in groups:
|
|
511
|
-
if group not in group_to_scenario_specs:
|
|
512
|
-
group_to_scenario_specs[group] = []
|
|
513
|
-
group_to_scenario_specs[group].append(scenario_spec)
|
|
514
|
-
return group_to_scenario_specs
|
|
515
|
-
|
|
516
|
-
def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
|
|
517
|
-
"""
|
|
518
|
-
Takes the data_overlap_dir as input and returns a dictionary
|
|
519
|
-
of stats_file_path -> List(model_names)
|
|
520
|
-
|
|
521
|
-
Sample input:
|
|
522
|
-
file_models_mapping:
|
|
523
|
-
- file_name: file1
|
|
524
|
-
model_names:
|
|
525
|
-
- model1
|
|
526
|
-
- model2
|
|
527
|
-
- file_name: file2
|
|
528
|
-
model_names:
|
|
529
|
-
- model2
|
|
530
|
-
- model3
|
|
531
|
-
|
|
532
|
-
"""
|
|
533
|
-
metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
|
|
534
|
-
if not os.path.exists(metadata_file_path):
|
|
535
|
-
return {}
|
|
536
|
-
|
|
537
|
-
with open(metadata_file_path, "r") as yaml_file:
|
|
538
|
-
data = yaml.safe_load(yaml_file)
|
|
539
|
-
|
|
540
|
-
file_metadata: Dict[str, List[str]] = {}
|
|
541
|
-
for entry in data["file_models_mapping"]:
|
|
542
|
-
if "file_name" in entry and "model_names" in entry:
|
|
543
|
-
file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
|
|
544
|
-
file_metadata[file_path] = entry["model_names"]
|
|
545
|
-
|
|
546
|
-
return file_metadata
|
|
547
|
-
|
|
548
|
-
# TODO: Delete this after @andyzorigin's project is done.
|
|
549
|
-
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
|
|
550
|
-
|
|
551
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
552
|
-
if not os.path.isdir(data_overlap_dir):
|
|
553
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
|
|
554
|
-
return
|
|
555
|
-
|
|
556
|
-
group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
|
|
557
|
-
|
|
558
|
-
stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
|
|
559
|
-
|
|
560
|
-
for file_path, model_names in stats_file_metadata.items():
|
|
561
|
-
overlap_stats_jsons = open(file_path, "r").readlines()
|
|
562
|
-
|
|
563
|
-
data_overlap_stats_list: List[DataOverlapStats] = []
|
|
564
|
-
for overlap_stats_json in overlap_stats_jsons:
|
|
565
|
-
overlap_stats_dict = json.loads(overlap_stats_json)
|
|
566
|
-
data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
|
|
567
|
-
|
|
568
|
-
scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
|
|
569
|
-
for data_overlap_stats in data_overlap_stats_list:
|
|
570
|
-
data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
|
|
571
|
-
n = data_overlap_stats_key.overlap_protocol_spec.n
|
|
572
|
-
if n == OVERLAP_N_COUNT:
|
|
573
|
-
light_scenario_key = data_overlap_stats_key.light_scenario_key
|
|
574
|
-
scenario_spec = light_scenario_key.scenario_spec
|
|
575
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
576
|
-
# Get statistics based on the subset of instance_ids that HELM uses for a scenario
|
|
577
|
-
instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
|
|
578
|
-
num_instances = len(instance_ids)
|
|
579
|
-
num_overlapping_inputs = len(
|
|
580
|
-
set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
|
|
581
|
-
)
|
|
582
|
-
num_overlapping_references = len(
|
|
583
|
-
set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
|
|
584
|
-
)
|
|
585
|
-
scenario_spec_overlap_counts[scenario_spec] = (
|
|
586
|
-
num_instances,
|
|
587
|
-
num_overlapping_inputs,
|
|
588
|
-
num_overlapping_references,
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
for group, scenario_specs in group_to_scenario_specs.items():
|
|
592
|
-
group_num_instances = 0
|
|
593
|
-
group_num_overlapping_inputs = 0
|
|
594
|
-
group_num_overlapping_references = 0
|
|
595
|
-
for scenario_spec in scenario_specs:
|
|
596
|
-
if scenario_spec in scenario_spec_overlap_counts:
|
|
597
|
-
(
|
|
598
|
-
num_instances,
|
|
599
|
-
num_overlapping_inputs,
|
|
600
|
-
num_overlapping_references,
|
|
601
|
-
) = scenario_spec_overlap_counts[scenario_spec]
|
|
602
|
-
group_num_instances += num_instances
|
|
603
|
-
group_num_overlapping_inputs += num_overlapping_inputs
|
|
604
|
-
group_num_overlapping_references += num_overlapping_references
|
|
605
|
-
if group_num_instances != 0:
|
|
606
|
-
group_overlap_stats = GroupOverlapStats(
|
|
607
|
-
group=group,
|
|
608
|
-
num_instances=group_num_instances,
|
|
609
|
-
num_overlapping_inputs=group_num_overlapping_inputs,
|
|
610
|
-
num_overlapping_references=group_num_overlapping_references,
|
|
611
|
-
)
|
|
612
|
-
for model_name in model_names:
|
|
613
|
-
# Assume model name will only be associated with single group overlap list for now
|
|
614
|
-
# can update to join lists if need arises
|
|
615
|
-
self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
|
|
616
|
-
|
|
617
526
|
@htrack(None)
|
|
618
527
|
def check_metrics_defined(self):
|
|
619
528
|
"""Check that all the metrics that appear in stats are defined."""
|
|
@@ -880,7 +789,7 @@ class Summarizer:
|
|
|
880
789
|
sort_by_model_order: bool = True,
|
|
881
790
|
sub_split: Optional[str] = None,
|
|
882
791
|
bold_columns: bool = True,
|
|
883
|
-
|
|
792
|
+
aggregation_strategies: List[str] = [],
|
|
884
793
|
) -> Table:
|
|
885
794
|
"""
|
|
886
795
|
Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
|
|
@@ -1016,16 +925,6 @@ class Summarizer:
|
|
|
1016
925
|
|
|
1017
926
|
description = ""
|
|
1018
927
|
|
|
1019
|
-
group_overlap_stats = None
|
|
1020
|
-
if (model_name, group_name) in self._model_group_overlap_stats:
|
|
1021
|
-
group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
|
|
1022
|
-
|
|
1023
|
-
description = (
|
|
1024
|
-
f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
|
|
1025
|
-
f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
|
|
1026
|
-
f"{description}"
|
|
1027
|
-
)
|
|
1028
|
-
|
|
1029
928
|
# HACK: we want to hide stats for the following model-metric combinations:
|
|
1030
929
|
# 1. Calibration metrics + AI21/Anthropic
|
|
1031
930
|
# 2. MSMARCO metrics + AI21/Anthropic
|
|
@@ -1063,21 +962,44 @@ class Summarizer:
|
|
|
1063
962
|
|
|
1064
963
|
table = Table(title=title, header=header, rows=rows, links=links, name=name)
|
|
1065
964
|
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
965
|
+
aggregate_header_cells: List[HeaderCell] = []
|
|
966
|
+
aggregate_row_values: List[List[Optional[float]]] = []
|
|
967
|
+
|
|
968
|
+
for strategy in aggregation_strategies:
|
|
969
|
+
if strategy == AggregationStrategy.WIN_RATE:
|
|
970
|
+
WIN_RATE_AGGREGATION = "mean"
|
|
971
|
+
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
972
|
+
description = "How many models this model outperforms on average (over columns)."
|
|
973
|
+
aggregate_header_cells.append(
|
|
974
|
+
HeaderCell(
|
|
975
|
+
f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
|
|
976
|
+
description=description,
|
|
977
|
+
lower_is_better=False,
|
|
978
|
+
)
|
|
979
|
+
)
|
|
980
|
+
aggregate_row_values.append(win_rates)
|
|
981
|
+
elif strategy == AggregationStrategy.MEAN:
|
|
982
|
+
means = compute_aggregate_row_means(table)
|
|
983
|
+
description = "An average over columns representing the mean performance."
|
|
984
|
+
aggregate_header_cells.append(
|
|
985
|
+
HeaderCell(
|
|
986
|
+
"Mean performance",
|
|
987
|
+
description=description,
|
|
988
|
+
lower_is_better=table.header[0].lower_is_better,
|
|
989
|
+
)
|
|
990
|
+
)
|
|
991
|
+
aggregate_row_values.append(means)
|
|
992
|
+
else:
|
|
993
|
+
raise Exception(
|
|
994
|
+
f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
for i in range(len(aggregate_header_cells)):
|
|
998
|
+
aggregate_header_cell = aggregate_header_cells[i]
|
|
999
|
+
aggregate_rows = aggregate_row_values[i]
|
|
1000
|
+
table.header.insert(i + 1, aggregate_header_cell)
|
|
1001
|
+
for row, row_val in zip(table.rows, aggregate_rows):
|
|
1002
|
+
row.insert(i + 1, Cell(row_val))
|
|
1081
1003
|
|
|
1082
1004
|
if bold_columns:
|
|
1083
1005
|
for i, header_cell in enumerate(table.header):
|
|
@@ -1125,14 +1047,22 @@ class Summarizer:
|
|
|
1125
1047
|
|
|
1126
1048
|
if len(adapter_to_runs) > 0:
|
|
1127
1049
|
for metric_group in all_metric_groups:
|
|
1128
|
-
|
|
1050
|
+
metric_group_config = self.schema.name_to_metric_group[metric_group]
|
|
1051
|
+
display_name = metric_group_config.get_short_display_name()
|
|
1052
|
+
aggregate_strategies: List[str]
|
|
1053
|
+
if metric_group_config.aggregation_strategies is not None:
|
|
1054
|
+
aggregate_strategies = metric_group_config.aggregation_strategies
|
|
1055
|
+
elif metric_group_config.hide_win_rates:
|
|
1056
|
+
aggregate_strategies = []
|
|
1057
|
+
else:
|
|
1058
|
+
aggregate_strategies = [AggregationStrategy.WIN_RATE]
|
|
1129
1059
|
table = self.create_group_table(
|
|
1130
1060
|
name=metric_group,
|
|
1131
1061
|
title=display_name,
|
|
1132
1062
|
adapter_to_runs=adapter_to_runs,
|
|
1133
1063
|
columns=[(subgroup, metric_group) for subgroup in subgroups],
|
|
1134
1064
|
is_scenario_table=False,
|
|
1135
|
-
|
|
1065
|
+
aggregation_strategies=aggregate_strategies,
|
|
1136
1066
|
)
|
|
1137
1067
|
tables.append(table)
|
|
1138
1068
|
return tables
|
|
@@ -1262,72 +1192,6 @@ class Summarizer:
|
|
|
1262
1192
|
|
|
1263
1193
|
parallel_map(process, self.runs, parallelism=self.num_threads)
|
|
1264
1194
|
|
|
1265
|
-
def read_scenario_spec_instance_ids(self, num_instances) -> None:
|
|
1266
|
-
"""
|
|
1267
|
-
This file checks if there exists a file, scenario_spec_instance_ids.json
|
|
1268
|
-
that it can read the instance_ids associated with scenario_specs.
|
|
1269
|
-
|
|
1270
|
-
It will write the num_instances used in the run as part of the file name
|
|
1271
|
-
|
|
1272
|
-
If it doesn't exist, it will go through all the scenario_state files
|
|
1273
|
-
and parse the instance_ids and output it to the file for future uses
|
|
1274
|
-
|
|
1275
|
-
Only when the scenario_specs for the data overlap script change
|
|
1276
|
-
(or num_instances are different), will this need to be rerun.
|
|
1277
|
-
|
|
1278
|
-
In such cases, do not include the file as part of the data_overlap directory.
|
|
1279
|
-
"""
|
|
1280
|
-
self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
|
|
1281
|
-
|
|
1282
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
1283
|
-
if not os.path.isdir(data_overlap_dir):
|
|
1284
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
|
|
1285
|
-
return
|
|
1286
|
-
|
|
1287
|
-
scenario_spec_instance_ids_json = os.path.join(
|
|
1288
|
-
data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
|
|
1289
|
-
)
|
|
1290
|
-
if not os.path.exists(scenario_spec_instance_ids_json):
|
|
1291
|
-
hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
|
|
1292
|
-
self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
|
|
1293
|
-
else:
|
|
1294
|
-
hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
|
|
1295
|
-
scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
|
|
1296
|
-
|
|
1297
|
-
for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
|
|
1298
|
-
scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
|
|
1299
|
-
scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
|
|
1300
|
-
self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
|
|
1301
|
-
scenario_spec_instance_ids.instance_ids
|
|
1302
|
-
)
|
|
1303
|
-
|
|
1304
|
-
def write_scenario_spec_instance_ids_json(self, file_path) -> None:
|
|
1305
|
-
for run in self.runs:
|
|
1306
|
-
run_spec = run.run_spec
|
|
1307
|
-
scenario_spec = run_spec.scenario_spec
|
|
1308
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
1309
|
-
continue
|
|
1310
|
-
|
|
1311
|
-
run_path = run.run_path
|
|
1312
|
-
instances_file_path = os.path.join(run_path, "instances.json")
|
|
1313
|
-
with open(instances_file_path, "r") as f:
|
|
1314
|
-
raw_instances = json.load(f)
|
|
1315
|
-
|
|
1316
|
-
# Optimization: Don't structure to dataclass, since we only need to read `id`
|
|
1317
|
-
instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
|
|
1318
|
-
self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
|
|
1319
|
-
|
|
1320
|
-
all_scenario_spec_instance_ids = []
|
|
1321
|
-
for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
|
|
1322
|
-
scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
|
|
1323
|
-
all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
|
|
1324
|
-
|
|
1325
|
-
with open(file_path, "w") as f:
|
|
1326
|
-
f.writelines(
|
|
1327
|
-
f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
|
|
1328
|
-
for scenario_spec_instance_ids in all_scenario_spec_instance_ids
|
|
1329
|
-
)
|
|
1330
|
-
|
|
1331
1195
|
def symlink_latest(self) -> None:
|
|
1332
1196
|
# Create a symlink runs/latest -> runs/<name_of_suite>,
|
|
1333
1197
|
# so runs/latest always points to the latest run suite.
|
|
@@ -1339,7 +1203,7 @@ class Summarizer:
|
|
|
1339
1203
|
os.unlink(symlink_path)
|
|
1340
1204
|
os.symlink(os.path.basename(self.run_release_path), symlink_path)
|
|
1341
1205
|
|
|
1342
|
-
def run_pipeline(self, skip_completed: bool
|
|
1206
|
+
def run_pipeline(self, skip_completed: bool) -> None:
|
|
1343
1207
|
"""Run the entire summarization pipeline."""
|
|
1344
1208
|
self.read_runs()
|
|
1345
1209
|
self.group_runs()
|
|
@@ -1347,14 +1211,6 @@ class Summarizer:
|
|
|
1347
1211
|
|
|
1348
1212
|
self.write_run_display_json(skip_completed)
|
|
1349
1213
|
|
|
1350
|
-
# Must happen after summarizer.write_run_display_json()
|
|
1351
|
-
# because it uses instances.json files
|
|
1352
|
-
self.read_scenario_spec_instance_ids(num_instances)
|
|
1353
|
-
|
|
1354
|
-
# Must happen after summarizer.read_scenario_spec_instance_ids()
|
|
1355
|
-
# because it uses self.scenario_spec_instance_id_dict
|
|
1356
|
-
self.read_overlap_stats()
|
|
1357
|
-
|
|
1358
1214
|
# Must happen after self.read_runs()
|
|
1359
1215
|
# because it uses self.runs
|
|
1360
1216
|
self.write_schema()
|
|
@@ -1404,12 +1260,6 @@ def main():
|
|
|
1404
1260
|
action="store_true",
|
|
1405
1261
|
help="Skip write_run_display_json() for runs which already have all output display JSON files",
|
|
1406
1262
|
)
|
|
1407
|
-
parser.add_argument(
|
|
1408
|
-
"-num-instances",
|
|
1409
|
-
type=int,
|
|
1410
|
-
help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
|
|
1411
|
-
default=1000,
|
|
1412
|
-
)
|
|
1413
1263
|
parser.add_argument(
|
|
1414
1264
|
"--local-path",
|
|
1415
1265
|
type=str,
|
|
@@ -1461,7 +1311,7 @@ def main():
|
|
|
1461
1311
|
num_threads=args.num_threads,
|
|
1462
1312
|
allow_unknown_models=args.allow_unknown_models,
|
|
1463
1313
|
)
|
|
1464
|
-
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json
|
|
1314
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1465
1315
|
hlog("Done.")
|
|
1466
1316
|
|
|
1467
1317
|
|
|
@@ -16,6 +16,7 @@ class TestRunEntry:
|
|
|
16
16
|
|
|
17
17
|
@pytest.mark.parametrize("fname", list_fnames())
|
|
18
18
|
def test_read_all_specs(self, fname: str):
|
|
19
|
+
pytest.skip("Skipping slow tests")
|
|
19
20
|
run_entries = read_run_entries([fname])
|
|
20
21
|
for entry in run_entries.entries:
|
|
21
22
|
construct_run_specs(parse_object_spec(entry.description))
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
from helm.benchmark.presentation.summarize import Summarizer
|
|
4
|
+
from helm.benchmark.presentation.summarize import Summarizer, compute_aggregate_row_win_rates
|
|
5
5
|
from helm.benchmark.presentation.schema import get_default_schema_path
|
|
6
|
+
from helm.benchmark.presentation.table import Cell, HeaderCell, Table
|
|
6
7
|
from helm.common.general import ensure_directory_exists
|
|
7
8
|
|
|
8
9
|
|
|
@@ -19,7 +20,7 @@ def test_summarize_suite():
|
|
|
19
20
|
num_threads=4,
|
|
20
21
|
allow_unknown_models=True,
|
|
21
22
|
)
|
|
22
|
-
summarizer.run_pipeline(skip_completed=True
|
|
23
|
+
summarizer.run_pipeline(skip_completed=True)
|
|
23
24
|
assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
|
|
24
25
|
|
|
25
26
|
|
|
@@ -37,5 +38,146 @@ def test_summarize_release():
|
|
|
37
38
|
num_threads=4,
|
|
38
39
|
allow_unknown_models=True,
|
|
39
40
|
)
|
|
40
|
-
summarizer.run_pipeline(skip_completed=True
|
|
41
|
+
summarizer.run_pipeline(skip_completed=True)
|
|
41
42
|
assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_compute_win_rates_one_scenario():
|
|
46
|
+
header = [
|
|
47
|
+
HeaderCell(value="Model"),
|
|
48
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
49
|
+
]
|
|
50
|
+
values = [
|
|
51
|
+
["Model A", 1],
|
|
52
|
+
["Model B", 2],
|
|
53
|
+
["Model C", 3],
|
|
54
|
+
["Model D", 4],
|
|
55
|
+
["Model E", 5],
|
|
56
|
+
]
|
|
57
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
58
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
59
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_compute_win_rates_two_scenarios():
|
|
63
|
+
header = [
|
|
64
|
+
HeaderCell(value="Model"),
|
|
65
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
66
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
67
|
+
]
|
|
68
|
+
values = [
|
|
69
|
+
["Model A", 1, 3],
|
|
70
|
+
["Model B", 2, 1],
|
|
71
|
+
["Model C", 3, 2],
|
|
72
|
+
["Model D", 4, 5],
|
|
73
|
+
["Model E", 5, 4],
|
|
74
|
+
]
|
|
75
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
76
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
77
|
+
assert compute_aggregate_row_win_rates(table) == [0.25, 0.125, 0.375, 0.875, 0.875]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_compute_win_rates_incomplete_values():
|
|
81
|
+
header = [
|
|
82
|
+
HeaderCell(value="Model"),
|
|
83
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
84
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
85
|
+
]
|
|
86
|
+
values = [
|
|
87
|
+
["Model A", 1, 3],
|
|
88
|
+
["Model B", 2, 1],
|
|
89
|
+
["Model C", 3, None],
|
|
90
|
+
["Model D", 4, None],
|
|
91
|
+
["Model E", 5, None],
|
|
92
|
+
]
|
|
93
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
94
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
95
|
+
assert compute_aggregate_row_win_rates(table) == [0.5, 0.125, 0.5, 0.75, 1]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_compute_win_rates_ignore_nones():
|
|
99
|
+
header = [
|
|
100
|
+
HeaderCell(value="Model"),
|
|
101
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
102
|
+
HeaderCell(value="Scenario B", lower_is_better=False),
|
|
103
|
+
HeaderCell(value="Scenario C", lower_is_better=False),
|
|
104
|
+
]
|
|
105
|
+
values = [
|
|
106
|
+
["Model A", 1, None, None],
|
|
107
|
+
["Model B", 2, None, 1],
|
|
108
|
+
["Model C", 3, None, None],
|
|
109
|
+
["Model D", 4, None, None],
|
|
110
|
+
["Model E", 5, None, None],
|
|
111
|
+
]
|
|
112
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
113
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
114
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_compute_win_rates_ignore_unset_lower_is_better():
|
|
118
|
+
header = [
|
|
119
|
+
HeaderCell(value="Model"),
|
|
120
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
121
|
+
HeaderCell(value="Scenario B"),
|
|
122
|
+
]
|
|
123
|
+
values = [
|
|
124
|
+
["Model A", 1, 3],
|
|
125
|
+
["Model B", 2, 1],
|
|
126
|
+
["Model C", 3, 2],
|
|
127
|
+
["Model D", 4, 5],
|
|
128
|
+
["Model E", 5, 4],
|
|
129
|
+
]
|
|
130
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
131
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
132
|
+
assert compute_aggregate_row_win_rates(table) == [0, 0.25, 0.5, 0.75, 1]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_compute_win_rates_no_win_rate():
|
|
136
|
+
header = [
|
|
137
|
+
HeaderCell(value="Model"),
|
|
138
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
139
|
+
]
|
|
140
|
+
values = [
|
|
141
|
+
["Model A", None],
|
|
142
|
+
["Model B", None],
|
|
143
|
+
["Model C", None],
|
|
144
|
+
["Model D", None],
|
|
145
|
+
["Model E", None],
|
|
146
|
+
]
|
|
147
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
148
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
149
|
+
assert compute_aggregate_row_win_rates(table) == [None, None, None, None, None]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_compute_win_rates_ties():
|
|
153
|
+
header = [
|
|
154
|
+
HeaderCell(value="Model"),
|
|
155
|
+
HeaderCell(value="Scenario A", lower_is_better=False),
|
|
156
|
+
]
|
|
157
|
+
values = [
|
|
158
|
+
["Model A", 1],
|
|
159
|
+
["Model B", 1],
|
|
160
|
+
["Model C", 1],
|
|
161
|
+
["Model D", 4],
|
|
162
|
+
["Model E", 5],
|
|
163
|
+
]
|
|
164
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
165
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
166
|
+
assert compute_aggregate_row_win_rates(table) == [0.25, 0.25, 0.25, 0.75, 1.0]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_compute_win_rates_lower_is_better():
|
|
170
|
+
header = [
|
|
171
|
+
HeaderCell(value="Model"),
|
|
172
|
+
HeaderCell(value="Scenario A", lower_is_better=True),
|
|
173
|
+
]
|
|
174
|
+
values = [
|
|
175
|
+
["Model A", 1],
|
|
176
|
+
["Model B", 2],
|
|
177
|
+
["Model C", 3],
|
|
178
|
+
["Model D", 4],
|
|
179
|
+
["Model E", 5],
|
|
180
|
+
]
|
|
181
|
+
rows = [[Cell(value) for value in row_values] for row_values in values]
|
|
182
|
+
table = Table(title="Test Table", header=header, rows=rows)
|
|
183
|
+
assert compute_aggregate_row_win_rates(table) == [1, 0.75, 0.5, 0.25, 0]
|
helm/benchmark/run.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from dataclasses import replace
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from typing import List, Optional
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
from helm.benchmark import model_metadata_registry
|
|
7
9
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
10
|
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
9
11
|
from helm.common.general import ensure_directory_exists
|
|
@@ -314,6 +316,19 @@ def main():
|
|
|
314
316
|
ensure_directory_exists(args.output_path)
|
|
315
317
|
set_benchmark_output_path(args.output_path)
|
|
316
318
|
|
|
319
|
+
# Validate the --models-to-run flag
|
|
320
|
+
if args.models_to_run:
|
|
321
|
+
all_models = set(model_metadata_registry.get_all_models())
|
|
322
|
+
for model_to_run in args.models_to_run:
|
|
323
|
+
if model_to_run not in all_models:
|
|
324
|
+
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
325
|
+
else:
|
|
326
|
+
model_expander_pattern = re.compile(
|
|
327
|
+
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
328
|
+
)
|
|
329
|
+
if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
330
|
+
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
331
|
+
|
|
317
332
|
run_specs = run_entries_to_run_specs(
|
|
318
333
|
run_entries=run_entries,
|
|
319
334
|
max_eval_instances=args.max_eval_instances,
|