crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from helm.common.cache import BlackHoleCacheConfig
|
|
4
|
+
from helm.common.tokenization_request import (
|
|
5
|
+
DecodeRequest,
|
|
6
|
+
TokenizationRequest,
|
|
7
|
+
TokenizationToken,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.models
|
|
12
|
+
def test_tokenize():
|
|
13
|
+
from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
|
|
14
|
+
|
|
15
|
+
tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
|
|
16
|
+
request = TokenizationRequest(tokenizer="ai21/jamba-instruct-tokenizer", text="otter 🦦")
|
|
17
|
+
result = tokenizer.tokenize(request)
|
|
18
|
+
assert result.success
|
|
19
|
+
assert not result.cached
|
|
20
|
+
assert result.tokens == [
|
|
21
|
+
TokenizationToken(token) for token in ["ot", "ter", "▁", "<0xF0>", "<0x9F>", "<0xA6>", "<0xA6>"]
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.mark.models
|
|
26
|
+
def test_encode():
|
|
27
|
+
from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
|
|
28
|
+
|
|
29
|
+
tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
|
|
30
|
+
request = TokenizationRequest(tokenizer="ai21/jamba-instruct-tokenizer", text="otter 🦦", encode=True)
|
|
31
|
+
result = tokenizer.tokenize(request)
|
|
32
|
+
assert result.success
|
|
33
|
+
assert not result.cached
|
|
34
|
+
assert result.tokens == [TokenizationToken(token) for token in [1860, 1901, 62934, 1784, 1703, 1710, 1710]]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.mark.models
|
|
38
|
+
def test_decode():
|
|
39
|
+
from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
|
|
40
|
+
|
|
41
|
+
tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
|
|
42
|
+
request = DecodeRequest(
|
|
43
|
+
tokenizer="ai21/jamba-instruct-tokenizer", tokens=[1860, 1901, 62934, 1784, 1703, 1710, 1710]
|
|
44
|
+
)
|
|
45
|
+
result = tokenizer.decode(request)
|
|
46
|
+
assert result.success
|
|
47
|
+
assert not result.cached
|
|
48
|
+
assert result.text == "otter 🦦"
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import List, Tuple
|
|
3
|
-
|
|
4
|
-
try:
|
|
5
|
-
from light_scenario import LightScenarioKey
|
|
6
|
-
except Exception:
|
|
7
|
-
from helm.benchmark.data_overlap.light_scenario import LightScenarioKey
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass(frozen=True)
|
|
11
|
-
class GroupOverlapStats:
|
|
12
|
-
"""
|
|
13
|
-
Dataclass that represents group data overlap stats
|
|
14
|
-
e.g.
|
|
15
|
-
{
|
|
16
|
-
"group": "natural_qa_closedbook",
|
|
17
|
-
"num_instances": 2144,
|
|
18
|
-
"num_overlapping_inputs": 1,
|
|
19
|
-
"num_overlapping_references": 100
|
|
20
|
-
}
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
group: str
|
|
24
|
-
|
|
25
|
-
num_instances: int
|
|
26
|
-
|
|
27
|
-
num_overlapping_inputs: int
|
|
28
|
-
|
|
29
|
-
num_overlapping_references: int
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def overlapping_input_ratio(self):
|
|
33
|
-
return self.num_overlapping_inputs / self.num_instances
|
|
34
|
-
|
|
35
|
-
@property
|
|
36
|
-
def overlapping_reference_ratio(self):
|
|
37
|
-
return self.num_overlapping_references / self.num_instances
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass(frozen=True)
|
|
41
|
-
class OverlapProtocolSpec:
|
|
42
|
-
"""Specification for how we compute overlap"""
|
|
43
|
-
|
|
44
|
-
# the N of the n_grams we're running
|
|
45
|
-
n: int
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
@dataclass(frozen=True)
|
|
49
|
-
class DataOverlapStatsKey:
|
|
50
|
-
"""Dataclass that represents output data overlap stats"""
|
|
51
|
-
|
|
52
|
-
light_scenario_key: LightScenarioKey
|
|
53
|
-
|
|
54
|
-
overlap_protocol_spec: OverlapProtocolSpec
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass(frozen=True)
|
|
58
|
-
class DataOverlapStats:
|
|
59
|
-
"""Dataclass that represents output data overlap stats"""
|
|
60
|
-
|
|
61
|
-
data_overlap_stats_key: DataOverlapStatsKey
|
|
62
|
-
|
|
63
|
-
num_instances: int
|
|
64
|
-
|
|
65
|
-
instance_ids_with_overlapping_input: List[str]
|
|
66
|
-
|
|
67
|
-
instance_ids_with_overlapping_reference: List[str]
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@dataclass(frozen=True)
|
|
71
|
-
class EntryDataOverlapKey:
|
|
72
|
-
"""Unique key representing either the input or references of a single instance in a scenario."""
|
|
73
|
-
|
|
74
|
-
stats_key: DataOverlapStatsKey
|
|
75
|
-
part: str
|
|
76
|
-
"""Either PART_INPUT or PART_REF"""
|
|
77
|
-
instance_id: str
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@dataclass(frozen=True)
|
|
81
|
-
class EntryOverlapNgrams:
|
|
82
|
-
"""Dataclass that represents output data overlap stats"""
|
|
83
|
-
|
|
84
|
-
entry_data_overlap_key: EntryDataOverlapKey
|
|
85
|
-
|
|
86
|
-
overlapping_ngram_counts: List[Tuple[str, int]]
|
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import argparse
|
|
4
|
-
from typing import List, DefaultDict, Set
|
|
5
|
-
from collections import defaultdict
|
|
6
|
-
|
|
7
|
-
from helm.common.general import asdict_without_nones, ensure_directory_exists
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
9
|
-
|
|
10
|
-
from helm.benchmark.scenarios.scenario import (
|
|
11
|
-
Scenario,
|
|
12
|
-
Instance,
|
|
13
|
-
create_scenario,
|
|
14
|
-
TRAIN_SPLIT,
|
|
15
|
-
VALID_SPLIT,
|
|
16
|
-
TEST_SPLIT,
|
|
17
|
-
ScenarioSpec,
|
|
18
|
-
with_instance_ids,
|
|
19
|
-
)
|
|
20
|
-
from helm.benchmark.presentation.run_entry import read_run_entries
|
|
21
|
-
from helm.benchmark.run import run_entries_to_run_specs
|
|
22
|
-
from helm.benchmark.data_overlap.light_scenario import LightInstance, LightScenario, LightScenarioKey
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def create_light_instance_from_instance(instance: Instance) -> LightInstance:
|
|
26
|
-
"""Create a LightInstance given an Instance. Only keep the text attributes."""
|
|
27
|
-
input_text: str = instance.input.text
|
|
28
|
-
reference_texts: List[str] = [reference.output.text for reference in instance.references]
|
|
29
|
-
return LightInstance(input=input_text, references=reference_texts, id=instance.id)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def get_light_scenarios_from_scenario_spec(
|
|
33
|
-
scenario_spec: ScenarioSpec, scenario_download_path: str = "exported_scenarios"
|
|
34
|
-
) -> List[LightScenario]:
|
|
35
|
-
"""
|
|
36
|
-
Create a list of LightInstances given a ScenarioSpec. Only keep the text of the input and references.
|
|
37
|
-
Note that one LightScenario object is created for each split of the Scenario for simplification.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
scenario: Scenario = create_scenario(scenario_spec)
|
|
41
|
-
|
|
42
|
-
ensure_directory_exists(scenario_download_path)
|
|
43
|
-
scenario_output_path = os.path.join(scenario_download_path, scenario.name)
|
|
44
|
-
ensure_directory_exists(scenario_output_path)
|
|
45
|
-
|
|
46
|
-
# Load instances
|
|
47
|
-
instances: List[Instance]
|
|
48
|
-
with htrack_block("scenario.get_instances"):
|
|
49
|
-
instances = scenario.get_instances(scenario_output_path)
|
|
50
|
-
|
|
51
|
-
# Get instance ids
|
|
52
|
-
instances = with_instance_ids(instances)
|
|
53
|
-
|
|
54
|
-
# Classify instances into splits
|
|
55
|
-
splits: List[str] = [TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT]
|
|
56
|
-
split_mapping: DefaultDict[str, list] = defaultdict(list)
|
|
57
|
-
for instance in instances:
|
|
58
|
-
if instance.split is None or instance.split not in splits:
|
|
59
|
-
raise ValueError(
|
|
60
|
-
f"split should be one of {TRAIN_SPLIT}, {VALID_SPLIT}, or {TEST_SPLIT}, but got {instance.split}"
|
|
61
|
-
)
|
|
62
|
-
split_mapping[instance.split].append(instance)
|
|
63
|
-
|
|
64
|
-
# Convert Scenarios to LightScenarios
|
|
65
|
-
light_scenarios: List[LightScenario] = []
|
|
66
|
-
for split, instances in split_mapping.items():
|
|
67
|
-
light_instances: List[LightInstance] = [create_light_instance_from_instance(instance) for instance in instances]
|
|
68
|
-
light_scenario_key: LightScenarioKey = LightScenarioKey(
|
|
69
|
-
scenario_spec=scenario_spec,
|
|
70
|
-
split=split,
|
|
71
|
-
)
|
|
72
|
-
light_scenario = LightScenario(
|
|
73
|
-
scenario_key=light_scenario_key,
|
|
74
|
-
instances=light_instances,
|
|
75
|
-
)
|
|
76
|
-
light_scenarios.append(light_scenario)
|
|
77
|
-
return light_scenarios
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def save_scenarios_to_jsonl(light_scenarios: List[LightScenario], filename: str):
|
|
81
|
-
"""
|
|
82
|
-
Save a list of LightInstance to a jsonl file where each line represents a LightScenario object.
|
|
83
|
-
"""
|
|
84
|
-
with open(filename, "a") as f:
|
|
85
|
-
for light_scenario in light_scenarios:
|
|
86
|
-
f.write(json.dumps(asdict_without_nones(light_scenario), ensure_ascii=False) + "\n")
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
if __name__ == "__main__":
|
|
90
|
-
parser = argparse.ArgumentParser()
|
|
91
|
-
parser.add_argument("--run-specs", nargs="+", required=True, help="Specifies what to export")
|
|
92
|
-
parser.add_argument("--output-data", type=str, required=True, help="The path to the output file")
|
|
93
|
-
args = parser.parse_args()
|
|
94
|
-
|
|
95
|
-
hlog("Loading run_specs")
|
|
96
|
-
run_entries = read_run_entries(args.run_specs).entries
|
|
97
|
-
run_specs = run_entries_to_run_specs(
|
|
98
|
-
run_entries=run_entries,
|
|
99
|
-
priority=4,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
try:
|
|
103
|
-
os.remove(args.output_data)
|
|
104
|
-
except OSError:
|
|
105
|
-
pass
|
|
106
|
-
|
|
107
|
-
scenario_specs: Set = set()
|
|
108
|
-
for run_spec in run_specs:
|
|
109
|
-
scenario_spec = run_spec.scenario_spec
|
|
110
|
-
if (
|
|
111
|
-
scenario_spec.class_name
|
|
112
|
-
!= "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
|
|
113
|
-
):
|
|
114
|
-
scenario_specs.add(scenario_spec)
|
|
115
|
-
|
|
116
|
-
hlog("Generating light scenarios from scenarios")
|
|
117
|
-
for scenario_spec in scenario_specs:
|
|
118
|
-
light_scenarios: List[LightScenario] = get_light_scenarios_from_scenario_spec(scenario_spec)
|
|
119
|
-
save_scenarios_to_jsonl(light_scenarios, args.output_data)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import List, Optional
|
|
3
|
-
|
|
4
|
-
try:
|
|
5
|
-
from scenarios.scenario import ScenarioSpec
|
|
6
|
-
except Exception:
|
|
7
|
-
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass(frozen=True)
|
|
11
|
-
class LightInstance:
|
|
12
|
-
"""
|
|
13
|
-
A lighter `Instance` with only text fields.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
input: str
|
|
17
|
-
"""The input"""
|
|
18
|
-
|
|
19
|
-
references: List[str]
|
|
20
|
-
"""References that help us evaluate"""
|
|
21
|
-
|
|
22
|
-
id: Optional[str] = None
|
|
23
|
-
"""Helm instance id"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@dataclass(frozen=True)
|
|
27
|
-
class LightScenarioKey:
|
|
28
|
-
"""
|
|
29
|
-
Key for LightScenario
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
scenario_spec: ScenarioSpec
|
|
33
|
-
|
|
34
|
-
split: str
|
|
35
|
-
|
|
36
|
-
def __hash__(self):
|
|
37
|
-
return hash((self.scenario_spec, self.split))
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass(frozen=True)
|
|
41
|
-
class LightScenario:
|
|
42
|
-
"""
|
|
43
|
-
A lighter `Scenario`.
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
scenario_key: LightScenarioKey
|
|
47
|
-
|
|
48
|
-
instances: List[LightInstance]
|
|
49
|
-
"""Instances of this scenario"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@dataclass(frozen=True)
|
|
53
|
-
class ScenarioSpecInstanceIds:
|
|
54
|
-
"""
|
|
55
|
-
Instance ids associated with a scenario
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
scenario_spec: ScenarioSpec
|
|
59
|
-
|
|
60
|
-
instance_ids: List[str]
|
|
File without changes
|
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
.correct {
|
|
2
|
-
background-color: #dfffdf;
|
|
3
|
-
}
|
|
4
|
-
|
|
5
|
-
.wrong {
|
|
6
|
-
background-color: #ffdfdf;
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
.scenario-info {
|
|
10
|
-
margin-top: 30px;
|
|
11
|
-
margin-bottom: 30px;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
td {
|
|
15
|
-
padding-left: 15px;
|
|
16
|
-
padding-right: 15px;
|
|
17
|
-
padding-top: 5px;
|
|
18
|
-
padding-bottom: 5px;
|
|
19
|
-
word-wrap: break-word;
|
|
20
|
-
max-width: 900px;
|
|
21
|
-
vertical-align: top;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
.results-table {
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
.table-container {
|
|
28
|
-
margin-top: 30px;
|
|
29
|
-
margin-bottom: 30px;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
tr {
|
|
33
|
-
border: solid;
|
|
34
|
-
border-color: #f0f0f0;
|
|
35
|
-
border-width: 1px 0;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
.results-table thead tr {
|
|
39
|
-
background-color: #f9f9f9;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
.logprob {
|
|
43
|
-
font-size: 8pt;
|
|
44
|
-
font-style: italic;
|
|
45
|
-
color: gray;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
.list-header {
|
|
49
|
-
font-size: 24px;
|
|
50
|
-
font-weight: bold;
|
|
51
|
-
}
|
|
52
|
-
.list-item {
|
|
53
|
-
color: black;
|
|
54
|
-
font-size: 14px;
|
|
55
|
-
white-space: nowrap;
|
|
56
|
-
}
|
|
57
|
-
.list-item-todo {
|
|
58
|
-
color: lightgray;
|
|
59
|
-
}
|
|
60
|
-
.list-item:hover {
|
|
61
|
-
color: black;
|
|
62
|
-
text-decoration: none;
|
|
63
|
-
background-color: lightgray;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
.main-link {
|
|
67
|
-
color: white;
|
|
68
|
-
background-color: #53A0C0;
|
|
69
|
-
}
|
|
70
|
-
.main-link:hover {
|
|
71
|
-
color: lightgray;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
.access-open {
|
|
75
|
-
background-color: lightgreen;
|
|
76
|
-
width: 100px;
|
|
77
|
-
}
|
|
78
|
-
.access-limited {
|
|
79
|
-
background-color: yellow;
|
|
80
|
-
width: 100px;
|
|
81
|
-
}
|
|
82
|
-
.access-restricted {
|
|
83
|
-
background-color: orange;
|
|
84
|
-
width: 100px;
|
|
85
|
-
}
|
|
86
|
-
.access-closed {
|
|
87
|
-
background-color: lightgray;
|
|
88
|
-
width: 100px;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
.technical-details {
|
|
92
|
-
font-size: 10px;
|
|
93
|
-
font-style: italic;
|
|
94
|
-
color: gray;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
.logo-container {
|
|
98
|
-
display: flex;
|
|
99
|
-
flex-flow: row wrap;
|
|
100
|
-
justify-content: space-between;
|
|
101
|
-
padding: 20px;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
.logo-item {
|
|
105
|
-
margin: auto;
|
|
106
|
-
padding: 10px;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
.instance-input {
|
|
110
|
-
font-style: italic;
|
|
111
|
-
background-color: #f5f5f5;
|
|
112
|
-
margin-left: 20px;
|
|
113
|
-
white-space: pre-wrap;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
.instance-reference {
|
|
117
|
-
font-style: italic;
|
|
118
|
-
background-color: #f5f5f5;
|
|
119
|
-
white-space: pre-wrap;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
.taxonomy-table {
|
|
123
|
-
margin: 10px;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
thead .table-sort-column {
|
|
127
|
-
background-color: #ffe599;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
tbody .table-sort-column {
|
|
131
|
-
background-color: #fff2cc;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
.prompt {
|
|
135
|
-
font-style: italic;
|
|
136
|
-
background-color: #f5f5f5;
|
|
137
|
-
white-space: pre-wrap;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
.plot {
|
|
141
|
-
margin: 15px;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
.plot img {
|
|
145
|
-
margin: 10px;
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
.plot-caption {
|
|
149
|
-
color: #555;
|
|
150
|
-
font-style: italic;
|
|
151
|
-
margin: 5px;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
.prediction-text {
|
|
155
|
-
white-space: pre-wrap;
|
|
156
|
-
}
|