crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show
  1. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
  2. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  5. helm/benchmark/annotation/call_center_annotator.py +22 -11
  6. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  7. helm/benchmark/annotation/live_qa_annotator.py +9 -4
  8. helm/benchmark/annotation/medication_qa_annotator.py +9 -4
  9. helm/benchmark/annotation/model_as_judge.py +70 -19
  10. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  11. helm/benchmark/annotation/xstest_annotator.py +20 -30
  12. helm/benchmark/metrics/safety_metrics.py +39 -17
  13. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  14. helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
  15. helm/benchmark/presentation/create_plots.py +1 -1
  16. helm/benchmark/presentation/schema.py +3 -0
  17. helm/benchmark/presentation/summarize.py +106 -256
  18. helm/benchmark/presentation/test_summarize.py +145 -3
  19. helm/benchmark/run_expander.py +27 -0
  20. helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
  21. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  22. helm/benchmark/run_specs/vlm_run_specs.py +8 -3
  23. helm/benchmark/scenarios/bhasa_scenario.py +226 -82
  24. helm/benchmark/scenarios/raft_scenario.py +1 -1
  25. helm/benchmark/static/schema_bhasa.yaml +10 -10
  26. helm/benchmark/static/schema_legal.yaml +566 -0
  27. helm/benchmark/static/schema_safety.yaml +25 -6
  28. helm/benchmark/static/schema_tables.yaml +26 -2
  29. helm/benchmark/static/schema_vhelm.yaml +42 -11
  30. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  31. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  32. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  33. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  34. helm/benchmark/static_build/index.html +1 -1
  35. helm/benchmark/window_services/tokenizer_service.py +0 -5
  36. helm/clients/openai_client.py +16 -1
  37. helm/clients/palmyra_client.py +1 -2
  38. helm/clients/together_client.py +22 -0
  39. helm/common/cache.py +8 -30
  40. helm/common/key_value_store.py +9 -9
  41. helm/common/mongo_key_value_store.py +3 -3
  42. helm/common/test_cache.py +1 -48
  43. helm/common/tokenization_request.py +0 -9
  44. helm/config/model_deployments.yaml +135 -3
  45. helm/config/model_metadata.yaml +134 -6
  46. helm/config/tokenizer_configs.yaml +24 -0
  47. helm/proxy/server.py +0 -9
  48. helm/proxy/services/remote_service.py +0 -6
  49. helm/proxy/services/server_service.py +5 -18
  50. helm/proxy/services/service.py +0 -6
  51. helm/benchmark/data_overlap/__init__.py +0 -0
  52. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  53. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  54. helm/benchmark/data_overlap/light_scenario.py +0 -60
  55. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  56. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  57. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  58. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  59. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  60. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
@@ -317,6 +317,14 @@ tokenizer_configs:
317
317
  prefix_token: "<|begin_of_text|>"
318
318
  end_of_text_token: "<|end_of_text|>"
319
319
 
320
+ - name: meta/llama-3-8b-instruct
321
+ tokenizer_spec:
322
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
323
+ args:
324
+ pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
325
+ prefix_token: "<|begin_of_text|>"
326
+ end_of_text_token: "<|eot_id|>"
327
+
320
328
  - name: meta/llama-3.1-8b
321
329
  tokenizer_spec:
322
330
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -325,6 +333,22 @@ tokenizer_configs:
325
333
  prefix_token: "<|begin_of_text|>"
326
334
  end_of_text_token: "<|end_of_text|>"
327
335
 
336
+ - name: meta/llama-3.2-3b-instruct
337
+ tokenizer_spec:
338
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
339
+ args:
340
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
341
+ prefix_token: "<|begin_of_text|>"
342
+ end_of_text_token: "<|eot_id|>"
343
+
344
+ - name: meta/llama-3.2-11b-vision-instruct
345
+ tokenizer_spec:
346
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
347
+ args:
348
+ pretrained_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
349
+ prefix_token: "<|begin_of_text|>"
350
+ end_of_text_token: "<|eot_id|>"
351
+
328
352
  # 01-ai
329
353
  - name: 01-ai/Yi-6B
330
354
  tokenizer_spec:
helm/proxy/server.py CHANGED
@@ -106,15 +106,6 @@ def handle_get_general_info():
106
106
  return safe_call(perform)
107
107
 
108
108
 
109
- @app.get("/api/window_service_info")
110
- def handle_get_window_service_info():
111
- def perform(args):
112
- global service
113
- return dataclasses.asdict(service.get_window_service_info(args["model_name"]))
114
-
115
- return safe_call(perform)
116
-
117
-
118
109
  @app.post("/api/account")
119
110
  def handle_create_account():
120
111
  def perform(args):
@@ -15,7 +15,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
15
15
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
16
  from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
17
17
  from helm.common.tokenization_request import (
18
- WindowServiceInfo,
19
18
  TokenizationRequest,
20
19
  TokenizationRequestResult,
21
20
  DecodeRequestResult,
@@ -51,11 +50,6 @@ class RemoteService(Service):
51
50
  response = requests.get(f"{self.base_url}/api/general_info").json()
52
51
  return from_dict(GeneralInfo, response)
53
52
 
54
- def get_window_service_info(self, model_name) -> WindowServiceInfo:
55
- params = {"model_name": model_name}
56
- response = requests.get(f"{self.base_url}/api/window_service_info?{urllib.parse.urlencode(params)}").json()
57
- return from_dict(WindowServiceInfo, response)
58
-
59
53
  def expand_query(self, query: Query) -> QueryResult:
60
54
  params = asdict(query)
61
55
  response = requests.get(f"{self.base_url}/api/query?{urllib.parse.urlencode(params)}").json()
@@ -14,7 +14,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
14
14
  from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
15
15
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
16
  from helm.common.tokenization_request import (
17
- WindowServiceInfo,
18
17
  TokenizationRequest,
19
18
  TokenizationRequestResult,
20
19
  DecodeRequest,
@@ -85,22 +84,6 @@ class ServerService(Service):
85
84
  all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
86
85
  return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
87
86
 
88
- def get_window_service_info(self, model_name) -> WindowServiceInfo:
89
- # The import statement is placed here to avoid two problems, please refer to the link for details
90
- # https://github.com/stanford-crfm/helm/pull/1430#discussion_r1156686624
91
- from helm.benchmark.window_services.tokenizer_service import TokenizerService
92
- from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
93
-
94
- token_service = TokenizerService(self, Authentication(""))
95
- window_service = WindowServiceFactory.get_window_service(model_name, token_service)
96
- return WindowServiceInfo(
97
- tokenizer_name=window_service.tokenizer_name,
98
- max_sequence_length=window_service.max_sequence_length,
99
- max_request_length=window_service.max_request_length,
100
- end_of_text_token=window_service.end_of_text_token,
101
- prefix_token=window_service.prefix_token,
102
- )
103
-
104
87
  def expand_query(self, query: Query) -> QueryResult:
105
88
  """Turn the `query` into requests."""
106
89
  prompt = query.prompt
@@ -120,8 +103,12 @@ class ServerService(Service):
120
103
  return "dall_e"
121
104
  elif model_deployment.startswith("openai/gpt-4"):
122
105
  return "gpt4"
123
- else:
106
+ elif model_deployment.startswith("openai/gpt-3"):
124
107
  return "gpt3"
108
+ elif model_deployment.startswith("openai/o1"):
109
+ return "o1"
110
+ else:
111
+ return "openai"
125
112
  elif model_deployment.startswith("ai21/"):
126
113
  return "jurassic"
127
114
  else:
@@ -11,7 +11,6 @@ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResu
11
11
  from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
12
12
  from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
13
13
  from helm.common.tokenization_request import (
14
- WindowServiceInfo,
15
14
  TokenizationRequest,
16
15
  TokenizationRequestResult,
17
16
  DecodeRequest,
@@ -85,11 +84,6 @@ class Service(ABC):
85
84
  """Get general info."""
86
85
  pass
87
86
 
88
- @abstractmethod
89
- def get_window_service_info(self, model_name: str) -> WindowServiceInfo:
90
- """Get window service info."""
91
- pass
92
-
93
87
  @abstractmethod
94
88
  def expand_query(self, query: Query) -> QueryResult:
95
89
  """Turn the `query` into requests."""
File without changes
@@ -1,86 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import List, Tuple
3
-
4
- try:
5
- from light_scenario import LightScenarioKey
6
- except Exception:
7
- from helm.benchmark.data_overlap.light_scenario import LightScenarioKey
8
-
9
-
10
- @dataclass(frozen=True)
11
- class GroupOverlapStats:
12
- """
13
- Dataclass that represents group data overlap stats
14
- e.g.
15
- {
16
- "group": "natural_qa_closedbook",
17
- "num_instances": 2144,
18
- "num_overlapping_inputs": 1,
19
- "num_overlapping_references": 100
20
- }
21
- """
22
-
23
- group: str
24
-
25
- num_instances: int
26
-
27
- num_overlapping_inputs: int
28
-
29
- num_overlapping_references: int
30
-
31
- @property
32
- def overlapping_input_ratio(self):
33
- return self.num_overlapping_inputs / self.num_instances
34
-
35
- @property
36
- def overlapping_reference_ratio(self):
37
- return self.num_overlapping_references / self.num_instances
38
-
39
-
40
- @dataclass(frozen=True)
41
- class OverlapProtocolSpec:
42
- """Specification for how we compute overlap"""
43
-
44
- # the N of the n_grams we're running
45
- n: int
46
-
47
-
48
- @dataclass(frozen=True)
49
- class DataOverlapStatsKey:
50
- """Dataclass that represents output data overlap stats"""
51
-
52
- light_scenario_key: LightScenarioKey
53
-
54
- overlap_protocol_spec: OverlapProtocolSpec
55
-
56
-
57
- @dataclass(frozen=True)
58
- class DataOverlapStats:
59
- """Dataclass that represents output data overlap stats"""
60
-
61
- data_overlap_stats_key: DataOverlapStatsKey
62
-
63
- num_instances: int
64
-
65
- instance_ids_with_overlapping_input: List[str]
66
-
67
- instance_ids_with_overlapping_reference: List[str]
68
-
69
-
70
- @dataclass(frozen=True)
71
- class EntryDataOverlapKey:
72
- """Unique key representing either the input or references of a single instance in a scenario."""
73
-
74
- stats_key: DataOverlapStatsKey
75
- part: str
76
- """Either PART_INPUT or PART_REF"""
77
- instance_id: str
78
-
79
-
80
- @dataclass(frozen=True)
81
- class EntryOverlapNgrams:
82
- """Dataclass that represents output data overlap stats"""
83
-
84
- entry_data_overlap_key: EntryDataOverlapKey
85
-
86
- overlapping_ngram_counts: List[Tuple[str, int]]
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- import argparse
4
- from typing import List, DefaultDict, Set
5
- from collections import defaultdict
6
-
7
- from helm.common.general import asdict_without_nones, ensure_directory_exists
8
- from helm.common.hierarchical_logger import hlog, htrack_block
9
-
10
- from helm.benchmark.scenarios.scenario import (
11
- Scenario,
12
- Instance,
13
- create_scenario,
14
- TRAIN_SPLIT,
15
- VALID_SPLIT,
16
- TEST_SPLIT,
17
- ScenarioSpec,
18
- with_instance_ids,
19
- )
20
- from helm.benchmark.presentation.run_entry import read_run_entries
21
- from helm.benchmark.run import run_entries_to_run_specs
22
- from helm.benchmark.data_overlap.light_scenario import LightInstance, LightScenario, LightScenarioKey
23
-
24
-
25
- def create_light_instance_from_instance(instance: Instance) -> LightInstance:
26
- """Create a LightInstance given an Instance. Only keep the text attributes."""
27
- input_text: str = instance.input.text
28
- reference_texts: List[str] = [reference.output.text for reference in instance.references]
29
- return LightInstance(input=input_text, references=reference_texts, id=instance.id)
30
-
31
-
32
- def get_light_scenarios_from_scenario_spec(
33
- scenario_spec: ScenarioSpec, scenario_download_path: str = "exported_scenarios"
34
- ) -> List[LightScenario]:
35
- """
36
- Create a list of LightInstances given a ScenarioSpec. Only keep the text of the input and references.
37
- Note that one LightScenario object is created for each split of the Scenario for simplification.
38
- """
39
-
40
- scenario: Scenario = create_scenario(scenario_spec)
41
-
42
- ensure_directory_exists(scenario_download_path)
43
- scenario_output_path = os.path.join(scenario_download_path, scenario.name)
44
- ensure_directory_exists(scenario_output_path)
45
-
46
- # Load instances
47
- instances: List[Instance]
48
- with htrack_block("scenario.get_instances"):
49
- instances = scenario.get_instances(scenario_output_path)
50
-
51
- # Get instance ids
52
- instances = with_instance_ids(instances)
53
-
54
- # Classify instances into splits
55
- splits: List[str] = [TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT]
56
- split_mapping: DefaultDict[str, list] = defaultdict(list)
57
- for instance in instances:
58
- if instance.split is None or instance.split not in splits:
59
- raise ValueError(
60
- f"split should be one of {TRAIN_SPLIT}, {VALID_SPLIT}, or {TEST_SPLIT}, but got {instance.split}"
61
- )
62
- split_mapping[instance.split].append(instance)
63
-
64
- # Convert Scenarios to LightScenarios
65
- light_scenarios: List[LightScenario] = []
66
- for split, instances in split_mapping.items():
67
- light_instances: List[LightInstance] = [create_light_instance_from_instance(instance) for instance in instances]
68
- light_scenario_key: LightScenarioKey = LightScenarioKey(
69
- scenario_spec=scenario_spec,
70
- split=split,
71
- )
72
- light_scenario = LightScenario(
73
- scenario_key=light_scenario_key,
74
- instances=light_instances,
75
- )
76
- light_scenarios.append(light_scenario)
77
- return light_scenarios
78
-
79
-
80
- def save_scenarios_to_jsonl(light_scenarios: List[LightScenario], filename: str):
81
- """
82
- Save a list of LightInstance to a jsonl file where each line represents a LightScenario object.
83
- """
84
- with open(filename, "a") as f:
85
- for light_scenario in light_scenarios:
86
- f.write(json.dumps(asdict_without_nones(light_scenario), ensure_ascii=False) + "\n")
87
-
88
-
89
- if __name__ == "__main__":
90
- parser = argparse.ArgumentParser()
91
- parser.add_argument("--run-specs", nargs="+", required=True, help="Specifies what to export")
92
- parser.add_argument("--output-data", type=str, required=True, help="The path to the output file")
93
- args = parser.parse_args()
94
-
95
- hlog("Loading run_specs")
96
- run_entries = read_run_entries(args.run_specs).entries
97
- run_specs = run_entries_to_run_specs(
98
- run_entries=run_entries,
99
- priority=4,
100
- )
101
-
102
- try:
103
- os.remove(args.output_data)
104
- except OSError:
105
- pass
106
-
107
- scenario_specs: Set = set()
108
- for run_spec in run_specs:
109
- scenario_spec = run_spec.scenario_spec
110
- if (
111
- scenario_spec.class_name
112
- != "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
113
- ):
114
- scenario_specs.add(scenario_spec)
115
-
116
- hlog("Generating light scenarios from scenarios")
117
- for scenario_spec in scenario_specs:
118
- light_scenarios: List[LightScenario] = get_light_scenarios_from_scenario_spec(scenario_spec)
119
- save_scenarios_to_jsonl(light_scenarios, args.output_data)
@@ -1,60 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import List, Optional
3
-
4
- try:
5
- from scenarios.scenario import ScenarioSpec
6
- except Exception:
7
- from helm.benchmark.scenarios.scenario import ScenarioSpec
8
-
9
-
10
- @dataclass(frozen=True)
11
- class LightInstance:
12
- """
13
- A lighter `Instance` with only text fields.
14
- """
15
-
16
- input: str
17
- """The input"""
18
-
19
- references: List[str]
20
- """References that help us evaluate"""
21
-
22
- id: Optional[str] = None
23
- """Helm instance id"""
24
-
25
-
26
- @dataclass(frozen=True)
27
- class LightScenarioKey:
28
- """
29
- Key for LightScenario
30
- """
31
-
32
- scenario_spec: ScenarioSpec
33
-
34
- split: str
35
-
36
- def __hash__(self):
37
- return hash((self.scenario_spec, self.split))
38
-
39
-
40
- @dataclass(frozen=True)
41
- class LightScenario:
42
- """
43
- A lighter `Scenario`.
44
- """
45
-
46
- scenario_key: LightScenarioKey
47
-
48
- instances: List[LightInstance]
49
- """Instances of this scenario"""
50
-
51
-
52
- @dataclass(frozen=True)
53
- class ScenarioSpecInstanceIds:
54
- """
55
- Instance ids associated with a scenario
56
- """
57
-
58
- scenario_spec: ScenarioSpec
59
-
60
- instance_ids: List[str]