crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show
  1. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
  2. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  5. helm/benchmark/annotation/call_center_annotator.py +22 -11
  6. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  7. helm/benchmark/annotation/live_qa_annotator.py +9 -4
  8. helm/benchmark/annotation/medication_qa_annotator.py +9 -4
  9. helm/benchmark/annotation/model_as_judge.py +70 -19
  10. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  11. helm/benchmark/annotation/xstest_annotator.py +20 -30
  12. helm/benchmark/metrics/safety_metrics.py +39 -17
  13. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  14. helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
  15. helm/benchmark/presentation/create_plots.py +1 -1
  16. helm/benchmark/presentation/schema.py +3 -0
  17. helm/benchmark/presentation/summarize.py +106 -256
  18. helm/benchmark/presentation/test_summarize.py +145 -3
  19. helm/benchmark/run_expander.py +27 -0
  20. helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
  21. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  22. helm/benchmark/run_specs/vlm_run_specs.py +8 -3
  23. helm/benchmark/scenarios/bhasa_scenario.py +226 -82
  24. helm/benchmark/scenarios/raft_scenario.py +1 -1
  25. helm/benchmark/static/schema_bhasa.yaml +10 -10
  26. helm/benchmark/static/schema_legal.yaml +566 -0
  27. helm/benchmark/static/schema_safety.yaml +25 -6
  28. helm/benchmark/static/schema_tables.yaml +26 -2
  29. helm/benchmark/static/schema_vhelm.yaml +42 -11
  30. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  31. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  32. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  33. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  34. helm/benchmark/static_build/index.html +1 -1
  35. helm/benchmark/window_services/tokenizer_service.py +0 -5
  36. helm/clients/openai_client.py +16 -1
  37. helm/clients/palmyra_client.py +1 -2
  38. helm/clients/together_client.py +22 -0
  39. helm/common/cache.py +8 -30
  40. helm/common/key_value_store.py +9 -9
  41. helm/common/mongo_key_value_store.py +3 -3
  42. helm/common/test_cache.py +1 -48
  43. helm/common/tokenization_request.py +0 -9
  44. helm/config/model_deployments.yaml +135 -3
  45. helm/config/model_metadata.yaml +134 -6
  46. helm/config/tokenizer_configs.yaml +24 -0
  47. helm/proxy/server.py +0 -9
  48. helm/proxy/services/remote_service.py +0 -6
  49. helm/proxy/services/server_service.py +5 -18
  50. helm/proxy/services/service.py +0 -6
  51. helm/benchmark/data_overlap/__init__.py +0 -0
  52. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  53. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  54. helm/benchmark/data_overlap/light_scenario.py +0 -60
  55. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  56. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  57. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  58. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  59. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  60. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import requests
7
7
  from retrying import retry
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
+ from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
10
11
  from helm.common.optional_dependencies import handle_module_not_found_error
11
12
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
12
13
  from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
@@ -323,8 +324,29 @@ class TogetherChatClient(CachingClient):
323
324
  self._together_model = together_model
324
325
 
325
326
  def convert_to_raw_chat_request(self, request: Request) -> TogetherRawChatRequest:
327
+ request.validate()
328
+ messages: List[Dict[str, Any]]
326
329
  if request.messages:
327
330
  messages = request.messages
331
+ elif request.multimodal_prompt:
332
+ message_contents = []
333
+ for media_object in request.multimodal_prompt.media_objects:
334
+ if media_object.is_type(IMAGE_TYPE) and media_object.location:
335
+ assert media_object.location
336
+ if media_object.is_local_file:
337
+ from helm.common.images_utils import encode_base64
338
+
339
+ base64_image: str = encode_base64(media_object.location)
340
+ image_url = f"data:image/jpeg;base64,{base64_image}"
341
+ else:
342
+ image_url = media_object.location
343
+ message_contents.append({"type": "image_url", "image_url": {"url": image_url}})
344
+ elif media_object.is_type(TEXT_TYPE):
345
+ assert media_object.text
346
+ message_contents.append({"type": "text", "text": media_object.text})
347
+ else:
348
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
349
+ messages = [{"role": "user", "content": message_contents}]
328
350
  else:
329
351
  messages = [{"role": "user", "content": request.prompt}]
330
352
  if self._together_model is not None:
helm/common/cache.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from collections import defaultdict
2
2
  from dataclasses import dataclass
3
- from typing import Dict, Callable, Generator, Mapping, Optional, Tuple
3
+ from typing import Dict, Callable, Generator, Mapping, Tuple
4
4
  import json
5
5
  import threading
6
6
 
@@ -38,6 +38,12 @@ class CacheConfig:
38
38
  class KeyValueStoreCacheConfig(CacheConfig):
39
39
  """Configuration for a cache backed by a key-value store."""
40
40
 
41
+ # This was originally to distinguish between "primitive" cache configs
42
+ # and "compound" cache configs. But we don't have any "compound" cache configs currently.
43
+ # Hypthetical "compound" example: ReadOnlyCacheConfig(SqliteCacheConfig("path"))
44
+ # TODO: Maybe remove this eventually?
45
+ pass
46
+
41
47
 
42
48
  @dataclass(frozen=True)
43
49
  class SqliteCacheConfig(KeyValueStoreCacheConfig):
@@ -78,24 +84,6 @@ class MongoCacheConfig(KeyValueStoreCacheConfig):
78
84
  return f"{self.uri}/{self.collection_name}"
79
85
 
80
86
 
81
- @dataclass(frozen=True)
82
- class WithFollowerCacheConfig(CacheConfig):
83
- """Configuration of a cache backed by a main cache and a follower cache."""
84
-
85
- # Configuration for the main cache.
86
- # Responses will be written to and served out of this cache.
87
- main: KeyValueStoreCacheConfig
88
-
89
- # Configuration for the follower cache.
90
- # The follower cache is a write-only cache. Responses will be written to this cache,
91
- # but not served out of this cache.
92
- follower: KeyValueStoreCacheConfig
93
-
94
- @property
95
- def cache_stats_key(self) -> str:
96
- return self.main.cache_stats_key
97
-
98
-
99
87
  def get_all_from_sqlite(path: str) -> Generator[Tuple[Dict, Dict], None, None]:
100
88
  """Yields all decoded key, value pairs from the SQLite cache.
101
89
 
@@ -126,7 +114,7 @@ def create_key_value_store(config: KeyValueStoreCacheConfig) -> KeyValueStore:
126
114
  elif isinstance(config, BlackHoleCacheConfig):
127
115
  return BlackHoleKeyValueStore()
128
116
  else:
129
- raise ValueError(f"KeyValueStoreCacheConfig with unknown type: {config}")
117
+ raise ValueError(f"CacheConfig with unknown type: {config}")
130
118
 
131
119
 
132
120
  @retry
@@ -189,14 +177,8 @@ class Cache(object):
189
177
 
190
178
  def __init__(self, config: CacheConfig):
191
179
  hlog(f"Created cache with config: {config}")
192
- self.config: KeyValueStoreCacheConfig
193
- self.follower_config: Optional[KeyValueStoreCacheConfig]
194
180
  if isinstance(config, KeyValueStoreCacheConfig):
195
181
  self.config = config
196
- self.follower_config = None
197
- elif isinstance(config, WithFollowerCacheConfig):
198
- self.config = config.main
199
- self.follower_config = config.follower
200
182
  else:
201
183
  raise ValueError(f"CacheConfig with unknown type: {config}")
202
184
 
@@ -216,8 +198,4 @@ class Cache(object):
216
198
  response = compute()
217
199
 
218
200
  write_to_key_value_store(key_value_store, request, response)
219
- if self.follower_config is not None:
220
- # TODO: Initialize follower_key_value_store in constructor
221
- with create_key_value_store(self.follower_config) as follower_key_value_store:
222
- write_to_key_value_store(follower_key_value_store, request, response)
223
201
  return response, cached
@@ -15,11 +15,11 @@ class KeyValueStore(contextlib.AbstractContextManager):
15
15
  """Key value store that persists writes."""
16
16
 
17
17
  @abstractmethod
18
- def contains(self, key: Dict) -> bool:
18
+ def contains(self, key: Mapping) -> bool:
19
19
  pass
20
20
 
21
21
  @abstractmethod
22
- def get(self, key: Dict) -> Optional[Dict]:
22
+ def get(self, key: Mapping) -> Optional[Dict]:
23
23
  pass
24
24
 
25
25
  @abstractmethod
@@ -35,7 +35,7 @@ class KeyValueStore(contextlib.AbstractContextManager):
35
35
  pass
36
36
 
37
37
  @abstractmethod
38
- def remove(self, key: Dict) -> None:
38
+ def remove(self, key: Mapping) -> None:
39
39
  pass
40
40
 
41
41
 
@@ -53,10 +53,10 @@ class SqliteKeyValueStore(KeyValueStore):
53
53
  def __exit__(self, exc_type, exc_value, traceback) -> None:
54
54
  self._sqlite_dict.__exit__(exc_type, exc_value, traceback)
55
55
 
56
- def contains(self, key: Dict) -> bool:
56
+ def contains(self, key: Mapping) -> bool:
57
57
  return request_to_key(key) in self._sqlite_dict
58
58
 
59
- def get(self, key: Dict) -> Optional[Dict]:
59
+ def get(self, key: Mapping) -> Optional[Dict]:
60
60
  key_string = request_to_key(key)
61
61
  result = self._sqlite_dict.get(key_string)
62
62
  if result is not None:
@@ -77,7 +77,7 @@ class SqliteKeyValueStore(KeyValueStore):
77
77
  for key, value in pairs:
78
78
  self.put(key, value)
79
79
 
80
- def remove(self, key: Dict) -> None:
80
+ def remove(self, key: Mapping) -> None:
81
81
  del self._sqlite_dict[key]
82
82
  self._sqlite_dict.commit()
83
83
 
@@ -91,10 +91,10 @@ class BlackHoleKeyValueStore(KeyValueStore):
91
91
  def __exit__(self, exc_type, exc_value, traceback) -> None:
92
92
  pass
93
93
 
94
- def contains(self, key: Dict) -> bool:
94
+ def contains(self, key: Mapping) -> bool:
95
95
  return False
96
96
 
97
- def get(self, key: Dict) -> Optional[Dict]:
97
+ def get(self, key: Mapping) -> Optional[Dict]:
98
98
  return None
99
99
 
100
100
  def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
@@ -109,5 +109,5 @@ class BlackHoleKeyValueStore(KeyValueStore):
109
109
  def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
110
110
  return None
111
111
 
112
- def remove(self, key: Dict) -> None:
112
+ def remove(self, key: Mapping) -> None:
113
113
  return None
@@ -39,11 +39,11 @@ class MongoKeyValueStore(KeyValueStore):
39
39
  serialized = json.dumps(key, sort_keys=True)
40
40
  return json.loads(serialized, object_pairs_hook=SON)
41
41
 
42
- def contains(self, key: Dict) -> bool:
42
+ def contains(self, key: Mapping) -> bool:
43
43
  query = {self._REQUEST_KEY: self._canonicalize_key(key)}
44
44
  return self._collection.find_one(query) is not None
45
45
 
46
- def get(self, key: Dict) -> Optional[Dict]:
46
+ def get(self, key: Mapping) -> Optional[Dict]:
47
47
  query = {self._REQUEST_KEY: self._canonicalize_key(key)}
48
48
  document = self._collection.find_one(query)
49
49
  if document is not None:
@@ -84,6 +84,6 @@ class MongoKeyValueStore(KeyValueStore):
84
84
  # Note: unlike put, multi_put does not support documents with null bytes in keys.
85
85
  self._collection.bulk_write(operations)
86
86
 
87
- def remove(self, key: Dict) -> None:
87
+ def remove(self, key: Mapping) -> None:
88
88
  query = {self._REQUEST_KEY: self._canonicalize_key(key)}
89
89
  self._collection.delete_one(query)
helm/common/test_cache.py CHANGED
@@ -3,9 +3,7 @@ import tempfile
3
3
  import unittest
4
4
  import threading
5
5
 
6
- from helm.common.cache import Cache, SqliteCacheConfig, WithFollowerCacheConfig, cache_stats, get_all_from_sqlite
7
-
8
- from sqlitedict import SqliteDict
6
+ from helm.common.cache import Cache, SqliteCacheConfig, cache_stats, get_all_from_sqlite
9
7
 
10
8
 
11
9
  class TestCache(unittest.TestCase):
@@ -99,51 +97,6 @@ class TestCache(unittest.TestCase):
99
97
  assert cache_stats.num_computes[self.cache_path] >= num_items
100
98
  assert cache_stats.num_computes[self.cache_path] <= num_items * num_threads
101
99
 
102
- def test_follower(self):
103
- cache = Cache(SqliteCacheConfig(self.cache_path))
104
- request_1 = {"name": "request1"}
105
- compute_1 = lambda: {"response": "response1"}
106
-
107
- response, cached = cache.get(request_1, compute_1)
108
- assert response == {"response": "response1"}
109
- assert not cached
110
- assert cache_stats.num_queries[self.cache_path] == 1
111
- assert cache_stats.num_computes[self.cache_path] == 1
112
-
113
- follower_cache_file = tempfile.NamedTemporaryFile(delete=False)
114
- follower_cache_path = follower_cache_file.name
115
- with follower_cache_file:
116
- cache_with_follower_config = WithFollowerCacheConfig(
117
- main=SqliteCacheConfig(self.cache_path),
118
- follower=SqliteCacheConfig(follower_cache_path),
119
- )
120
- cache_with_follower = Cache(cache_with_follower_config)
121
-
122
- response, cached = cache_with_follower.get(request_1, compute_1)
123
- assert response == {"response": "response1"}
124
- assert cached
125
- assert cache_stats.num_queries[self.cache_path] == 2
126
- assert cache_stats.num_computes[self.cache_path] == 1
127
- assert cache_stats.num_queries[follower_cache_path] == 0
128
- assert cache_stats.num_computes[follower_cache_path] == 0
129
-
130
- request_2 = {"name": "request2"}
131
- compute_2 = lambda: {"response": "response2"}
132
-
133
- response, cached = cache_with_follower.get(request_2, compute_2)
134
- assert response == {"response": "response2"}
135
- assert not cached
136
- assert cache_stats.num_queries[self.cache_path] == 3
137
- assert cache_stats.num_computes[self.cache_path] == 2
138
- assert cache_stats.num_queries[follower_cache_path] == 0
139
- assert cache_stats.num_computes[follower_cache_path] == 0
140
-
141
- expected_dict = {
142
- '{"name": "request1"}': {"response": "response1"},
143
- '{"name": "request2"}': {"response": "response2"},
144
- }
145
- self.assertCountEqual(SqliteDict(follower_cache_path).items(), expected_dict.items())
146
-
147
100
  def test_get_all_from_sqlite(self):
148
101
  cache = Cache(SqliteCacheConfig(self.cache_path))
149
102
  num_items = 10 # TODO: Inrcrease to 100
@@ -2,15 +2,6 @@ from dataclasses import dataclass
2
2
  from typing import List, Optional, Union
3
3
 
4
4
 
5
- @dataclass(frozen=True)
6
- class WindowServiceInfo:
7
- tokenizer_name: str
8
- max_sequence_length: int
9
- max_request_length: int
10
- end_of_text_token: str
11
- prefix_token: str
12
-
13
-
14
5
  @dataclass(frozen=True)
15
6
  class TokenizationRequest:
16
7
  """A `TokenizationRequest` specifies how to tokenize some text."""
@@ -626,6 +626,26 @@ model_deployments:
626
626
  args:
627
627
  trust_remote_code: true
628
628
 
629
+ - name: huggingface/llama3-8b-cpt-sea-lionv2-base
630
+ model_name: aisingapore/llama3-8b-cpt-sea-lionv2-base
631
+ tokenizer_name: meta/llama-3-8b-instruct
632
+ max_sequence_length: 8192
633
+ client_spec:
634
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
635
+ args:
636
+ device_map: auto
637
+ torch_dtype: torch.bfloat16
638
+
639
+ - name: huggingface/llama3-8b-cpt-sea-lionv2.1-instruct
640
+ model_name: aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct
641
+ tokenizer_name: meta/llama-3-8b-instruct
642
+ max_sequence_length: 8192
643
+ client_spec:
644
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
645
+ args:
646
+ device_map: auto
647
+ torch_dtype: torch.bfloat16
648
+
629
649
  ## Bigcode
630
650
  - name: huggingface/santacoder
631
651
  model_name: bigcode/santacoder
@@ -1641,6 +1661,21 @@ model_deployments:
1641
1661
  client_spec:
1642
1662
  class_name: "helm.clients.openai_client.OpenAIClient"
1643
1663
 
1664
+ ## o1 Models
1665
+ - name: openai/o1-preview-2024-09-12
1666
+ model_name: openai/o1-preview-2024-09-12
1667
+ tokenizer_name: openai/cl100k_base
1668
+ max_sequence_length: 128000
1669
+ client_spec:
1670
+ class_name: "helm.clients.openai_client.OpenAIClient"
1671
+
1672
+ - name: openai/o1-mini-2024-09-12
1673
+ model_name: openai/o1-mini-2024-09-12
1674
+ tokenizer_name: openai/cl100k_base
1675
+ max_sequence_length: 128000
1676
+ client_spec:
1677
+ class_name: "helm.clients.openai_client.OpenAIClient"
1678
+
1644
1679
  ## Text Similarity Models
1645
1680
  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
1646
1681
  # The number of parameters is guessed based on the number of parameters of the
@@ -1831,7 +1866,25 @@ model_deployments:
1831
1866
  client_spec:
1832
1867
  class_name: "helm.clients.together_client.TogetherClient"
1833
1868
  args:
1834
- together_model: meta-llama/Meta-Llama-3-8B
1869
+ together_model: meta-llama/Llama-3-8b-hf
1870
+
1871
+ - name: together/llama-3-8b-instruct-turbo
1872
+ model_name: meta/llama-3-8b-instruct-turbo
1873
+ tokenizer_name: meta/llama-3-8b
1874
+ max_sequence_length: 8191
1875
+ client_spec:
1876
+ class_name: "helm.clients.together_client.TogetherClient"
1877
+ args:
1878
+ together_model: meta-llama/Meta-Llama-3-8B-Instruct-Turbo
1879
+
1880
+ - name: together/llama-3-8b-instruct-lite
1881
+ model_name: meta/llama-3-8b-instruct-lite
1882
+ tokenizer_name: meta/llama-3-8b
1883
+ max_sequence_length: 8191
1884
+ client_spec:
1885
+ class_name: "helm.clients.together_client.TogetherClient"
1886
+ args:
1887
+ together_model: meta-llama/Meta-Llama-3-8B-Instruct-Lite
1835
1888
 
1836
1889
  - name: together/llama-3-70b
1837
1890
  model_name: meta/llama-3-70b
@@ -1842,6 +1895,24 @@ model_deployments:
1842
1895
  args:
1843
1896
  together_model: meta-llama/Meta-Llama-3-70B
1844
1897
 
1898
+ - name: together/llama-3-70b-instruct-turbo
1899
+ model_name: meta/llama-3-70b-instruct-turbo
1900
+ tokenizer_name: meta/llama-3-8b
1901
+ max_sequence_length: 8191
1902
+ client_spec:
1903
+ class_name: "helm.clients.together_client.TogetherClient"
1904
+ args:
1905
+ together_model: meta-llama/Meta-Llama-3-70B-Instruct-Turbo
1906
+
1907
+ - name: together/llama-3-70b-instruct-lite
1908
+ model_name: meta/llama-3-70b-instruct-lite
1909
+ tokenizer_name: meta/llama-3-8b
1910
+ max_sequence_length: 8191
1911
+ client_spec:
1912
+ class_name: "helm.clients.together_client.TogetherClient"
1913
+ args:
1914
+ together_model: meta-llama/Meta-Llama-3-70B-Instruct-Lite
1915
+
1845
1916
  - name: together/llama-3.1-8b-instruct-turbo
1846
1917
  model_name: meta/llama-3.1-8b-instruct-turbo
1847
1918
  tokenizer_name: meta/llama-3.1-8b
@@ -1871,7 +1942,7 @@ model_deployments:
1871
1942
 
1872
1943
  - name: together/llama-3-8b-chat
1873
1944
  model_name: meta/llama-3-8b-chat
1874
- tokenizer_name: meta/llama-3-8b
1945
+ tokenizer_name: meta/llama-3-8b-instruct
1875
1946
  max_sequence_length: 8182
1876
1947
  client_spec:
1877
1948
  class_name: "helm.clients.together_client.TogetherChatClient"
@@ -1880,13 +1951,40 @@ model_deployments:
1880
1951
 
1881
1952
  - name: together/llama-3-70b-chat
1882
1953
  model_name: meta/llama-3-70b-chat
1883
- tokenizer_name: meta/llama-3-8b
1954
+ tokenizer_name: meta/llama-3-8b-instruct
1884
1955
  max_sequence_length: 8182
1885
1956
  client_spec:
1886
1957
  class_name: "helm.clients.together_client.TogetherChatClient"
1887
1958
  args:
1888
1959
  together_model: meta-llama/Llama-3-70b-chat-hf
1889
1960
 
1961
+ - name: together/llama-3.2-3b-instruct-turbo
1962
+ model_name: meta/llama-3.2-3b-instruct-turbo
1963
+ tokenizer_name: meta/llama-3.2-3b-instruct
1964
+ max_sequence_length: 128000
1965
+ client_spec:
1966
+ class_name: "helm.clients.together_client.TogetherChatClient"
1967
+ args:
1968
+ together_model: meta-llama/Llama-3.2-3B-Instruct-Turbo
1969
+
1970
+ - name: together/llama-3.2-11b-vision-instruct-turbo
1971
+ model_name: meta/llama-3.2-11b-vision-instruct-turbo
1972
+ tokenizer_name: meta/llama-3.2-11b-vision-instruct
1973
+ max_sequence_length: 128000
1974
+ client_spec:
1975
+ class_name: "helm.clients.together_client.TogetherChatClient"
1976
+ args:
1977
+ together_model: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
1978
+
1979
+ - name: together/llama-3.2-90b-vision-instruct-turbo
1980
+ model_name: meta/llama-3.2-90b-vision-instruct-turbo
1981
+ tokenizer_name: meta/llama-3.2-11b-vision-instruct
1982
+ max_sequence_length: 128000
1983
+ client_spec:
1984
+ class_name: "helm.clients.together_client.TogetherChatClient"
1985
+ args:
1986
+ together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
1987
+
1890
1988
  - name: together/llama-guard-7b
1891
1989
  model_name: meta/llama-guard-7b
1892
1990
  tokenizer_name: meta-llama/Llama-2-7b-hf
@@ -2262,6 +2360,40 @@ model_deployments:
2262
2360
  client_spec:
2263
2361
  class_name: "helm.clients.vision_language.palmyra_vision_client.PalmyraVisionClient"
2264
2362
 
2363
+ - name: writer/palmyra-x-004
2364
+ model_name: writer/palmyra-x-004
2365
+ # Actual tokenizer is Llama 2, but it cannot be used in HELM due to this issue:
2366
+ # https://github.com/stanford-crfm/helm/issues/2467
2367
+ # Work around by using Llama 3 tokenizer for now.
2368
+ tokenizer_name: meta/llama-3-8b
2369
+ max_sequence_length: 8192
2370
+ client_spec:
2371
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2372
+
2373
+ - name: writer/palmyra-med-32k
2374
+ model_name: writer/palmyra-med-32k
2375
+ # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
2376
+ # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
2377
+ tokenizer_name: meta/llama-3-8b
2378
+ max_sequence_length: 32000
2379
+ client_spec:
2380
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2381
+
2382
+ - name: writer/palmyra-med
2383
+ model_name: writer/palmyra-med
2384
+ # Palmyra-Med uses the "<|end_of_text|>" as the end of text token, which is used by meta/llama-3-8b,
2385
+ # rather than "<|eot_id|>", which is used by meta/llama-3-8b-instruct
2386
+ tokenizer_name: meta/llama-3-8b
2387
+ max_sequence_length: 4096
2388
+ client_spec:
2389
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2390
+
2391
+ - name: writer/palmyra-fin-32k
2392
+ model_name: writer/palmyra-fin-32k
2393
+ tokenizer_name: meta/llama-3-8b-instruct
2394
+ max_sequence_length: 32000
2395
+ client_spec:
2396
+ class_name: "helm.clients.palmyra_client.PalmyraChatClient"
2265
2397
 
2266
2398
  # Qwen
2267
2399
 
@@ -145,6 +145,23 @@ models:
145
145
  release_date: 2023-02-24
146
146
  tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
147
147
 
148
+ - name: aisingapore/llama3-8b-cpt-sea-lionv2-base
149
+ display_name: Llama 3 CPT SEA-Lion v2 (8B)
150
+ description: Llama 3 CPT SEA-Lion v2 (8B) is a multilingual model which was continued pre-trained on 48B additional tokens, including tokens in Southeast Asian languages.
151
+ creator_organization_name: AI Singapore
152
+ access: open
153
+ num_parameters: 80300000000
154
+ release_date: 2024-07-31
155
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
156
+
157
+ - name: aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct
158
+ display_name: Llama 3 CPT SEA-Lion v2.1 Instruct (8B)
159
+ description: Llama 3 CPT SEA-Lion v2.1 Instruct (8B) is a multilingual model which has been fine-tuned with around 100,000 English instruction-completion pairs alongside a smaller pool of around 50,000 instruction-completion pairs from other Southeast Asian languages, such as Indonesian, Thai and Vietnamese.
160
+ creator_organization_name: AI Singapore
161
+ access: open
162
+ num_parameters: 80300000000
163
+ release_date: 2024-08-21
164
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
148
165
 
149
166
  # Aleph Alpha
150
167
  # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
@@ -1427,6 +1444,24 @@ models:
1427
1444
  num_parameters: 8000000000
1428
1445
  release_date: 2024-04-18
1429
1446
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
1447
+
1448
+ - name: meta/llama-3-8b-instruct-turbo
1449
+ display_name: Llama 3 Instruct Turbo (8B)
1450
+ description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/) Turbo is Together's implementation, providing fast FP8 performance while maintaining quality, closely matching FP16 reference models. ([blog](https://www.together.ai/blog/together-inference-engine-2))
1451
+ creator_organization_name: Meta
1452
+ access: open
1453
+ num_parameters: 8000000000
1454
+ release_date: 2024-07-18
1455
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1456
+
1457
+ - name: meta/llama-3-8b-instruct-lite
1458
+ display_name: Llama 3 Instruct Lite (8B)
1459
+ description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/) Lite is Together's implementation, it leverages a number of optimizations including INT4 quantization, provides the most cost-efficient and scalable Llama 3 models available anywhere, while maintaining excellent quality relative to full precision reference implementations ([blog](https://www.together.ai/blog/together-inference-engine-2))
1460
+ creator_organization_name: Meta
1461
+ access: open
1462
+ num_parameters: 8000000000
1463
+ release_date: 2024-07-18
1464
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1430
1465
 
1431
1466
  - name: meta/llama-3-70b
1432
1467
  display_name: Llama 3 (70B)
@@ -1436,6 +1471,24 @@ models:
1436
1471
  num_parameters: 70000000000
1437
1472
  release_date: 2024-04-18
1438
1473
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
1474
+
1475
+ - name: meta/llama-3-70b-instruct-turbo
1476
+ display_name: Llama 3 Instruct Turbo (70B)
1477
+ description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/) Turbo is Together's implementation, providing fast FP8 performance while maintaining quality, closely matching FP16 reference models. ([blog](https://www.together.ai/blog/together-inference-engine-2))
1478
+ creator_organization_name: Meta
1479
+ access: open
1480
+ num_parameters: 70000000000
1481
+ release_date: 2024-07-18
1482
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1483
+
1484
+ - name: meta/llama-3-70b-instruct-lite
1485
+ display_name: Llama 3 Instruct Lite (70B)
1486
+ description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/) Lite is Together's implementation, it leverages a number of optimizations including INT4 quantization, provides the most cost-efficient and scalable Llama 3 models available anywhere, while maintaining excellent quality relative to full precision reference implementations ([blog](https://www.together.ai/blog/together-inference-engine-2))
1487
+ creator_organization_name: Meta
1488
+ access: open
1489
+ num_parameters: 70000000000
1490
+ release_date: 2024-07-18
1491
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1439
1492
 
1440
1493
  - name: meta/llama-3.1-8b-instruct-turbo
1441
1494
  display_name: Llama 3.1 Instruct Turbo (8B)
@@ -1444,7 +1497,7 @@ models:
1444
1497
  access: open
1445
1498
  num_parameters: 8000000000
1446
1499
  release_date: 2024-07-23
1447
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
1500
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1448
1501
 
1449
1502
  - name: meta/llama-3.1-70b-instruct-turbo
1450
1503
  display_name: Llama 3.1 Instruct Turbo (70B)
@@ -1453,7 +1506,7 @@ models:
1453
1506
  access: open
1454
1507
  num_parameters: 70000000000
1455
1508
  release_date: 2024-07-23
1456
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
1509
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1457
1510
 
1458
1511
  - name: meta/llama-3.1-405b-instruct-turbo
1459
1512
  display_name: Llama 3.1 Instruct Turbo (405B)
@@ -1462,7 +1515,34 @@ models:
1462
1515
  access: open
1463
1516
  num_parameters: 405000000000
1464
1517
  release_date: 2024-07-23
1465
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
1518
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1519
+
1520
+ - name: meta/llama-3.2-3b-instruct-turbo
1521
+ display_name: Llama 3.2 Instruct Turbo (3B)
1522
+ description: The Meta Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned text-only generative models in 1B and 3B sizes. ([blog](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))
1523
+ creator_organization_name: Meta
1524
+ access: open
1525
+ num_parameters: 3210000000
1526
+ release_date: 2024-09-25
1527
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1528
+
1529
+ - name: meta/llama-3.2-11b-vision-instruct-turbo
1530
+ display_name: Llama 3.2 Vision Instruct Turbo (11B)
1531
+ description: The Llama 3.2 Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes. ([blog](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))
1532
+ creator_organization_name: Meta
1533
+ access: open
1534
+ num_parameters: 10700000000
1535
+ release_date: 2024-09-25
1536
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG. LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1537
+
1538
+ - name: meta/llama-3.2-90b-vision-instruct-turbo
1539
+ display_name: Llama 3.2 Vision Instruct Turbo (90B)
1540
+ description: The Llama 3.2 Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes. ([blog](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))
1541
+ creator_organization_name: Meta
1542
+ access: open
1543
+ num_parameters: 88600000000
1544
+ release_date: 2024-09-25
1545
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG. LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1466
1546
 
1467
1547
  - name: meta/llama-3-8b-chat
1468
1548
  display_name: Llama 3 Instruct (8B)
@@ -1510,9 +1590,6 @@ models:
1510
1590
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
1511
1591
 
1512
1592
 
1513
-
1514
-
1515
-
1516
1593
  # Microsoft/NVIDIA
1517
1594
  - name: microsoft/TNLGv2_530B
1518
1595
  display_name: TNLG v2 (530B)
@@ -2218,6 +2295,23 @@ models:
2218
2295
  release_date: 2023-11-06
2219
2296
  tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
2220
2297
 
2298
+ ## o1 Models
2299
+ - name: openai/o1-preview-2024-09-12
2300
+ display_name: o1-preview (2024-09-12)
2301
+ description: o1-preview is a language model trained with reinforcement learning to perform complex reasoning that can produce a long internal chain of thought before responding to the user. ([model card](https://openai.com/index/openai-o1-system-card/), [blog post](https://openai.com/index/learning-to-reason-with-llms/))
2302
+ creator_organization_name: OpenAI
2303
+ access: limited
2304
+ release_date: 2024-09-12
2305
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2306
+
2307
+ - name: openai/o1-mini-2024-09-12
2308
+ display_name: o1-mini (2024-09-12)
2309
+ description: o1-mini is a cost-effective reasoning model for applications that require reasoning without broad world knowledge. ([model card](https://openai.com/index/openai-o1-system-card/), [blog post](https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/))
2310
+ creator_organization_name: OpenAI
2311
+ access: limited
2312
+ release_date: 2024-09-12
2313
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2314
+
2221
2315
  ## Codex Models
2222
2316
  # DEPRECATED: Codex models have been shut down on March 23 2023.
2223
2317
 
@@ -2928,6 +3022,40 @@ models:
2928
3022
  # Does not support echo
2929
3023
  tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
2930
3024
 
3025
+ - name: writer/palmyra-x-004
3026
+ display_name: Palmyra-X-004
3027
+ description: Palmyra-X-004 language model with a large context window of up to 128,000 tokens that excels in processing and understanding complex tasks.
3028
+ creator_organization_name: Writer
3029
+ access: limited
3030
+ release_date: 2024-09-12
3031
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3032
+
3033
+ - name: writer/palmyra-med-32k
3034
+ display_name: Palmyra-Med 32K (70B)
3035
+ description: Palmyra-Med 32K (70B) is a model finetuned from Palmyra-X-003 intended for medical applications.
3036
+ creator_organization_name: Writer
3037
+ access: open
3038
+ num_parameters: 70600000000
3039
+ release_date: 2024-07-31
3040
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3041
+
3042
+ - name: writer/palmyra-med
3043
+ display_name: Palmyra-Med (70B)
3044
+ description: Palmyra-Med (70B) is a model finetuned from Palmyra-X-003 intended for medical applications.
3045
+ creator_organization_name: Writer
3046
+ access: open
3047
+ num_parameters: 70600000000
3048
+ release_date: 2024-07-31
3049
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3050
+
3051
+ - name: writer/palmyra-fin-32k
3052
+ display_name: Palmyra-Fin 32K (70B)
3053
+ description: Palmyra-Fin 32K (70B) is a model finetuned from Palmyra-X-003 intended for financial applications.
3054
+ creator_organization_name: Writer
3055
+ access: open
3056
+ num_parameters: 70600000000
3057
+ release_date: 2024-07-31
3058
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2931
3059
 
2932
3060
  # Yandex
2933
3061
  - name: yandex/yalm