crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,83 +1,50 @@
1
- import json
2
- import requests
3
- from typing import Any, Dict, List
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import cohere
4
+ from cohere.manually_maintained.tokenizers import get_hf_tokenizer
4
5
 
5
6
  from helm.common.cache import CacheConfig
6
7
  from helm.common.tokenization_request import (
7
8
  TokenizationRequest,
8
- DecodeRequest,
9
- DecodeRequestResult,
10
9
  TokenizationToken,
11
10
  )
12
- from helm.clients.cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
13
- from .caching_tokenizer import CachingTokenizer
14
-
11
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
15
12
 
16
- class CohereTokenizer(CachingTokenizer):
17
- # From "https://docs.cohere.ai/versioning-reference",
18
- # "this version [2021-11-08] introduces multiple generations, meaning that the generations endpoint will
19
- # now accept a num_generations argument in the JSON and will always return an array of generations"
20
- # Note that the API version is decoupled from the model version.
21
- DEFAULT_API_VERSION: str = "2021-11-08"
22
13
 
23
- TOKENIZE_ENDPOINT: str = "tokenize"
14
+ class CohereLocalTokenizer(CachingTokenizer):
15
+ """Cohere tokenizer using the Cohere Python library."""
24
16
 
25
- # According to https://docs.cohere.ai/tokenize-reference#request, for tokenize, text: "the string to
26
- # be tokenized, the minimum text length is 1 character, and the maximum text length is 65536 characters."
27
- # However, even sending a request with 60,000 characters sometimes fails, so we set the
28
- # maximum length to 50,000, which is about 8,333 tokens.
29
- # TODO: followed up with Cohere support with an example of a failure case
30
- TOKENIZE_API_MAX_TEXT_LENGTH: int = 50_000
31
-
32
- def __init__(self, api_key: str, cache_config: CacheConfig) -> None:
17
+ def __init__(self, api_key: Optional[str], cache_config: CacheConfig) -> None:
33
18
  super().__init__(cache_config)
34
- self.api_key: str = api_key
19
+ self.client = cohere.Client(api_key)
35
20
 
36
21
  def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
37
- # This cache key is used to preserve our existing Cache (10/17/2023)
38
- return {"text": request.text}
22
+ return {"text": request.text, "tokenizer": request.tokenizer}
39
23
 
40
24
  def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
41
- """
42
- Send the request to the Cohere Tokenize API.
43
-
44
- From https://docs.cohere.ai/tokenize-reference, for text "tokenize me! :D", the response will be:
45
-
46
- {
47
- "tokens": [34160, 974, 514, 34, 1420, 69]
48
- "token_strings": ["token", "ize", " me", "!", " :", "D"]
49
- }
50
- """
51
- text: str = request["text"]
52
- assert (
53
- 1 <= len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
54
- ), f"Invalid text length: {len(text)}. Valid length: [1..{CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH:,d}]"
55
-
56
- response = requests.request(
57
- method="POST",
58
- url=get_cohere_url(CohereTokenizer.TOKENIZE_ENDPOINT),
59
- headers={
60
- "Authorization": f"BEARER {self.api_key}",
61
- "Content-Type": "application/json",
62
- "Cohere-Version": DEFAULT_COHERE_API_VERSION,
63
- },
64
- data=json.dumps(request),
25
+ model: str = request["tokenizer"].split("/")[1]
26
+ # Workaround for https://github.com/cohere-ai/cohere-python/issues/493
27
+ # `token_strings` are always set to `[]`, so we have to populate it ourselves.
28
+ response = self.client.tokenize(text=request["text"], model=model)
29
+ response_dict = response.dict()
30
+ response_dict["token_strings"] = get_hf_tokenizer(self.client, model).decode_batch(
31
+ [[token] for token in response.tokens]
65
32
  )
66
- result = json.loads(response.text)
67
- assert "message" not in result.keys(), f"Request failed with error {result['message']}"
68
- assert "tokens" in result and "token_strings" in result, f"Invalid response: {result}"
69
- # This output format is used to preserve our existing Cache (10/17/2023)
70
- return result
33
+ return response_dict
71
34
 
72
35
  def _tokenization_raw_response_to_tokens(
73
36
  self, response: Dict[str, Any], request: TokenizationRequest
74
37
  ) -> List[TokenizationToken]:
75
- tokens = response["tokens" if request.encode else "token_strings"]
76
- return [TokenizationToken(token) for token in tokens]
38
+ tokens: List[TokenizationToken] = []
39
+ if request.encode:
40
+ tokens = [TokenizationToken(token) for token in response["tokens"]]
41
+ else:
42
+ tokens = [TokenizationToken(token) for token in response["token_strings"]]
43
+ if request.truncation:
44
+ tokens = tokens[: request.max_length]
45
+ return tokens
77
46
 
78
47
  def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
79
- # Defined for mypy but decode() already raises NotImplementedError
80
- raise NotImplementedError("The Cohere API does not support decoding.")
81
-
82
- def decode(self, request: DecodeRequest) -> DecodeRequestResult:
83
- raise NotImplementedError("The Cohere API does not support decoding.")
48
+ model: str = request["tokenizer"].split("/")[1]
49
+ response = self.client.detokenize(tokens=request["tokens"], model=model)
50
+ return response.dict()
@@ -29,8 +29,17 @@ class HuggingFaceTokenizer(CachingTokenizer):
29
29
  _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
30
30
  _tokenizers_lock: Lock = Lock()
31
31
 
32
- def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
32
+ def __init__(
33
+ self,
34
+ cache_config: CacheConfig,
35
+ tokenizer_name: str,
36
+ pretrained_model_name_or_path: Optional[str] = None,
37
+ **kwargs,
38
+ ):
33
39
  super().__init__(cache_config=cache_config)
40
+ self._helm_tokenizer_name = (
41
+ tokenizer_name # HELM tokenizer name (e.g. "huggingface/gpt2"), *not* Hugging Face Hub Model ID
42
+ )
34
43
  self._pretrained_model_name_or_path = pretrained_model_name_or_path
35
44
  self._kwargs = kwargs
36
45
 
@@ -40,7 +49,10 @@ class HuggingFaceTokenizer(CachingTokenizer):
40
49
  # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
41
50
  # TODO: Figure out if we actually need this.
42
51
  os.environ["TOKENIZERS_PARALLELISM"] = "False"
43
-
52
+ from_pretrained_kwargs = {**kwargs}
53
+ # If unspecified, set `use_fast=True` by default.
54
+ if "use_fast" not in from_pretrained_kwargs:
55
+ from_pretrained_kwargs["use_fast"] = True
44
56
  try:
45
57
  # From the Hugging Face documentation, "local_files_only(defaults to False) —
46
58
  # Whether or not to only look at local files".
@@ -53,14 +65,14 @@ class HuggingFaceTokenizer(CachingTokenizer):
53
65
  # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
54
66
  return WrappedPreTrainedTokenizer(
55
67
  AutoTokenizer.from_pretrained(
56
- pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
68
+ pretrained_model_name_or_path, local_files_only=True, **from_pretrained_kwargs
57
69
  )
58
70
  )
59
71
  except OSError:
60
72
  hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
61
73
  return WrappedPreTrainedTokenizer(
62
74
  AutoTokenizer.from_pretrained(
63
- pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
75
+ pretrained_model_name_or_path, local_files_only=False, **from_pretrained_kwargs
64
76
  )
65
77
  )
66
78
 
@@ -84,21 +96,26 @@ class HuggingFaceTokenizer(CachingTokenizer):
84
96
  )
85
97
  return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
86
98
 
87
- def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer:
88
- """Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer."""
99
+ def get_wrapped_tokenizer(self) -> WrappedPreTrainedTokenizer:
100
+ """Get the underlying Hugging Face WrappedPreTrainedTokenizer."""
89
101
  pretrained_model_name_or_path = (
90
- self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else request["tokenizer"]
102
+ self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else self._helm_tokenizer_name
91
103
  )
92
104
  return HuggingFaceTokenizer.get_tokenizer(
93
- helm_tokenizer_name=request["tokenizer"],
105
+ helm_tokenizer_name=self._helm_tokenizer_name,
94
106
  pretrained_model_name_or_path=pretrained_model_name_or_path,
95
107
  **self._kwargs,
96
108
  )
97
109
 
98
110
  def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
111
+ if request["tokenizer"] != self._helm_tokenizer_name:
112
+ raise ValueError(
113
+ f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
114
+ "but instead the request has tokenizer {request['tokenizer']}"
115
+ )
99
116
  if request["encode"]:
100
117
  if request["truncation"]:
101
- with self._get_tokenizer_for_request(request) as tokenizer:
118
+ with self.get_wrapped_tokenizer() as tokenizer:
102
119
  tokens = tokenizer.encode(
103
120
  request["text"],
104
121
  truncation=request["truncation"],
@@ -106,7 +123,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
106
123
  add_special_tokens=False,
107
124
  )
108
125
  else:
109
- with self._get_tokenizer_for_request(request) as tokenizer:
126
+ with self.get_wrapped_tokenizer() as tokenizer:
110
127
  tokens = tokenizer.encode(request["text"], add_special_tokens=False)
111
128
  else:
112
129
  if "gpt" in request["tokenizer"] or request["tokenizer"] in [
@@ -118,7 +135,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
118
135
  # convert_tokens_to_string method. We prefer to use this method instead
119
136
  # of the hacky cleanup_tokens method below as it might handle cases
120
137
  # we haven't thought of in cleanup_tokens.
121
- with self._get_tokenizer_for_request(request) as tokenizer:
138
+ with self.get_wrapped_tokenizer() as tokenizer:
122
139
  tokens = [
123
140
  tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
124
141
  ]
@@ -131,7 +148,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
131
148
  # But this replaces all the "▁" characters by "", which is not what we want.
132
149
  # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
133
150
  # Just like tokenize("Hello", encode=False) would return ["Hello"].
134
- with self._get_tokenizer_for_request(request) as tokenizer:
151
+ with self.get_wrapped_tokenizer() as tokenizer:
135
152
  tokens = tokenizer.tokenize(request["text"])
136
153
  # Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings.
137
154
  if tokens and type(tokens[0]) == bytes:
@@ -140,7 +157,12 @@ class HuggingFaceTokenizer(CachingTokenizer):
140
157
  return {"tokens": tokens}
141
158
 
142
159
  def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
143
- with self._get_tokenizer_for_request(request) as tokenizer:
160
+ if request["tokenizer"] != self._helm_tokenizer_name:
161
+ raise ValueError(
162
+ f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
163
+ "but instead the request has tokenizer {request['tokenizer']}"
164
+ )
165
+ with self.get_wrapped_tokenizer() as tokenizer:
144
166
  text = tokenizer.decode(
145
167
  request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
146
168
  )
@@ -0,0 +1,48 @@
1
+ import pytest
2
+
3
+ from helm.common.cache import BlackHoleCacheConfig
4
+ from helm.common.tokenization_request import (
5
+ DecodeRequest,
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+
10
+
11
+ @pytest.mark.models
12
+ def test_tokenize():
13
+ from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
14
+
15
+ tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
16
+ request = TokenizationRequest(tokenizer="ai21/jamba-instruct-tokenizer", text="otter 🦦")
17
+ result = tokenizer.tokenize(request)
18
+ assert result.success
19
+ assert not result.cached
20
+ assert result.tokens == [
21
+ TokenizationToken(token) for token in ["ot", "ter", "▁", "<0xF0>", "<0x9F>", "<0xA6>", "<0xA6>"]
22
+ ]
23
+
24
+
25
+ @pytest.mark.models
26
+ def test_encode():
27
+ from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
28
+
29
+ tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
30
+ request = TokenizationRequest(tokenizer="ai21/jamba-instruct-tokenizer", text="otter 🦦", encode=True)
31
+ result = tokenizer.tokenize(request)
32
+ assert result.success
33
+ assert not result.cached
34
+ assert result.tokens == [TokenizationToken(token) for token in [1860, 1901, 62934, 1784, 1703, 1710, 1710]]
35
+
36
+
37
+ @pytest.mark.models
38
+ def test_decode():
39
+ from helm.tokenizers.ai21_tokenizer import AI21LocalTokenizer
40
+
41
+ tokenizer = AI21LocalTokenizer(cache_config=BlackHoleCacheConfig())
42
+ request = DecodeRequest(
43
+ tokenizer="ai21/jamba-instruct-tokenizer", tokens=[1860, 1901, 62934, 1784, 1703, 1710, 1710]
44
+ )
45
+ result = tokenizer.decode(request)
46
+ assert result.success
47
+ assert not result.cached
48
+ assert result.text == "otter 🦦"
@@ -0,0 +1,39 @@
1
+ import pytest
2
+
3
+ from helm.common.cache import BlackHoleCacheConfig
4
+ from helm.common.tokenization_request import (
5
+ DecodeRequest,
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+ from helm.tokenizers.cohere_tokenizer import CohereLocalTokenizer
10
+
11
+
12
+ @pytest.mark.models
13
+ def test_tokenize():
14
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
15
+ request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦")
16
+ result = tokenizer.tokenize(request)
17
+ assert result.success
18
+ assert not result.cached
19
+ assert result.tokens == [TokenizationToken(token) for token in ["ot", "ter", " �", "�", "�"]]
20
+
21
+
22
+ @pytest.mark.models
23
+ def test_encode():
24
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
25
+ request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦", encode=True)
26
+ result = tokenizer.tokenize(request)
27
+ assert result.success
28
+ assert not result.cached
29
+ assert result.tokens == [TokenizationToken(token) for token in [1741, 1779, 7728, 107, 107]]
30
+
31
+
32
+ @pytest.mark.models
33
+ def test_decode():
34
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
35
+ request = DecodeRequest(tokenizer="cohere/command", tokens=[1741, 1779, 7728, 107, 107])
36
+ result = tokenizer.decode(request)
37
+ assert result.success
38
+ assert not result.cached
39
+ assert result.text == "otter 🦦"
@@ -17,7 +17,11 @@ class TestHuggingFaceGPT2Tokenizer:
17
17
  def setup_method(self, method):
18
18
  cache_file = tempfile.NamedTemporaryFile(delete=False)
19
19
  self.cache_path: str = cache_file.name
20
- self.tokenizer = HuggingFaceTokenizer(SqliteCacheConfig(self.cache_path))
20
+ self.tokenizer = HuggingFaceTokenizer(
21
+ SqliteCacheConfig(self.cache_path),
22
+ tokenizer_name="huggingface/gpt2",
23
+ pretrained_model_name_or_path="openai-community/gpt2",
24
+ )
21
25
 
22
26
  def teardown_method(self, method):
23
27
  os.remove(self.cache_path)
@@ -1,156 +0,0 @@
1
- .correct {
2
- background-color: #dfffdf;
3
- }
4
-
5
- .wrong {
6
- background-color: #ffdfdf;
7
- }
8
-
9
- .scenario-info {
10
- margin-top: 30px;
11
- margin-bottom: 30px;
12
- }
13
-
14
- td {
15
- padding-left: 15px;
16
- padding-right: 15px;
17
- padding-top: 5px;
18
- padding-bottom: 5px;
19
- word-wrap: break-word;
20
- max-width: 900px;
21
- vertical-align: top;
22
- }
23
-
24
- .results-table {
25
- }
26
-
27
- .table-container {
28
- margin-top: 30px;
29
- margin-bottom: 30px;
30
- }
31
-
32
- tr {
33
- border: solid;
34
- border-color: #f0f0f0;
35
- border-width: 1px 0;
36
- }
37
-
38
- .results-table thead tr {
39
- background-color: #f9f9f9;
40
- }
41
-
42
- .logprob {
43
- font-size: 8pt;
44
- font-style: italic;
45
- color: gray;
46
- }
47
-
48
- .list-header {
49
- font-size: 24px;
50
- font-weight: bold;
51
- }
52
- .list-item {
53
- color: black;
54
- font-size: 14px;
55
- white-space: nowrap;
56
- }
57
- .list-item-todo {
58
- color: lightgray;
59
- }
60
- .list-item:hover {
61
- color: black;
62
- text-decoration: none;
63
- background-color: lightgray;
64
- }
65
-
66
- .main-link {
67
- color: white;
68
- background-color: #53A0C0;
69
- }
70
- .main-link:hover {
71
- color: lightgray;
72
- }
73
-
74
- .access-open {
75
- background-color: lightgreen;
76
- width: 100px;
77
- }
78
- .access-limited {
79
- background-color: yellow;
80
- width: 100px;
81
- }
82
- .access-restricted {
83
- background-color: orange;
84
- width: 100px;
85
- }
86
- .access-closed {
87
- background-color: lightgray;
88
- width: 100px;
89
- }
90
-
91
- .technical-details {
92
- font-size: 10px;
93
- font-style: italic;
94
- color: gray;
95
- }
96
-
97
- .logo-container {
98
- display: flex;
99
- flex-flow: row wrap;
100
- justify-content: space-between;
101
- padding: 20px;
102
- }
103
-
104
- .logo-item {
105
- margin: auto;
106
- padding: 10px;
107
- }
108
-
109
- .instance-input {
110
- font-style: italic;
111
- background-color: #f5f5f5;
112
- margin-left: 20px;
113
- white-space: pre-wrap;
114
- }
115
-
116
- .instance-reference {
117
- font-style: italic;
118
- background-color: #f5f5f5;
119
- white-space: pre-wrap;
120
- }
121
-
122
- .taxonomy-table {
123
- margin: 10px;
124
- }
125
-
126
- thead .table-sort-column {
127
- background-color: #ffe599;
128
- }
129
-
130
- tbody .table-sort-column {
131
- background-color: #fff2cc;
132
- }
133
-
134
- .prompt {
135
- font-style: italic;
136
- background-color: #f5f5f5;
137
- white-space: pre-wrap;
138
- }
139
-
140
- .plot {
141
- margin: 15px;
142
- }
143
-
144
- .plot img {
145
- margin: 10px;
146
- }
147
-
148
- .plot-caption {
149
- color: #555;
150
- font-style: italic;
151
- margin: 5px;
152
- }
153
-
154
- .prediction-text {
155
- white-space: pre-wrap;
156
- }