crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -69,7 +69,7 @@ tokenizer_configs:
69
69
  tokenizer_spec:
70
70
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
71
71
  end_of_text_token: "</s>"
72
- prefix_token: "</s>"
72
+ prefix_token: "<s>"
73
73
  - name: bigscience/T0pp
74
74
  tokenizer_spec:
75
75
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -83,6 +83,46 @@ tokenizer_configs:
83
83
  end_of_text_token: ""
84
84
  prefix_token: ":"
85
85
 
86
+ - name: cohere/command
87
+ tokenizer_spec:
88
+ class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
89
+ end_of_text_token: "<EOS_TOKEN>"
90
+ prefix_token: "<BOS_TOKEN>"
91
+
92
+ - name: cohere/command-light
93
+ tokenizer_spec:
94
+ class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
95
+ end_of_text_token: "<EOS_TOKEN>"
96
+ prefix_token: "<BOS_TOKEN>"
97
+
98
+ - name: cohere/command-r
99
+ tokenizer_spec:
100
+ class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
101
+ end_of_text_token: "<EOS_TOKEN>"
102
+ prefix_token: "<BOS_TOKEN>"
103
+
104
+ - name: cohere/command-r-plus
105
+ tokenizer_spec:
106
+ class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
107
+ end_of_text_token: "<EOS_TOKEN>"
108
+ prefix_token: "<BOS_TOKEN>"
109
+
110
+ - name: cohere/c4ai-command-r-v01
111
+ tokenizer_spec:
112
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
113
+ args:
114
+ pretrained_model_name_or_path: CohereForAI/c4ai-command-r-v01
115
+ end_of_text_token: "<EOS_TOKEN>"
116
+ prefix_token: "<BOS_TOKEN>"
117
+
118
+ - name: cohere/c4ai-command-r-plus
119
+ tokenizer_spec:
120
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
121
+ args:
122
+ pretrained_model_name_or_path: CohereForAI/c4ai-command-r-plus
123
+ end_of_text_token: "<EOS_TOKEN>"
124
+ prefix_token: "<BOS_TOKEN>"
125
+
86
126
  # Databricks
87
127
  - name: databricks/dbrx-instruct
88
128
  tokenizer_spec:
@@ -247,6 +287,17 @@ tokenizer_configs:
247
287
  end_of_text_token: "</s>"
248
288
  prefix_token: "<s>"
249
289
 
290
+ # AI Singapore
291
+ - name: aisingapore/sea-lion-7b
292
+ tokenizer_spec:
293
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
294
+ args:
295
+ trust_remote_code: true
296
+ use_fast: false
297
+ end_of_text_token: "<|endoftext|>"
298
+ prefix_token: ""
299
+
300
+
250
301
 
251
302
  # Allen Institute for AI
252
303
  # The allenai/olmo-7b requires Python 3.9 or newer.
@@ -259,6 +310,12 @@ tokenizer_configs:
259
310
  end_of_text_token: "<|endoftext|>"
260
311
  prefix_token: ""
261
312
 
313
+ - name: allenai/OLMo-1.7-7B-hf
314
+ tokenizer_spec:
315
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
316
+ end_of_text_token: "<|endoftext|>"
317
+ prefix_token: ""
318
+
262
319
 
263
320
  # Microsoft
264
321
  - name: microsoft/phi-2
@@ -274,6 +331,24 @@ tokenizer_configs:
274
331
  end_of_text_token: "</s>"
275
332
  prefix_token: "<s>"
276
333
 
334
+ - name: mistralai/Mistral-7B-Instruct-v0.1
335
+ tokenizer_spec:
336
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
337
+ end_of_text_token: "</s>"
338
+ prefix_token: "<s>"
339
+
340
+ - name: mistralai/Mistral-7B-Instruct-v0.2
341
+ tokenizer_spec:
342
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
343
+ end_of_text_token: "</s>"
344
+ prefix_token: "<s>"
345
+
346
+ - name: mistralai/Mistral-7B-Instruct-v0.3
347
+ tokenizer_spec:
348
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
349
+ end_of_text_token: "</s>"
350
+ prefix_token: "<s>"
351
+
277
352
  # Neurips
278
353
  - name: neurips/local
279
354
  tokenizer_spec:
@@ -288,6 +363,12 @@ tokenizer_configs:
288
363
  end_of_text_token: "<|endoftext|>"
289
364
  prefix_token: "<|endoftext|>"
290
365
 
366
+ - name: openai/o200k_base
367
+ tokenizer_spec:
368
+ class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
369
+ end_of_text_token: "<|endoftext|>"
370
+ prefix_token: "<|endoftext|>"
371
+
291
372
  - name: openai/clip-vit-large-patch14
292
373
  tokenizer_spec:
293
374
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -311,6 +392,14 @@ tokenizer_configs:
311
392
  end_of_text_token: "<|endoftext|>"
312
393
  prefix_token: ""
313
394
 
395
+ - name: qwen/qwen2-72b-instruct
396
+ tokenizer_spec:
397
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
398
+ args:
399
+ pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct
400
+ end_of_text_token: <|im_end|>"
401
+ prefix_token: "<|im_start|>'"
402
+
314
403
  - name: qwen/qwen-vl
315
404
  tokenizer_spec:
316
405
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -331,6 +420,16 @@ tokenizer_configs:
331
420
  end_of_text_token: "<|endoftext|>"
332
421
  prefix_token: ""
333
422
 
423
+ # Snowflake
424
+ - name: snowflake/snowflake-arctic-instruct
425
+ tokenizer_spec:
426
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
427
+ args:
428
+ pretrained_model_name_or_path: Snowflake/snowflake-arctic-instruct
429
+ trust_remote_code: true
430
+ end_of_text_token: "<|im_end|>"
431
+ prefix_token: "<|im_start|>"
432
+
334
433
  # Tiiuae
335
434
  - name: tiiuae/falcon-7b
336
435
  tokenizer_spec:
@@ -15,6 +15,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
15
15
  from helm.common.request import Request, RequestResult, GeneratedOutput
16
16
  from helm.clients.client import Client
17
17
  from helm.proxy.critique.critique_client import CritiqueClient
18
+ from helm.common.media_object import MultimediaObject, MediaObject
18
19
 
19
20
 
20
21
  class CritiqueParseError(Exception):
@@ -24,6 +25,8 @@ class CritiqueParseError(Exception):
24
25
  class ModelCritiqueClient(CritiqueClient):
25
26
  """A CritiqueClient that queries a Model to answer CritiqueRequests."""
26
27
 
28
+ VISION_LANGUAGE_MODELS = ["openai/gpt-4-vision", "reka/reka", "huggingface/prometheus-vision"]
29
+
27
30
  def __init__(self, client: Client, model_name):
28
31
  self._client = client
29
32
  self._model_name = model_name
@@ -31,6 +34,11 @@ class ModelCritiqueClient(CritiqueClient):
31
34
  get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
32
35
  or self._model_name
33
36
  )
37
+ self.vision_language = False
38
+ for vision_language_model_name in self.VISION_LANGUAGE_MODELS:
39
+ if model_name.startswith(vision_language_model_name):
40
+ self.vision_language = True
41
+ break
34
42
 
35
43
  def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
36
44
  for key, value in fields.items():
@@ -58,10 +66,15 @@ class ModelCritiqueClient(CritiqueClient):
58
66
 
59
67
  requests: List[Request] = []
60
68
  for question in task.questions:
61
- prompt: str = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
69
+ prompt: str
70
+ if len(question.text) > 0:
71
+ prompt = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
72
+ else:
73
+ # We may don't want to add extra newlines and prompts
74
+ # if the question text is empty (e.g., the Vibe-Eval evaluator).
75
+ prompt = base_prompt
62
76
  if question.question_type == "free_response":
63
- # TODO: Make max_tokens configurable
64
- max_tokens = 100
77
+ max_tokens = 100 if task.max_tokens is None else task.max_tokens
65
78
  elif question.question_type == "checkbox":
66
79
  # We multiply by 2 because the model will generate a comma after each option.
67
80
  max_tokens = len(question.options) * 2
@@ -78,12 +91,21 @@ class ModelCritiqueClient(CritiqueClient):
78
91
 
79
92
  prompt = anthropic.HUMAN_PROMPT + prompt + anthropic.AI_PROMPT
80
93
 
94
+ multimodal_prompt: Optional[MultimediaObject] = None
95
+ if self.vision_language:
96
+ assert question.media_object is not None, "Expect media_object for vision-language models"
97
+ image_media: MediaObject = question.media_object
98
+ text_media: MediaObject = MediaObject(text=prompt, content_type="text/plain")
99
+ multimodal_prompt = MultimediaObject(media_objects=[image_media, text_media])
100
+ prompt = "" # set to empty string to avoid conflicts with multimodal_prompt
101
+
81
102
  request = Request(
82
103
  model=self._model_name,
83
104
  model_deployment=self._model_deployment_name,
84
105
  prompt=prompt,
85
106
  max_tokens=max_tokens,
86
107
  echo_prompt=False,
108
+ multimodal_prompt=multimodal_prompt,
87
109
  )
88
110
  requests.append(request)
89
111
  return requests
@@ -124,7 +146,13 @@ class ModelCritiqueClient(CritiqueClient):
124
146
  raise CritiqueParseError(
125
147
  f"Invalid answer: {completion}. Multiple choice questions should have one answer."
126
148
  )
127
- return answers[0]
149
+ letter_answer = answers[0]
150
+ choice_rank = string.ascii_uppercase.index(letter_answer)
151
+ if choice_rank >= len(question.options):
152
+ raise CritiqueParseError(
153
+ f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
154
+ )
155
+ return letter_answer
128
156
  except CritiqueParseError as e:
129
157
  # If there was an error parsing the answer, we assume the user did not answer the question.
130
158
  hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")
@@ -119,7 +119,7 @@ class ServerService(Service):
119
119
  return "codex"
120
120
  elif model_deployment.startswith("openai/dall-e-"):
121
121
  return "dall_e"
122
- elif model_deployment.startswith("openai/gpt-4-"):
122
+ elif model_deployment.startswith("openai/gpt-4"):
123
123
  return "gpt4"
124
124
  else:
125
125
  return "gpt3"
@@ -41,7 +41,7 @@ class AutoTokenizer(Tokenizer):
41
41
  if tokenizer_config:
42
42
  tokenizer_spec = inject_object_spec_args(
43
43
  tokenizer_config.tokenizer_spec,
44
- constant_bindings={"cache_config": cache_config},
44
+ constant_bindings={"cache_config": cache_config, "tokenizer_name": tokenizer_name},
45
45
  provider_bindings={
46
46
  "api_key": lambda: provide_api_key(self.credentials, organization),
47
47
  "project_id": lambda: self.credentials.get(organization + "ProjectId", None), # VertexAI
@@ -1,6 +1,9 @@
1
1
  import json
2
2
  import requests
3
- from typing import Any, Dict, List
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import cohere
6
+ from cohere.manually_maintained.tokenizers import get_hf_tokenizer
4
7
 
5
8
  from helm.common.cache import CacheConfig
6
9
  from helm.common.tokenization_request import (
@@ -10,7 +13,7 @@ from helm.common.tokenization_request import (
10
13
  TokenizationToken,
11
14
  )
12
15
  from helm.clients.cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
13
- from .caching_tokenizer import CachingTokenizer
16
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
14
17
 
15
18
 
16
19
  class CohereTokenizer(CachingTokenizer):
@@ -81,3 +84,42 @@ class CohereTokenizer(CachingTokenizer):
81
84
 
82
85
  def decode(self, request: DecodeRequest) -> DecodeRequestResult:
83
86
  raise NotImplementedError("The Cohere API does not support decoding.")
87
+
88
+
89
+ class CohereLocalTokenizer(CachingTokenizer):
90
+ """Cohere tokenizer using the Cohere Python library."""
91
+
92
+ def __init__(self, api_key: Optional[str], cache_config: CacheConfig) -> None:
93
+ super().__init__(cache_config)
94
+ self.client = cohere.Client(api_key)
95
+
96
+ def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
97
+ return {"text": request.text, "tokenizer": request.tokenizer}
98
+
99
+ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
100
+ model: str = request["tokenizer"].split("/")[1]
101
+ # Workaround for https://github.com/cohere-ai/cohere-python/issues/493
102
+ # `token_strings` are always set to `[]`, so we have to populate it ourselves.
103
+ response = self.client.tokenize(text=request["text"], model=model)
104
+ response_dict = response.dict()
105
+ response_dict["token_strings"] = get_hf_tokenizer(self.client, model).decode_batch(
106
+ [[token] for token in response.tokens]
107
+ )
108
+ return response_dict
109
+
110
+ def _tokenization_raw_response_to_tokens(
111
+ self, response: Dict[str, Any], request: TokenizationRequest
112
+ ) -> List[TokenizationToken]:
113
+ tokens: List[TokenizationToken] = []
114
+ if request.encode:
115
+ tokens = [TokenizationToken(token) for token in response["tokens"]]
116
+ else:
117
+ tokens = [TokenizationToken(token) for token in response["token_strings"]]
118
+ if request.truncation:
119
+ tokens = tokens[: request.max_length]
120
+ return tokens
121
+
122
+ def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
123
+ model: str = request["tokenizer"].split("/")[1]
124
+ response = self.client.detokenize(tokens=request["tokens"], model=model)
125
+ return response.dict()
@@ -29,8 +29,17 @@ class HuggingFaceTokenizer(CachingTokenizer):
29
29
  _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
30
30
  _tokenizers_lock: Lock = Lock()
31
31
 
32
- def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
32
+ def __init__(
33
+ self,
34
+ cache_config: CacheConfig,
35
+ tokenizer_name: str,
36
+ pretrained_model_name_or_path: Optional[str] = None,
37
+ **kwargs,
38
+ ):
33
39
  super().__init__(cache_config=cache_config)
40
+ self._helm_tokenizer_name = (
41
+ tokenizer_name # HELM tokenizer name (e.g. "huggingface/gpt2"), *not* Hugging Face Hub Model ID
42
+ )
34
43
  self._pretrained_model_name_or_path = pretrained_model_name_or_path
35
44
  self._kwargs = kwargs
36
45
 
@@ -40,7 +49,11 @@ class HuggingFaceTokenizer(CachingTokenizer):
40
49
  # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
41
50
  # TODO: Figure out if we actually need this.
42
51
  os.environ["TOKENIZERS_PARALLELISM"] = "False"
43
-
52
+ from_pretrained_kwargs = {**kwargs}
53
+ # If unspecified, set `use_fast=True` by default.
54
+ if "use_fast" not in from_pretrained_kwargs:
55
+ from_pretrained_kwargs["use_fast"] = True
56
+ print(from_pretrained_kwargs)
44
57
  try:
45
58
  # From the Hugging Face documentation, "local_files_only(defaults to False) —
46
59
  # Whether or not to only look at local files".
@@ -53,14 +66,14 @@ class HuggingFaceTokenizer(CachingTokenizer):
53
66
  # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
54
67
  return WrappedPreTrainedTokenizer(
55
68
  AutoTokenizer.from_pretrained(
56
- pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
69
+ pretrained_model_name_or_path, local_files_only=True, **from_pretrained_kwargs
57
70
  )
58
71
  )
59
72
  except OSError:
60
73
  hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
61
74
  return WrappedPreTrainedTokenizer(
62
75
  AutoTokenizer.from_pretrained(
63
- pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
76
+ pretrained_model_name_or_path, local_files_only=False, **from_pretrained_kwargs
64
77
  )
65
78
  )
66
79
 
@@ -84,21 +97,26 @@ class HuggingFaceTokenizer(CachingTokenizer):
84
97
  )
85
98
  return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
86
99
 
87
- def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer:
88
- """Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer."""
100
+ def get_wrapped_tokenizer(self) -> WrappedPreTrainedTokenizer:
101
+ """Get the underlying Hugging Face WrappedPreTrainedTokenizer."""
89
102
  pretrained_model_name_or_path = (
90
- self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else request["tokenizer"]
103
+ self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else self._helm_tokenizer_name
91
104
  )
92
105
  return HuggingFaceTokenizer.get_tokenizer(
93
- helm_tokenizer_name=request["tokenizer"],
106
+ helm_tokenizer_name=self._helm_tokenizer_name,
94
107
  pretrained_model_name_or_path=pretrained_model_name_or_path,
95
108
  **self._kwargs,
96
109
  )
97
110
 
98
111
  def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
112
+ if request["tokenizer"] != self._helm_tokenizer_name:
113
+ raise ValueError(
114
+ f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
115
+ "but instead the request has tokenizer {request['tokenizer']}"
116
+ )
99
117
  if request["encode"]:
100
118
  if request["truncation"]:
101
- with self._get_tokenizer_for_request(request) as tokenizer:
119
+ with self.get_wrapped_tokenizer() as tokenizer:
102
120
  tokens = tokenizer.encode(
103
121
  request["text"],
104
122
  truncation=request["truncation"],
@@ -106,7 +124,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
106
124
  add_special_tokens=False,
107
125
  )
108
126
  else:
109
- with self._get_tokenizer_for_request(request) as tokenizer:
127
+ with self.get_wrapped_tokenizer() as tokenizer:
110
128
  tokens = tokenizer.encode(request["text"], add_special_tokens=False)
111
129
  else:
112
130
  if "gpt" in request["tokenizer"] or request["tokenizer"] in [
@@ -118,7 +136,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
118
136
  # convert_tokens_to_string method. We prefer to use this method instead
119
137
  # of the hacky cleanup_tokens method below as it might handle cases
120
138
  # we haven't thought of in cleanup_tokens.
121
- with self._get_tokenizer_for_request(request) as tokenizer:
139
+ with self.get_wrapped_tokenizer() as tokenizer:
122
140
  tokens = [
123
141
  tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
124
142
  ]
@@ -131,7 +149,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
131
149
  # But this replaces all the "▁" characters by "", which is not what we want.
132
150
  # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
133
151
  # Just like tokenize("Hello", encode=False) would return ["Hello"].
134
- with self._get_tokenizer_for_request(request) as tokenizer:
152
+ with self.get_wrapped_tokenizer() as tokenizer:
135
153
  tokens = tokenizer.tokenize(request["text"])
136
154
  # Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings.
137
155
  if tokens and type(tokens[0]) == bytes:
@@ -140,7 +158,12 @@ class HuggingFaceTokenizer(CachingTokenizer):
140
158
  return {"tokens": tokens}
141
159
 
142
160
  def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
143
- with self._get_tokenizer_for_request(request) as tokenizer:
161
+ if request["tokenizer"] != self._helm_tokenizer_name:
162
+ raise ValueError(
163
+ f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
164
+ "but instead the request has tokenizer {request['tokenizer']}"
165
+ )
166
+ with self.get_wrapped_tokenizer() as tokenizer:
144
167
  text = tokenizer.decode(
145
168
  request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
146
169
  )
@@ -0,0 +1,39 @@
1
+ import pytest
2
+
3
+ from helm.common.cache import BlackHoleCacheConfig
4
+ from helm.common.tokenization_request import (
5
+ DecodeRequest,
6
+ TokenizationRequest,
7
+ TokenizationToken,
8
+ )
9
+ from helm.tokenizers.cohere_tokenizer import CohereLocalTokenizer
10
+
11
+
12
+ @pytest.mark.models
13
+ def test_tokenize():
14
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
15
+ request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦")
16
+ result = tokenizer.tokenize(request)
17
+ assert result.success
18
+ assert not result.cached
19
+ assert result.tokens == [TokenizationToken(token) for token in ["ot", "ter", " �", "�", "�"]]
20
+
21
+
22
+ @pytest.mark.models
23
+ def test_encode():
24
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
25
+ request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦", encode=True)
26
+ result = tokenizer.tokenize(request)
27
+ assert result.success
28
+ assert not result.cached
29
+ assert result.tokens == [TokenizationToken(token) for token in [1741, 1779, 7728, 107, 107]]
30
+
31
+
32
+ @pytest.mark.models
33
+ def test_decode():
34
+ tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
35
+ request = DecodeRequest(tokenizer="cohere/command", tokens=[1741, 1779, 7728, 107, 107])
36
+ result = tokenizer.decode(request)
37
+ assert result.success
38
+ assert not result.cached
39
+ assert result.text == "otter 🦦"
@@ -17,7 +17,11 @@ class TestHuggingFaceGPT2Tokenizer:
17
17
  def setup_method(self, method):
18
18
  cache_file = tempfile.NamedTemporaryFile(delete=False)
19
19
  self.cache_path: str = cache_file.name
20
- self.tokenizer = HuggingFaceTokenizer(SqliteCacheConfig(self.cache_path))
20
+ self.tokenizer = HuggingFaceTokenizer(
21
+ SqliteCacheConfig(self.cache_path),
22
+ tokenizer_name="huggingface/gpt2",
23
+ pretrained_model_name_or_path="openai-community/gpt2",
24
+ )
21
25
 
22
26
  def teardown_method(self, method):
23
27
  os.remove(self.cache_path)