crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +77 -0
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +168 -45
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +254 -111
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +43 -9
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +9 -2
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +19 -0
- helm/config/model_deployments.yaml +412 -18
- helm/config/model_metadata.yaml +447 -25
- helm/config/tokenizer_configs.yaml +93 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -69,7 +69,7 @@ tokenizer_configs:
|
|
|
69
69
|
tokenizer_spec:
|
|
70
70
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
71
71
|
end_of_text_token: "</s>"
|
|
72
|
-
prefix_token: "
|
|
72
|
+
prefix_token: "<s>"
|
|
73
73
|
- name: bigscience/T0pp
|
|
74
74
|
tokenizer_spec:
|
|
75
75
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -83,6 +83,46 @@ tokenizer_configs:
|
|
|
83
83
|
end_of_text_token: ""
|
|
84
84
|
prefix_token: ":"
|
|
85
85
|
|
|
86
|
+
- name: cohere/command
|
|
87
|
+
tokenizer_spec:
|
|
88
|
+
class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
|
|
89
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
90
|
+
prefix_token: "<BOS_TOKEN>"
|
|
91
|
+
|
|
92
|
+
- name: cohere/command-light
|
|
93
|
+
tokenizer_spec:
|
|
94
|
+
class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
|
|
95
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
96
|
+
prefix_token: "<BOS_TOKEN>"
|
|
97
|
+
|
|
98
|
+
- name: cohere/command-r
|
|
99
|
+
tokenizer_spec:
|
|
100
|
+
class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
|
|
101
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
102
|
+
prefix_token: "<BOS_TOKEN>"
|
|
103
|
+
|
|
104
|
+
- name: cohere/command-r-plus
|
|
105
|
+
tokenizer_spec:
|
|
106
|
+
class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
|
|
107
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
108
|
+
prefix_token: "<BOS_TOKEN>"
|
|
109
|
+
|
|
110
|
+
- name: cohere/c4ai-command-r-v01
|
|
111
|
+
tokenizer_spec:
|
|
112
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
113
|
+
args:
|
|
114
|
+
pretrained_model_name_or_path: CohereForAI/c4ai-command-r-v01
|
|
115
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
116
|
+
prefix_token: "<BOS_TOKEN>"
|
|
117
|
+
|
|
118
|
+
- name: cohere/c4ai-command-r-plus
|
|
119
|
+
tokenizer_spec:
|
|
120
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
121
|
+
args:
|
|
122
|
+
pretrained_model_name_or_path: CohereForAI/c4ai-command-r-plus
|
|
123
|
+
end_of_text_token: "<EOS_TOKEN>"
|
|
124
|
+
prefix_token: "<BOS_TOKEN>"
|
|
125
|
+
|
|
86
126
|
# Databricks
|
|
87
127
|
- name: databricks/dbrx-instruct
|
|
88
128
|
tokenizer_spec:
|
|
@@ -247,6 +287,17 @@ tokenizer_configs:
|
|
|
247
287
|
end_of_text_token: "</s>"
|
|
248
288
|
prefix_token: "<s>"
|
|
249
289
|
|
|
290
|
+
# AI Singapore
|
|
291
|
+
- name: aisingapore/sea-lion-7b
|
|
292
|
+
tokenizer_spec:
|
|
293
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
294
|
+
args:
|
|
295
|
+
trust_remote_code: true
|
|
296
|
+
use_fast: false
|
|
297
|
+
end_of_text_token: "<|endoftext|>"
|
|
298
|
+
prefix_token: ""
|
|
299
|
+
|
|
300
|
+
|
|
250
301
|
|
|
251
302
|
# Allen Institute for AI
|
|
252
303
|
# The allenai/olmo-7b requires Python 3.9 or newer.
|
|
@@ -259,6 +310,12 @@ tokenizer_configs:
|
|
|
259
310
|
end_of_text_token: "<|endoftext|>"
|
|
260
311
|
prefix_token: ""
|
|
261
312
|
|
|
313
|
+
- name: allenai/OLMo-1.7-7B-hf
|
|
314
|
+
tokenizer_spec:
|
|
315
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
316
|
+
end_of_text_token: "<|endoftext|>"
|
|
317
|
+
prefix_token: ""
|
|
318
|
+
|
|
262
319
|
|
|
263
320
|
# Microsoft
|
|
264
321
|
- name: microsoft/phi-2
|
|
@@ -274,6 +331,24 @@ tokenizer_configs:
|
|
|
274
331
|
end_of_text_token: "</s>"
|
|
275
332
|
prefix_token: "<s>"
|
|
276
333
|
|
|
334
|
+
- name: mistralai/Mistral-7B-Instruct-v0.1
|
|
335
|
+
tokenizer_spec:
|
|
336
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
337
|
+
end_of_text_token: "</s>"
|
|
338
|
+
prefix_token: "<s>"
|
|
339
|
+
|
|
340
|
+
- name: mistralai/Mistral-7B-Instruct-v0.2
|
|
341
|
+
tokenizer_spec:
|
|
342
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
343
|
+
end_of_text_token: "</s>"
|
|
344
|
+
prefix_token: "<s>"
|
|
345
|
+
|
|
346
|
+
- name: mistralai/Mistral-7B-Instruct-v0.3
|
|
347
|
+
tokenizer_spec:
|
|
348
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
349
|
+
end_of_text_token: "</s>"
|
|
350
|
+
prefix_token: "<s>"
|
|
351
|
+
|
|
277
352
|
# Neurips
|
|
278
353
|
- name: neurips/local
|
|
279
354
|
tokenizer_spec:
|
|
@@ -288,6 +363,12 @@ tokenizer_configs:
|
|
|
288
363
|
end_of_text_token: "<|endoftext|>"
|
|
289
364
|
prefix_token: "<|endoftext|>"
|
|
290
365
|
|
|
366
|
+
- name: openai/o200k_base
|
|
367
|
+
tokenizer_spec:
|
|
368
|
+
class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
|
|
369
|
+
end_of_text_token: "<|endoftext|>"
|
|
370
|
+
prefix_token: "<|endoftext|>"
|
|
371
|
+
|
|
291
372
|
- name: openai/clip-vit-large-patch14
|
|
292
373
|
tokenizer_spec:
|
|
293
374
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -311,6 +392,14 @@ tokenizer_configs:
|
|
|
311
392
|
end_of_text_token: "<|endoftext|>"
|
|
312
393
|
prefix_token: ""
|
|
313
394
|
|
|
395
|
+
- name: qwen/qwen2-72b-instruct
|
|
396
|
+
tokenizer_spec:
|
|
397
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
398
|
+
args:
|
|
399
|
+
pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct
|
|
400
|
+
end_of_text_token: <|im_end|>"
|
|
401
|
+
prefix_token: "<|im_start|>'"
|
|
402
|
+
|
|
314
403
|
- name: qwen/qwen-vl
|
|
315
404
|
tokenizer_spec:
|
|
316
405
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -335,6 +424,9 @@ tokenizer_configs:
|
|
|
335
424
|
- name: snowflake/snowflake-arctic-instruct
|
|
336
425
|
tokenizer_spec:
|
|
337
426
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
427
|
+
args:
|
|
428
|
+
pretrained_model_name_or_path: Snowflake/snowflake-arctic-instruct
|
|
429
|
+
trust_remote_code: true
|
|
338
430
|
end_of_text_token: "<|im_end|>"
|
|
339
431
|
prefix_token: "<|im_start|>"
|
|
340
432
|
|
|
@@ -15,6 +15,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
|
|
|
15
15
|
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
16
16
|
from helm.clients.client import Client
|
|
17
17
|
from helm.proxy.critique.critique_client import CritiqueClient
|
|
18
|
+
from helm.common.media_object import MultimediaObject, MediaObject
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class CritiqueParseError(Exception):
|
|
@@ -24,6 +25,8 @@ class CritiqueParseError(Exception):
|
|
|
24
25
|
class ModelCritiqueClient(CritiqueClient):
|
|
25
26
|
"""A CritiqueClient that queries a Model to answer CritiqueRequests."""
|
|
26
27
|
|
|
28
|
+
VISION_LANGUAGE_MODELS = ["openai/gpt-4-vision", "reka/reka", "huggingface/prometheus-vision"]
|
|
29
|
+
|
|
27
30
|
def __init__(self, client: Client, model_name):
|
|
28
31
|
self._client = client
|
|
29
32
|
self._model_name = model_name
|
|
@@ -31,6 +34,11 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
31
34
|
get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
|
|
32
35
|
or self._model_name
|
|
33
36
|
)
|
|
37
|
+
self.vision_language = False
|
|
38
|
+
for vision_language_model_name in self.VISION_LANGUAGE_MODELS:
|
|
39
|
+
if model_name.startswith(vision_language_model_name):
|
|
40
|
+
self.vision_language = True
|
|
41
|
+
break
|
|
34
42
|
|
|
35
43
|
def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
|
|
36
44
|
for key, value in fields.items():
|
|
@@ -58,10 +66,15 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
58
66
|
|
|
59
67
|
requests: List[Request] = []
|
|
60
68
|
for question in task.questions:
|
|
61
|
-
prompt: str
|
|
69
|
+
prompt: str
|
|
70
|
+
if len(question.text) > 0:
|
|
71
|
+
prompt = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
|
|
72
|
+
else:
|
|
73
|
+
# We may don't want to add extra newlines and prompts
|
|
74
|
+
# if the question text is empty (e.g., the Vibe-Eval evaluator).
|
|
75
|
+
prompt = base_prompt
|
|
62
76
|
if question.question_type == "free_response":
|
|
63
|
-
|
|
64
|
-
max_tokens = 100
|
|
77
|
+
max_tokens = 100 if task.max_tokens is None else task.max_tokens
|
|
65
78
|
elif question.question_type == "checkbox":
|
|
66
79
|
# We multiply by 2 because the model will generate a comma after each option.
|
|
67
80
|
max_tokens = len(question.options) * 2
|
|
@@ -78,12 +91,21 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
78
91
|
|
|
79
92
|
prompt = anthropic.HUMAN_PROMPT + prompt + anthropic.AI_PROMPT
|
|
80
93
|
|
|
94
|
+
multimodal_prompt: Optional[MultimediaObject] = None
|
|
95
|
+
if self.vision_language:
|
|
96
|
+
assert question.media_object is not None, "Expect media_object for vision-language models"
|
|
97
|
+
image_media: MediaObject = question.media_object
|
|
98
|
+
text_media: MediaObject = MediaObject(text=prompt, content_type="text/plain")
|
|
99
|
+
multimodal_prompt = MultimediaObject(media_objects=[image_media, text_media])
|
|
100
|
+
prompt = "" # set to empty string to avoid conflicts with multimodal_prompt
|
|
101
|
+
|
|
81
102
|
request = Request(
|
|
82
103
|
model=self._model_name,
|
|
83
104
|
model_deployment=self._model_deployment_name,
|
|
84
105
|
prompt=prompt,
|
|
85
106
|
max_tokens=max_tokens,
|
|
86
107
|
echo_prompt=False,
|
|
108
|
+
multimodal_prompt=multimodal_prompt,
|
|
87
109
|
)
|
|
88
110
|
requests.append(request)
|
|
89
111
|
return requests
|
|
@@ -124,7 +146,13 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
124
146
|
raise CritiqueParseError(
|
|
125
147
|
f"Invalid answer: {completion}. Multiple choice questions should have one answer."
|
|
126
148
|
)
|
|
127
|
-
|
|
149
|
+
letter_answer = answers[0]
|
|
150
|
+
choice_rank = string.ascii_uppercase.index(letter_answer)
|
|
151
|
+
if choice_rank >= len(question.options):
|
|
152
|
+
raise CritiqueParseError(
|
|
153
|
+
f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
|
|
154
|
+
)
|
|
155
|
+
return letter_answer
|
|
128
156
|
except CritiqueParseError as e:
|
|
129
157
|
# If there was an error parsing the answer, we assume the user did not answer the question.
|
|
130
158
|
hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")
|
|
@@ -119,7 +119,7 @@ class ServerService(Service):
|
|
|
119
119
|
return "codex"
|
|
120
120
|
elif model_deployment.startswith("openai/dall-e-"):
|
|
121
121
|
return "dall_e"
|
|
122
|
-
elif model_deployment.startswith("openai/gpt-4
|
|
122
|
+
elif model_deployment.startswith("openai/gpt-4"):
|
|
123
123
|
return "gpt4"
|
|
124
124
|
else:
|
|
125
125
|
return "gpt3"
|
|
@@ -41,7 +41,7 @@ class AutoTokenizer(Tokenizer):
|
|
|
41
41
|
if tokenizer_config:
|
|
42
42
|
tokenizer_spec = inject_object_spec_args(
|
|
43
43
|
tokenizer_config.tokenizer_spec,
|
|
44
|
-
constant_bindings={"cache_config": cache_config},
|
|
44
|
+
constant_bindings={"cache_config": cache_config, "tokenizer_name": tokenizer_name},
|
|
45
45
|
provider_bindings={
|
|
46
46
|
"api_key": lambda: provide_api_key(self.credentials, organization),
|
|
47
47
|
"project_id": lambda: self.credentials.get(organization + "ProjectId", None), # VertexAI
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import requests
|
|
3
|
-
from typing import Any, Dict, List
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import cohere
|
|
6
|
+
from cohere.manually_maintained.tokenizers import get_hf_tokenizer
|
|
4
7
|
|
|
5
8
|
from helm.common.cache import CacheConfig
|
|
6
9
|
from helm.common.tokenization_request import (
|
|
@@ -10,7 +13,7 @@ from helm.common.tokenization_request import (
|
|
|
10
13
|
TokenizationToken,
|
|
11
14
|
)
|
|
12
15
|
from helm.clients.cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
|
|
13
|
-
from .caching_tokenizer import CachingTokenizer
|
|
16
|
+
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class CohereTokenizer(CachingTokenizer):
|
|
@@ -81,3 +84,42 @@ class CohereTokenizer(CachingTokenizer):
|
|
|
81
84
|
|
|
82
85
|
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
83
86
|
raise NotImplementedError("The Cohere API does not support decoding.")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class CohereLocalTokenizer(CachingTokenizer):
|
|
90
|
+
"""Cohere tokenizer using the Cohere Python library."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, api_key: Optional[str], cache_config: CacheConfig) -> None:
|
|
93
|
+
super().__init__(cache_config)
|
|
94
|
+
self.client = cohere.Client(api_key)
|
|
95
|
+
|
|
96
|
+
def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
|
|
97
|
+
return {"text": request.text, "tokenizer": request.tokenizer}
|
|
98
|
+
|
|
99
|
+
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
100
|
+
model: str = request["tokenizer"].split("/")[1]
|
|
101
|
+
# Workaround for https://github.com/cohere-ai/cohere-python/issues/493
|
|
102
|
+
# `token_strings` are always set to `[]`, so we have to populate it ourselves.
|
|
103
|
+
response = self.client.tokenize(text=request["text"], model=model)
|
|
104
|
+
response_dict = response.dict()
|
|
105
|
+
response_dict["token_strings"] = get_hf_tokenizer(self.client, model).decode_batch(
|
|
106
|
+
[[token] for token in response.tokens]
|
|
107
|
+
)
|
|
108
|
+
return response_dict
|
|
109
|
+
|
|
110
|
+
def _tokenization_raw_response_to_tokens(
|
|
111
|
+
self, response: Dict[str, Any], request: TokenizationRequest
|
|
112
|
+
) -> List[TokenizationToken]:
|
|
113
|
+
tokens: List[TokenizationToken] = []
|
|
114
|
+
if request.encode:
|
|
115
|
+
tokens = [TokenizationToken(token) for token in response["tokens"]]
|
|
116
|
+
else:
|
|
117
|
+
tokens = [TokenizationToken(token) for token in response["token_strings"]]
|
|
118
|
+
if request.truncation:
|
|
119
|
+
tokens = tokens[: request.max_length]
|
|
120
|
+
return tokens
|
|
121
|
+
|
|
122
|
+
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
123
|
+
model: str = request["tokenizer"].split("/")[1]
|
|
124
|
+
response = self.client.detokenize(tokens=request["tokens"], model=model)
|
|
125
|
+
return response.dict()
|
|
@@ -29,8 +29,17 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
29
29
|
_tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
|
|
30
30
|
_tokenizers_lock: Lock = Lock()
|
|
31
31
|
|
|
32
|
-
def __init__(
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
cache_config: CacheConfig,
|
|
35
|
+
tokenizer_name: str,
|
|
36
|
+
pretrained_model_name_or_path: Optional[str] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
33
39
|
super().__init__(cache_config=cache_config)
|
|
40
|
+
self._helm_tokenizer_name = (
|
|
41
|
+
tokenizer_name # HELM tokenizer name (e.g. "huggingface/gpt2"), *not* Hugging Face Hub Model ID
|
|
42
|
+
)
|
|
34
43
|
self._pretrained_model_name_or_path = pretrained_model_name_or_path
|
|
35
44
|
self._kwargs = kwargs
|
|
36
45
|
|
|
@@ -40,7 +49,11 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
40
49
|
# To avoid deadlocks when using HuggingFace tokenizers with multiple processes
|
|
41
50
|
# TODO: Figure out if we actually need this.
|
|
42
51
|
os.environ["TOKENIZERS_PARALLELISM"] = "False"
|
|
43
|
-
|
|
52
|
+
from_pretrained_kwargs = {**kwargs}
|
|
53
|
+
# If unspecified, set `use_fast=True` by default.
|
|
54
|
+
if "use_fast" not in from_pretrained_kwargs:
|
|
55
|
+
from_pretrained_kwargs["use_fast"] = True
|
|
56
|
+
print(from_pretrained_kwargs)
|
|
44
57
|
try:
|
|
45
58
|
# From the Hugging Face documentation, "local_files_only(defaults to False) —
|
|
46
59
|
# Whether or not to only look at local files".
|
|
@@ -53,14 +66,14 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
53
66
|
# Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
|
|
54
67
|
return WrappedPreTrainedTokenizer(
|
|
55
68
|
AutoTokenizer.from_pretrained(
|
|
56
|
-
pretrained_model_name_or_path, local_files_only=True,
|
|
69
|
+
pretrained_model_name_or_path, local_files_only=True, **from_pretrained_kwargs
|
|
57
70
|
)
|
|
58
71
|
)
|
|
59
72
|
except OSError:
|
|
60
73
|
hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
|
|
61
74
|
return WrappedPreTrainedTokenizer(
|
|
62
75
|
AutoTokenizer.from_pretrained(
|
|
63
|
-
pretrained_model_name_or_path, local_files_only=False,
|
|
76
|
+
pretrained_model_name_or_path, local_files_only=False, **from_pretrained_kwargs
|
|
64
77
|
)
|
|
65
78
|
)
|
|
66
79
|
|
|
@@ -84,21 +97,26 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
84
97
|
)
|
|
85
98
|
return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
|
|
86
99
|
|
|
87
|
-
def
|
|
88
|
-
"""
|
|
100
|
+
def get_wrapped_tokenizer(self) -> WrappedPreTrainedTokenizer:
|
|
101
|
+
"""Get the underlying Hugging Face WrappedPreTrainedTokenizer."""
|
|
89
102
|
pretrained_model_name_or_path = (
|
|
90
|
-
self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else
|
|
103
|
+
self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else self._helm_tokenizer_name
|
|
91
104
|
)
|
|
92
105
|
return HuggingFaceTokenizer.get_tokenizer(
|
|
93
|
-
helm_tokenizer_name=
|
|
106
|
+
helm_tokenizer_name=self._helm_tokenizer_name,
|
|
94
107
|
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
|
95
108
|
**self._kwargs,
|
|
96
109
|
)
|
|
97
110
|
|
|
98
111
|
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
112
|
+
if request["tokenizer"] != self._helm_tokenizer_name:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
|
|
115
|
+
"but instead the request has tokenizer {request['tokenizer']}"
|
|
116
|
+
)
|
|
99
117
|
if request["encode"]:
|
|
100
118
|
if request["truncation"]:
|
|
101
|
-
with self.
|
|
119
|
+
with self.get_wrapped_tokenizer() as tokenizer:
|
|
102
120
|
tokens = tokenizer.encode(
|
|
103
121
|
request["text"],
|
|
104
122
|
truncation=request["truncation"],
|
|
@@ -106,7 +124,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
106
124
|
add_special_tokens=False,
|
|
107
125
|
)
|
|
108
126
|
else:
|
|
109
|
-
with self.
|
|
127
|
+
with self.get_wrapped_tokenizer() as tokenizer:
|
|
110
128
|
tokens = tokenizer.encode(request["text"], add_special_tokens=False)
|
|
111
129
|
else:
|
|
112
130
|
if "gpt" in request["tokenizer"] or request["tokenizer"] in [
|
|
@@ -118,7 +136,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
118
136
|
# convert_tokens_to_string method. We prefer to use this method instead
|
|
119
137
|
# of the hacky cleanup_tokens method below as it might handle cases
|
|
120
138
|
# we haven't thought of in cleanup_tokens.
|
|
121
|
-
with self.
|
|
139
|
+
with self.get_wrapped_tokenizer() as tokenizer:
|
|
122
140
|
tokens = [
|
|
123
141
|
tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
|
|
124
142
|
]
|
|
@@ -131,7 +149,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
131
149
|
# But this replaces all the "▁" characters by "", which is not what we want.
|
|
132
150
|
# This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
|
|
133
151
|
# Just like tokenize("Hello", encode=False) would return ["Hello"].
|
|
134
|
-
with self.
|
|
152
|
+
with self.get_wrapped_tokenizer() as tokenizer:
|
|
135
153
|
tokens = tokenizer.tokenize(request["text"])
|
|
136
154
|
# Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings.
|
|
137
155
|
if tokens and type(tokens[0]) == bytes:
|
|
@@ -140,7 +158,12 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
140
158
|
return {"tokens": tokens}
|
|
141
159
|
|
|
142
160
|
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
143
|
-
|
|
161
|
+
if request["tokenizer"] != self._helm_tokenizer_name:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} "
|
|
164
|
+
"but instead the request has tokenizer {request['tokenizer']}"
|
|
165
|
+
)
|
|
166
|
+
with self.get_wrapped_tokenizer() as tokenizer:
|
|
144
167
|
text = tokenizer.decode(
|
|
145
168
|
request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
|
|
146
169
|
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from helm.common.cache import BlackHoleCacheConfig
|
|
4
|
+
from helm.common.tokenization_request import (
|
|
5
|
+
DecodeRequest,
|
|
6
|
+
TokenizationRequest,
|
|
7
|
+
TokenizationToken,
|
|
8
|
+
)
|
|
9
|
+
from helm.tokenizers.cohere_tokenizer import CohereLocalTokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.models
|
|
13
|
+
def test_tokenize():
|
|
14
|
+
tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
|
|
15
|
+
request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦")
|
|
16
|
+
result = tokenizer.tokenize(request)
|
|
17
|
+
assert result.success
|
|
18
|
+
assert not result.cached
|
|
19
|
+
assert result.tokens == [TokenizationToken(token) for token in ["ot", "ter", " �", "�", "�"]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.models
|
|
23
|
+
def test_encode():
|
|
24
|
+
tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
|
|
25
|
+
request = TokenizationRequest(tokenizer="cohere/command", text="otter 🦦", encode=True)
|
|
26
|
+
result = tokenizer.tokenize(request)
|
|
27
|
+
assert result.success
|
|
28
|
+
assert not result.cached
|
|
29
|
+
assert result.tokens == [TokenizationToken(token) for token in [1741, 1779, 7728, 107, 107]]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.models
|
|
33
|
+
def test_decode():
|
|
34
|
+
tokenizer = CohereLocalTokenizer(api_key=None, cache_config=BlackHoleCacheConfig())
|
|
35
|
+
request = DecodeRequest(tokenizer="cohere/command", tokens=[1741, 1779, 7728, 107, 107])
|
|
36
|
+
result = tokenizer.decode(request)
|
|
37
|
+
assert result.success
|
|
38
|
+
assert not result.cached
|
|
39
|
+
assert result.text == "otter 🦦"
|
|
@@ -17,7 +17,11 @@ class TestHuggingFaceGPT2Tokenizer:
|
|
|
17
17
|
def setup_method(self, method):
|
|
18
18
|
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
19
19
|
self.cache_path: str = cache_file.name
|
|
20
|
-
self.tokenizer = HuggingFaceTokenizer(
|
|
20
|
+
self.tokenizer = HuggingFaceTokenizer(
|
|
21
|
+
SqliteCacheConfig(self.cache_path),
|
|
22
|
+
tokenizer_name="huggingface/gpt2",
|
|
23
|
+
pretrained_model_name_or_path="openai-community/gpt2",
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
def teardown_method(self, method):
|
|
23
27
|
os.remove(self.cache_path)
|