crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -17,11 +17,31 @@ tokenizer_configs:
|
|
|
17
17
|
prefix_token: "<s>"
|
|
18
18
|
|
|
19
19
|
# AI21
|
|
20
|
-
- name: ai21/
|
|
20
|
+
- name: ai21/j2-tokenizer
|
|
21
21
|
tokenizer_spec:
|
|
22
|
-
class_name: "helm.tokenizers.ai21_tokenizer.
|
|
23
|
-
end_of_text_token: "
|
|
24
|
-
prefix_token: ""
|
|
22
|
+
class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
|
|
23
|
+
end_of_text_token: "<|endoftext|>"
|
|
24
|
+
prefix_token: "<|startoftext|>"
|
|
25
|
+
- name: ai21/jamba-tokenizer
|
|
26
|
+
tokenizer_spec:
|
|
27
|
+
class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
|
|
28
|
+
end_of_text_token: "<|endoftext|>"
|
|
29
|
+
prefix_token: "<|startoftext|>"
|
|
30
|
+
- name: ai21/jamba-instruct-tokenizer
|
|
31
|
+
tokenizer_spec:
|
|
32
|
+
class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
|
|
33
|
+
end_of_text_token: "<|endoftext|>"
|
|
34
|
+
prefix_token: "<|startoftext|>"
|
|
35
|
+
- name: ai21/jamba-1.5-mini-tokenizer
|
|
36
|
+
tokenizer_spec:
|
|
37
|
+
class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
|
|
38
|
+
end_of_text_token: "<|endoftext|>"
|
|
39
|
+
prefix_token: "<|startoftext|>"
|
|
40
|
+
- name: ai21/jamba-1.5-large-tokenizer
|
|
41
|
+
tokenizer_spec:
|
|
42
|
+
class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
|
|
43
|
+
end_of_text_token: "<|endoftext|>"
|
|
44
|
+
prefix_token: "<|startoftext|>"
|
|
25
45
|
|
|
26
46
|
# AlephAlpha
|
|
27
47
|
- name: AlephAlpha/luminous-base
|
|
@@ -45,6 +65,24 @@ tokenizer_configs:
|
|
|
45
65
|
end_of_text_token: ""
|
|
46
66
|
prefix_token: ""
|
|
47
67
|
|
|
68
|
+
# Alibaba DAMO Academy
|
|
69
|
+
|
|
70
|
+
- name: damo/seallm-7b-v2
|
|
71
|
+
tokenizer_spec:
|
|
72
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
73
|
+
args:
|
|
74
|
+
pretrained_model_name_or_path: SeaLLMs/SeaLLM-7B-v2
|
|
75
|
+
end_of_text_token: "</s>"
|
|
76
|
+
prefix_token: "<s>"
|
|
77
|
+
|
|
78
|
+
- name: damo/seallm-7b-v2.5
|
|
79
|
+
tokenizer_spec:
|
|
80
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
81
|
+
args:
|
|
82
|
+
pretrained_model_name_or_path: SeaLLMs/SeaLLM-7B-v2.5
|
|
83
|
+
end_of_text_token: "<eos>"
|
|
84
|
+
prefix_token: "<bos>"
|
|
85
|
+
|
|
48
86
|
# Anthropic
|
|
49
87
|
- name: anthropic/claude
|
|
50
88
|
tokenizer_spec:
|
|
@@ -77,12 +115,6 @@ tokenizer_configs:
|
|
|
77
115
|
prefix_token: ""
|
|
78
116
|
|
|
79
117
|
# Cohere
|
|
80
|
-
- name: cohere/cohere
|
|
81
|
-
tokenizer_spec:
|
|
82
|
-
class_name: "helm.tokenizers.cohere_tokenizer.CohereTokenizer"
|
|
83
|
-
end_of_text_token: ""
|
|
84
|
-
prefix_token: ":"
|
|
85
|
-
|
|
86
118
|
- name: cohere/command
|
|
87
119
|
tokenizer_spec:
|
|
88
120
|
class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
|
|
@@ -199,6 +231,11 @@ tokenizer_configs:
|
|
|
199
231
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
200
232
|
end_of_text_token: "<eos>"
|
|
201
233
|
prefix_token: "<bos>"
|
|
234
|
+
- name: google/gemma-2-9b
|
|
235
|
+
tokenizer_spec:
|
|
236
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
237
|
+
end_of_text_token: "<eos>"
|
|
238
|
+
prefix_token: "<bos>"
|
|
202
239
|
|
|
203
240
|
# Hf-internal-testing
|
|
204
241
|
|
|
@@ -280,6 +317,38 @@ tokenizer_configs:
|
|
|
280
317
|
prefix_token: "<|begin_of_text|>"
|
|
281
318
|
end_of_text_token: "<|end_of_text|>"
|
|
282
319
|
|
|
320
|
+
- name: meta/llama-3-8b-instruct
|
|
321
|
+
tokenizer_spec:
|
|
322
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
323
|
+
args:
|
|
324
|
+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
|
|
325
|
+
prefix_token: "<|begin_of_text|>"
|
|
326
|
+
end_of_text_token: "<|eot_id|>"
|
|
327
|
+
|
|
328
|
+
- name: meta/llama-3.1-8b
|
|
329
|
+
tokenizer_spec:
|
|
330
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
331
|
+
args:
|
|
332
|
+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
|
|
333
|
+
prefix_token: "<|begin_of_text|>"
|
|
334
|
+
end_of_text_token: "<|end_of_text|>"
|
|
335
|
+
|
|
336
|
+
- name: meta/llama-3.2-3b-instruct
|
|
337
|
+
tokenizer_spec:
|
|
338
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
339
|
+
args:
|
|
340
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
|
|
341
|
+
prefix_token: "<|begin_of_text|>"
|
|
342
|
+
end_of_text_token: "<|eot_id|>"
|
|
343
|
+
|
|
344
|
+
- name: meta/llama-3.2-11b-vision-instruct
|
|
345
|
+
tokenizer_spec:
|
|
346
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
347
|
+
args:
|
|
348
|
+
pretrained_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
|
|
349
|
+
prefix_token: "<|begin_of_text|>"
|
|
350
|
+
end_of_text_token: "<|eot_id|>"
|
|
351
|
+
|
|
283
352
|
# 01-ai
|
|
284
353
|
- name: 01-ai/Yi-6B
|
|
285
354
|
tokenizer_spec:
|
|
@@ -324,6 +393,20 @@ tokenizer_configs:
|
|
|
324
393
|
end_of_text_token: "<|endoftext|>"
|
|
325
394
|
prefix_token: "<|endoftext|>"
|
|
326
395
|
|
|
396
|
+
- name: microsoft/phi-3-small-8k-instruct
|
|
397
|
+
tokenizer_spec:
|
|
398
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
399
|
+
args:
|
|
400
|
+
trust_remote_code: true
|
|
401
|
+
end_of_text_token: "<|endoftext|>"
|
|
402
|
+
prefix_token: "<|endoftext|>"
|
|
403
|
+
|
|
404
|
+
- name: microsoft/phi-3-medium-4k-instruct
|
|
405
|
+
tokenizer_spec:
|
|
406
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
407
|
+
end_of_text_token: "<|endoftext|>"
|
|
408
|
+
prefix_token: "<s>"
|
|
409
|
+
|
|
327
410
|
# Mistralai
|
|
328
411
|
- name: mistralai/Mistral-7B-v0.1
|
|
329
412
|
tokenizer_spec:
|
|
@@ -349,6 +432,18 @@ tokenizer_configs:
|
|
|
349
432
|
end_of_text_token: "</s>"
|
|
350
433
|
prefix_token: "<s>"
|
|
351
434
|
|
|
435
|
+
- name: mistralai/Mistral-Nemo-Base-2407
|
|
436
|
+
tokenizer_spec:
|
|
437
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
438
|
+
end_of_text_token: "</s>"
|
|
439
|
+
prefix_token: "<s>"
|
|
440
|
+
|
|
441
|
+
- name: mistralai/Mistral-Large-Instruct-2407
|
|
442
|
+
tokenizer_spec:
|
|
443
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
444
|
+
end_of_text_token: "</s>"
|
|
445
|
+
prefix_token: "<s>"
|
|
446
|
+
|
|
352
447
|
# Neurips
|
|
353
448
|
- name: neurips/local
|
|
354
449
|
tokenizer_spec:
|
|
@@ -356,7 +451,17 @@ tokenizer_configs:
|
|
|
356
451
|
end_of_text_token: "<|endoftext|>"
|
|
357
452
|
prefix_token: "<|endoftext|>"
|
|
358
453
|
|
|
359
|
-
#
|
|
454
|
+
# NVIDIA
|
|
455
|
+
- name: nvidia/nemotron-4-340b-instruct
|
|
456
|
+
tokenizer_spec:
|
|
457
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
458
|
+
args:
|
|
459
|
+
pretrained_model_name_or_path: Xenova/Nemotron-4-340B-Instruct-Tokenizer
|
|
460
|
+
revision: b7aa0de92cda9f9e722d58d6ca90f46ae17d4701
|
|
461
|
+
end_of_text_token: "<|endoftext|>"
|
|
462
|
+
prefix_token: "<|endoftext|>"
|
|
463
|
+
|
|
464
|
+
# OpenAI
|
|
360
465
|
- name: openai/cl100k_base
|
|
361
466
|
tokenizer_spec:
|
|
362
467
|
class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
|
|
@@ -375,6 +480,14 @@ tokenizer_configs:
|
|
|
375
480
|
end_of_text_token: ""
|
|
376
481
|
prefix_token: ""
|
|
377
482
|
|
|
483
|
+
# OpenThaiGPT
|
|
484
|
+
- name: openthaigpt/openthaigpt-1.0.0-7b-chat
|
|
485
|
+
tokenizer_spec:
|
|
486
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
487
|
+
end_of_text_token: "</s>"
|
|
488
|
+
prefix_token: "<s>"
|
|
489
|
+
|
|
490
|
+
# Qwen
|
|
378
491
|
- name: qwen/qwen-7b
|
|
379
492
|
tokenizer_spec:
|
|
380
493
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -420,6 +533,15 @@ tokenizer_configs:
|
|
|
420
533
|
end_of_text_token: "<|endoftext|>"
|
|
421
534
|
prefix_token: ""
|
|
422
535
|
|
|
536
|
+
# SambaLingo
|
|
537
|
+
- name: sambanova/sambalingo-thai-base
|
|
538
|
+
tokenizer_spec:
|
|
539
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
540
|
+
args:
|
|
541
|
+
pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base
|
|
542
|
+
end_of_text_token: "</s>"
|
|
543
|
+
prefix_token: "<s>"
|
|
544
|
+
|
|
423
545
|
# Snowflake
|
|
424
546
|
- name: snowflake/snowflake-arctic-instruct
|
|
425
547
|
tokenizer_spec:
|
|
@@ -444,6 +566,13 @@ tokenizer_configs:
|
|
|
444
566
|
end_of_text_token: "</s>"
|
|
445
567
|
prefix_token: ""
|
|
446
568
|
|
|
569
|
+
# Typhoon
|
|
570
|
+
- name: scb10x/typhoon-7b
|
|
571
|
+
tokenizer_spec:
|
|
572
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
573
|
+
end_of_text_token: "</s>"
|
|
574
|
+
prefix_token: "<s>"
|
|
575
|
+
|
|
447
576
|
# Writer
|
|
448
577
|
- name: writer/gpt2
|
|
449
578
|
tokenizer_spec:
|
helm/proxy/example_queries.py
CHANGED
|
@@ -22,7 +22,6 @@ example_queries = [
|
|
|
22
22
|
temperature: 0.5 # Medium amount of randomness
|
|
23
23
|
stop_sequences: [.] # Stop when you hit a period
|
|
24
24
|
model: openai/gpt-3.5-turbo-0613
|
|
25
|
-
model_deployment: openai/gpt-3.5-turbo-0613
|
|
26
25
|
"""
|
|
27
26
|
),
|
|
28
27
|
environments="",
|
|
@@ -35,24 +34,24 @@ example_queries = [
|
|
|
35
34
|
stop_sequences: [\\n] # Stop when you hit a newline
|
|
36
35
|
num_completions: 5 # Generate many samples
|
|
37
36
|
model: openai/gpt-3.5-turbo-0613
|
|
38
|
-
model_deployment: openai/gpt-3.5-turbo-0613
|
|
39
37
|
"""
|
|
40
38
|
),
|
|
41
39
|
environments="",
|
|
42
40
|
),
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
),
|
|
41
|
+
# Disabled because `max_tokens: 0` no longer works on the OpenAI API
|
|
42
|
+
# Query(
|
|
43
|
+
# prompt="The quick brown fox jumps over the lazy dog.",
|
|
44
|
+
# settings=dedent(
|
|
45
|
+
# """
|
|
46
|
+
# echo_prompt: true # Analyze the prompt
|
|
47
|
+
# max_tokens: 0 # Don't generate any more
|
|
48
|
+
# top_k_per_token: 5 # Show alternatives for each position
|
|
49
|
+
# model: openai/text-davinci-002
|
|
50
|
+
# model_deployment: openai/text-davinci-002
|
|
51
|
+
# """
|
|
52
|
+
# ),
|
|
53
|
+
# environments=dedent(""),
|
|
54
|
+
# ),
|
|
56
55
|
Query(
|
|
57
56
|
prompt="Odd numbers: 1 -> 3 -> 5",
|
|
58
57
|
settings=dedent(
|
|
@@ -60,7 +59,6 @@ example_queries = [
|
|
|
60
59
|
temperature: 0 # Deterministic
|
|
61
60
|
max_tokens: 50
|
|
62
61
|
model: openai/gpt-3.5-turbo-0613
|
|
63
|
-
model_deployment: openai/gpt-3.5-turbo-0613
|
|
64
62
|
"""
|
|
65
63
|
),
|
|
66
64
|
environments="",
|
|
@@ -73,7 +71,6 @@ example_queries = [
|
|
|
73
71
|
stop_sequences: [.]
|
|
74
72
|
# Try out multiple models
|
|
75
73
|
model: ${model}
|
|
76
|
-
model_deployment: ${model}
|
|
77
74
|
"""
|
|
78
75
|
),
|
|
79
76
|
environments=dedent(
|
|
@@ -100,7 +97,6 @@ example_queries = [
|
|
|
100
97
|
num_completions: 5
|
|
101
98
|
# Try out multiple models
|
|
102
99
|
model: ${model}
|
|
103
|
-
model_deployment: ${model}
|
|
104
100
|
"""
|
|
105
101
|
),
|
|
106
102
|
environments=dedent(
|
|
@@ -136,7 +132,6 @@ example_queries = [
|
|
|
136
132
|
top_k_per_token: 4
|
|
137
133
|
# Try out multiple models
|
|
138
134
|
model: ${model}
|
|
139
|
-
model_deployment: ${model}
|
|
140
135
|
"""
|
|
141
136
|
),
|
|
142
137
|
environments=dedent(
|
|
@@ -150,7 +145,6 @@ example_queries = [
|
|
|
150
145
|
settings=dedent(
|
|
151
146
|
"""
|
|
152
147
|
model: openai/gpt-3.5-turbo-0613
|
|
153
|
-
model_deployment: openai/gpt-3.5-turbo-0613
|
|
154
148
|
"""
|
|
155
149
|
),
|
|
156
150
|
environments="",
|
|
@@ -163,7 +157,6 @@ example_queries = [
|
|
|
163
157
|
stop_sequences: [\\n]
|
|
164
158
|
# Try out multiple models
|
|
165
159
|
model: ${model}
|
|
166
|
-
model_deployment: ${model}
|
|
167
160
|
"""
|
|
168
161
|
),
|
|
169
162
|
environments=dedent(
|
helm/proxy/server.py
CHANGED
|
@@ -106,15 +106,6 @@ def handle_get_general_info():
|
|
|
106
106
|
return safe_call(perform)
|
|
107
107
|
|
|
108
108
|
|
|
109
|
-
@app.get("/api/window_service_info")
|
|
110
|
-
def handle_get_window_service_info():
|
|
111
|
-
def perform(args):
|
|
112
|
-
global service
|
|
113
|
-
return dataclasses.asdict(service.get_window_service_info(args["model_name"]))
|
|
114
|
-
|
|
115
|
-
return safe_call(perform)
|
|
116
|
-
|
|
117
|
-
|
|
118
109
|
@app.post("/api/account")
|
|
119
110
|
def handle_create_account():
|
|
120
111
|
def perform(args):
|
|
@@ -15,7 +15,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
|
15
15
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
16
|
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
17
17
|
from helm.common.tokenization_request import (
|
|
18
|
-
WindowServiceInfo,
|
|
19
18
|
TokenizationRequest,
|
|
20
19
|
TokenizationRequestResult,
|
|
21
20
|
DecodeRequestResult,
|
|
@@ -51,11 +50,6 @@ class RemoteService(Service):
|
|
|
51
50
|
response = requests.get(f"{self.base_url}/api/general_info").json()
|
|
52
51
|
return from_dict(GeneralInfo, response)
|
|
53
52
|
|
|
54
|
-
def get_window_service_info(self, model_name) -> WindowServiceInfo:
|
|
55
|
-
params = {"model_name": model_name}
|
|
56
|
-
response = requests.get(f"{self.base_url}/api/window_service_info?{urllib.parse.urlencode(params)}").json()
|
|
57
|
-
return from_dict(WindowServiceInfo, response)
|
|
58
|
-
|
|
59
53
|
def expand_query(self, query: Query) -> QueryResult:
|
|
60
54
|
params = asdict(query)
|
|
61
55
|
response = requests.get(f"{self.base_url}/api/query?{urllib.parse.urlencode(params)}").json()
|
|
@@ -14,7 +14,6 @@ from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
|
14
14
|
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
|
|
15
15
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
16
|
from helm.common.tokenization_request import (
|
|
17
|
-
WindowServiceInfo,
|
|
18
17
|
TokenizationRequest,
|
|
19
18
|
TokenizationRequestResult,
|
|
20
19
|
DecodeRequest,
|
|
@@ -25,7 +24,6 @@ from helm.common.hierarchical_logger import hlog
|
|
|
25
24
|
from helm.proxy.accounts import Accounts, Account
|
|
26
25
|
from helm.clients.auto_client import AutoClient
|
|
27
26
|
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
28
|
-
from helm.clients.perspective_api_client import PerspectiveAPIClient
|
|
29
27
|
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
30
28
|
from helm.clients.gcs_client import GCSClient
|
|
31
29
|
from helm.clients.clip_score_client import CLIPScoreClient
|
|
@@ -75,7 +73,7 @@ class ServerService(Service):
|
|
|
75
73
|
# Lazily instantiate the following clients
|
|
76
74
|
self.moderation_api_client: Optional[ModerationAPIClient] = None
|
|
77
75
|
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
78
|
-
self.perspective_api_client: Optional[
|
|
76
|
+
self.perspective_api_client: Optional[ToxicityClassifierClient] = None
|
|
79
77
|
self.nudity_check_client: Optional[NudityCheckClient] = None
|
|
80
78
|
self.clip_score_client: Optional[CLIPScoreClient] = None
|
|
81
79
|
self.gcs_client: Optional[GCSClient] = None
|
|
@@ -86,22 +84,6 @@ class ServerService(Service):
|
|
|
86
84
|
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
87
85
|
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
88
86
|
|
|
89
|
-
def get_window_service_info(self, model_name) -> WindowServiceInfo:
|
|
90
|
-
# The import statement is placed here to avoid two problems, please refer to the link for details
|
|
91
|
-
# https://github.com/stanford-crfm/helm/pull/1430#discussion_r1156686624
|
|
92
|
-
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
93
|
-
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
94
|
-
|
|
95
|
-
token_service = TokenizerService(self, Authentication(""))
|
|
96
|
-
window_service = WindowServiceFactory.get_window_service(model_name, token_service)
|
|
97
|
-
return WindowServiceInfo(
|
|
98
|
-
tokenizer_name=window_service.tokenizer_name,
|
|
99
|
-
max_sequence_length=window_service.max_sequence_length,
|
|
100
|
-
max_request_length=window_service.max_request_length,
|
|
101
|
-
end_of_text_token=window_service.end_of_text_token,
|
|
102
|
-
prefix_token=window_service.prefix_token,
|
|
103
|
-
)
|
|
104
|
-
|
|
105
87
|
def expand_query(self, query: Query) -> QueryResult:
|
|
106
88
|
"""Turn the `query` into requests."""
|
|
107
89
|
prompt = query.prompt
|
|
@@ -121,8 +103,12 @@ class ServerService(Service):
|
|
|
121
103
|
return "dall_e"
|
|
122
104
|
elif model_deployment.startswith("openai/gpt-4"):
|
|
123
105
|
return "gpt4"
|
|
124
|
-
|
|
106
|
+
elif model_deployment.startswith("openai/gpt-3"):
|
|
125
107
|
return "gpt3"
|
|
108
|
+
elif model_deployment.startswith("openai/o1"):
|
|
109
|
+
return "o1"
|
|
110
|
+
else:
|
|
111
|
+
return "openai"
|
|
126
112
|
elif model_deployment.startswith("ai21/"):
|
|
127
113
|
return "jurassic"
|
|
128
114
|
else:
|
helm/proxy/services/service.py
CHANGED
|
@@ -11,7 +11,6 @@ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResu
|
|
|
11
11
|
from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
|
|
12
12
|
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
13
13
|
from helm.common.tokenization_request import (
|
|
14
|
-
WindowServiceInfo,
|
|
15
14
|
TokenizationRequest,
|
|
16
15
|
TokenizationRequestResult,
|
|
17
16
|
DecodeRequest,
|
|
@@ -85,11 +84,6 @@ class Service(ABC):
|
|
|
85
84
|
"""Get general info."""
|
|
86
85
|
pass
|
|
87
86
|
|
|
88
|
-
@abstractmethod
|
|
89
|
-
def get_window_service_info(self, model_name: str) -> WindowServiceInfo:
|
|
90
|
-
"""Get window service info."""
|
|
91
|
-
pass
|
|
92
|
-
|
|
93
87
|
@abstractmethod
|
|
94
88
|
def expand_query(self, query: Query) -> QueryResult:
|
|
95
89
|
"""Turn the `query` into requests."""
|
|
@@ -13,8 +13,8 @@ class TestAutoTokenCounter:
|
|
|
13
13
|
)
|
|
14
14
|
# The following prompt has 51 tokens according to the GPT-2 tokenizer
|
|
15
15
|
request = Request(
|
|
16
|
-
model="openai/
|
|
17
|
-
model_deployment="
|
|
16
|
+
model="openai/gpt2",
|
|
17
|
+
model_deployment="huggingface/gpt2",
|
|
18
18
|
prompt="The Center for Research on Foundation Models (CRFM) is "
|
|
19
19
|
"an interdisciplinary initiative born out of the Stanford "
|
|
20
20
|
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
@@ -1,60 +1,52 @@
|
|
|
1
|
-
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from helm.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
self.
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# If 'tokens' is not present in the response, assume request failed.
|
|
39
|
-
if "tokens" not in response:
|
|
40
|
-
handle_failed_request(api_type="tokenizer", response=response)
|
|
41
|
-
|
|
42
|
-
return response
|
|
43
|
-
|
|
44
|
-
try:
|
|
45
|
-
response, cached = self.cache.get(raw_request, do_it)
|
|
46
|
-
except AI21RequestError:
|
|
47
|
-
return TokenizationRequestResult(success=False, cached=False, text="", tokens=[])
|
|
48
|
-
|
|
49
|
-
# Each token is represented like this in the response:
|
|
50
|
-
# {'token': '▁Hello', 'textRange': {'start': 0, 'end': 5}}
|
|
51
|
-
tokens: List[TokenizationToken] = []
|
|
52
|
-
for token_dict in response["tokens"]:
|
|
53
|
-
tokens.append(
|
|
54
|
-
TokenizationToken(value=token_dict["token"], text_range=from_dict(TextRange, token_dict["textRange"]))
|
|
1
|
+
import threading
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
+
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from ai21_tokenizer import Tokenizer as SDKTokenizer
|
|
10
|
+
from ai21_tokenizer.base_tokenizer import BaseTokenizer
|
|
11
|
+
except ModuleNotFoundError as e:
|
|
12
|
+
handle_module_not_found_error(e, ["ai21"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AI21LocalTokenizer(CachingTokenizer):
|
|
16
|
+
"""AI21 tokenizer using the AI21 Python library."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, cache_config: CacheConfig) -> None:
|
|
19
|
+
super().__init__(cache_config)
|
|
20
|
+
self._tokenizers_lock = threading.Lock()
|
|
21
|
+
self.tokenizers: Dict[str, BaseTokenizer] = {}
|
|
22
|
+
|
|
23
|
+
def _get_tokenizer(self, tokenizer_name: str) -> BaseTokenizer:
|
|
24
|
+
with self._tokenizers_lock:
|
|
25
|
+
if tokenizer_name not in self.tokenizers:
|
|
26
|
+
self.tokenizers[tokenizer_name] = SDKTokenizer.get_tokenizer(tokenizer_name)
|
|
27
|
+
return self.tokenizers[tokenizer_name]
|
|
28
|
+
|
|
29
|
+
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
30
|
+
tokenizer_name = request["tokenizer"].split("/")[1]
|
|
31
|
+
tokenizer = self._get_tokenizer(tokenizer_name)
|
|
32
|
+
if request["truncation"]:
|
|
33
|
+
token_ids = tokenizer.encode(
|
|
34
|
+
text=request["text"],
|
|
35
|
+
truncation=request["truncation"],
|
|
36
|
+
max_length=request["max_length"],
|
|
37
|
+
add_special_tokens=False,
|
|
55
38
|
)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
39
|
+
else:
|
|
40
|
+
token_ids = tokenizer.encode(
|
|
41
|
+
text=request["text"],
|
|
42
|
+
add_special_tokens=False,
|
|
43
|
+
)
|
|
44
|
+
if request["encode"]:
|
|
45
|
+
return {"tokens": token_ids}
|
|
46
|
+
else:
|
|
47
|
+
return {"tokens": tokenizer.convert_ids_to_tokens(token_ids)}
|
|
48
|
+
|
|
49
|
+
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
50
|
+
tokenizer_name = request["tokenizer"].split("/")[1]
|
|
51
|
+
tokenizer = self._get_tokenizer(tokenizer_name)
|
|
52
|
+
return {"text": tokenizer.decode(request["tokens"])}
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import requests
|
|
3
1
|
from typing import Any, Dict, List, Optional
|
|
4
2
|
|
|
5
3
|
import cohere
|
|
@@ -8,84 +6,11 @@ from cohere.manually_maintained.tokenizers import get_hf_tokenizer
|
|
|
8
6
|
from helm.common.cache import CacheConfig
|
|
9
7
|
from helm.common.tokenization_request import (
|
|
10
8
|
TokenizationRequest,
|
|
11
|
-
DecodeRequest,
|
|
12
|
-
DecodeRequestResult,
|
|
13
9
|
TokenizationToken,
|
|
14
10
|
)
|
|
15
|
-
from helm.clients.cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
|
|
16
11
|
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
17
12
|
|
|
18
13
|
|
|
19
|
-
class CohereTokenizer(CachingTokenizer):
|
|
20
|
-
# From "https://docs.cohere.ai/versioning-reference",
|
|
21
|
-
# "this version [2021-11-08] introduces multiple generations, meaning that the generations endpoint will
|
|
22
|
-
# now accept a num_generations argument in the JSON and will always return an array of generations"
|
|
23
|
-
# Note that the API version is decoupled from the model version.
|
|
24
|
-
DEFAULT_API_VERSION: str = "2021-11-08"
|
|
25
|
-
|
|
26
|
-
TOKENIZE_ENDPOINT: str = "tokenize"
|
|
27
|
-
|
|
28
|
-
# According to https://docs.cohere.ai/tokenize-reference#request, for tokenize, text: "the string to
|
|
29
|
-
# be tokenized, the minimum text length is 1 character, and the maximum text length is 65536 characters."
|
|
30
|
-
# However, even sending a request with 60,000 characters sometimes fails, so we set the
|
|
31
|
-
# maximum length to 50,000, which is about 8,333 tokens.
|
|
32
|
-
# TODO: followed up with Cohere support with an example of a failure case
|
|
33
|
-
TOKENIZE_API_MAX_TEXT_LENGTH: int = 50_000
|
|
34
|
-
|
|
35
|
-
def __init__(self, api_key: str, cache_config: CacheConfig) -> None:
|
|
36
|
-
super().__init__(cache_config)
|
|
37
|
-
self.api_key: str = api_key
|
|
38
|
-
|
|
39
|
-
def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
|
|
40
|
-
# This cache key is used to preserve our existing Cache (10/17/2023)
|
|
41
|
-
return {"text": request.text}
|
|
42
|
-
|
|
43
|
-
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
44
|
-
"""
|
|
45
|
-
Send the request to the Cohere Tokenize API.
|
|
46
|
-
|
|
47
|
-
From https://docs.cohere.ai/tokenize-reference, for text "tokenize me! :D", the response will be:
|
|
48
|
-
|
|
49
|
-
{
|
|
50
|
-
"tokens": [34160, 974, 514, 34, 1420, 69]
|
|
51
|
-
"token_strings": ["token", "ize", " me", "!", " :", "D"]
|
|
52
|
-
}
|
|
53
|
-
"""
|
|
54
|
-
text: str = request["text"]
|
|
55
|
-
assert (
|
|
56
|
-
1 <= len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
|
|
57
|
-
), f"Invalid text length: {len(text)}. Valid length: [1..{CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH:,d}]"
|
|
58
|
-
|
|
59
|
-
response = requests.request(
|
|
60
|
-
method="POST",
|
|
61
|
-
url=get_cohere_url(CohereTokenizer.TOKENIZE_ENDPOINT),
|
|
62
|
-
headers={
|
|
63
|
-
"Authorization": f"BEARER {self.api_key}",
|
|
64
|
-
"Content-Type": "application/json",
|
|
65
|
-
"Cohere-Version": DEFAULT_COHERE_API_VERSION,
|
|
66
|
-
},
|
|
67
|
-
data=json.dumps(request),
|
|
68
|
-
)
|
|
69
|
-
result = json.loads(response.text)
|
|
70
|
-
assert "message" not in result.keys(), f"Request failed with error {result['message']}"
|
|
71
|
-
assert "tokens" in result and "token_strings" in result, f"Invalid response: {result}"
|
|
72
|
-
# This output format is used to preserve our existing Cache (10/17/2023)
|
|
73
|
-
return result
|
|
74
|
-
|
|
75
|
-
def _tokenization_raw_response_to_tokens(
|
|
76
|
-
self, response: Dict[str, Any], request: TokenizationRequest
|
|
77
|
-
) -> List[TokenizationToken]:
|
|
78
|
-
tokens = response["tokens" if request.encode else "token_strings"]
|
|
79
|
-
return [TokenizationToken(token) for token in tokens]
|
|
80
|
-
|
|
81
|
-
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
82
|
-
# Defined for mypy but decode() already raises NotImplementedError
|
|
83
|
-
raise NotImplementedError("The Cohere API does not support decoding.")
|
|
84
|
-
|
|
85
|
-
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
86
|
-
raise NotImplementedError("The Cohere API does not support decoding.")
|
|
87
|
-
|
|
88
|
-
|
|
89
14
|
class CohereLocalTokenizer(CachingTokenizer):
|
|
90
15
|
"""Cohere tokenizer using the Cohere Python library."""
|
|
91
16
|
|
|
@@ -53,7 +53,6 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
53
53
|
# If unspecified, set `use_fast=True` by default.
|
|
54
54
|
if "use_fast" not in from_pretrained_kwargs:
|
|
55
55
|
from_pretrained_kwargs["use_fast"] = True
|
|
56
|
-
print(from_pretrained_kwargs)
|
|
57
56
|
try:
|
|
58
57
|
# From the Hugging Face documentation, "local_files_only(defaults to False) —
|
|
59
58
|
# Whether or not to only look at local files".
|