PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import os
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, cast
 from threading import Lock
 from helm.common.cache import CacheConfig
+from helm.common.concurrency import ThreadSafeWrapper
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -10,46 +11,36 @@ from .caching_tokenizer import CachingTokenizer
 from .tokenizer import cleanup_tokens
-# TODO: Delete this.
-_MODEL_NAME_ALIASES: Dict[str, str] = {
-    "google/t5-11b": "t5-11b",
-    "huggingface/gpt2": "gpt2",
-    "huggingface/santacoder": "bigcode/santacoder",
-    "huggingface/starcoder": "bigcode/starcoder",
-}
-"""Mapping of some HELM model names to Hugging Face pretrained model name."""
+WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
+"""Thread safe wrapper around Hugging Face PreTrainedTokenizerBase.
+Hugging Face PreTrainedTokenizerBase is thread-hostile and using it from multiple threads
+simultaneously can result in an "Already borrowed" error (#1421). This wrapper ensures
+that a lock is held when using the PreTrainedTokenizerBase.
-# TODO: Delete this.
-def resolve_alias(model_name: str) -> str:
-    """Resolve some HELM model names to Hugging Face pretrained model name."""
-    return _MODEL_NAME_ALIASES.get(model_name, model_name)
+Example usage:
+    with wrapped_tokenizer as tokenizer:
+        tokenizer.encode("...")
+"""
 class HuggingFaceTokenizer(CachingTokenizer):
-    _tokenizers: Dict[str, PreTrainedTokenizerBase] = {}
+    _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
     _tokenizers_lock: Lock = Lock()
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
-    ):
+    def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
         super().__init__(cache_config=cache_config)
         self._pretrained_model_name_or_path = pretrained_model_name_or_path
-        self._revision = revision
+        self._kwargs = kwargs
     @staticmethod
-    def create_tokenizer(pretrained_model_name_or_path: str, revision: Optional[str] = None) -> PreTrainedTokenizerBase:
+    def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer:
         """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
         # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
         # TODO: Figure out if we actually need this.
         os.environ["TOKENIZERS_PARALLELISM"] = "False"
-        tokenizer_kwargs = {}
-        if revision is not None:
-            tokenizer_kwargs["revision"] = revision
         try:
             # From the Hugging Face documentation, "local_files_only(defaults to False) —
             # Whether or not to only look at local files".
@@ -60,19 +51,23 @@ class HuggingFaceTokenizer(CachingTokenizer):
             # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
             # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
             # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=True, use_fast=True, **tokenizer_kwargs
+            return WrappedPreTrainedTokenizer(
+                AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
+                )
             )
         except OSError:
             hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=False, use_fast=True, **tokenizer_kwargs
+            return WrappedPreTrainedTokenizer(
+                AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
+                )
             )
     @staticmethod
     def get_tokenizer(
-        helm_tokenizer_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
-    ) -> PreTrainedTokenizerBase:
+        helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs
+    ) -> WrappedPreTrainedTokenizer:
         """
         Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
         Returns the tokenizer.
@@ -80,42 +75,39 @@ class HuggingFaceTokenizer(CachingTokenizer):
         with HuggingFaceTokenizer._tokenizers_lock:
             if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers:
                 with htrack_block(
-                    f"Loading {pretrained_model_name_or_path} (revision={revision}) "
+                    f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
                     f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers"
                 ):
                     # Keep the tokenizer in memory, so we don't recreate it for future requests
                     HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer(
-                        pretrained_model_name_or_path, revision
+                        pretrained_model_name_or_path, **kwargs
                     )
         return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
-    def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> PreTrainedTokenizerBase:
+    def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer:
         """Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer."""
-        pretrained_model_name_or_path: str
-        if self._pretrained_model_name_or_path:
-            pretrained_model_name_or_path = self._pretrained_model_name_or_path
-        else:
-            pretrained_model_name_or_path = resolve_alias(request["tokenizer"])
-        _tokenizer = HuggingFaceTokenizer.get_tokenizer(
+        pretrained_model_name_or_path = (
+            self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else request["tokenizer"]
+        )
+        return HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=request["tokenizer"],
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            revision=self._revision,
+            **self._kwargs,
         )
-        return _tokenizer
     def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        _tokenizer = self._get_tokenizer_for_request(request)
         if request["encode"]:
             if request["truncation"]:
-                tokens = _tokenizer.encode(
-                    request["text"],
-                    truncation=request["truncation"],
-                    max_length=request["max_length"],
-                    add_special_tokens=False,
-                )
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.encode(
+                        request["text"],
+                        truncation=request["truncation"],
+                        max_length=request["max_length"],
+                        add_special_tokens=False,
+                    )
             else:
-                tokens = _tokenizer.encode(request["text"], add_special_tokens=False)
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.encode(request["text"], add_special_tokens=False)
         else:
             if "gpt" in request["tokenizer"] or request["tokenizer"] in [
                 "bigscience/bloom",
@@ -126,9 +118,10 @@ class HuggingFaceTokenizer(CachingTokenizer):
                 # convert_tokens_to_string method. We prefer to use this method instead
                 # of the hacky cleanup_tokens method below as it might handle cases
                 # we haven't thought of in cleanup_tokens.
-                tokens = [
-                    _tokenizer.convert_tokens_to_string([token]) for token in _tokenizer.tokenize(request["text"])
-                ]
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = [
+                        tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
+                    ]
             else:
                 # Tokenizes the text and returns the tokens as a list of strings,
                 # not a list of token objects (otherwise "Hello world" would be"
@@ -138,14 +131,17 @@ class HuggingFaceTokenizer(CachingTokenizer):
                 # But this replaces all the "▁" characters by "", which is not what we want.
                 # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
                 # Just like tokenize("Hello", encode=False) would return ["Hello"].
-                tokens = _tokenizer.tokenize(request["text"])
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.tokenize(request["text"])
+                # Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings.
+                if tokens and type(tokens[0]) == bytes:
+                    tokens = [cast(bytes, token).decode(errors="ignore") for token in tokens]
                 tokens = cleanup_tokens(tokens, request["tokenizer"])
         return {"tokens": tokens}
     def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        _tokenizer = self._get_tokenizer_for_request(request)
-        text = _tokenizer.decode(
-            request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
-        )
+        with self._get_tokenizer_for_request(request) as tokenizer:
+            text = tokenizer.decode(
+                request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
+            )
         return {"text": text}

helm/tokenizers/simple_tokenizer.py ADDED Viewed

@@ -0,0 +1,33 @@
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    DecodeRequestResult,
+    TokenizationRequest,
+    TokenizationRequestResult,
+    TokenizationToken,
+)
+from helm.tokenizers.tokenizer import Tokenizer
+class SimpleTokenizer(Tokenizer):
+    """Simple tokenizer for tutorials and for debugging."""
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        if request.encode:
+            return TokenizationRequestResult(
+                success=True,
+                cached=False,
+                tokens=[TokenizationToken(ord(character)) for character in request.text],
+                text=request.text,
+            )
+        else:
+            return TokenizationRequestResult(
+                success=True,
+                cached=False,
+                tokens=[TokenizationToken(character) for character in request.text],
+                text=request.text,
+            )
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        return DecodeRequestResult(
+            success=True, cached=False, text="".join([chr(code_point) for code_point in request.tokens])
+        )

helm/tokenizers/test_anthropic_tokenizer.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+import tempfile
+from typing import List
+from helm.common.cache import SqliteCacheConfig
+from helm.common.general import parallel_map
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    DecodeRequestResult,
+    TokenizationRequest,
+    TokenizationRequestResult,
+)
+from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
+class TestAnthropicTokenizer:
+    TEST_PROMPT: str = "I am a computer scientist."
+    TEST_ENCODED: List[int] = [45, 1413, 269, 6797, 22228, 18]
+    TEST_TOKENS: List[str] = ["I", " am", " a", " computer", " scientist", "."]
+    def setup_method(self, method):
+        cache_file = tempfile.NamedTemporaryFile(delete=False)
+        self.cache_path: str = cache_file.name
+        self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
+    def teardown_method(self, method):
+        os.remove(self.cache_path)
+    def test_tokenize(self):
+        request = TokenizationRequest(text=self.TEST_PROMPT, tokenizer="anthropic/claude")
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making the tokenize request. Result should not be cached"
+        assert result.raw_tokens == self.TEST_TOKENS
+        result = self.tokenizer.tokenize(request)
+        assert result.cached, "Result should be cached"
+        assert result.raw_tokens == self.TEST_TOKENS
+    def test_encode(self):
+        request = TokenizationRequest(
+            text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1
+        )
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making the tokenize request. Result should not be cached"
+        assert result.raw_tokens == [self.TEST_ENCODED[0]]
+        result = self.tokenizer.tokenize(request)
+        assert result.cached, "Result should be cached"
+        assert result.raw_tokens == [self.TEST_ENCODED[0]]
+        request = TokenizationRequest(
+            text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1024
+        )
+        result = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making this particular request. Result should not be cached"
+        assert result.raw_tokens == self.TEST_ENCODED
+    def test_decode(self):
+        request = DecodeRequest(tokens=self.TEST_ENCODED, tokenizer="anthropic/claude")
+        result: DecodeRequestResult = self.tokenizer.decode(request)
+        assert not result.cached, "First time making the decode request. Result should not be cached"
+        assert result.text == self.TEST_PROMPT
+        result = self.tokenizer.decode(request)
+        assert result.cached, "Result should be cached"
+        assert result.text == self.TEST_PROMPT
+    def test_already_borrowed(self):
+        """Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Anthropic tokenizer,
+        which is a thin wrapper around a Hugging Face FastTokenizer"""
+        def make_tokenize_request(seed: int) -> None:
+            request_length = 10
+            truncation = bool(seed % 2)
+            self.tokenizer.tokenize(
+                # The truncation parameter requires setting a flag on the Rust FastTokenizer.
+                # Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
+                TokenizationRequest(
+                    text=str(seed) * request_length, tokenizer="anthropic/claude", encode=True, truncation=truncation
+                )
+            )
+        num_requests = 100
+        # Should not raise "Already borrowed" error
+        parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)

helm/tokenizers/test_huggingface_tokenizer.py ADDED Viewed

@@ -0,0 +1,136 @@
+import os
+import tempfile
+from typing import Optional
+from helm.common.cache import SqliteCacheConfig
+from helm.common.general import parallel_map, singleton
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    DecodeRequestResult,
+    TokenizationRequest,
+    TokenizationRequestResult,
+)
+from .huggingface_tokenizer import HuggingFaceTokenizer
+class TestHuggingFaceGPT2Tokenizer:
+    def setup_method(self, method):
+        cache_file = tempfile.NamedTemporaryFile(delete=False)
+        self.cache_path: str = cache_file.name
+        self.tokenizer = HuggingFaceTokenizer(SqliteCacheConfig(self.cache_path))
+    def teardown_method(self, method):
+        os.remove(self.cache_path)
+    def test_tokenize(self):
+        request = TokenizationRequest(text="I am a computer scientist.", tokenizer="huggingface/gpt2")
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making the tokenize request. Result should not be cached"
+        result = self.tokenizer.tokenize(request)
+        assert result.cached, "Result should be cached"
+        assert result.raw_tokens == ["I", " am", " a", " computer", " scientist", "."]
+    def test_encode(self):
+        request = TokenizationRequest(
+            text="I am a computer scientist.", tokenizer="huggingface/gpt2", encode=True, truncation=True, max_length=1
+        )
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making the tokenize request. Result should not be cached"
+        result = self.tokenizer.tokenize(request)
+        assert result.cached, "Result should be cached"
+        assert result.raw_tokens == [40]
+        request = TokenizationRequest(
+            text="I am a computer scientist.",
+            tokenizer="huggingface/gpt2",
+            encode=True,
+            truncation=True,
+            max_length=1024,
+        )
+        result = self.tokenizer.tokenize(request)
+        assert not result.cached, "First time making this particular request. Result should not be cached"
+        assert result.raw_tokens == [40, 716, 257, 3644, 11444, 13]
+    def test_decode(self):
+        request = DecodeRequest(tokens=[40, 716, 257, 3644, 11444, 13], tokenizer="huggingface/gpt2")
+        result: DecodeRequestResult = self.tokenizer.decode(request)
+        assert not result.cached, "First time making the decode request. Result should not be cached"
+        result = self.tokenizer.decode(request)
+        assert result.cached, "Result should be cached"
+        assert result.text == "I am a computer scientist."
+    def test_already_borrowed(self):
+        """Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Hugging Face tokenizer"""
+        def make_tokenize_request(seed: int) -> None:
+            request_length = 10
+            truncation = bool(seed % 2)
+            self.tokenizer.tokenize(
+                # The truncation parameter requires setting a flag on the Rust FastTokenizer.
+                # Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
+                TokenizationRequest(
+                    text=str(seed) * request_length, tokenizer="huggingface/gpt2", encode=True, truncation=truncation
+                )
+            )
+        num_requests = 100
+        # Should not raise "Already borrowed" error
+        parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)
+class TestHuggingFaceTokenizer:
+    # The following prompt has 51 tokens according to the GPT-2 tokenizer
+    TEST_PROMPT: str = (
+        "The Center for Research on Foundation Models (CRFM) is "
+        "an interdisciplinary initiative born out of the Stanford "
+        "Institute for Human-Centered Artificial Intelligence (HAI) "
+        "that aims to make fundamental advances in the study, development, "
+        "and deployment of foundation models."
+    )
+    @staticmethod
+    def verify_get_tokenizer(
+        tokenizer_name: str, expected_num_tokens: int, pretrained_model_name_or_path: Optional[str] = None
+    ):
+        wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer(
+            helm_tokenizer_name=tokenizer_name,
+            pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
+        )
+        assert tokenizer_name in HuggingFaceTokenizer._tokenizers, "Tokenizer should be cached"
+        with wrapped_tokenizer as tokenizer:
+            assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
+    def test_get_tokenizer_gpt2(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("huggingface/gpt2", 51, pretrained_model_name_or_path="gpt2")
+    def test_get_tokenizer_gptj(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-j-6B", 51)
+    def test_get_tokenizer_gptneox(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-neox-20b", 52)
+    def test_get_tokenizer_bloom(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/bloom", 51)
+    def test_get_tokenizer_t0pp(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/T0pp", 58)
+    def test_get_tokenizer_t511b(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("google/t5-11b", 58, pretrained_model_name_or_path="t5-11b")
+    def test_get_tokenizer_ul2(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("google/ul2", 58)
+    def test_get_santacoder(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("bigcode/santacoder", 62)
+    def test_get_clip_tokenizer(self):
+        TestHuggingFaceTokenizer.verify_get_tokenizer("openai/clip-vit-large-patch14", 50)
+    def test_gpt2_tokenize_eos(self):
+        eos_token: str = "<|endoftext|>"
+        wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
+        with wrapped_tokenizer as tokenizer:
+            token_ids = tokenizer.encode(eos_token)
+            assert singleton(token_ids) == 50256
+            assert tokenizer.decode(token_ids) == eos_token

helm/tokenizers/test_simple_tokenizer.py ADDED Viewed

@@ -0,0 +1,33 @@
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.simple_tokenizer import SimpleTokenizer
+def test_simple_tokenizer_tokenize():
+    tokenizer = SimpleTokenizer()
+    request = TokenizationRequest(tokenizer="simple/tokenizer1", text="otter 🦦")
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in ["o", "t", "t", "e", "r", " ", "🦦"]]
+def test_simple_tokenizer_encode():
+    tokenizer = SimpleTokenizer()
+    request = TokenizationRequest(tokenizer="simple/tokenizer1", text="otter 🦦", encode=True)
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in [111, 116, 116, 101, 114, 32, 129446]]
+def test_simple_tokenizer_decode():
+    tokenizer = SimpleTokenizer()
+    request = DecodeRequest(tokenizer="simple/tokenizer1", tokens=[111, 116, 116, 101, 114, 32, 129446])
+    result = tokenizer.decode(request)
+    assert result.success
+    assert not result.cached
+    assert result.text == "otter 🦦"

helm/tokenizers/vertexai_tokenizer.py ADDED Viewed

@@ -0,0 +1,97 @@
+import base64
+import dataclasses
+import requests
+from typing import Any, Dict, List, Union, Optional
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
+from helm.proxy.retry import NonRetriableException
+try:
+    import google.auth
+    import google.auth.transport.requests
+    from google.auth.exceptions import DefaultCredentialsError
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["google"])
+class VertexAIAuthenticationException(NonRetriableException):
+    pass
+class VertexAITokenizer(CachingTokenizer):
+    """Google Vertex AI API for tokenization.
+    Doc: https://cloud.google.com/vertex-ai/docs/generative-ai/compute-token"""
+    def __init__(self, project_id: Optional[str], location: Optional[str], cache_config: CacheConfig) -> None:
+        super().__init__(cache_config)
+        if not project_id:
+            raise VertexAIAuthenticationException("credentials.conf is missing googleProjectId")
+        if not location:
+            raise VertexAIAuthenticationException("credentials.conf is missing googleLocation")
+        self.project_id = project_id
+        self.location = location
+        try:
+            creds, _ = google.auth.default(quota_project_id=self.project_id)
+            auth_req = google.auth.transport.requests.Request()
+            creds.refresh(auth_req)
+        except DefaultCredentialsError as e:
+            raise VertexAIAuthenticationException(
+                "Log in using `gcloud auth application-default login` to use the Google Vertex tokenizer API"
+            ) from e
+        self.access_token = creds.token
+    def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
+        cache_key = dataclasses.asdict(request)
+        # Delete encode because the Google Vertex AI API simulateously gives string and integer tokens.
+        del cache_key["encode"]
+        return cache_key
+    def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        text: str = request["text"]
+        tokenizer_name = request["tokenizer"].split("/", maxsplit=1)[1]
+        url = (
+            f"https://{self.location}-aiplatform.googleapis.com/v1/projects/{self.project_id}/"
+            f"locations/{self.location}/publishers/google/models/{tokenizer_name}:computeTokens"
+        )
+        headers = {"Authorization": f"Bearer {self.access_token}"}
+        body = {
+            "instances": [{"prompt": text}],
+        }
+        response = requests.post(url, headers=headers, json=body)
+        response.raise_for_status()
+        return response.json()
+    def _tokenization_raw_response_to_tokens(
+        self, response: Dict[str, Any], request: TokenizationRequest
+    ) -> List[TokenizationToken]:
+        tokens: List[Union[int, str]]
+        response_instance = response["tokensInfo"][0]
+        if not response_instance:
+            # Response was empty
+            tokens = []
+        else:
+            if request.encode:
+                tokens = [int(token) for token in response_instance["tokenIds"]]
+            else:
+                # errors="ignore" is needed because the tokenizer is not guaranteed to tokenize on
+                # the boundary of UTF-8 characters. The tokenization boundary can be within the bytes of
+                # a UTF-8 character.
+                #
+                # TODO(#2141): Come up with a more correct way of doing this.
+                tokens = [
+                    base64.decodebytes(token.encode()).decode("utf-8", errors="ignore")
+                    for token in response_instance["tokens"]
+                ]
+        return [TokenizationToken(token) for token in tokens]
+    def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        # Defined for mypy but decode() already raises NotImplementedError
+        raise NotImplementedError("The Google Vertex AI API does not support decoding.")

helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py RENAMED Viewed

@@ -21,9 +21,11 @@ class YaLMTokenizer(CachingTokenizer):
         # This is a problem because then tokenize(" Hello", encode=False) == tokenize("Hello", encode=False)
         # That is why we manually replace "▁" with a space.
         return {
-            "tokens": token_ids
-            if request["encode"]
-            else cleanup_tokens(self._tokenizer.convert_ids_to_tokens(token_ids), request["tokenizer"])
+            "tokens": (
+                token_ids
+                if request["encode"]
+                else cleanup_tokens(self._tokenizer.convert_ids_to_tokens(token_ids), request["tokenizer"])
+            )
         }
     def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:

helm/tokenizers/yalm_tokenizer_data/__init__.py ADDED Viewed

File without changes

helm/tokenizers/yalm_tokenizer_data/voc_100b.sp ADDED Viewed

Binary file

helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py RENAMED Viewed

@@ -16,7 +16,7 @@ adapted from https://github.com/yandex/YaLM-100B/blob/main/megatron_lm/megatron/
 """
-YALM_TOKENIZER_PACKAGE: str = "helm.proxy.tokenizers.yalm_tokenizer_data"
+YALM_TOKENIZER_PACKAGE: str = "helm.tokenizers.yalm_tokenizer_data"
 YALM_TOKENIZER_VOCAB_FILENAME: str = "voc_100b.sp"

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl