PyPI - langfun - Versions diffs - 0.1.1.dev20240822__py3-none-any.whl → 0.1.1.dev20240825__py3-none-any.whl - Mend

langfun 0.1.1.dev20240822py3-none-any.whl → 0.1.1.dev20240825py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

langfun/core/language_model.py CHANGED Viewed

@@ -712,6 +712,72 @@ class LanguageModel(component.Component):
             color='blue',
         )
+  def tokenize(
+      self,
+      prompt: str | message_lib.Message,
+      **kwargs,
+  ) -> list[tuple[str | bytes, int]]:
+    """Tokenizes the given prompt."""
+    prompt = message_lib.UserMessage.from_value(prompt)
+    call_counter = self._call_counter
+    self._call_counter += 1
+    with component.context(override_attrs=True, **kwargs):
+      request_start = time.time()
+      tokens = self._tokenize(prompt)
+      elapse = time.time() - request_start
+      self._debug_tokenize(prompt, tokens, call_counter, elapse)
+      return tokens
+  def _tokenize(
+      self, prompt: message_lib.Message
+  ) -> list[tuple[str | bytes, int]]:
+    """Subclass to implement."""
+    raise NotImplementedError(
+        f'{self.__class__.__name__} does not support tokenization.'
+    )
+  def _debug_tokenize(
+      self,
+      prompt: message_lib.Message,
+      tokens: list[tuple[str | bytes, int]],
+      call_counter: int,
+      elapse: float,
+  ):
+    debug = self.debug
+    if isinstance(debug, bool):
+      debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
+    if debug & LMDebugMode.INFO:
+      self._debug_model_info(call_counter, UsageNotAvailable())
+    if debug & LMDebugMode.PROMPT:
+      console.write(
+          prompt,
+          title=f'\n[{call_counter}] PROMPT TO TOKENIZE:',
+          color='green',
+      )
+      referred_modalities_lst = [prompt.referred_modalities(),]
+      if referred_modalities_lst:
+        for referred_modalities in referred_modalities_lst:
+          console.write(
+              pg.object_utils.kvlist_str(
+                  [(k, repr(v), None) for k, v in referred_modalities.items()]
+              ),
+              title=f'\n[{call_counter}] MODALITY OBJECTS SENT TO LM:',
+              color='green',
+          )
+    if debug & LMDebugMode.RESPONSE:
+      console.write(
+          tokens,
+          title=(
+              f'\n[{call_counter}] {len(tokens)} TOKENS RETURNED '
+              f'(in {elapse:.2f} seconds):'
+          ),
+          color='blue',
+      )
   def rate_to_max_concurrency(
       self, requests_per_min: float = 0, tokens_per_min: float = 0
   ) -> int:

langfun/core/language_model_test.py CHANGED Viewed

@@ -81,6 +81,13 @@ class MockScoringModel(MockModel):
     ]
+class MockTokenizeModel(MockModel):
+  def _tokenize(
+      self, prompt: message_lib.Message) -> list[tuple[str | bytes, int]]:
+    return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
 class LMSamplingOptionsTest(unittest.TestCase):
   """Tests for LMSamplingOptions."""
@@ -552,6 +559,65 @@ class LanguageModelTest(unittest.TestCase):
     with self.assertRaises(NotImplementedError):
       MockModel().score('hi', ['1', '2'])
+  def test_tokenize(self):
+    info_flag = lm_lib.LMDebugMode.INFO
+    prompt_flag = lm_lib.LMDebugMode.PROMPT
+    response_flag = lm_lib.LMDebugMode.RESPONSE
+    debug_prints = {
+        info_flag: 'LM INFO',
+        prompt_flag: 'PROMPT TO TOKENIZE',
+        response_flag: 'TOKENS RETURNED',
+    }
+    debug_modes = [
+        info_flag,
+        prompt_flag,
+        response_flag,
+        info_flag | prompt_flag,
+        info_flag | response_flag,
+        prompt_flag | response_flag,
+        info_flag | prompt_flag | response_flag,
+    ]
+    class Image(modality.Modality):
+      def to_bytes(self):
+        return b'fake_image'
+    for debug_mode in debug_modes:
+      string_io = io.StringIO()
+      lm = MockTokenizeModel()
+      with contextlib.redirect_stdout(string_io):
+        self.assertEqual(
+            lm.tokenize(
+                message_lib.UserMessage('hi <<[[image]]>>', image=Image()),
+                debug=debug_mode),
+            [('hi', 0), ('<<[[image]]>>', 1)],
+        )
+      debug_info = string_io.getvalue()
+      expected_included = [
+          debug_prints[f]
+          for f in lm_lib.LMDebugMode
+          if f != lm_lib.LMDebugMode.NONE and f in debug_mode
+      ]
+      expected_excluded = [
+          debug_prints[f]
+          for f in lm_lib.LMDebugMode
+          if f != lm_lib.LMDebugMode.NONE and f not in debug_mode
+      ]
+      for expected_include in expected_included:
+        self.assertIn(expected_include, debug_info)
+      for expected_exclude in expected_excluded:
+        self.assertNotIn(expected_exclude, debug_info)
+      if debug_mode & lm_lib.LMDebugMode.PROMPT:
+        self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
+  def test_tokenize_with_unsupported_model(self):
+    with self.assertRaises(NotImplementedError):
+      MockModel().tokenize('hi')
   def test_rate_to_max_concurrency_no_rpm_no_tpm(self) -> None:
     lm = MockModel()
     self.assertEqual(

langfun/core/llms/fake.py CHANGED Viewed

@@ -25,6 +25,9 @@ class Fake(lf.LanguageModel):
              completions: list[lf.Message]):
     return [lf.LMScoringResult(score=-i * 1.0) for i in range(len(completions))]
+  def _tokenize(self, prompt: lf.Message) -> list[tuple[str | bytes, int]]:
+    return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
   def _sample(self, prompts: list[lf.Message]) -> list[lf.LMSamplingResult]:
     results = []
     for prompt in prompts:

langfun/core/llms/fake_test.py CHANGED Viewed

@@ -62,6 +62,13 @@ class EchoTest(unittest.TestCase):
         [lf.LMScoringResult(0.0), lf.LMScoringResult(-1.0)],
     )
+  def test_tokenize(self):
+    lm = fakelm.Echo()
+    self.assertEqual(
+        lm.tokenize('hi'),
+        [('hi', 0)]
+    )
 class StaticResponseTest(unittest.TestCase):

langfun/core/structured/__init__.py CHANGED Viewed

@@ -77,6 +77,8 @@ from langfun.core.structured.completion import complete
 from langfun.core.structured.scoring import score
+from langfun.core.structured.tokenization import tokenize
 # Expose default examples for structured operations so users could refer to
 # them.
 from langfun.core.structured.parsing import default_parse_examples

langfun/core/structured/tokenization.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright 2023 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenize the prompt for `lf.query`."""
+from typing import Any, Type, Union
+import langfun.core as lf
+from langfun.core.structured import mapping
+from langfun.core.structured import prompting
+from langfun.core.structured import schema as schema_lib
+import pyglove as pg
+def tokenize(
+    prompt: Union[str, pg.Symbolic] | list[str | pg.Symbolic],
+    schema: Union[
+        schema_lib.Schema, Type[Any], list[Type[Any]], dict[str, Any], None
+    ] = None,
+    *,
+    lm: lf.LanguageModel | None = None,
+    examples: list[mapping.MappingExample] | None = None,
+    protocol: schema_lib.SchemaProtocol = 'python',
+    **kwargs,
+) -> list[tuple[str | bytes, int]]:
+  """Tokenize the prompt for `lf.query`.
+  Args:
+    prompt: The prompt(s) based on which each completion will be scored.
+    schema: The schema as the output type. If None, it will be inferred from
+      the completions.
+    lm: The language model used for scoring.
+    examples: Fewshot exemplars used together with the prompt in getting the
+      completions.
+    protocol: The protocol for formulating the prompt based on objects.
+    **kwargs: Keyword arguments that are referred by the prompt.
+  Returns:
+    A list of (text, token_id) tuples.
+  """
+  input_message = prompting.query_prompt(
+      prompt,
+      schema,
+      examples=examples,
+      protocol=protocol,
+      **kwargs,
+  )
+  if lm is None:
+    lm_override = lf.get_contextual_override('lm')
+    if lm_override is None:
+      raise ValueError('`lm` must be specified or provided from `lf.context`.')
+    lm = lm_override.value
+  return lm.tokenize(input_message)

langfun/core/structured/tokenization_test.py ADDED Viewed

@@ -0,0 +1,48 @@
+# Copyright 2023 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import langfun.core as lf
+from langfun.core.llms import fake
+from langfun.core.structured import tokenization
+import pyglove as pg
+class Answer(pg.Object):
+  result: int
+class TokenizationTest(unittest.TestCase):
+  def test_bad_call(self):
+    with self.assertRaisesRegex(ValueError, '`lm` must be specified'):
+      tokenization.tokenize('hi')
+  def test_tokenize(self):
+    self.assertEqual(
+        tokenization.tokenize('hi', lm=fake.Echo()),
+        [('hi', 0)]
+    )
+  def test_tokenize_with_lm_from_the_context(self):
+    with lf.context(lm=fake.Echo()):
+      self.assertEqual(
+          tokenization.tokenize('hi'),
+          [('hi', 0)]
+      )
+if __name__ == '__main__':
+  unittest.main()

{langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langfun
-Version: 0.1.1.dev20240822
+Version: 0.1.1.dev20240825
 Summary: Langfun: Language as Functions.
 Home-page: https://github.com/google/langfun
 Author: Langfun Authors

{langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/RECORD RENAMED Viewed

@@ -8,8 +8,8 @@ langfun/core/console.py,sha256=bk5rNPNm9rMGW5YT2HixxU04p2umnoabn5SDz6Dqe88,2317
 langfun/core/console_test.py,sha256=5SYJdxpJGLgdSSQqqMPoA1X6jpsLD8rgcyk-EgI65oE,1077
 langfun/core/langfunc.py,sha256=RvIcRjIq0jWYRu1xim-FYe4HSrt97r3GMBO_PuagUmw,11060
 langfun/core/langfunc_test.py,sha256=lyt-UzkD8972cxZwzCkps0_RMLeSsOBrcUFIW-fB6us,8653
-langfun/core/language_model.py,sha256=ihcLy7WWrUByZ4Yfikb2OBppM6QGwMyjTYecBzelNCs,24028
-langfun/core/language_model_test.py,sha256=TlNmVUfBfDQZzIiiBqCBTrxgcoyj2qNp3kONvmr2pX4,21273
+langfun/core/language_model.py,sha256=oGni82fhYB3kUsL0okzvIXkKgXEMHVE-c0jR5LRmsIc,26039
+langfun/core/language_model_test.py,sha256=ebJ1vnaxKSKvlwi6v07yHjn91xMiDw2bQ9DBnyVorYw,23303
 langfun/core/logging.py,sha256=oDSeqGIQogZJ6xuPTcr9mkmLC2YnLP67UHtTdWbbiVY,4250
 langfun/core/logging_test.py,sha256=poSsNGKi6G9LWOcWnTY0BQjj0BtaQknH-NK6FcQrVT4,2152
 langfun/core/memory.py,sha256=f-asN1F7Vehgdn_fK84v73GrEUOxRtaW934keutTKjk,2416
@@ -55,8 +55,8 @@ langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se
 langfun/core/llms/__init__.py,sha256=a1AV3XWi2gY4UvmmaPP1GapaQxygA6xzQJvVQRp6EPA,4818
 langfun/core/llms/anthropic.py,sha256=Gon3fOi31RhZFgNd0ijyTnKnUdp9hrWrCoSXyO4UaLw,7316
 langfun/core/llms/anthropic_test.py,sha256=T-swuMkfnlgs8Fpif4rtXs579exGk0TsbLMirXDZCkg,5533
-langfun/core/llms/fake.py,sha256=Dd7-6ka9pFf3fcWZyczamjOqQ91MOI-m7We3Oc9Ffmo,2927
-langfun/core/llms/fake_test.py,sha256=ipKfdOcuqVcJ8lDXVpnBVb9HHG0hAVkFkMoHpWjC2cI,7212
+langfun/core/llms/fake.py,sha256=gCHBYBLvBCsC78HI1hpoqXCS-p1FMTgY1P1qh_sGBPk,3070
+langfun/core/llms/fake_test.py,sha256=sIl_Mg7nFVjaN7AJhYCpA_qzDJpSnJzkazepGXpfQQg,7338
 langfun/core/llms/google_genai.py,sha256=Rl5a5CyF_6Y0BYYArKk8yMaenv1rH3MUQLy6b3dfMRI,10202
 langfun/core/llms/google_genai_test.py,sha256=iTISk3tJ4-3gjWmzcKQhEbH3ke4AkEiCu8rAGtB7SvU,7535
 langfun/core/llms/groq.py,sha256=pqtyOZ_1_OJMOg8xATWT_B_SVbuT9nMRf4VkH9GzW8g,6308
@@ -89,7 +89,7 @@ langfun/core/modalities/pdf.py,sha256=mfaeCbUA4JslFVTARiJh8hW7imvL4tLVw9gUhO5bAZ
 langfun/core/modalities/pdf_test.py,sha256=KE40zJD3Whe6ty2OULkp1J8jwLmB4ZjGXlGekluTP48,1952
 langfun/core/modalities/video.py,sha256=sKcXxbx9S1ERjH8yEzkbtySpcRJD40QiPIQiIBy-U5I,955
 langfun/core/modalities/video_test.py,sha256=GbsoefSeO7y8kCYhTtp4s9E3ah_eYrb6Z-MXpS01RFc,2046
-langfun/core/structured/__init__.py,sha256=yp60yeDSVlyT0ElmLwbpBHnQtk_JX5udnjG1UGcsXKA,3776
+langfun/core/structured/__init__.py,sha256=VeB0_yV8ZEkey8kizTmB0GdkLs_aag7D9bclP8Nntac,3835
 langfun/core/structured/completion.py,sha256=cS2PjG7sqzDu5x0xoTk8RmNcoeX55iVwH38NTefkMHg,8108
 langfun/core/structured/completion_test.py,sha256=2mUzDMKGF_WGfTtsnfmfMDx97dkJ-98y8leen__qWLA,19281
 langfun/core/structured/description.py,sha256=SXW4MJvshFjbR-0gw6rE21o6WXq12UlRXawvDBXMZFA,5211
@@ -108,6 +108,8 @@ langfun/core/structured/schema_generation_test.py,sha256=RM9s71kMNg2jTePwInkiW9f
 langfun/core/structured/schema_test.py,sha256=RjYhwTgktQgyqAjzLvo967nTiIK9KWgP-aNGg4e7ihE,25258
 langfun/core/structured/scoring.py,sha256=ae6SjLqoqsKFmcPnaJbsFmH4XFGKOQaJRjYZ1wm1Ywo,5860
 langfun/core/structured/scoring_test.py,sha256=QvlwDAzwuamKL5tCotm1L3Sx0cs3idoNK4aIEhaO4Yk,2272
+langfun/core/structured/tokenization.py,sha256=w6UeFGVcNSWJUPHdwgsKdYmiw7-k_PXX6kEv8TACPN4,2191
+langfun/core/structured/tokenization_test.py,sha256=dVW30kGYkX2HNtiRZe1oTmXFP7iIK6PrlKCttZ3QXe4,1311
 langfun/core/templates/__init__.py,sha256=bO0eMsVJbi7sxEB2YlInKRQ2EVP-RyyKUwcD-8msuN4,927
 langfun/core/templates/completion.py,sha256=mUqZHOEV3ag6-A08XghpeEltcrBvCDxXP004eDDfeag,1931
 langfun/core/templates/completion_test.py,sha256=vGnjnM38UHyVDUyaUYtmp20s9KBGOdbPVsX-H-ET11E,1636
@@ -117,8 +119,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
 langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
 langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
 langfun/core/templates/selfplay_test.py,sha256=rBW2Qr8yi-aWYwoTwRR-n1peKyMX9QXPZXURjLgoiRs,2264
-langfun-0.1.1.dev20240822.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
-langfun-0.1.1.dev20240822.dist-info/METADATA,sha256=tldtlq7znDyRGiaq62EDI8aqpsKJSpPoSsl1cCE2OUc,5234
-langfun-0.1.1.dev20240822.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
-langfun-0.1.1.dev20240822.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
-langfun-0.1.1.dev20240822.dist-info/RECORD,,
+langfun-0.1.1.dev20240825.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
+langfun-0.1.1.dev20240825.dist-info/METADATA,sha256=EpsJgv9Qsf97MphfiC_IJA3ICOqNDfH8V1UI2s8RWdQ,5234
+langfun-0.1.1.dev20240825.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
+langfun-0.1.1.dev20240825.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
+langfun-0.1.1.dev20240825.dist-info/RECORD,,

{langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/LICENSE RENAMED Viewed

File without changes

{langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/WHEEL RENAMED Viewed

File without changes

{langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/top_level.txt RENAMED Viewed

File without changes

langfun 0.1.1.dev20240822__py3-none-any.whl → 0.1.1.dev20240825__py3-none-any.whl

langfun 0.1.1.dev20240822py3-none-any.whl → 0.1.1.dev20240825py3-none-any.whl