langfun 0.1.1.dev20240822__tar.gz → 0.1.1.dev20240825__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/PKG-INFO +1 -1
  2. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/language_model.py +66 -0
  3. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/language_model_test.py +66 -0
  4. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/fake.py +3 -0
  5. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/fake_test.py +7 -0
  6. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/__init__.py +2 -0
  7. langfun-0.1.1.dev20240825/langfun/core/structured/tokenization.py +64 -0
  8. langfun-0.1.1.dev20240825/langfun/core/structured/tokenization_test.py +48 -0
  9. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun.egg-info/PKG-INFO +1 -1
  10. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun.egg-info/SOURCES.txt +2 -0
  11. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/LICENSE +0 -0
  12. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/README.md +0 -0
  13. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/__init__.py +0 -0
  14. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/__init__.py +0 -0
  15. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/__init__.py +0 -0
  16. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/__init__.py +0 -0
  17. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/correction.py +0 -0
  18. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/correction_test.py +0 -0
  19. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/errors.py +0 -0
  20. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/errors_test.py +0 -0
  21. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/execution.py +0 -0
  22. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/execution_test.py +0 -0
  23. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/generation.py +0 -0
  24. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/generation_test.py +0 -0
  25. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/parsing.py +0 -0
  26. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/parsing_test.py +0 -0
  27. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/permissions.py +0 -0
  28. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/coding/python/permissions_test.py +0 -0
  29. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/component.py +0 -0
  30. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/component_test.py +0 -0
  31. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/concurrent.py +0 -0
  32. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/concurrent_test.py +0 -0
  33. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/console.py +0 -0
  34. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/console_test.py +0 -0
  35. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/__init__.py +0 -0
  36. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/base.py +0 -0
  37. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/base_test.py +0 -0
  38. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/matching.py +0 -0
  39. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/matching_test.py +0 -0
  40. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/patching.py +0 -0
  41. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/patching_test.py +0 -0
  42. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/scoring.py +0 -0
  43. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/eval/scoring_test.py +0 -0
  44. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/langfunc.py +0 -0
  45. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/langfunc_test.py +0 -0
  46. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/__init__.py +0 -0
  47. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/anthropic.py +0 -0
  48. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/anthropic_test.py +0 -0
  49. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/cache/__init__.py +0 -0
  50. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/cache/base.py +0 -0
  51. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/cache/in_memory.py +0 -0
  52. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/cache/in_memory_test.py +0 -0
  53. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/google_genai.py +0 -0
  54. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/google_genai_test.py +0 -0
  55. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/groq.py +0 -0
  56. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/groq_test.py +0 -0
  57. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/llama_cpp.py +0 -0
  58. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/llama_cpp_test.py +0 -0
  59. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/openai.py +0 -0
  60. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/openai_test.py +0 -0
  61. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/rest.py +0 -0
  62. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/rest_test.py +0 -0
  63. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/vertexai.py +0 -0
  64. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/llms/vertexai_test.py +0 -0
  65. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/logging.py +0 -0
  66. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/logging_test.py +0 -0
  67. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/memories/__init__.py +0 -0
  68. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/memories/conversation_history.py +0 -0
  69. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/memories/conversation_history_test.py +0 -0
  70. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/memory.py +0 -0
  71. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/message.py +0 -0
  72. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/message_test.py +0 -0
  73. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/__init__.py +0 -0
  74. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/audio.py +0 -0
  75. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/audio_test.py +0 -0
  76. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/image.py +0 -0
  77. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/image_test.py +0 -0
  78. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/mime.py +0 -0
  79. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/mime_test.py +0 -0
  80. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/ms_office.py +0 -0
  81. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/ms_office_test.py +0 -0
  82. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/pdf.py +0 -0
  83. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/pdf_test.py +0 -0
  84. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/video.py +0 -0
  85. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modalities/video_test.py +0 -0
  86. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modality.py +0 -0
  87. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/modality_test.py +0 -0
  88. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/natural_language.py +0 -0
  89. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/natural_language_test.py +0 -0
  90. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/repr_utils.py +0 -0
  91. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/repr_utils_test.py +0 -0
  92. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/sampling.py +0 -0
  93. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/sampling_test.py +0 -0
  94. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/completion.py +0 -0
  95. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/completion_test.py +0 -0
  96. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/description.py +0 -0
  97. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/description_test.py +0 -0
  98. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/function_generation.py +0 -0
  99. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/function_generation_test.py +0 -0
  100. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/mapping.py +0 -0
  101. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/mapping_test.py +0 -0
  102. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/parsing.py +0 -0
  103. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/parsing_test.py +0 -0
  104. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/prompting.py +0 -0
  105. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/prompting_test.py +0 -0
  106. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/schema.py +0 -0
  107. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/schema_generation.py +0 -0
  108. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/schema_generation_test.py +0 -0
  109. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/schema_test.py +0 -0
  110. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/scoring.py +0 -0
  111. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/structured/scoring_test.py +0 -0
  112. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/subscription.py +0 -0
  113. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/subscription_test.py +0 -0
  114. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/template.py +0 -0
  115. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/template_test.py +0 -0
  116. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/__init__.py +0 -0
  117. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/completion.py +0 -0
  118. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/completion_test.py +0 -0
  119. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/conversation.py +0 -0
  120. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/conversation_test.py +0 -0
  121. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/demonstration.py +0 -0
  122. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/demonstration_test.py +0 -0
  123. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/selfplay.py +0 -0
  124. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/templates/selfplay_test.py +0 -0
  125. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/text_formatting.py +0 -0
  126. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun/core/text_formatting_test.py +0 -0
  127. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun.egg-info/dependency_links.txt +0 -0
  128. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun.egg-info/requires.txt +0 -0
  129. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/langfun.egg-info/top_level.txt +0 -0
  130. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/setup.cfg +0 -0
  131. {langfun-0.1.1.dev20240822 → langfun-0.1.1.dev20240825}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.1.1.dev20240822
3
+ Version: 0.1.1.dev20240825
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -712,6 +712,72 @@ class LanguageModel(component.Component):
712
712
  color='blue',
713
713
  )
714
714
 
715
+ def tokenize(
716
+ self,
717
+ prompt: str | message_lib.Message,
718
+ **kwargs,
719
+ ) -> list[tuple[str | bytes, int]]:
720
+ """Tokenizes the given prompt."""
721
+ prompt = message_lib.UserMessage.from_value(prompt)
722
+ call_counter = self._call_counter
723
+ self._call_counter += 1
724
+
725
+ with component.context(override_attrs=True, **kwargs):
726
+ request_start = time.time()
727
+ tokens = self._tokenize(prompt)
728
+ elapse = time.time() - request_start
729
+ self._debug_tokenize(prompt, tokens, call_counter, elapse)
730
+ return tokens
731
+
732
+ def _tokenize(
733
+ self, prompt: message_lib.Message
734
+ ) -> list[tuple[str | bytes, int]]:
735
+ """Subclass to implement."""
736
+ raise NotImplementedError(
737
+ f'{self.__class__.__name__} does not support tokenization.'
738
+ )
739
+
740
+ def _debug_tokenize(
741
+ self,
742
+ prompt: message_lib.Message,
743
+ tokens: list[tuple[str | bytes, int]],
744
+ call_counter: int,
745
+ elapse: float,
746
+ ):
747
+ debug = self.debug
748
+ if isinstance(debug, bool):
749
+ debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
750
+
751
+ if debug & LMDebugMode.INFO:
752
+ self._debug_model_info(call_counter, UsageNotAvailable())
753
+
754
+ if debug & LMDebugMode.PROMPT:
755
+ console.write(
756
+ prompt,
757
+ title=f'\n[{call_counter}] PROMPT TO TOKENIZE:',
758
+ color='green',
759
+ )
760
+ referred_modalities_lst = [prompt.referred_modalities(),]
761
+ if referred_modalities_lst:
762
+ for referred_modalities in referred_modalities_lst:
763
+ console.write(
764
+ pg.object_utils.kvlist_str(
765
+ [(k, repr(v), None) for k, v in referred_modalities.items()]
766
+ ),
767
+ title=f'\n[{call_counter}] MODALITY OBJECTS SENT TO LM:',
768
+ color='green',
769
+ )
770
+
771
+ if debug & LMDebugMode.RESPONSE:
772
+ console.write(
773
+ tokens,
774
+ title=(
775
+ f'\n[{call_counter}] {len(tokens)} TOKENS RETURNED '
776
+ f'(in {elapse:.2f} seconds):'
777
+ ),
778
+ color='blue',
779
+ )
780
+
715
781
  def rate_to_max_concurrency(
716
782
  self, requests_per_min: float = 0, tokens_per_min: float = 0
717
783
  ) -> int:
@@ -81,6 +81,13 @@ class MockScoringModel(MockModel):
81
81
  ]
82
82
 
83
83
 
84
+ class MockTokenizeModel(MockModel):
85
+
86
+ def _tokenize(
87
+ self, prompt: message_lib.Message) -> list[tuple[str | bytes, int]]:
88
+ return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
89
+
90
+
84
91
  class LMSamplingOptionsTest(unittest.TestCase):
85
92
  """Tests for LMSamplingOptions."""
86
93
 
@@ -552,6 +559,65 @@ class LanguageModelTest(unittest.TestCase):
552
559
  with self.assertRaises(NotImplementedError):
553
560
  MockModel().score('hi', ['1', '2'])
554
561
 
562
+ def test_tokenize(self):
563
+ info_flag = lm_lib.LMDebugMode.INFO
564
+ prompt_flag = lm_lib.LMDebugMode.PROMPT
565
+ response_flag = lm_lib.LMDebugMode.RESPONSE
566
+ debug_prints = {
567
+ info_flag: 'LM INFO',
568
+ prompt_flag: 'PROMPT TO TOKENIZE',
569
+ response_flag: 'TOKENS RETURNED',
570
+ }
571
+ debug_modes = [
572
+ info_flag,
573
+ prompt_flag,
574
+ response_flag,
575
+ info_flag | prompt_flag,
576
+ info_flag | response_flag,
577
+ prompt_flag | response_flag,
578
+ info_flag | prompt_flag | response_flag,
579
+ ]
580
+
581
+ class Image(modality.Modality):
582
+ def to_bytes(self):
583
+ return b'fake_image'
584
+
585
+ for debug_mode in debug_modes:
586
+ string_io = io.StringIO()
587
+ lm = MockTokenizeModel()
588
+
589
+ with contextlib.redirect_stdout(string_io):
590
+ self.assertEqual(
591
+ lm.tokenize(
592
+ message_lib.UserMessage('hi <<[[image]]>>', image=Image()),
593
+ debug=debug_mode),
594
+ [('hi', 0), ('<<[[image]]>>', 1)],
595
+ )
596
+
597
+ debug_info = string_io.getvalue()
598
+ expected_included = [
599
+ debug_prints[f]
600
+ for f in lm_lib.LMDebugMode
601
+ if f != lm_lib.LMDebugMode.NONE and f in debug_mode
602
+ ]
603
+ expected_excluded = [
604
+ debug_prints[f]
605
+ for f in lm_lib.LMDebugMode
606
+ if f != lm_lib.LMDebugMode.NONE and f not in debug_mode
607
+ ]
608
+
609
+ for expected_include in expected_included:
610
+ self.assertIn(expected_include, debug_info)
611
+ for expected_exclude in expected_excluded:
612
+ self.assertNotIn(expected_exclude, debug_info)
613
+
614
+ if debug_mode & lm_lib.LMDebugMode.PROMPT:
615
+ self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
616
+
617
+ def test_tokenize_with_unsupported_model(self):
618
+ with self.assertRaises(NotImplementedError):
619
+ MockModel().tokenize('hi')
620
+
555
621
  def test_rate_to_max_concurrency_no_rpm_no_tpm(self) -> None:
556
622
  lm = MockModel()
557
623
  self.assertEqual(
@@ -25,6 +25,9 @@ class Fake(lf.LanguageModel):
25
25
  completions: list[lf.Message]):
26
26
  return [lf.LMScoringResult(score=-i * 1.0) for i in range(len(completions))]
27
27
 
28
+ def _tokenize(self, prompt: lf.Message) -> list[tuple[str | bytes, int]]:
29
+ return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
30
+
28
31
  def _sample(self, prompts: list[lf.Message]) -> list[lf.LMSamplingResult]:
29
32
  results = []
30
33
  for prompt in prompts:
@@ -62,6 +62,13 @@ class EchoTest(unittest.TestCase):
62
62
  [lf.LMScoringResult(0.0), lf.LMScoringResult(-1.0)],
63
63
  )
64
64
 
65
+ def test_tokenize(self):
66
+ lm = fakelm.Echo()
67
+ self.assertEqual(
68
+ lm.tokenize('hi'),
69
+ [('hi', 0)]
70
+ )
71
+
65
72
 
66
73
  class StaticResponseTest(unittest.TestCase):
67
74
 
@@ -77,6 +77,8 @@ from langfun.core.structured.completion import complete
77
77
 
78
78
  from langfun.core.structured.scoring import score
79
79
 
80
+ from langfun.core.structured.tokenization import tokenize
81
+
80
82
  # Expose default examples for structured operations so users could refer to
81
83
  # them.
82
84
  from langfun.core.structured.parsing import default_parse_examples
@@ -0,0 +1,64 @@
1
+ # Copyright 2023 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tokenize the prompt for `lf.query`."""
15
+
16
+ from typing import Any, Type, Union
17
+
18
+ import langfun.core as lf
19
+ from langfun.core.structured import mapping
20
+ from langfun.core.structured import prompting
21
+ from langfun.core.structured import schema as schema_lib
22
+ import pyglove as pg
23
+
24
+
25
+ def tokenize(
26
+ prompt: Union[str, pg.Symbolic] | list[str | pg.Symbolic],
27
+ schema: Union[
28
+ schema_lib.Schema, Type[Any], list[Type[Any]], dict[str, Any], None
29
+ ] = None,
30
+ *,
31
+ lm: lf.LanguageModel | None = None,
32
+ examples: list[mapping.MappingExample] | None = None,
33
+ protocol: schema_lib.SchemaProtocol = 'python',
34
+ **kwargs,
35
+ ) -> list[tuple[str | bytes, int]]:
36
+ """Tokenize the prompt for `lf.query`.
37
+
38
+ Args:
39
+ prompt: The prompt(s) based on which each completion will be scored.
40
+ schema: The schema as the output type. If None, it will be inferred from
41
+ the completions.
42
+ lm: The language model used for scoring.
43
+ examples: Fewshot exemplars used together with the prompt in getting the
44
+ completions.
45
+ protocol: The protocol for formulating the prompt based on objects.
46
+ **kwargs: Keyword arguments that are referred by the prompt.
47
+
48
+ Returns:
49
+ A list of (text, token_id) tuples.
50
+ """
51
+ input_message = prompting.query_prompt(
52
+ prompt,
53
+ schema,
54
+ examples=examples,
55
+ protocol=protocol,
56
+ **kwargs,
57
+ )
58
+ if lm is None:
59
+ lm_override = lf.get_contextual_override('lm')
60
+ if lm_override is None:
61
+ raise ValueError('`lm` must be specified or provided from `lf.context`.')
62
+ lm = lm_override.value
63
+
64
+ return lm.tokenize(input_message)
@@ -0,0 +1,48 @@
1
+ # Copyright 2023 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import unittest
16
+ import langfun.core as lf
17
+ from langfun.core.llms import fake
18
+ from langfun.core.structured import tokenization
19
+ import pyglove as pg
20
+
21
+
22
+ class Answer(pg.Object):
23
+ result: int
24
+
25
+
26
+ class TokenizationTest(unittest.TestCase):
27
+
28
+ def test_bad_call(self):
29
+
30
+ with self.assertRaisesRegex(ValueError, '`lm` must be specified'):
31
+ tokenization.tokenize('hi')
32
+
33
+ def test_tokenize(self):
34
+ self.assertEqual(
35
+ tokenization.tokenize('hi', lm=fake.Echo()),
36
+ [('hi', 0)]
37
+ )
38
+
39
+ def test_tokenize_with_lm_from_the_context(self):
40
+ with lf.context(lm=fake.Echo()):
41
+ self.assertEqual(
42
+ tokenization.tokenize('hi'),
43
+ [('hi', 0)]
44
+ )
45
+
46
+
47
+ if __name__ == '__main__':
48
+ unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.1.1.dev20240822
3
+ Version: 0.1.1.dev20240825
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -116,6 +116,8 @@ langfun/core/structured/schema_generation_test.py
116
116
  langfun/core/structured/schema_test.py
117
117
  langfun/core/structured/scoring.py
118
118
  langfun/core/structured/scoring_test.py
119
+ langfun/core/structured/tokenization.py
120
+ langfun/core/structured/tokenization_test.py
119
121
  langfun/core/templates/__init__.py
120
122
  langfun/core/templates/completion.py
121
123
  langfun/core/templates/completion_test.py