langfun 0.1.1.dev20240822__py3-none-any.whl → 0.1.1.dev20240825__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -712,6 +712,72 @@ class LanguageModel(component.Component):
712
712
  color='blue',
713
713
  )
714
714
 
715
+ def tokenize(
716
+ self,
717
+ prompt: str | message_lib.Message,
718
+ **kwargs,
719
+ ) -> list[tuple[str | bytes, int]]:
720
+ """Tokenizes the given prompt."""
721
+ prompt = message_lib.UserMessage.from_value(prompt)
722
+ call_counter = self._call_counter
723
+ self._call_counter += 1
724
+
725
+ with component.context(override_attrs=True, **kwargs):
726
+ request_start = time.time()
727
+ tokens = self._tokenize(prompt)
728
+ elapse = time.time() - request_start
729
+ self._debug_tokenize(prompt, tokens, call_counter, elapse)
730
+ return tokens
731
+
732
+ def _tokenize(
733
+ self, prompt: message_lib.Message
734
+ ) -> list[tuple[str | bytes, int]]:
735
+ """Subclass to implement."""
736
+ raise NotImplementedError(
737
+ f'{self.__class__.__name__} does not support tokenization.'
738
+ )
739
+
740
+ def _debug_tokenize(
741
+ self,
742
+ prompt: message_lib.Message,
743
+ tokens: list[tuple[str | bytes, int]],
744
+ call_counter: int,
745
+ elapse: float,
746
+ ):
747
+ debug = self.debug
748
+ if isinstance(debug, bool):
749
+ debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
750
+
751
+ if debug & LMDebugMode.INFO:
752
+ self._debug_model_info(call_counter, UsageNotAvailable())
753
+
754
+ if debug & LMDebugMode.PROMPT:
755
+ console.write(
756
+ prompt,
757
+ title=f'\n[{call_counter}] PROMPT TO TOKENIZE:',
758
+ color='green',
759
+ )
760
+ referred_modalities_lst = [prompt.referred_modalities(),]
761
+ if referred_modalities_lst:
762
+ for referred_modalities in referred_modalities_lst:
763
+ console.write(
764
+ pg.object_utils.kvlist_str(
765
+ [(k, repr(v), None) for k, v in referred_modalities.items()]
766
+ ),
767
+ title=f'\n[{call_counter}] MODALITY OBJECTS SENT TO LM:',
768
+ color='green',
769
+ )
770
+
771
+ if debug & LMDebugMode.RESPONSE:
772
+ console.write(
773
+ tokens,
774
+ title=(
775
+ f'\n[{call_counter}] {len(tokens)} TOKENS RETURNED '
776
+ f'(in {elapse:.2f} seconds):'
777
+ ),
778
+ color='blue',
779
+ )
780
+
715
781
  def rate_to_max_concurrency(
716
782
  self, requests_per_min: float = 0, tokens_per_min: float = 0
717
783
  ) -> int:
@@ -81,6 +81,13 @@ class MockScoringModel(MockModel):
81
81
  ]
82
82
 
83
83
 
84
+ class MockTokenizeModel(MockModel):
85
+
86
+ def _tokenize(
87
+ self, prompt: message_lib.Message) -> list[tuple[str | bytes, int]]:
88
+ return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
89
+
90
+
84
91
  class LMSamplingOptionsTest(unittest.TestCase):
85
92
  """Tests for LMSamplingOptions."""
86
93
 
@@ -552,6 +559,65 @@ class LanguageModelTest(unittest.TestCase):
552
559
  with self.assertRaises(NotImplementedError):
553
560
  MockModel().score('hi', ['1', '2'])
554
561
 
562
+ def test_tokenize(self):
563
+ info_flag = lm_lib.LMDebugMode.INFO
564
+ prompt_flag = lm_lib.LMDebugMode.PROMPT
565
+ response_flag = lm_lib.LMDebugMode.RESPONSE
566
+ debug_prints = {
567
+ info_flag: 'LM INFO',
568
+ prompt_flag: 'PROMPT TO TOKENIZE',
569
+ response_flag: 'TOKENS RETURNED',
570
+ }
571
+ debug_modes = [
572
+ info_flag,
573
+ prompt_flag,
574
+ response_flag,
575
+ info_flag | prompt_flag,
576
+ info_flag | response_flag,
577
+ prompt_flag | response_flag,
578
+ info_flag | prompt_flag | response_flag,
579
+ ]
580
+
581
+ class Image(modality.Modality):
582
+ def to_bytes(self):
583
+ return b'fake_image'
584
+
585
+ for debug_mode in debug_modes:
586
+ string_io = io.StringIO()
587
+ lm = MockTokenizeModel()
588
+
589
+ with contextlib.redirect_stdout(string_io):
590
+ self.assertEqual(
591
+ lm.tokenize(
592
+ message_lib.UserMessage('hi <<[[image]]>>', image=Image()),
593
+ debug=debug_mode),
594
+ [('hi', 0), ('<<[[image]]>>', 1)],
595
+ )
596
+
597
+ debug_info = string_io.getvalue()
598
+ expected_included = [
599
+ debug_prints[f]
600
+ for f in lm_lib.LMDebugMode
601
+ if f != lm_lib.LMDebugMode.NONE and f in debug_mode
602
+ ]
603
+ expected_excluded = [
604
+ debug_prints[f]
605
+ for f in lm_lib.LMDebugMode
606
+ if f != lm_lib.LMDebugMode.NONE and f not in debug_mode
607
+ ]
608
+
609
+ for expected_include in expected_included:
610
+ self.assertIn(expected_include, debug_info)
611
+ for expected_exclude in expected_excluded:
612
+ self.assertNotIn(expected_exclude, debug_info)
613
+
614
+ if debug_mode & lm_lib.LMDebugMode.PROMPT:
615
+ self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
616
+
617
+ def test_tokenize_with_unsupported_model(self):
618
+ with self.assertRaises(NotImplementedError):
619
+ MockModel().tokenize('hi')
620
+
555
621
  def test_rate_to_max_concurrency_no_rpm_no_tpm(self) -> None:
556
622
  lm = MockModel()
557
623
  self.assertEqual(
langfun/core/llms/fake.py CHANGED
@@ -25,6 +25,9 @@ class Fake(lf.LanguageModel):
25
25
  completions: list[lf.Message]):
26
26
  return [lf.LMScoringResult(score=-i * 1.0) for i in range(len(completions))]
27
27
 
28
+ def _tokenize(self, prompt: lf.Message) -> list[tuple[str | bytes, int]]:
29
+ return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
30
+
28
31
  def _sample(self, prompts: list[lf.Message]) -> list[lf.LMSamplingResult]:
29
32
  results = []
30
33
  for prompt in prompts:
@@ -62,6 +62,13 @@ class EchoTest(unittest.TestCase):
62
62
  [lf.LMScoringResult(0.0), lf.LMScoringResult(-1.0)],
63
63
  )
64
64
 
65
+ def test_tokenize(self):
66
+ lm = fakelm.Echo()
67
+ self.assertEqual(
68
+ lm.tokenize('hi'),
69
+ [('hi', 0)]
70
+ )
71
+
65
72
 
66
73
  class StaticResponseTest(unittest.TestCase):
67
74
 
@@ -77,6 +77,8 @@ from langfun.core.structured.completion import complete
77
77
 
78
78
  from langfun.core.structured.scoring import score
79
79
 
80
+ from langfun.core.structured.tokenization import tokenize
81
+
80
82
  # Expose default examples for structured operations so users could refer to
81
83
  # them.
82
84
  from langfun.core.structured.parsing import default_parse_examples
@@ -0,0 +1,64 @@
1
+ # Copyright 2023 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tokenize the prompt for `lf.query`."""
15
+
16
+ from typing import Any, Type, Union
17
+
18
+ import langfun.core as lf
19
+ from langfun.core.structured import mapping
20
+ from langfun.core.structured import prompting
21
+ from langfun.core.structured import schema as schema_lib
22
+ import pyglove as pg
23
+
24
+
25
+ def tokenize(
26
+ prompt: Union[str, pg.Symbolic] | list[str | pg.Symbolic],
27
+ schema: Union[
28
+ schema_lib.Schema, Type[Any], list[Type[Any]], dict[str, Any], None
29
+ ] = None,
30
+ *,
31
+ lm: lf.LanguageModel | None = None,
32
+ examples: list[mapping.MappingExample] | None = None,
33
+ protocol: schema_lib.SchemaProtocol = 'python',
34
+ **kwargs,
35
+ ) -> list[tuple[str | bytes, int]]:
36
+ """Tokenize the prompt for `lf.query`.
37
+
38
+ Args:
39
+ prompt: The prompt(s) based on which each completion will be scored.
40
+ schema: The schema as the output type. If None, it will be inferred from
41
+ the completions.
42
+ lm: The language model used for scoring.
43
+ examples: Fewshot exemplars used together with the prompt in getting the
44
+ completions.
45
+ protocol: The protocol for formulating the prompt based on objects.
46
+ **kwargs: Keyword arguments that are referred by the prompt.
47
+
48
+ Returns:
49
+ A list of (text, token_id) tuples.
50
+ """
51
+ input_message = prompting.query_prompt(
52
+ prompt,
53
+ schema,
54
+ examples=examples,
55
+ protocol=protocol,
56
+ **kwargs,
57
+ )
58
+ if lm is None:
59
+ lm_override = lf.get_contextual_override('lm')
60
+ if lm_override is None:
61
+ raise ValueError('`lm` must be specified or provided from `lf.context`.')
62
+ lm = lm_override.value
63
+
64
+ return lm.tokenize(input_message)
@@ -0,0 +1,48 @@
1
+ # Copyright 2023 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import unittest
16
+ import langfun.core as lf
17
+ from langfun.core.llms import fake
18
+ from langfun.core.structured import tokenization
19
+ import pyglove as pg
20
+
21
+
22
+ class Answer(pg.Object):
23
+ result: int
24
+
25
+
26
+ class TokenizationTest(unittest.TestCase):
27
+
28
+ def test_bad_call(self):
29
+
30
+ with self.assertRaisesRegex(ValueError, '`lm` must be specified'):
31
+ tokenization.tokenize('hi')
32
+
33
+ def test_tokenize(self):
34
+ self.assertEqual(
35
+ tokenization.tokenize('hi', lm=fake.Echo()),
36
+ [('hi', 0)]
37
+ )
38
+
39
+ def test_tokenize_with_lm_from_the_context(self):
40
+ with lf.context(lm=fake.Echo()):
41
+ self.assertEqual(
42
+ tokenization.tokenize('hi'),
43
+ [('hi', 0)]
44
+ )
45
+
46
+
47
+ if __name__ == '__main__':
48
+ unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.1.1.dev20240822
3
+ Version: 0.1.1.dev20240825
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -8,8 +8,8 @@ langfun/core/console.py,sha256=bk5rNPNm9rMGW5YT2HixxU04p2umnoabn5SDz6Dqe88,2317
8
8
  langfun/core/console_test.py,sha256=5SYJdxpJGLgdSSQqqMPoA1X6jpsLD8rgcyk-EgI65oE,1077
9
9
  langfun/core/langfunc.py,sha256=RvIcRjIq0jWYRu1xim-FYe4HSrt97r3GMBO_PuagUmw,11060
10
10
  langfun/core/langfunc_test.py,sha256=lyt-UzkD8972cxZwzCkps0_RMLeSsOBrcUFIW-fB6us,8653
11
- langfun/core/language_model.py,sha256=ihcLy7WWrUByZ4Yfikb2OBppM6QGwMyjTYecBzelNCs,24028
12
- langfun/core/language_model_test.py,sha256=TlNmVUfBfDQZzIiiBqCBTrxgcoyj2qNp3kONvmr2pX4,21273
11
+ langfun/core/language_model.py,sha256=oGni82fhYB3kUsL0okzvIXkKgXEMHVE-c0jR5LRmsIc,26039
12
+ langfun/core/language_model_test.py,sha256=ebJ1vnaxKSKvlwi6v07yHjn91xMiDw2bQ9DBnyVorYw,23303
13
13
  langfun/core/logging.py,sha256=oDSeqGIQogZJ6xuPTcr9mkmLC2YnLP67UHtTdWbbiVY,4250
14
14
  langfun/core/logging_test.py,sha256=poSsNGKi6G9LWOcWnTY0BQjj0BtaQknH-NK6FcQrVT4,2152
15
15
  langfun/core/memory.py,sha256=f-asN1F7Vehgdn_fK84v73GrEUOxRtaW934keutTKjk,2416
@@ -55,8 +55,8 @@ langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se
55
55
  langfun/core/llms/__init__.py,sha256=a1AV3XWi2gY4UvmmaPP1GapaQxygA6xzQJvVQRp6EPA,4818
56
56
  langfun/core/llms/anthropic.py,sha256=Gon3fOi31RhZFgNd0ijyTnKnUdp9hrWrCoSXyO4UaLw,7316
57
57
  langfun/core/llms/anthropic_test.py,sha256=T-swuMkfnlgs8Fpif4rtXs579exGk0TsbLMirXDZCkg,5533
58
- langfun/core/llms/fake.py,sha256=Dd7-6ka9pFf3fcWZyczamjOqQ91MOI-m7We3Oc9Ffmo,2927
59
- langfun/core/llms/fake_test.py,sha256=ipKfdOcuqVcJ8lDXVpnBVb9HHG0hAVkFkMoHpWjC2cI,7212
58
+ langfun/core/llms/fake.py,sha256=gCHBYBLvBCsC78HI1hpoqXCS-p1FMTgY1P1qh_sGBPk,3070
59
+ langfun/core/llms/fake_test.py,sha256=sIl_Mg7nFVjaN7AJhYCpA_qzDJpSnJzkazepGXpfQQg,7338
60
60
  langfun/core/llms/google_genai.py,sha256=Rl5a5CyF_6Y0BYYArKk8yMaenv1rH3MUQLy6b3dfMRI,10202
61
61
  langfun/core/llms/google_genai_test.py,sha256=iTISk3tJ4-3gjWmzcKQhEbH3ke4AkEiCu8rAGtB7SvU,7535
62
62
  langfun/core/llms/groq.py,sha256=pqtyOZ_1_OJMOg8xATWT_B_SVbuT9nMRf4VkH9GzW8g,6308
@@ -89,7 +89,7 @@ langfun/core/modalities/pdf.py,sha256=mfaeCbUA4JslFVTARiJh8hW7imvL4tLVw9gUhO5bAZ
89
89
  langfun/core/modalities/pdf_test.py,sha256=KE40zJD3Whe6ty2OULkp1J8jwLmB4ZjGXlGekluTP48,1952
90
90
  langfun/core/modalities/video.py,sha256=sKcXxbx9S1ERjH8yEzkbtySpcRJD40QiPIQiIBy-U5I,955
91
91
  langfun/core/modalities/video_test.py,sha256=GbsoefSeO7y8kCYhTtp4s9E3ah_eYrb6Z-MXpS01RFc,2046
92
- langfun/core/structured/__init__.py,sha256=yp60yeDSVlyT0ElmLwbpBHnQtk_JX5udnjG1UGcsXKA,3776
92
+ langfun/core/structured/__init__.py,sha256=VeB0_yV8ZEkey8kizTmB0GdkLs_aag7D9bclP8Nntac,3835
93
93
  langfun/core/structured/completion.py,sha256=cS2PjG7sqzDu5x0xoTk8RmNcoeX55iVwH38NTefkMHg,8108
94
94
  langfun/core/structured/completion_test.py,sha256=2mUzDMKGF_WGfTtsnfmfMDx97dkJ-98y8leen__qWLA,19281
95
95
  langfun/core/structured/description.py,sha256=SXW4MJvshFjbR-0gw6rE21o6WXq12UlRXawvDBXMZFA,5211
@@ -108,6 +108,8 @@ langfun/core/structured/schema_generation_test.py,sha256=RM9s71kMNg2jTePwInkiW9f
108
108
  langfun/core/structured/schema_test.py,sha256=RjYhwTgktQgyqAjzLvo967nTiIK9KWgP-aNGg4e7ihE,25258
109
109
  langfun/core/structured/scoring.py,sha256=ae6SjLqoqsKFmcPnaJbsFmH4XFGKOQaJRjYZ1wm1Ywo,5860
110
110
  langfun/core/structured/scoring_test.py,sha256=QvlwDAzwuamKL5tCotm1L3Sx0cs3idoNK4aIEhaO4Yk,2272
111
+ langfun/core/structured/tokenization.py,sha256=w6UeFGVcNSWJUPHdwgsKdYmiw7-k_PXX6kEv8TACPN4,2191
112
+ langfun/core/structured/tokenization_test.py,sha256=dVW30kGYkX2HNtiRZe1oTmXFP7iIK6PrlKCttZ3QXe4,1311
111
113
  langfun/core/templates/__init__.py,sha256=bO0eMsVJbi7sxEB2YlInKRQ2EVP-RyyKUwcD-8msuN4,927
112
114
  langfun/core/templates/completion.py,sha256=mUqZHOEV3ag6-A08XghpeEltcrBvCDxXP004eDDfeag,1931
113
115
  langfun/core/templates/completion_test.py,sha256=vGnjnM38UHyVDUyaUYtmp20s9KBGOdbPVsX-H-ET11E,1636
@@ -117,8 +119,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
117
119
  langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
118
120
  langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
119
121
  langfun/core/templates/selfplay_test.py,sha256=rBW2Qr8yi-aWYwoTwRR-n1peKyMX9QXPZXURjLgoiRs,2264
120
- langfun-0.1.1.dev20240822.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
121
- langfun-0.1.1.dev20240822.dist-info/METADATA,sha256=tldtlq7znDyRGiaq62EDI8aqpsKJSpPoSsl1cCE2OUc,5234
122
- langfun-0.1.1.dev20240822.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
123
- langfun-0.1.1.dev20240822.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
124
- langfun-0.1.1.dev20240822.dist-info/RECORD,,
122
+ langfun-0.1.1.dev20240825.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
123
+ langfun-0.1.1.dev20240825.dist-info/METADATA,sha256=EpsJgv9Qsf97MphfiC_IJA3ICOqNDfH8V1UI2s8RWdQ,5234
124
+ langfun-0.1.1.dev20240825.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
125
+ langfun-0.1.1.dev20240825.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
126
+ langfun-0.1.1.dev20240825.dist-info/RECORD,,