langfun 0.1.1.dev20240822__py3-none-any.whl → 0.1.1.dev20240825__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/core/language_model.py +66 -0
- langfun/core/language_model_test.py +66 -0
- langfun/core/llms/fake.py +3 -0
- langfun/core/llms/fake_test.py +7 -0
- langfun/core/structured/__init__.py +2 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- {langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/METADATA +1 -1
- {langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/RECORD +12 -10
- {langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/LICENSE +0 -0
- {langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/WHEEL +0 -0
- {langfun-0.1.1.dev20240822.dist-info → langfun-0.1.1.dev20240825.dist-info}/top_level.txt +0 -0
langfun/core/language_model.py
CHANGED
@@ -712,6 +712,72 @@ class LanguageModel(component.Component):
|
|
712
712
|
color='blue',
|
713
713
|
)
|
714
714
|
|
715
|
+
def tokenize(
|
716
|
+
self,
|
717
|
+
prompt: str | message_lib.Message,
|
718
|
+
**kwargs,
|
719
|
+
) -> list[tuple[str | bytes, int]]:
|
720
|
+
"""Tokenizes the given prompt."""
|
721
|
+
prompt = message_lib.UserMessage.from_value(prompt)
|
722
|
+
call_counter = self._call_counter
|
723
|
+
self._call_counter += 1
|
724
|
+
|
725
|
+
with component.context(override_attrs=True, **kwargs):
|
726
|
+
request_start = time.time()
|
727
|
+
tokens = self._tokenize(prompt)
|
728
|
+
elapse = time.time() - request_start
|
729
|
+
self._debug_tokenize(prompt, tokens, call_counter, elapse)
|
730
|
+
return tokens
|
731
|
+
|
732
|
+
def _tokenize(
|
733
|
+
self, prompt: message_lib.Message
|
734
|
+
) -> list[tuple[str | bytes, int]]:
|
735
|
+
"""Subclass to implement."""
|
736
|
+
raise NotImplementedError(
|
737
|
+
f'{self.__class__.__name__} does not support tokenization.'
|
738
|
+
)
|
739
|
+
|
740
|
+
def _debug_tokenize(
|
741
|
+
self,
|
742
|
+
prompt: message_lib.Message,
|
743
|
+
tokens: list[tuple[str | bytes, int]],
|
744
|
+
call_counter: int,
|
745
|
+
elapse: float,
|
746
|
+
):
|
747
|
+
debug = self.debug
|
748
|
+
if isinstance(debug, bool):
|
749
|
+
debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
|
750
|
+
|
751
|
+
if debug & LMDebugMode.INFO:
|
752
|
+
self._debug_model_info(call_counter, UsageNotAvailable())
|
753
|
+
|
754
|
+
if debug & LMDebugMode.PROMPT:
|
755
|
+
console.write(
|
756
|
+
prompt,
|
757
|
+
title=f'\n[{call_counter}] PROMPT TO TOKENIZE:',
|
758
|
+
color='green',
|
759
|
+
)
|
760
|
+
referred_modalities_lst = [prompt.referred_modalities(),]
|
761
|
+
if referred_modalities_lst:
|
762
|
+
for referred_modalities in referred_modalities_lst:
|
763
|
+
console.write(
|
764
|
+
pg.object_utils.kvlist_str(
|
765
|
+
[(k, repr(v), None) for k, v in referred_modalities.items()]
|
766
|
+
),
|
767
|
+
title=f'\n[{call_counter}] MODALITY OBJECTS SENT TO LM:',
|
768
|
+
color='green',
|
769
|
+
)
|
770
|
+
|
771
|
+
if debug & LMDebugMode.RESPONSE:
|
772
|
+
console.write(
|
773
|
+
tokens,
|
774
|
+
title=(
|
775
|
+
f'\n[{call_counter}] {len(tokens)} TOKENS RETURNED '
|
776
|
+
f'(in {elapse:.2f} seconds):'
|
777
|
+
),
|
778
|
+
color='blue',
|
779
|
+
)
|
780
|
+
|
715
781
|
def rate_to_max_concurrency(
|
716
782
|
self, requests_per_min: float = 0, tokens_per_min: float = 0
|
717
783
|
) -> int:
|
@@ -81,6 +81,13 @@ class MockScoringModel(MockModel):
|
|
81
81
|
]
|
82
82
|
|
83
83
|
|
84
|
+
class MockTokenizeModel(MockModel):
|
85
|
+
|
86
|
+
def _tokenize(
|
87
|
+
self, prompt: message_lib.Message) -> list[tuple[str | bytes, int]]:
|
88
|
+
return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
|
89
|
+
|
90
|
+
|
84
91
|
class LMSamplingOptionsTest(unittest.TestCase):
|
85
92
|
"""Tests for LMSamplingOptions."""
|
86
93
|
|
@@ -552,6 +559,65 @@ class LanguageModelTest(unittest.TestCase):
|
|
552
559
|
with self.assertRaises(NotImplementedError):
|
553
560
|
MockModel().score('hi', ['1', '2'])
|
554
561
|
|
562
|
+
def test_tokenize(self):
|
563
|
+
info_flag = lm_lib.LMDebugMode.INFO
|
564
|
+
prompt_flag = lm_lib.LMDebugMode.PROMPT
|
565
|
+
response_flag = lm_lib.LMDebugMode.RESPONSE
|
566
|
+
debug_prints = {
|
567
|
+
info_flag: 'LM INFO',
|
568
|
+
prompt_flag: 'PROMPT TO TOKENIZE',
|
569
|
+
response_flag: 'TOKENS RETURNED',
|
570
|
+
}
|
571
|
+
debug_modes = [
|
572
|
+
info_flag,
|
573
|
+
prompt_flag,
|
574
|
+
response_flag,
|
575
|
+
info_flag | prompt_flag,
|
576
|
+
info_flag | response_flag,
|
577
|
+
prompt_flag | response_flag,
|
578
|
+
info_flag | prompt_flag | response_flag,
|
579
|
+
]
|
580
|
+
|
581
|
+
class Image(modality.Modality):
|
582
|
+
def to_bytes(self):
|
583
|
+
return b'fake_image'
|
584
|
+
|
585
|
+
for debug_mode in debug_modes:
|
586
|
+
string_io = io.StringIO()
|
587
|
+
lm = MockTokenizeModel()
|
588
|
+
|
589
|
+
with contextlib.redirect_stdout(string_io):
|
590
|
+
self.assertEqual(
|
591
|
+
lm.tokenize(
|
592
|
+
message_lib.UserMessage('hi <<[[image]]>>', image=Image()),
|
593
|
+
debug=debug_mode),
|
594
|
+
[('hi', 0), ('<<[[image]]>>', 1)],
|
595
|
+
)
|
596
|
+
|
597
|
+
debug_info = string_io.getvalue()
|
598
|
+
expected_included = [
|
599
|
+
debug_prints[f]
|
600
|
+
for f in lm_lib.LMDebugMode
|
601
|
+
if f != lm_lib.LMDebugMode.NONE and f in debug_mode
|
602
|
+
]
|
603
|
+
expected_excluded = [
|
604
|
+
debug_prints[f]
|
605
|
+
for f in lm_lib.LMDebugMode
|
606
|
+
if f != lm_lib.LMDebugMode.NONE and f not in debug_mode
|
607
|
+
]
|
608
|
+
|
609
|
+
for expected_include in expected_included:
|
610
|
+
self.assertIn(expected_include, debug_info)
|
611
|
+
for expected_exclude in expected_excluded:
|
612
|
+
self.assertNotIn(expected_exclude, debug_info)
|
613
|
+
|
614
|
+
if debug_mode & lm_lib.LMDebugMode.PROMPT:
|
615
|
+
self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
|
616
|
+
|
617
|
+
def test_tokenize_with_unsupported_model(self):
|
618
|
+
with self.assertRaises(NotImplementedError):
|
619
|
+
MockModel().tokenize('hi')
|
620
|
+
|
555
621
|
def test_rate_to_max_concurrency_no_rpm_no_tpm(self) -> None:
|
556
622
|
lm = MockModel()
|
557
623
|
self.assertEqual(
|
langfun/core/llms/fake.py
CHANGED
@@ -25,6 +25,9 @@ class Fake(lf.LanguageModel):
|
|
25
25
|
completions: list[lf.Message]):
|
26
26
|
return [lf.LMScoringResult(score=-i * 1.0) for i in range(len(completions))]
|
27
27
|
|
28
|
+
def _tokenize(self, prompt: lf.Message) -> list[tuple[str | bytes, int]]:
|
29
|
+
return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
|
30
|
+
|
28
31
|
def _sample(self, prompts: list[lf.Message]) -> list[lf.LMSamplingResult]:
|
29
32
|
results = []
|
30
33
|
for prompt in prompts:
|
langfun/core/llms/fake_test.py
CHANGED
@@ -62,6 +62,13 @@ class EchoTest(unittest.TestCase):
|
|
62
62
|
[lf.LMScoringResult(0.0), lf.LMScoringResult(-1.0)],
|
63
63
|
)
|
64
64
|
|
65
|
+
def test_tokenize(self):
|
66
|
+
lm = fakelm.Echo()
|
67
|
+
self.assertEqual(
|
68
|
+
lm.tokenize('hi'),
|
69
|
+
[('hi', 0)]
|
70
|
+
)
|
71
|
+
|
65
72
|
|
66
73
|
class StaticResponseTest(unittest.TestCase):
|
67
74
|
|
@@ -77,6 +77,8 @@ from langfun.core.structured.completion import complete
|
|
77
77
|
|
78
78
|
from langfun.core.structured.scoring import score
|
79
79
|
|
80
|
+
from langfun.core.structured.tokenization import tokenize
|
81
|
+
|
80
82
|
# Expose default examples for structured operations so users could refer to
|
81
83
|
# them.
|
82
84
|
from langfun.core.structured.parsing import default_parse_examples
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Copyright 2023 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""Tokenize the prompt for `lf.query`."""
|
15
|
+
|
16
|
+
from typing import Any, Type, Union
|
17
|
+
|
18
|
+
import langfun.core as lf
|
19
|
+
from langfun.core.structured import mapping
|
20
|
+
from langfun.core.structured import prompting
|
21
|
+
from langfun.core.structured import schema as schema_lib
|
22
|
+
import pyglove as pg
|
23
|
+
|
24
|
+
|
25
|
+
def tokenize(
|
26
|
+
prompt: Union[str, pg.Symbolic] | list[str | pg.Symbolic],
|
27
|
+
schema: Union[
|
28
|
+
schema_lib.Schema, Type[Any], list[Type[Any]], dict[str, Any], None
|
29
|
+
] = None,
|
30
|
+
*,
|
31
|
+
lm: lf.LanguageModel | None = None,
|
32
|
+
examples: list[mapping.MappingExample] | None = None,
|
33
|
+
protocol: schema_lib.SchemaProtocol = 'python',
|
34
|
+
**kwargs,
|
35
|
+
) -> list[tuple[str | bytes, int]]:
|
36
|
+
"""Tokenize the prompt for `lf.query`.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
prompt: The prompt(s) based on which each completion will be scored.
|
40
|
+
schema: The schema as the output type. If None, it will be inferred from
|
41
|
+
the completions.
|
42
|
+
lm: The language model used for scoring.
|
43
|
+
examples: Fewshot exemplars used together with the prompt in getting the
|
44
|
+
completions.
|
45
|
+
protocol: The protocol for formulating the prompt based on objects.
|
46
|
+
**kwargs: Keyword arguments that are referred by the prompt.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
A list of (text, token_id) tuples.
|
50
|
+
"""
|
51
|
+
input_message = prompting.query_prompt(
|
52
|
+
prompt,
|
53
|
+
schema,
|
54
|
+
examples=examples,
|
55
|
+
protocol=protocol,
|
56
|
+
**kwargs,
|
57
|
+
)
|
58
|
+
if lm is None:
|
59
|
+
lm_override = lf.get_contextual_override('lm')
|
60
|
+
if lm_override is None:
|
61
|
+
raise ValueError('`lm` must be specified or provided from `lf.context`.')
|
62
|
+
lm = lm_override.value
|
63
|
+
|
64
|
+
return lm.tokenize(input_message)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright 2023 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import unittest
|
16
|
+
import langfun.core as lf
|
17
|
+
from langfun.core.llms import fake
|
18
|
+
from langfun.core.structured import tokenization
|
19
|
+
import pyglove as pg
|
20
|
+
|
21
|
+
|
22
|
+
class Answer(pg.Object):
|
23
|
+
result: int
|
24
|
+
|
25
|
+
|
26
|
+
class TokenizationTest(unittest.TestCase):
|
27
|
+
|
28
|
+
def test_bad_call(self):
|
29
|
+
|
30
|
+
with self.assertRaisesRegex(ValueError, '`lm` must be specified'):
|
31
|
+
tokenization.tokenize('hi')
|
32
|
+
|
33
|
+
def test_tokenize(self):
|
34
|
+
self.assertEqual(
|
35
|
+
tokenization.tokenize('hi', lm=fake.Echo()),
|
36
|
+
[('hi', 0)]
|
37
|
+
)
|
38
|
+
|
39
|
+
def test_tokenize_with_lm_from_the_context(self):
|
40
|
+
with lf.context(lm=fake.Echo()):
|
41
|
+
self.assertEqual(
|
42
|
+
tokenization.tokenize('hi'),
|
43
|
+
[('hi', 0)]
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
if __name__ == '__main__':
|
48
|
+
unittest.main()
|
@@ -8,8 +8,8 @@ langfun/core/console.py,sha256=bk5rNPNm9rMGW5YT2HixxU04p2umnoabn5SDz6Dqe88,2317
|
|
8
8
|
langfun/core/console_test.py,sha256=5SYJdxpJGLgdSSQqqMPoA1X6jpsLD8rgcyk-EgI65oE,1077
|
9
9
|
langfun/core/langfunc.py,sha256=RvIcRjIq0jWYRu1xim-FYe4HSrt97r3GMBO_PuagUmw,11060
|
10
10
|
langfun/core/langfunc_test.py,sha256=lyt-UzkD8972cxZwzCkps0_RMLeSsOBrcUFIW-fB6us,8653
|
11
|
-
langfun/core/language_model.py,sha256=
|
12
|
-
langfun/core/language_model_test.py,sha256=
|
11
|
+
langfun/core/language_model.py,sha256=oGni82fhYB3kUsL0okzvIXkKgXEMHVE-c0jR5LRmsIc,26039
|
12
|
+
langfun/core/language_model_test.py,sha256=ebJ1vnaxKSKvlwi6v07yHjn91xMiDw2bQ9DBnyVorYw,23303
|
13
13
|
langfun/core/logging.py,sha256=oDSeqGIQogZJ6xuPTcr9mkmLC2YnLP67UHtTdWbbiVY,4250
|
14
14
|
langfun/core/logging_test.py,sha256=poSsNGKi6G9LWOcWnTY0BQjj0BtaQknH-NK6FcQrVT4,2152
|
15
15
|
langfun/core/memory.py,sha256=f-asN1F7Vehgdn_fK84v73GrEUOxRtaW934keutTKjk,2416
|
@@ -55,8 +55,8 @@ langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se
|
|
55
55
|
langfun/core/llms/__init__.py,sha256=a1AV3XWi2gY4UvmmaPP1GapaQxygA6xzQJvVQRp6EPA,4818
|
56
56
|
langfun/core/llms/anthropic.py,sha256=Gon3fOi31RhZFgNd0ijyTnKnUdp9hrWrCoSXyO4UaLw,7316
|
57
57
|
langfun/core/llms/anthropic_test.py,sha256=T-swuMkfnlgs8Fpif4rtXs579exGk0TsbLMirXDZCkg,5533
|
58
|
-
langfun/core/llms/fake.py,sha256=
|
59
|
-
langfun/core/llms/fake_test.py,sha256=
|
58
|
+
langfun/core/llms/fake.py,sha256=gCHBYBLvBCsC78HI1hpoqXCS-p1FMTgY1P1qh_sGBPk,3070
|
59
|
+
langfun/core/llms/fake_test.py,sha256=sIl_Mg7nFVjaN7AJhYCpA_qzDJpSnJzkazepGXpfQQg,7338
|
60
60
|
langfun/core/llms/google_genai.py,sha256=Rl5a5CyF_6Y0BYYArKk8yMaenv1rH3MUQLy6b3dfMRI,10202
|
61
61
|
langfun/core/llms/google_genai_test.py,sha256=iTISk3tJ4-3gjWmzcKQhEbH3ke4AkEiCu8rAGtB7SvU,7535
|
62
62
|
langfun/core/llms/groq.py,sha256=pqtyOZ_1_OJMOg8xATWT_B_SVbuT9nMRf4VkH9GzW8g,6308
|
@@ -89,7 +89,7 @@ langfun/core/modalities/pdf.py,sha256=mfaeCbUA4JslFVTARiJh8hW7imvL4tLVw9gUhO5bAZ
|
|
89
89
|
langfun/core/modalities/pdf_test.py,sha256=KE40zJD3Whe6ty2OULkp1J8jwLmB4ZjGXlGekluTP48,1952
|
90
90
|
langfun/core/modalities/video.py,sha256=sKcXxbx9S1ERjH8yEzkbtySpcRJD40QiPIQiIBy-U5I,955
|
91
91
|
langfun/core/modalities/video_test.py,sha256=GbsoefSeO7y8kCYhTtp4s9E3ah_eYrb6Z-MXpS01RFc,2046
|
92
|
-
langfun/core/structured/__init__.py,sha256=
|
92
|
+
langfun/core/structured/__init__.py,sha256=VeB0_yV8ZEkey8kizTmB0GdkLs_aag7D9bclP8Nntac,3835
|
93
93
|
langfun/core/structured/completion.py,sha256=cS2PjG7sqzDu5x0xoTk8RmNcoeX55iVwH38NTefkMHg,8108
|
94
94
|
langfun/core/structured/completion_test.py,sha256=2mUzDMKGF_WGfTtsnfmfMDx97dkJ-98y8leen__qWLA,19281
|
95
95
|
langfun/core/structured/description.py,sha256=SXW4MJvshFjbR-0gw6rE21o6WXq12UlRXawvDBXMZFA,5211
|
@@ -108,6 +108,8 @@ langfun/core/structured/schema_generation_test.py,sha256=RM9s71kMNg2jTePwInkiW9f
|
|
108
108
|
langfun/core/structured/schema_test.py,sha256=RjYhwTgktQgyqAjzLvo967nTiIK9KWgP-aNGg4e7ihE,25258
|
109
109
|
langfun/core/structured/scoring.py,sha256=ae6SjLqoqsKFmcPnaJbsFmH4XFGKOQaJRjYZ1wm1Ywo,5860
|
110
110
|
langfun/core/structured/scoring_test.py,sha256=QvlwDAzwuamKL5tCotm1L3Sx0cs3idoNK4aIEhaO4Yk,2272
|
111
|
+
langfun/core/structured/tokenization.py,sha256=w6UeFGVcNSWJUPHdwgsKdYmiw7-k_PXX6kEv8TACPN4,2191
|
112
|
+
langfun/core/structured/tokenization_test.py,sha256=dVW30kGYkX2HNtiRZe1oTmXFP7iIK6PrlKCttZ3QXe4,1311
|
111
113
|
langfun/core/templates/__init__.py,sha256=bO0eMsVJbi7sxEB2YlInKRQ2EVP-RyyKUwcD-8msuN4,927
|
112
114
|
langfun/core/templates/completion.py,sha256=mUqZHOEV3ag6-A08XghpeEltcrBvCDxXP004eDDfeag,1931
|
113
115
|
langfun/core/templates/completion_test.py,sha256=vGnjnM38UHyVDUyaUYtmp20s9KBGOdbPVsX-H-ET11E,1636
|
@@ -117,8 +119,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
|
|
117
119
|
langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
|
118
120
|
langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
|
119
121
|
langfun/core/templates/selfplay_test.py,sha256=rBW2Qr8yi-aWYwoTwRR-n1peKyMX9QXPZXURjLgoiRs,2264
|
120
|
-
langfun-0.1.1.
|
121
|
-
langfun-0.1.1.
|
122
|
-
langfun-0.1.1.
|
123
|
-
langfun-0.1.1.
|
124
|
-
langfun-0.1.1.
|
122
|
+
langfun-0.1.1.dev20240825.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
123
|
+
langfun-0.1.1.dev20240825.dist-info/METADATA,sha256=EpsJgv9Qsf97MphfiC_IJA3ICOqNDfH8V1UI2s8RWdQ,5234
|
124
|
+
langfun-0.1.1.dev20240825.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
|
125
|
+
langfun-0.1.1.dev20240825.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
|
126
|
+
langfun-0.1.1.dev20240825.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|