google-genai 1.30.0__py3-none-any.whl → 1.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/genai/_api_client.py +32 -32
- google/genai/_automatic_function_calling_util.py +12 -0
- google/genai/_base_transformers.py +26 -0
- google/genai/_live_converters.py +1 -0
- google/genai/_local_tokenizer_loader.py +223 -0
- google/genai/_operations_converters.py +307 -0
- google/genai/_tokens_converters.py +1 -0
- google/genai/_transformers.py +0 -10
- google/genai/batches.py +141 -0
- google/genai/caches.py +15 -2
- google/genai/files.py +11 -2
- google/genai/local_tokenizer.py +362 -0
- google/genai/models.py +518 -17
- google/genai/operations.py +1 -0
- google/genai/tunings.py +135 -0
- google/genai/types.py +781 -323
- google/genai/version.py +1 -1
- {google_genai-1.30.0.dist-info → google_genai-1.32.0.dist-info}/METADATA +6 -6
- google_genai-1.32.0.dist-info/RECORD +39 -0
- google_genai-1.30.0.dist-info/RECORD +0 -35
- {google_genai-1.30.0.dist-info → google_genai-1.32.0.dist-info}/WHEEL +0 -0
- {google_genai-1.30.0.dist-info → google_genai-1.32.0.dist-info}/licenses/LICENSE +0 -0
- {google_genai-1.30.0.dist-info → google_genai-1.32.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,362 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
#
|
15
|
+
|
16
|
+
"""[Experimental] Text Only Local Tokenizer."""
|
17
|
+
|
18
|
+
import logging
|
19
|
+
from typing import Any, Iterable
|
20
|
+
from typing import Optional, Union
|
21
|
+
|
22
|
+
from sentencepiece import sentencepiece_model_pb2
|
23
|
+
|
24
|
+
from . import _common
|
25
|
+
from . import _local_tokenizer_loader as loader
|
26
|
+
from . import _transformers as t
|
27
|
+
from . import types
|
28
|
+
from . import types
|
29
|
+
from ._transformers import t_contents
|
30
|
+
|
31
|
+
logger = logging.getLogger("google_genai.local_tokenizer")
|
32
|
+
|
33
|
+
|
34
|
+
class _TextsAccumulator:
|
35
|
+
"""Accumulates countable texts from `Content` and `Tool` objects.
|
36
|
+
|
37
|
+
This class is responsible for traversing complex `Content` and `Tool`
|
38
|
+
objects and extracting all the text content that should be included when
|
39
|
+
calculating token counts.
|
40
|
+
|
41
|
+
A key feature of this class is its ability to detect unsupported fields in
|
42
|
+
`Content` objects. If a user provides a `Content` object with fields that
|
43
|
+
this local tokenizer doesn't recognize (e.g., new fields added in a future
|
44
|
+
API update), this class will log a warning.
|
45
|
+
|
46
|
+
The detection mechanism for `Content` objects works by recursively building
|
47
|
+
a "counted" version of the input object. This "counted" object only
|
48
|
+
contains the data that was successfully processed and added to the text
|
49
|
+
list for tokenization. After traversing the input, the original `Content`
|
50
|
+
object is compared to the "counted" object. If they don't match, it
|
51
|
+
signifies the presence of unsupported fields, and a warning is logged.
|
52
|
+
"""
|
53
|
+
|
54
|
+
def __init__(self) -> None:
|
55
|
+
self._texts: list[str] = []
|
56
|
+
|
57
|
+
def get_texts(self) -> Iterable[str]:
|
58
|
+
return self._texts
|
59
|
+
|
60
|
+
def add_contents(self, contents: Iterable[types.Content]) -> None:
|
61
|
+
for content in contents:
|
62
|
+
self.add_content(content)
|
63
|
+
|
64
|
+
def add_content(self, content: types.Content) -> None:
|
65
|
+
counted_content = types.Content(parts=[], role=content.role)
|
66
|
+
if content.parts:
|
67
|
+
for part in content.parts:
|
68
|
+
assert counted_content.parts is not None
|
69
|
+
counted_part = types.Part()
|
70
|
+
if part.file_data is not None or part.inline_data is not None:
|
71
|
+
raise ValueError(
|
72
|
+
"LocalTokenizers do not support non-text content types."
|
73
|
+
)
|
74
|
+
if part.video_metadata is not None:
|
75
|
+
counted_part.video_metadata = part.video_metadata
|
76
|
+
if part.function_call is not None:
|
77
|
+
self.add_function_call(part.function_call)
|
78
|
+
counted_part.function_call = part.function_call
|
79
|
+
if part.function_response is not None:
|
80
|
+
self.add_function_response(part.function_response)
|
81
|
+
counted_part.function_response = part.function_response
|
82
|
+
if part.text is not None:
|
83
|
+
counted_part.text = part.text
|
84
|
+
self._texts.append(part.text)
|
85
|
+
counted_content.parts.append(counted_part)
|
86
|
+
|
87
|
+
if content.model_dump(exclude_none=True) != counted_content.model_dump(
|
88
|
+
exclude_none=True
|
89
|
+
):
|
90
|
+
logger.warning(
|
91
|
+
"Content contains unsupported types for token counting. Supported"
|
92
|
+
f" fields {counted_content}. Got {content}."
|
93
|
+
)
|
94
|
+
|
95
|
+
def add_function_call(self, function_call: types.FunctionCall) -> None:
|
96
|
+
"""Processes a function call and adds relevant text to the accumulator.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
function_call: The function call to process.
|
100
|
+
"""
|
101
|
+
if function_call.name:
|
102
|
+
self._texts.append(function_call.name)
|
103
|
+
counted_function_call = types.FunctionCall(name=function_call.name)
|
104
|
+
if function_call.args:
|
105
|
+
counted_args = self._dict_traverse(function_call.args)
|
106
|
+
counted_function_call.args = counted_args
|
107
|
+
|
108
|
+
def add_tool(self, tool: types.Tool) -> types.Tool:
|
109
|
+
counted_tool = types.Tool(function_declarations=[])
|
110
|
+
if tool.function_declarations:
|
111
|
+
for function_declaration in tool.function_declarations:
|
112
|
+
counted_function_declaration = self._function_declaration_traverse(
|
113
|
+
function_declaration
|
114
|
+
)
|
115
|
+
if counted_tool.function_declarations is None:
|
116
|
+
counted_tool.function_declarations = []
|
117
|
+
counted_tool.function_declarations.append(counted_function_declaration)
|
118
|
+
|
119
|
+
return counted_tool
|
120
|
+
|
121
|
+
def add_tools(self, tools: Iterable[types.Tool]) -> None:
|
122
|
+
for tool in tools:
|
123
|
+
self.add_tool(tool)
|
124
|
+
|
125
|
+
def add_function_responses(
|
126
|
+
self, function_responses: Iterable[types.FunctionResponse]
|
127
|
+
) -> None:
|
128
|
+
for function_response in function_responses:
|
129
|
+
self.add_function_response(function_response)
|
130
|
+
|
131
|
+
def add_function_response(
|
132
|
+
self, function_response: types.FunctionResponse
|
133
|
+
) -> None:
|
134
|
+
counted_function_response = types.FunctionResponse()
|
135
|
+
if function_response.name:
|
136
|
+
self._texts.append(function_response.name)
|
137
|
+
counted_function_response.name = function_response.name
|
138
|
+
if function_response.response:
|
139
|
+
counted_response = self._dict_traverse(function_response.response)
|
140
|
+
counted_function_response.response = counted_response
|
141
|
+
|
142
|
+
def _function_declaration_traverse(
|
143
|
+
self, function_declaration: types.FunctionDeclaration
|
144
|
+
) -> types.FunctionDeclaration:
|
145
|
+
counted_function_declaration = types.FunctionDeclaration()
|
146
|
+
if function_declaration.name:
|
147
|
+
self._texts.append(function_declaration.name)
|
148
|
+
counted_function_declaration.name = function_declaration.name
|
149
|
+
if function_declaration.description:
|
150
|
+
self._texts.append(function_declaration.description)
|
151
|
+
counted_function_declaration.description = (
|
152
|
+
function_declaration.description
|
153
|
+
)
|
154
|
+
if function_declaration.parameters:
|
155
|
+
counted_parameters = self.add_schema(function_declaration.parameters)
|
156
|
+
counted_function_declaration.parameters = counted_parameters
|
157
|
+
if function_declaration.response:
|
158
|
+
counted_response = self.add_schema(function_declaration.response)
|
159
|
+
counted_function_declaration.response = counted_response
|
160
|
+
return counted_function_declaration
|
161
|
+
|
162
|
+
def add_schema(self, schema: types.Schema) -> types.Schema:
|
163
|
+
"""Processes a schema and adds relevant text to the accumulator.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
schema: The schema to process.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
The new schema object with only countable fields.
|
170
|
+
"""
|
171
|
+
counted_schema = types.Schema()
|
172
|
+
if schema.type:
|
173
|
+
counted_schema.type = schema.type
|
174
|
+
if schema.title:
|
175
|
+
counted_schema.title = schema.title
|
176
|
+
if schema.default is not None:
|
177
|
+
counted_schema.default = schema.default
|
178
|
+
if schema.format:
|
179
|
+
self._texts.append(schema.format)
|
180
|
+
counted_schema.format = schema.format
|
181
|
+
if schema.description:
|
182
|
+
self._texts.append(schema.description)
|
183
|
+
counted_schema.description = schema.description
|
184
|
+
if schema.enum:
|
185
|
+
self._texts.extend(schema.enum)
|
186
|
+
counted_schema.enum = schema.enum
|
187
|
+
if schema.required:
|
188
|
+
self._texts.extend(schema.required)
|
189
|
+
counted_schema.required = schema.required
|
190
|
+
if schema.property_ordering:
|
191
|
+
counted_schema.property_ordering = schema.property_ordering
|
192
|
+
if schema.items:
|
193
|
+
counted_schema_items = self.add_schema(schema.items)
|
194
|
+
counted_schema.items = counted_schema_items
|
195
|
+
if schema.properties:
|
196
|
+
d = {}
|
197
|
+
for key, value in schema.properties.items():
|
198
|
+
self._texts.append(key)
|
199
|
+
counted_value = self.add_schema(value)
|
200
|
+
d[key] = counted_value
|
201
|
+
counted_schema.properties = d
|
202
|
+
if schema.example:
|
203
|
+
counted_schema_example = self._any_traverse(schema.example)
|
204
|
+
counted_schema.example = counted_schema_example
|
205
|
+
return counted_schema
|
206
|
+
|
207
|
+
def _dict_traverse(self, d: dict[str, Any]) -> dict[str, Any]:
|
208
|
+
"""Processes a dict and adds relevant text to the accumulator.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
d: The dict to process.
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
The new dict object with only countable fields.
|
215
|
+
"""
|
216
|
+
counted_dict = {}
|
217
|
+
self._texts.extend(list(d.keys()))
|
218
|
+
for key, val in d.items():
|
219
|
+
counted_dict[key] = self._any_traverse(val)
|
220
|
+
return counted_dict
|
221
|
+
|
222
|
+
def _any_traverse(self, value: Any) -> Any:
|
223
|
+
"""Processes a value and adds relevant text to the accumulator.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
value: The value to process.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
The new value with only countable fields.
|
230
|
+
"""
|
231
|
+
if isinstance(value, str):
|
232
|
+
self._texts.append(value)
|
233
|
+
return value
|
234
|
+
elif isinstance(value, dict):
|
235
|
+
return self._dict_traverse(value)
|
236
|
+
elif isinstance(value, list):
|
237
|
+
return [self._any_traverse(item) for item in value]
|
238
|
+
else:
|
239
|
+
return value
|
240
|
+
|
241
|
+
|
242
|
+
def _token_str_to_bytes(
|
243
|
+
token: str, type: sentencepiece_model_pb2.ModelProto.SentencePiece.Type
|
244
|
+
) -> bytes:
|
245
|
+
if type == sentencepiece_model_pb2.ModelProto.SentencePiece.Type.BYTE:
|
246
|
+
return _parse_hex_byte(token).to_bytes(length=1, byteorder="big")
|
247
|
+
else:
|
248
|
+
return token.replace("▁", " ").encode("utf-8")
|
249
|
+
|
250
|
+
|
251
|
+
def _parse_hex_byte(token: str) -> int:
|
252
|
+
"""Parses a hex byte string of the form '<0xXX>' and returns the integer value.
|
253
|
+
|
254
|
+
Raises ValueError if the input is malformed or the byte value is invalid.
|
255
|
+
"""
|
256
|
+
|
257
|
+
if len(token) != 6:
|
258
|
+
raise ValueError(f"Invalid byte length: {token}")
|
259
|
+
if not token.startswith("<0x") or not token.endswith(">"):
|
260
|
+
raise ValueError(f"Invalid byte format: {token}")
|
261
|
+
|
262
|
+
try:
|
263
|
+
val = int(token[3:5], 16) # Parse the hex part directly
|
264
|
+
except ValueError:
|
265
|
+
raise ValueError(f"Invalid hex value: {token}")
|
266
|
+
|
267
|
+
if val >= 256:
|
268
|
+
raise ValueError(f"Byte value out of range: {token}")
|
269
|
+
|
270
|
+
return val
|
271
|
+
|
272
|
+
|
273
|
+
class LocalTokenizer:
|
274
|
+
"""[Experimental] Text Only Local Tokenizer.
|
275
|
+
|
276
|
+
This class provides a local tokenizer for text only token counting.
|
277
|
+
|
278
|
+
LIMITATIONS:
|
279
|
+
- Only supports text based tokenization and no multimodal tokenization.
|
280
|
+
- Forward compatibility depends on the open-source tokenizer models for future
|
281
|
+
Gemini versions.
|
282
|
+
- For token counting of tools and response schemas, the `LocalTokenizer` only
|
283
|
+
supports `types.Tool` and `types.Schema` objects. Python functions or Pydantic
|
284
|
+
models cannot be passed directly.
|
285
|
+
"""
|
286
|
+
|
287
|
+
def __init__(self, model_name: str):
|
288
|
+
self._tokenizer_name = loader.get_tokenizer_name(model_name)
|
289
|
+
self._model_proto = loader.load_model_proto(self._tokenizer_name)
|
290
|
+
self._tokenizer = loader.get_sentencepiece(self._tokenizer_name)
|
291
|
+
|
292
|
+
@_common.experimental_warning(
|
293
|
+
"The SDK's local tokenizer implementation is experimental and may change"
|
294
|
+
" in the future. It only supports text based tokenization."
|
295
|
+
)
|
296
|
+
def count_tokens(
|
297
|
+
self,
|
298
|
+
contents: Union[types.ContentListUnion, types.ContentListUnionDict],
|
299
|
+
*,
|
300
|
+
config: Optional[types.CountTokensConfigOrDict] = None,
|
301
|
+
) -> types.CountTokensResult:
|
302
|
+
"""Counts the number of tokens in a given text.
|
303
|
+
|
304
|
+
Args:
|
305
|
+
contents: The contents to tokenize.
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
A `CountTokensResult` containing the total number of tokens.
|
309
|
+
"""
|
310
|
+
processed_contents = t.t_contents(contents)
|
311
|
+
text_accumulator = _TextsAccumulator()
|
312
|
+
config = types.CountTokensConfig.model_validate(config or {})
|
313
|
+
text_accumulator.add_contents(processed_contents)
|
314
|
+
if config.tools:
|
315
|
+
text_accumulator.add_tools(config.tools)
|
316
|
+
if config.generation_config and config.generation_config.response_schema:
|
317
|
+
text_accumulator.add_schema(config.generation_config.response_schema)
|
318
|
+
if config.system_instruction:
|
319
|
+
text_accumulator.add_contents(t.t_contents([config.system_instruction]))
|
320
|
+
tokens_list = self._tokenizer.encode(list(text_accumulator.get_texts()))
|
321
|
+
return types.CountTokensResult(
|
322
|
+
total_tokens=sum(len(tokens) for tokens in tokens_list)
|
323
|
+
)
|
324
|
+
|
325
|
+
@_common.experimental_warning(
|
326
|
+
"The SDK's local tokenizer implementation is experimental and may change"
|
327
|
+
" in the future. It only supports text based tokenization."
|
328
|
+
)
|
329
|
+
def compute_tokens(
|
330
|
+
self,
|
331
|
+
contents: Union[types.ContentListUnion, types.ContentListUnionDict],
|
332
|
+
) -> types.ComputeTokensResult:
|
333
|
+
"""Computes the tokens ids and string pieces in the input."""
|
334
|
+
processed_contents = t.t_contents(contents)
|
335
|
+
text_accumulator = _TextsAccumulator()
|
336
|
+
for content in processed_contents:
|
337
|
+
text_accumulator.add_content(content)
|
338
|
+
tokens_protos = self._tokenizer.EncodeAsImmutableProto(
|
339
|
+
text_accumulator.get_texts()
|
340
|
+
)
|
341
|
+
|
342
|
+
roles = []
|
343
|
+
for content in processed_contents:
|
344
|
+
if content.parts:
|
345
|
+
for _ in content.parts:
|
346
|
+
roles.append(content.role)
|
347
|
+
|
348
|
+
token_infos = []
|
349
|
+
for tokens_proto, role in zip(tokens_protos, roles):
|
350
|
+
token_infos.append(
|
351
|
+
types.TokensInfo(
|
352
|
+
token_ids=[piece.id for piece in tokens_proto.pieces],
|
353
|
+
tokens=[
|
354
|
+
_token_str_to_bytes(
|
355
|
+
piece.piece, self._model_proto.pieces[piece.id].type
|
356
|
+
)
|
357
|
+
for piece in tokens_proto.pieces
|
358
|
+
],
|
359
|
+
role=role,
|
360
|
+
)
|
361
|
+
)
|
362
|
+
return types.ComputeTokensResult(tokens_info=token_infos)
|