agno 2.3.10__py3-none-any.whl → 2.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/compression/manager.py +87 -16
- agno/db/base.py +5 -5
- agno/db/dynamo/dynamo.py +2 -2
- agno/db/firestore/firestore.py +2 -2
- agno/db/gcs_json/gcs_json_db.py +2 -2
- agno/db/in_memory/in_memory_db.py +2 -2
- agno/db/json/json_db.py +2 -2
- agno/db/mongo/async_mongo.py +170 -68
- agno/db/mongo/mongo.py +170 -76
- agno/db/mysql/async_mysql.py +93 -69
- agno/db/mysql/mysql.py +93 -68
- agno/db/postgres/async_postgres.py +104 -78
- agno/db/postgres/postgres.py +97 -69
- agno/db/redis/redis.py +2 -2
- agno/db/singlestore/singlestore.py +91 -66
- agno/db/sqlite/async_sqlite.py +101 -78
- agno/db/sqlite/sqlite.py +97 -69
- agno/db/surrealdb/surrealdb.py +2 -2
- agno/exceptions.py +1 -0
- agno/knowledge/chunking/fixed.py +4 -1
- agno/knowledge/knowledge.py +105 -24
- agno/knowledge/reader/csv_reader.py +2 -2
- agno/knowledge/reader/text_reader.py +15 -3
- agno/knowledge/reader/wikipedia_reader.py +33 -1
- agno/knowledge/utils.py +52 -7
- agno/memory/strategies/base.py +3 -4
- agno/models/anthropic/claude.py +44 -0
- agno/models/aws/bedrock.py +60 -0
- agno/models/base.py +124 -30
- agno/models/google/gemini.py +141 -23
- agno/models/litellm/chat.py +25 -0
- agno/models/openai/chat.py +21 -0
- agno/models/openai/responses.py +44 -0
- agno/os/routers/knowledge/knowledge.py +20 -9
- agno/run/agent.py +17 -0
- agno/run/requirement.py +89 -6
- agno/tracing/exporter.py +2 -2
- agno/utils/print_response/agent.py +4 -4
- agno/utils/print_response/team.py +12 -12
- agno/utils/tokens.py +643 -27
- agno/vectordb/base.py +15 -2
- agno/vectordb/chroma/chromadb.py +6 -2
- agno/vectordb/lancedb/lance_db.py +3 -37
- agno/vectordb/milvus/milvus.py +6 -32
- agno/vectordb/mongodb/mongodb.py +0 -27
- agno/vectordb/pgvector/pgvector.py +21 -11
- agno/vectordb/pineconedb/pineconedb.py +0 -17
- agno/vectordb/qdrant/qdrant.py +6 -29
- agno/vectordb/redis/redisdb.py +0 -26
- agno/vectordb/singlestore/singlestore.py +16 -8
- agno/vectordb/surrealdb/surrealdb.py +0 -36
- agno/vectordb/weaviate/weaviate.py +6 -2
- {agno-2.3.10.dist-info → agno-2.3.12.dist-info}/METADATA +4 -1
- {agno-2.3.10.dist-info → agno-2.3.12.dist-info}/RECORD +57 -57
- {agno-2.3.10.dist-info → agno-2.3.12.dist-info}/WHEEL +0 -0
- {agno-2.3.10.dist-info → agno-2.3.12.dist-info}/licenses/LICENSE +0 -0
- {agno-2.3.10.dist-info → agno-2.3.12.dist-info}/top_level.txt +0 -0
agno/utils/tokens.py
CHANGED
|
@@ -1,41 +1,657 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Type, Union
|
|
2
6
|
|
|
7
|
+
from pydantic import BaseModel
|
|
3
8
|
|
|
4
|
-
|
|
5
|
-
|
|
9
|
+
from agno.media import Audio, File, Image, Video
|
|
10
|
+
from agno.models.message import Message
|
|
11
|
+
from agno.tools.function import Function
|
|
12
|
+
from agno.utils.log import log_warning
|
|
6
13
|
|
|
7
|
-
|
|
8
|
-
|
|
14
|
+
# Default image dimensions used as fallback when actual dimensions cannot be determined.
|
|
15
|
+
# These values provide a more conservative estimate for high-detail image token counting.
|
|
16
|
+
DEFAULT_IMAGE_WIDTH = 1024
|
|
17
|
+
DEFAULT_IMAGE_HEIGHT = 1024
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Different models use different encodings
|
|
21
|
+
@lru_cache(maxsize=16)
|
|
22
|
+
def _get_tiktoken_encoding(model_id: str):
|
|
23
|
+
model_id = model_id.lower()
|
|
24
|
+
try:
|
|
25
|
+
import tiktoken
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
# Use model-specific encoding
|
|
29
|
+
return tiktoken.encoding_for_model(model_id)
|
|
30
|
+
except KeyError:
|
|
31
|
+
return tiktoken.get_encoding("o200k_base")
|
|
32
|
+
except ImportError:
|
|
33
|
+
log_warning("tiktoken not installed. Please install it using `pip install tiktoken`.")
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@lru_cache(maxsize=16)
|
|
38
|
+
def _get_hf_tokenizer(model_id: str):
|
|
39
|
+
try:
|
|
40
|
+
from tokenizers import Tokenizer
|
|
41
|
+
|
|
42
|
+
model_id = model_id.lower()
|
|
43
|
+
|
|
44
|
+
# Llama-3 models use a different tokenizer than Llama-2
|
|
45
|
+
if "llama-3" in model_id or "llama3" in model_id:
|
|
46
|
+
return Tokenizer.from_pretrained("Xenova/llama-3-tokenizer")
|
|
47
|
+
|
|
48
|
+
# Llama-2 models and Replicate models (LiteLLM uses llama tokenizer for replicate)
|
|
49
|
+
if "llama-2" in model_id or "llama2" in model_id or "replicate" in model_id:
|
|
50
|
+
return Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
51
|
+
|
|
52
|
+
# Cohere command-r models have their own tokenizer
|
|
53
|
+
if "command-r" in model_id:
|
|
54
|
+
return Tokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer")
|
|
55
|
+
|
|
56
|
+
return None
|
|
57
|
+
except ImportError:
|
|
58
|
+
log_warning("tokenizers not installed. Please install it using `pip install tokenizers`.")
|
|
59
|
+
return None
|
|
60
|
+
except Exception:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _select_tokenizer(model_id: str) -> Tuple[str, Any]:
|
|
65
|
+
# Priority 1: HuggingFace tokenizers for models with specific tokenizers
|
|
66
|
+
hf_tokenizer = _get_hf_tokenizer(model_id)
|
|
67
|
+
if hf_tokenizer is not None:
|
|
68
|
+
return ("huggingface", hf_tokenizer)
|
|
69
|
+
|
|
70
|
+
# Priority 2: tiktoken for OpenAI models
|
|
71
|
+
tiktoken_enc = _get_tiktoken_encoding(model_id)
|
|
72
|
+
if tiktoken_enc is not None:
|
|
73
|
+
return ("tiktoken", tiktoken_enc)
|
|
74
|
+
|
|
75
|
+
# Fallback: No tokenizer available, will use character-based estimation
|
|
76
|
+
return ("none", None)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# =============================================================================
|
|
80
|
+
# Tool Token Counting
|
|
81
|
+
# =============================================================================
|
|
82
|
+
# OpenAI counts tool/function tokens by converting them to a TypeScript-like
|
|
83
|
+
# namespace format. This approach was reverse-engineered and documented from:
|
|
84
|
+
# https://github.com/forestwanglin/openai-java/blob/main/jtokkit/src/main/java/xyz/felh/openai/jtokkit/utils/TikTokenUtils.java
|
|
85
|
+
#
|
|
86
|
+
# The formatted output looks like:
|
|
87
|
+
# namespace functions {
|
|
88
|
+
# // {description}
|
|
89
|
+
# type {name} = (_: {
|
|
90
|
+
# // {param_description}
|
|
91
|
+
# {param_name}{?}: {type},
|
|
92
|
+
# }) => any;
|
|
93
|
+
# } // namespace functions
|
|
94
|
+
# =============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# OpenAI internally represents function/tool definitions in a TypeScript-like format for tokenization
|
|
98
|
+
def _format_function_definitions(tools: List[Dict[str, Any]]) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Formats tool definitions as a TypeScript namespace.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A TypeScript namespace string representation of all tools.
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
Input tool: {"function": {"name": "get_weather", "parameters": {...}}}
|
|
107
|
+
Output: "namespace functions {\ntype get_weather = (_: {...}) => any;\n}"
|
|
108
|
+
"""
|
|
109
|
+
lines = []
|
|
110
|
+
lines.append("namespace functions {")
|
|
111
|
+
lines.append("")
|
|
112
|
+
|
|
113
|
+
for tool in tools:
|
|
114
|
+
# Handle both {"function": {...}} and direct function dict formats
|
|
115
|
+
function = tool.get("function", tool)
|
|
116
|
+
if function_description := function.get("description"):
|
|
117
|
+
lines.append(f"// {function_description}")
|
|
118
|
+
|
|
119
|
+
function_name = function.get("name", "")
|
|
120
|
+
parameters = function.get("parameters", {})
|
|
121
|
+
properties = parameters.get("properties", {})
|
|
122
|
+
|
|
123
|
+
if properties:
|
|
124
|
+
lines.append(f"type {function_name} = (_: {{")
|
|
125
|
+
lines.append(_format_object_parameters(parameters, 0))
|
|
126
|
+
lines.append("}) => any;")
|
|
127
|
+
else:
|
|
128
|
+
# Functions with no parameters
|
|
129
|
+
lines.append(f"type {function_name} = () => any;")
|
|
130
|
+
lines.append("")
|
|
131
|
+
|
|
132
|
+
lines.append("} // namespace functions")
|
|
133
|
+
return "\n".join(lines)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _format_object_parameters(parameters: Dict[str, Any], indent: int) -> str:
|
|
137
|
+
"""
|
|
138
|
+
Format JSON Schema object properties as TypeScript object properties.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
parameters: A JSON Schema object with 'properties' and optional 'required' keys.
|
|
142
|
+
indent: Number of spaces for indentation.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
TypeScript property definitions, one per line.
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
Input: {"properties": {"name": {"type": "string"}}, "required": ["name"]}
|
|
149
|
+
Output: "name: string,"
|
|
150
|
+
"""
|
|
151
|
+
properties = parameters.get("properties", {})
|
|
152
|
+
if not properties:
|
|
153
|
+
return ""
|
|
154
|
+
|
|
155
|
+
required_params = parameters.get("required", [])
|
|
156
|
+
lines = []
|
|
157
|
+
|
|
158
|
+
for key, props in properties.items():
|
|
159
|
+
# Add property description as a comment
|
|
160
|
+
description = props.get("description")
|
|
161
|
+
if description:
|
|
162
|
+
lines.append(f"// {description}")
|
|
163
|
+
|
|
164
|
+
# Required params have no "?", optional params have "?"
|
|
165
|
+
question = "" if required_params and key in required_params else "?"
|
|
166
|
+
lines.append(f"{key}{question}: {_format_type(props, indent)},")
|
|
167
|
+
|
|
168
|
+
return "\n".join([" " * max(0, indent) + line for line in lines])
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _format_type(props: Dict[str, Any], indent: int) -> str:
|
|
172
|
+
"""
|
|
173
|
+
Convert a JSON Schema type to its TypeScript equivalent.
|
|
174
|
+
|
|
175
|
+
Recursively handles nested types including arrays and objects.
|
|
9
176
|
|
|
10
177
|
Args:
|
|
11
|
-
|
|
178
|
+
props: A JSON Schema property definition containing 'type' and optionally
|
|
179
|
+
'enum', 'items' (for arrays), or 'properties' (for objects).
|
|
180
|
+
indent: The current indentation level for nested object formatting.
|
|
12
181
|
|
|
13
182
|
Returns:
|
|
14
|
-
|
|
183
|
+
A TypeScript type string.
|
|
15
184
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
0
|
|
185
|
+
Example:
|
|
186
|
+
- {"type": "string"} -> "string"
|
|
187
|
+
- {"type": "string", "enum": ["low", "high"]} -> '"low" | "high"'
|
|
188
|
+
- {"type": "array", "items": {"type": "number"}} -> "number[]"
|
|
21
189
|
"""
|
|
190
|
+
type_name = props.get("type", "any")
|
|
191
|
+
|
|
192
|
+
if type_name == "string":
|
|
193
|
+
if "enum" in props:
|
|
194
|
+
# Convert enum to TypeScript union of string literals
|
|
195
|
+
return " | ".join([f'"{item}"' for item in props["enum"]])
|
|
196
|
+
return "string"
|
|
197
|
+
elif type_name == "array":
|
|
198
|
+
# Recursively format the array item type
|
|
199
|
+
items = props.get("items", {})
|
|
200
|
+
return f"{_format_type(items, indent)}[]"
|
|
201
|
+
elif type_name == "object":
|
|
202
|
+
# Recursively format nested object properties
|
|
203
|
+
return f"{{\n{_format_object_parameters(props, indent + 2)}\n}}"
|
|
204
|
+
elif type_name in ["integer", "number"]:
|
|
205
|
+
if "enum" in props:
|
|
206
|
+
return " | ".join([f'"{item}"' for item in props["enum"]])
|
|
207
|
+
return "number"
|
|
208
|
+
elif type_name == "boolean":
|
|
209
|
+
return "boolean"
|
|
210
|
+
elif type_name == "null":
|
|
211
|
+
return "null"
|
|
212
|
+
else:
|
|
213
|
+
# Default to "any" for unknown types
|
|
214
|
+
return "any"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# =============================================================================
|
|
218
|
+
# Multi-modal Token Counting
|
|
219
|
+
# =============================================================================
|
|
220
|
+
# Image dimension parsing uses magic byte detection to identify file formats
|
|
221
|
+
# without relying on external libraries. This allows efficient header-only reads.
|
|
222
|
+
# =============================================================================
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _get_image_type(data: bytes) -> Optional[str]:
|
|
226
|
+
"""Returns the image format from magic bytes in the file header."""
|
|
227
|
+
if len(data) < 12:
|
|
228
|
+
return None
|
|
229
|
+
# PNG: 8-byte signature
|
|
230
|
+
if data[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a":
|
|
231
|
+
return "png"
|
|
232
|
+
# GIF: "GIF8" followed by "9a" or "7a" (we check for 'a')
|
|
233
|
+
if data[0:4] == b"GIF8" and data[5:6] == b"a":
|
|
234
|
+
return "gif"
|
|
235
|
+
# JPEG: SOI marker (Start of Image)
|
|
236
|
+
if data[0:3] == b"\xff\xd8\xff":
|
|
237
|
+
return "jpeg"
|
|
238
|
+
# HEIC/HEIF: ftyp box at offset 4
|
|
239
|
+
if data[4:8] == b"ftyp":
|
|
240
|
+
return "heic"
|
|
241
|
+
# WebP: RIFF container with WEBP identifier
|
|
242
|
+
if data[0:4] == b"RIFF" and data[8:12] == b"WEBP":
|
|
243
|
+
return "webp"
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _parse_image_dimensions_from_bytes(data: bytes, img_type: Optional[str] = None) -> Tuple[int, int]:
|
|
248
|
+
"""Returns the image dimensions (width, height) from raw image bytes."""
|
|
249
|
+
import io
|
|
250
|
+
import struct
|
|
251
|
+
|
|
252
|
+
if img_type is None:
|
|
253
|
+
img_type = _get_image_type(data)
|
|
254
|
+
|
|
255
|
+
if img_type == "png":
|
|
256
|
+
# PNG IHDR chunk: width at offset 16, height at offset 20 (big-endian)
|
|
257
|
+
return struct.unpack(">LL", data[16:24])
|
|
258
|
+
elif img_type == "gif":
|
|
259
|
+
# GIF logical screen descriptor: width/height at offset 6 (little-endian)
|
|
260
|
+
return struct.unpack("<HH", data[6:10])
|
|
261
|
+
elif img_type == "jpeg":
|
|
262
|
+
# JPEG requires scanning for SOF (Start of Frame) markers
|
|
263
|
+
# SOF markers are 0xC0-0xCF, excluding 0xC4 (DHT), 0xC8 (JPG), 0xCC (DAC)
|
|
264
|
+
with io.BytesIO(data) as f:
|
|
265
|
+
f.seek(0)
|
|
266
|
+
size = 2
|
|
267
|
+
ftype = 0
|
|
268
|
+
while not 0xC0 <= ftype <= 0xCF or ftype in (0xC4, 0xC8, 0xCC):
|
|
269
|
+
f.seek(size, 1)
|
|
270
|
+
byte = f.read(1)
|
|
271
|
+
# Skip any padding 0xFF bytes
|
|
272
|
+
while ord(byte) == 0xFF:
|
|
273
|
+
byte = f.read(1)
|
|
274
|
+
ftype = ord(byte)
|
|
275
|
+
size = struct.unpack(">H", f.read(2))[0] - 2
|
|
276
|
+
f.seek(1, 1) # Skip precision byte
|
|
277
|
+
h, w = struct.unpack(">HH", f.read(4))
|
|
278
|
+
return w, h
|
|
279
|
+
elif img_type == "webp":
|
|
280
|
+
# WebP has three encoding formats with different dimension locations
|
|
281
|
+
if data[12:16] == b"VP8X":
|
|
282
|
+
# Extended format: 24-bit dimensions stored in 3 bytes each
|
|
283
|
+
w = struct.unpack("<I", data[24:27] + b"\x00")[0] + 1
|
|
284
|
+
h = struct.unpack("<I", data[27:30] + b"\x00")[0] + 1
|
|
285
|
+
return w, h
|
|
286
|
+
elif data[12:16] == b"VP8 ":
|
|
287
|
+
# Lossy format: dimensions in first frame header, 14-bit masked
|
|
288
|
+
w = struct.unpack("<H", data[26:28])[0] & 0x3FFF
|
|
289
|
+
h = struct.unpack("<H", data[28:30])[0] & 0x3FFF
|
|
290
|
+
return w, h
|
|
291
|
+
elif data[12:16] == b"VP8L":
|
|
292
|
+
# Lossless format: dimensions bit-packed in 4 bytes
|
|
293
|
+
bits = struct.unpack("<I", data[21:25])[0]
|
|
294
|
+
w = (bits & 0x3FFF) + 1
|
|
295
|
+
h = ((bits >> 14) & 0x3FFF) + 1
|
|
296
|
+
return w, h
|
|
297
|
+
|
|
298
|
+
return DEFAULT_IMAGE_WIDTH, DEFAULT_IMAGE_HEIGHT
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _get_image_dimensions(image: Image) -> Tuple[int, int]:
|
|
302
|
+
"""Returns the image dimensions (width, height) from an Image object."""
|
|
22
303
|
try:
|
|
23
|
-
|
|
304
|
+
# Try to get format hint from metadata to skip magic byte detection
|
|
305
|
+
img_format = image.format
|
|
306
|
+
if not img_format and image.mime_type:
|
|
307
|
+
img_format = image.mime_type.split("/")[-1] if "/" in image.mime_type else None
|
|
24
308
|
|
|
25
|
-
#
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
309
|
+
# Get raw bytes from the appropriate source
|
|
310
|
+
if image.content:
|
|
311
|
+
data = image.content
|
|
312
|
+
elif image.filepath:
|
|
313
|
+
with open(image.filepath, "rb") as f:
|
|
314
|
+
data = f.read(100) # Only need header bytes for dimension parsing
|
|
315
|
+
elif image.url:
|
|
316
|
+
import httpx
|
|
31
317
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
318
|
+
response = httpx.get(image.url, timeout=5)
|
|
319
|
+
data = response.content
|
|
320
|
+
else:
|
|
321
|
+
return DEFAULT_IMAGE_WIDTH, DEFAULT_IMAGE_HEIGHT
|
|
322
|
+
|
|
323
|
+
return _parse_image_dimensions_from_bytes(data, img_format)
|
|
324
|
+
except Exception:
|
|
325
|
+
return DEFAULT_IMAGE_WIDTH, DEFAULT_IMAGE_HEIGHT
|
|
39
326
|
|
|
40
|
-
|
|
327
|
+
|
|
328
|
+
def count_file_tokens(file: File) -> int:
|
|
329
|
+
"""Estimate the number of tokens in a file based on its size and type."""
|
|
330
|
+
# Determine file size from available source
|
|
331
|
+
size = 0
|
|
332
|
+
if file.content and isinstance(file.content, (str, bytes)):
|
|
333
|
+
size = len(file.content)
|
|
334
|
+
elif file.filepath:
|
|
335
|
+
try:
|
|
336
|
+
path = Path(file.filepath) if isinstance(file.filepath, str) else file.filepath
|
|
337
|
+
if path.exists():
|
|
338
|
+
size = path.stat().st_size
|
|
339
|
+
except Exception:
|
|
340
|
+
pass
|
|
341
|
+
elif file.url:
|
|
342
|
+
# Use HEAD request to get Content-Length without downloading
|
|
343
|
+
try:
|
|
344
|
+
import urllib.request
|
|
345
|
+
|
|
346
|
+
req = urllib.request.Request(file.url, method="HEAD")
|
|
347
|
+
with urllib.request.urlopen(req, timeout=5) as response:
|
|
348
|
+
content_length = response.headers.get("Content-Length")
|
|
349
|
+
if content_length:
|
|
350
|
+
size = int(content_length)
|
|
351
|
+
except Exception:
|
|
352
|
+
pass
|
|
353
|
+
|
|
354
|
+
if size == 0:
|
|
355
|
+
return 0
|
|
356
|
+
|
|
357
|
+
# Determine file extension for type-based estimation
|
|
358
|
+
ext = None
|
|
359
|
+
if file.format:
|
|
360
|
+
ext = file.format.lower().lstrip(".")
|
|
361
|
+
elif file.filepath:
|
|
362
|
+
path = Path(file.filepath) if isinstance(file.filepath, str) else file.filepath
|
|
363
|
+
ext = path.suffix.lower().lstrip(".") if path.suffix else None
|
|
364
|
+
elif file.url:
|
|
365
|
+
url_path = file.url.split("?")[0]
|
|
366
|
+
if "." in url_path:
|
|
367
|
+
ext = url_path.rsplit(".", 1)[-1].lower()
|
|
368
|
+
|
|
369
|
+
# Text files: ~4 characters per token (based on typical tiktoken ratios)
|
|
370
|
+
if ext in {"txt", "csv", "md", "json", "xml", "html"}:
|
|
371
|
+
return size // 4
|
|
372
|
+
# Binary/other files: ~40 bytes per token (rough estimate)
|
|
373
|
+
return size // 40
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def count_tool_tokens(
|
|
377
|
+
tools: Sequence[Union[Function, Dict[str, Any]]],
|
|
378
|
+
model_id: str = "gpt-4o",
|
|
379
|
+
) -> int:
|
|
380
|
+
"""Count tokens consumed by tool/function definitions"""
|
|
381
|
+
if not tools:
|
|
382
|
+
return 0
|
|
383
|
+
|
|
384
|
+
# Convert Function objects to dict format for formatting
|
|
385
|
+
tool_dicts = []
|
|
386
|
+
for tool in tools:
|
|
387
|
+
if isinstance(tool, Function):
|
|
388
|
+
tool_dicts.append(tool.to_dict())
|
|
389
|
+
else:
|
|
390
|
+
tool_dicts.append(tool)
|
|
391
|
+
|
|
392
|
+
# Format tools in TypeScript namespace format and count tokens
|
|
393
|
+
formatted = _format_function_definitions(tool_dicts)
|
|
394
|
+
tokens = count_text_tokens(formatted, model_id)
|
|
395
|
+
return tokens
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def count_schema_tokens(
|
|
399
|
+
output_schema: Optional[Union[Dict, Type["BaseModel"]]],
|
|
400
|
+
model_id: str = "gpt-4o",
|
|
401
|
+
) -> int:
|
|
402
|
+
"""Estimate tokens for output_schema/output_schema."""
|
|
403
|
+
if output_schema is None:
|
|
404
|
+
return 0
|
|
405
|
+
|
|
406
|
+
try:
|
|
407
|
+
from pydantic import BaseModel
|
|
408
|
+
|
|
409
|
+
if isinstance(output_schema, type) and issubclass(output_schema, BaseModel):
|
|
410
|
+
# Convert Pydantic model to JSON schema
|
|
411
|
+
schema = output_schema.model_json_schema()
|
|
412
|
+
elif isinstance(output_schema, dict):
|
|
413
|
+
schema = output_schema
|
|
414
|
+
else:
|
|
415
|
+
return 0
|
|
416
|
+
|
|
417
|
+
schema_json = json.dumps(schema)
|
|
418
|
+
return count_text_tokens(schema_json, model_id)
|
|
419
|
+
except Exception:
|
|
420
|
+
return 0
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def count_text_tokens(text: str, model_id: str = "gpt-4o") -> int:
|
|
424
|
+
if not text:
|
|
425
|
+
return 0
|
|
426
|
+
tokenizer_type, tokenizer = _select_tokenizer(model_id)
|
|
427
|
+
if tokenizer_type == "huggingface":
|
|
428
|
+
return len(tokenizer.encode(text).ids)
|
|
429
|
+
elif tokenizer_type == "tiktoken":
|
|
430
|
+
# disallowed_special=() allows all special tokens to be encoded
|
|
431
|
+
return len(tokenizer.encode(text, disallowed_special=()))
|
|
432
|
+
else:
|
|
433
|
+
# Fallback: ~4 characters per token (typical for English text)
|
|
41
434
|
return len(text) // 4
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# =============================================================================
|
|
438
|
+
# Image Token Counting
|
|
439
|
+
# =============================================================================
|
|
440
|
+
# OpenAI's vision models process images by dividing them into 512x512 tiles.
|
|
441
|
+
# The token count depends on the image dimensions and detail level.
|
|
442
|
+
# OpenAI's image token formula:
|
|
443
|
+
# 1. If max(width, height) > 2000: scale to fit in 2000px on longest side
|
|
444
|
+
# 2. If min(width, height) > 768: scale so shortest side is 768px
|
|
445
|
+
# 3. tiles = ceil(width/512) * ceil(height/512)
|
|
446
|
+
# 4. tokens = 85 + (170 * tiles)
|
|
447
|
+
|
|
448
|
+
# Token constants:
|
|
449
|
+
# - 85: Base tokens for any image (covers metadata, low-detail representation)
|
|
450
|
+
# - 170: Additional tokens per 512x512 tile (high-detail tile encoding)
|
|
451
|
+
|
|
452
|
+
# Detail modes:
|
|
453
|
+
# - "low": Fixed 85 tokens (thumbnail/overview only)
|
|
454
|
+
# - "high"/"auto": Full tile-based calculation
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# Example:
|
|
458
|
+
# 1024x1024 image with high detail:
|
|
459
|
+
# - No scaling needed (within limits)
|
|
460
|
+
# - tiles = ceil(1024/512) * ceil(1024/512) = 2 * 2 = 4
|
|
461
|
+
# - tokens = 85 + (170 * 4) = 765
|
|
462
|
+
# =============================================================================
|
|
463
|
+
def count_image_tokens(image: Image) -> int:
|
|
464
|
+
width, height = _get_image_dimensions(image)
|
|
465
|
+
detail = image.detail or "auto"
|
|
466
|
+
|
|
467
|
+
if width <= 0 or height <= 0:
|
|
468
|
+
return 0
|
|
469
|
+
|
|
470
|
+
# Low detail: fixed 85 tokens regardless of dimensions
|
|
471
|
+
if detail == "low":
|
|
472
|
+
return 85
|
|
473
|
+
|
|
474
|
+
# For auto/high detail, calculate based on dimensions
|
|
475
|
+
# Step 1: Scale down if longest side exceeds 2000px
|
|
476
|
+
if max(width, height) > 2000:
|
|
477
|
+
scale = 2000 / max(width, height)
|
|
478
|
+
width, height = int(width * scale), int(height * scale)
|
|
479
|
+
|
|
480
|
+
# Step 2: Scale down if shortest side exceeds 768px
|
|
481
|
+
if min(width, height) > 768:
|
|
482
|
+
scale = 768 / min(width, height)
|
|
483
|
+
width, height = int(width * scale), int(height * scale)
|
|
484
|
+
|
|
485
|
+
# Step 3: Calculate tiles (512x512 each)
|
|
486
|
+
tiles = math.ceil(width / 512) * math.ceil(height / 512)
|
|
487
|
+
|
|
488
|
+
# Step 4: 85 base tokens + 170 tokens per tile
|
|
489
|
+
return 85 + (170 * tiles)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# =============================================================================
|
|
493
|
+
# Audio Token Counting
|
|
494
|
+
# =============================================================================
|
|
495
|
+
# This is an Agno-specific implementation using a conservative estimate of 25 tokens per second of audio.
|
|
496
|
+
# OpenAI's Whisper model actually uses ~50 tokens/second (20ms per token), but this estimate is more conservative for context window planning.
|
|
497
|
+
# Example:
|
|
498
|
+
# 10 seconds of audio: 10 * 25 = 250 tokens
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def count_audio_tokens(audio: Audio) -> int:
|
|
502
|
+
"""Estimate the number of tokens for an audio clip based on duration."""
|
|
503
|
+
duration = audio.duration or 0
|
|
504
|
+
if duration <= 0:
|
|
505
|
+
return 0
|
|
506
|
+
return int(duration * 25)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
# =============================================================================
|
|
510
|
+
# Video Token Counting
|
|
511
|
+
# =============================================================================
|
|
512
|
+
# This is an Agno-specific implementation that treats video as a sequence of
|
|
513
|
+
# images, applying the OpenAI image token formula to each frame.
|
|
514
|
+
# Example:
|
|
515
|
+
# 5 second video at 1 fps with 512x512 resolution:
|
|
516
|
+
# - tiles = 1 (512/512 = 1)
|
|
517
|
+
# - tokens_per_frame = 85 + 170 = 255
|
|
518
|
+
# - num_frames = 5
|
|
519
|
+
# - total = 255 * 5 = 1275 tokens
|
|
520
|
+
# =============================================================================
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def count_video_tokens(video: Video) -> int:
|
|
524
|
+
duration = video.duration or 0
|
|
525
|
+
if duration <= 0:
|
|
526
|
+
return 0
|
|
527
|
+
|
|
528
|
+
# Use defaults if dimensions/fps not specified
|
|
529
|
+
width = video.width or 512
|
|
530
|
+
height = video.height or 512
|
|
531
|
+
fps = video.fps or 1.0
|
|
532
|
+
|
|
533
|
+
# Calculate tokens per frame using the same formula as images (high detail)
|
|
534
|
+
w, h = width, height
|
|
535
|
+
# Scale down if longest side exceeds 2000px
|
|
536
|
+
if max(w, h) > 2000:
|
|
537
|
+
scale = 2000 / max(w, h)
|
|
538
|
+
w, h = int(w * scale), int(h * scale)
|
|
539
|
+
# Scale down if shortest side exceeds 768px
|
|
540
|
+
if min(w, h) > 768:
|
|
541
|
+
scale = 768 / min(w, h)
|
|
542
|
+
w, h = int(w * scale), int(h * scale)
|
|
543
|
+
tiles = math.ceil(w / 512) * math.ceil(h / 512)
|
|
544
|
+
tokens_per_frame = 85 + (170 * tiles)
|
|
545
|
+
|
|
546
|
+
# Calculate total tokens for all frames
|
|
547
|
+
num_frames = max(int(duration * fps), 1)
|
|
548
|
+
return num_frames * tokens_per_frame
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _count_media_tokens(message: Message) -> int:
|
|
552
|
+
tokens = 0
|
|
553
|
+
|
|
554
|
+
if message.images:
|
|
555
|
+
for image in message.images:
|
|
556
|
+
tokens += count_image_tokens(image)
|
|
557
|
+
|
|
558
|
+
if message.audio:
|
|
559
|
+
for audio in message.audio:
|
|
560
|
+
tokens += count_audio_tokens(audio)
|
|
561
|
+
|
|
562
|
+
if message.videos:
|
|
563
|
+
for video in message.videos:
|
|
564
|
+
tokens += count_video_tokens(video)
|
|
565
|
+
|
|
566
|
+
if message.files:
|
|
567
|
+
for file in message.files:
|
|
568
|
+
tokens += count_file_tokens(file)
|
|
569
|
+
|
|
570
|
+
return tokens
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _count_message_tokens(message: Message, model_id: str = "gpt-4o") -> int:
|
|
574
|
+
tokens = 0
|
|
575
|
+
text_parts: List[str] = []
|
|
576
|
+
|
|
577
|
+
# Collect content text
|
|
578
|
+
content = message.get_content(use_compressed_content=True)
|
|
579
|
+
if content:
|
|
580
|
+
if isinstance(content, str):
|
|
581
|
+
text_parts.append(content)
|
|
582
|
+
elif isinstance(content, list):
|
|
583
|
+
# Handle multimodal content blocks
|
|
584
|
+
for item in content:
|
|
585
|
+
if isinstance(item, str):
|
|
586
|
+
text_parts.append(item)
|
|
587
|
+
elif isinstance(item, dict):
|
|
588
|
+
item_type = item.get("type", "")
|
|
589
|
+
if item_type == "text":
|
|
590
|
+
text_parts.append(item.get("text", ""))
|
|
591
|
+
elif item_type == "image_url":
|
|
592
|
+
# Handle OpenAI-style content lists without populating message.images
|
|
593
|
+
image_url_data = item.get("image_url", {})
|
|
594
|
+
url = image_url_data.get("url") if isinstance(image_url_data, dict) else None
|
|
595
|
+
detail = image_url_data.get("detail", "auto") if isinstance(image_url_data, dict) else "auto"
|
|
596
|
+
|
|
597
|
+
temp_image = Image(url=url, detail=detail)
|
|
598
|
+
tokens += count_image_tokens(temp_image)
|
|
599
|
+
else:
|
|
600
|
+
text_parts.append(json.dumps(item))
|
|
601
|
+
else:
|
|
602
|
+
text_parts.append(str(content))
|
|
603
|
+
|
|
604
|
+
# Collect tool call arguments
|
|
605
|
+
if message.tool_calls:
|
|
606
|
+
for tool_call in message.tool_calls:
|
|
607
|
+
if isinstance(tool_call, dict) and "function" in tool_call:
|
|
608
|
+
args = tool_call["function"].get("arguments", "")
|
|
609
|
+
text_parts.append(str(args))
|
|
610
|
+
|
|
611
|
+
# Collect tool response id
|
|
612
|
+
if message.tool_call_id:
|
|
613
|
+
text_parts.append(message.tool_call_id)
|
|
614
|
+
|
|
615
|
+
# Collect reasoning content
|
|
616
|
+
if message.reasoning_content:
|
|
617
|
+
text_parts.append(message.reasoning_content)
|
|
618
|
+
if message.redacted_reasoning_content:
|
|
619
|
+
text_parts.append(message.redacted_reasoning_content)
|
|
620
|
+
|
|
621
|
+
# Collect name field
|
|
622
|
+
if message.name:
|
|
623
|
+
text_parts.append(message.name)
|
|
624
|
+
|
|
625
|
+
# Count all text tokens in a single call
|
|
626
|
+
if text_parts:
|
|
627
|
+
tokens += count_text_tokens(" ".join(text_parts), model_id)
|
|
628
|
+
|
|
629
|
+
# Count all media attachments
|
|
630
|
+
tokens += _count_media_tokens(message)
|
|
631
|
+
|
|
632
|
+
return tokens
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def count_tokens(
|
|
636
|
+
messages: List[Message],
|
|
637
|
+
tools: Optional[List[Union[Function, Dict[str, Any]]]] = None,
|
|
638
|
+
model_id: str = "gpt-4o",
|
|
639
|
+
output_schema: Optional[Union[Dict, Type["BaseModel"]]] = None,
|
|
640
|
+
) -> int:
|
|
641
|
+
total = 0
|
|
642
|
+
model_id = model_id.lower()
|
|
643
|
+
|
|
644
|
+
# Count message tokens
|
|
645
|
+
if messages:
|
|
646
|
+
for msg in messages:
|
|
647
|
+
total += _count_message_tokens(msg, model_id)
|
|
648
|
+
|
|
649
|
+
# Add tool tokens
|
|
650
|
+
if tools:
|
|
651
|
+
total += count_tool_tokens(tools, model_id)
|
|
652
|
+
|
|
653
|
+
# Add output_schema/output_schema tokens
|
|
654
|
+
if output_schema is not None:
|
|
655
|
+
total += count_schema_tokens(output_schema, model_id)
|
|
656
|
+
|
|
657
|
+
return total
|