flow-toon-format 0.9.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flow_toon_format-0.9.0b2.dist-info/METADATA +200 -0
- flow_toon_format-0.9.0b2.dist-info/RECORD +24 -0
- flow_toon_format-0.9.0b2.dist-info/WHEEL +4 -0
- flow_toon_format-0.9.0b2.dist-info/entry_points.txt +2 -0
- flow_toon_format-0.9.0b2.dist-info/licenses/LICENSE +24 -0
- toon_format/__init__.py +40 -0
- toon_format/__main__.py +13 -0
- toon_format/_literal_utils.py +70 -0
- toon_format/_parsing_utils.py +167 -0
- toon_format/_scanner.py +289 -0
- toon_format/_string_utils.py +169 -0
- toon_format/_validation.py +150 -0
- toon_format/cli.py +217 -0
- toon_format/constants.py +84 -0
- toon_format/decoder.py +788 -0
- toon_format/encoder.py +56 -0
- toon_format/encoders.py +456 -0
- toon_format/logging_config.py +92 -0
- toon_format/normalize.py +237 -0
- toon_format/primitives.py +171 -0
- toon_format/py.typed +0 -0
- toon_format/types.py +64 -0
- toon_format/utils.py +187 -0
- toon_format/writer.py +53 -0
toon_format/normalize.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Value normalization for TOON encoding.
|
|
4
|
+
|
|
5
|
+
Converts Python-specific types to JSON-compatible values before encoding:
|
|
6
|
+
- datetime/date → ISO 8601 strings
|
|
7
|
+
- Decimal → float
|
|
8
|
+
- tuple/set/frozenset → sorted lists
|
|
9
|
+
- Infinity/NaN → null
|
|
10
|
+
- Functions/callables → null
|
|
11
|
+
- Negative zero → zero
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import math
|
|
15
|
+
import sys
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
from datetime import date, datetime
|
|
18
|
+
from decimal import Decimal
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
# TypeGuard was added in Python 3.10, use typing_extensions for older versions
|
|
22
|
+
if sys.version_info >= (3, 10):
|
|
23
|
+
from typing import TypeGuard
|
|
24
|
+
else:
|
|
25
|
+
from typing_extensions import TypeGuard
|
|
26
|
+
|
|
27
|
+
from .logging_config import get_logger
|
|
28
|
+
from .types import JsonArray, JsonObject, JsonPrimitive, JsonValue
|
|
29
|
+
|
|
30
|
+
# Module logger
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
_MAX_SAFE_INTEGER = 2**53 - 1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def normalize_value(value: Any) -> JsonValue:
|
|
37
|
+
"""Normalize Python value to JSON-compatible type.
|
|
38
|
+
|
|
39
|
+
Converts Python-specific types to JSON-compatible equivalents:
|
|
40
|
+
- datetime objects → ISO 8601 strings
|
|
41
|
+
- sets → sorted lists
|
|
42
|
+
- Large integers (>2^53-1) → strings (for JS compatibility)
|
|
43
|
+
- Non-finite floats (inf, -inf, NaN) → null
|
|
44
|
+
- Negative zero → positive zero
|
|
45
|
+
- Mapping types → dicts with string keys
|
|
46
|
+
- Unsupported types → null
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
value: Python value to normalize.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
JsonValue: Normalized value (None, bool, int, float, str, list, or dict).
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
>>> normalize_value(datetime(2024, 1, 1))
|
|
56
|
+
'2024-01-01T00:00:00'
|
|
57
|
+
|
|
58
|
+
>>> normalize_value({1, 2, 3})
|
|
59
|
+
[1, 2, 3]
|
|
60
|
+
|
|
61
|
+
>>> normalize_value(float('inf'))
|
|
62
|
+
None
|
|
63
|
+
|
|
64
|
+
>>> normalize_value(2**60) # Large integer
|
|
65
|
+
'1152921504606846976'
|
|
66
|
+
|
|
67
|
+
Note:
|
|
68
|
+
- Recursive: normalizes nested structures
|
|
69
|
+
- Sets are sorted for deterministic output
|
|
70
|
+
- Heterogeneous sets sorted by repr() if natural sorting fails
|
|
71
|
+
"""
|
|
72
|
+
if value is None:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
if isinstance(value, bool):
|
|
76
|
+
return value
|
|
77
|
+
if isinstance(value, str):
|
|
78
|
+
return value
|
|
79
|
+
|
|
80
|
+
if isinstance(value, int):
|
|
81
|
+
# Python integers have arbitrary precision and are encoded directly
|
|
82
|
+
# Note: JavaScript BigInt types are converted to strings during normalization
|
|
83
|
+
# (per spec Section 3), but Python ints don't need this conversion
|
|
84
|
+
return value
|
|
85
|
+
|
|
86
|
+
if isinstance(value, float):
|
|
87
|
+
# Handle non-finite first
|
|
88
|
+
if not math.isfinite(value) or value != value: # includes inf, -inf, NaN
|
|
89
|
+
logger.debug(f"Converting non-finite float to null: {value}")
|
|
90
|
+
return None
|
|
91
|
+
if value == 0.0 and math.copysign(1.0, value) == -1.0:
|
|
92
|
+
logger.debug("Converting negative zero to positive zero")
|
|
93
|
+
return 0
|
|
94
|
+
return value
|
|
95
|
+
|
|
96
|
+
# Handle Decimal
|
|
97
|
+
if isinstance(value, Decimal):
|
|
98
|
+
if not value.is_finite():
|
|
99
|
+
logger.debug(f"Converting non-finite Decimal to null: {value}")
|
|
100
|
+
return None
|
|
101
|
+
return float(value)
|
|
102
|
+
|
|
103
|
+
if isinstance(value, datetime):
|
|
104
|
+
try:
|
|
105
|
+
result = value.isoformat()
|
|
106
|
+
logger.debug(f"Converting datetime to ISO string: {value}")
|
|
107
|
+
return result
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise ValueError(f"Failed to convert datetime to ISO format: {e}") from e
|
|
110
|
+
|
|
111
|
+
if isinstance(value, date):
|
|
112
|
+
try:
|
|
113
|
+
result = value.isoformat()
|
|
114
|
+
logger.debug(f"Converting date to ISO string: {value}")
|
|
115
|
+
return result
|
|
116
|
+
except Exception as e:
|
|
117
|
+
raise ValueError(f"Failed to convert date to ISO format: {e}") from e
|
|
118
|
+
|
|
119
|
+
if isinstance(value, list):
|
|
120
|
+
if not value:
|
|
121
|
+
return []
|
|
122
|
+
return [normalize_value(item) for item in value]
|
|
123
|
+
|
|
124
|
+
if isinstance(value, tuple):
|
|
125
|
+
logger.debug(f"Converting tuple to list: {len(value)} items")
|
|
126
|
+
return [normalize_value(item) for item in value]
|
|
127
|
+
|
|
128
|
+
if isinstance(value, (set, frozenset)):
|
|
129
|
+
logger.debug(f"Converting {type(value).__name__} to sorted list: {len(value)} items")
|
|
130
|
+
try:
|
|
131
|
+
return [normalize_value(item) for item in sorted(value)]
|
|
132
|
+
except TypeError:
|
|
133
|
+
# Fall back to stable conversion for heterogeneous sets/frozensets
|
|
134
|
+
logger.debug(
|
|
135
|
+
f"{type(value).__name__} contains heterogeneous types, using repr() for sorting"
|
|
136
|
+
)
|
|
137
|
+
return [normalize_value(item) for item in sorted(value, key=lambda x: repr(x))]
|
|
138
|
+
|
|
139
|
+
# Handle generic mapping types (Map-like) and dicts
|
|
140
|
+
if isinstance(value, Mapping):
|
|
141
|
+
logger.debug(f"Converting {type(value).__name__} to dict: {len(value)} items")
|
|
142
|
+
try:
|
|
143
|
+
return {str(k): normalize_value(v) for k, v in value.items()}
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Failed to convert mapping to dict: {e}. "
|
|
147
|
+
"Check that all keys can be converted to strings."
|
|
148
|
+
) from e
|
|
149
|
+
|
|
150
|
+
# Handle callables -> null
|
|
151
|
+
if callable(value):
|
|
152
|
+
logger.debug(f"Converting callable {type(value).__name__} to null")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
# Fallback for other types
|
|
156
|
+
logger.warning(
|
|
157
|
+
f"Unsupported type {type(value).__name__}, converting to null. Value: {str(value)[:50]}"
|
|
158
|
+
)
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def is_json_primitive(value: Any) -> TypeGuard[JsonPrimitive]:
|
|
163
|
+
"""Check if value is a JSON primitive type.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
value: Value to check.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
TypeGuard[JsonPrimitive]: True if value is None, str, int, float, or bool.
|
|
170
|
+
"""
|
|
171
|
+
return value is None or isinstance(value, (str, int, float, bool))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def is_json_array(value: Any) -> TypeGuard[JsonArray]:
|
|
175
|
+
"""Check if value is a JSON array (Python list).
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
value: Value to check.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
TypeGuard[JsonArray]: True if value is a list.
|
|
182
|
+
"""
|
|
183
|
+
return isinstance(value, list)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def is_json_object(value: Any) -> TypeGuard[JsonObject]:
|
|
187
|
+
"""Check if value is a JSON object (Python dict).
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
value: Value to check.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
TypeGuard[JsonObject]: True if value is a dict.
|
|
194
|
+
"""
|
|
195
|
+
return isinstance(value, dict)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def is_array_of_primitives(value: JsonArray) -> bool:
|
|
199
|
+
"""Check if array contains only primitive values.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
value: List to check.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
bool: True if all items are primitives. Empty arrays return True.
|
|
206
|
+
"""
|
|
207
|
+
if not value:
|
|
208
|
+
return True
|
|
209
|
+
return all(is_json_primitive(item) for item in value)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def is_array_of_arrays(value: JsonArray) -> bool:
|
|
213
|
+
"""Check if array contains only arrays.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
value: List to check.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
bool: True if all items are lists. Empty arrays return True.
|
|
220
|
+
"""
|
|
221
|
+
if not value:
|
|
222
|
+
return True
|
|
223
|
+
return all(is_json_array(item) for item in value)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def is_array_of_objects(value: JsonArray) -> bool:
|
|
227
|
+
"""Check if array contains only objects.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
value: List to check.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
bool: True if all items are dicts. Empty arrays return True.
|
|
234
|
+
"""
|
|
235
|
+
if not value:
|
|
236
|
+
return True
|
|
237
|
+
return all(is_json_object(item) for item in value)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Primitive value encoding utilities.
|
|
4
|
+
|
|
5
|
+
Handles encoding of primitive values (strings, numbers, booleans, null) and
|
|
6
|
+
array headers. Implements quoting rules, escape sequences, and header formatting
|
|
7
|
+
for inline and tabular array formats.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import List, Literal, Optional, Union
|
|
12
|
+
|
|
13
|
+
from ._string_utils import escape_string
|
|
14
|
+
from ._validation import is_safe_unquoted, is_valid_unquoted_key
|
|
15
|
+
from .constants import (
|
|
16
|
+
CLOSE_BRACE,
|
|
17
|
+
CLOSE_BRACKET,
|
|
18
|
+
COLON,
|
|
19
|
+
COMMA,
|
|
20
|
+
CONTROL_CHARS_REGEX,
|
|
21
|
+
DOUBLE_QUOTE,
|
|
22
|
+
FALSE_LITERAL,
|
|
23
|
+
NULL_LITERAL,
|
|
24
|
+
NUMERIC_REGEX,
|
|
25
|
+
OCTAL_REGEX,
|
|
26
|
+
OPEN_BRACE,
|
|
27
|
+
OPEN_BRACKET,
|
|
28
|
+
STRUCTURAL_CHARS_REGEX,
|
|
29
|
+
TRUE_LITERAL,
|
|
30
|
+
VALID_KEY_REGEX,
|
|
31
|
+
)
|
|
32
|
+
from .logging_config import get_logger
|
|
33
|
+
from .types import Delimiter, JsonPrimitive
|
|
34
|
+
|
|
35
|
+
# Precompiled patterns for performance
|
|
36
|
+
_STRUCTURAL_CHARS_PATTERN = re.compile(STRUCTURAL_CHARS_REGEX)
|
|
37
|
+
_CONTROL_CHARS_PATTERN = re.compile(CONTROL_CHARS_REGEX)
|
|
38
|
+
_NUMERIC_PATTERN = re.compile(NUMERIC_REGEX, re.IGNORECASE)
|
|
39
|
+
_OCTAL_PATTERN = re.compile(OCTAL_REGEX)
|
|
40
|
+
_VALID_KEY_PATTERN = re.compile(VALID_KEY_REGEX, re.IGNORECASE)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
logger = get_logger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def encode_primitive(value: JsonPrimitive, delimiter: str = COMMA) -> str:
|
|
47
|
+
"""Encode a primitive value.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
value: Primitive value
|
|
51
|
+
delimiter: Current delimiter being used
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Encoded string
|
|
55
|
+
"""
|
|
56
|
+
if value is None:
|
|
57
|
+
return NULL_LITERAL
|
|
58
|
+
if isinstance(value, bool):
|
|
59
|
+
return TRUE_LITERAL if value else FALSE_LITERAL
|
|
60
|
+
if isinstance(value, (int, float)):
|
|
61
|
+
# Format numbers in decimal form without scientific notation
|
|
62
|
+
# Per spec Section 2: numbers must be rendered without exponent notation
|
|
63
|
+
if isinstance(value, int):
|
|
64
|
+
return str(value)
|
|
65
|
+
# For floats, use Python's default conversion first
|
|
66
|
+
formatted = str(value)
|
|
67
|
+
# Check if Python used scientific notation
|
|
68
|
+
if "e" in formatted or "E" in formatted:
|
|
69
|
+
# Convert to fixed-point decimal notation
|
|
70
|
+
# Use format with enough precision, then strip trailing zeros
|
|
71
|
+
from decimal import Decimal
|
|
72
|
+
|
|
73
|
+
# Convert through Decimal to get exact decimal representation
|
|
74
|
+
dec = Decimal(str(value))
|
|
75
|
+
formatted = format(dec, "f")
|
|
76
|
+
return formatted
|
|
77
|
+
if isinstance(value, str):
|
|
78
|
+
return encode_string_literal(value, delimiter)
|
|
79
|
+
return str(value)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# Note: escape_string and is_safe_unquoted are now imported from _string_utils and _validation
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def encode_string_literal(value: str, delimiter: str = COMMA) -> str:
|
|
86
|
+
"""Encode a string, quoting only if necessary.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
value: String value
|
|
90
|
+
delimiter: Current delimiter being used
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Encoded string
|
|
94
|
+
"""
|
|
95
|
+
if is_safe_unquoted(value, delimiter):
|
|
96
|
+
return value
|
|
97
|
+
return f"{DOUBLE_QUOTE}{escape_string(value)}{DOUBLE_QUOTE}"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def encode_key(key: str) -> str:
|
|
101
|
+
"""Encode an object key.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
key: Key string
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Encoded key
|
|
108
|
+
"""
|
|
109
|
+
# Keys matching /^[A-Z_][\w.]*$/i don't require quotes
|
|
110
|
+
if is_valid_unquoted_key(key):
|
|
111
|
+
return key
|
|
112
|
+
return f"{DOUBLE_QUOTE}{escape_string(key)}{DOUBLE_QUOTE}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def join_encoded_values(values: List[str], delimiter: Delimiter) -> str:
|
|
116
|
+
"""Join encoded primitive values with a delimiter.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
values: List of encoded values
|
|
120
|
+
delimiter: Delimiter to use
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Joined string
|
|
124
|
+
"""
|
|
125
|
+
return delimiter.join(values)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def format_header(
|
|
129
|
+
key: Optional[str],
|
|
130
|
+
length: int,
|
|
131
|
+
fields: Optional[List[str]],
|
|
132
|
+
delimiter: Delimiter,
|
|
133
|
+
length_marker: Union[str, Literal[False], None],
|
|
134
|
+
) -> str:
|
|
135
|
+
"""Format array/table header.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
key: Optional key name
|
|
139
|
+
length: Array length
|
|
140
|
+
fields: Optional field names for tabular format
|
|
141
|
+
delimiter: Delimiter character
|
|
142
|
+
length_marker: Optional length marker prefix
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Formatted header string
|
|
146
|
+
"""
|
|
147
|
+
# Build length marker
|
|
148
|
+
marker_prefix = length_marker if length_marker else ""
|
|
149
|
+
|
|
150
|
+
# Build fields if provided
|
|
151
|
+
fields_str = ""
|
|
152
|
+
if fields:
|
|
153
|
+
# Encode each field name as a key (may need quoting per Section 7.3)
|
|
154
|
+
encoded_fields = [encode_key(field) for field in fields]
|
|
155
|
+
fields_str = f"{OPEN_BRACE}{delimiter.join(encoded_fields)}{CLOSE_BRACE}"
|
|
156
|
+
|
|
157
|
+
# Build length string with delimiter when needed
|
|
158
|
+
# Rules per TOON spec: delimiter is optional in bracket [N<delim?>]
|
|
159
|
+
# - Only include delimiter if it's NOT comma (comma is the default)
|
|
160
|
+
# - This applies to both tabular and primitive arrays
|
|
161
|
+
if delimiter != COMMA:
|
|
162
|
+
# Non-comma delimiter: show delimiter in bracket
|
|
163
|
+
length_str = f"{OPEN_BRACKET}{marker_prefix}{length}{delimiter}{CLOSE_BRACKET}"
|
|
164
|
+
else:
|
|
165
|
+
# Comma delimiter (default): just [length]
|
|
166
|
+
length_str = f"{OPEN_BRACKET}{marker_prefix}{length}{CLOSE_BRACKET}"
|
|
167
|
+
|
|
168
|
+
# Combine parts
|
|
169
|
+
if key:
|
|
170
|
+
return f"{encode_key(key)}{length_str}{fields_str}{COLON}"
|
|
171
|
+
return f"{length_str}{fields_str}{COLON}"
|
toon_format/py.typed
ADDED
|
File without changes
|
toon_format/types.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Type definitions for TOON format.
|
|
4
|
+
|
|
5
|
+
Defines type aliases and TypedDict classes for JSON values, encoding/decoding
|
|
6
|
+
options, and internal types used throughout the package.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Literal, TypedDict, Union
|
|
10
|
+
|
|
11
|
+
# JSON-compatible types
|
|
12
|
+
JsonPrimitive = Union[str, int, float, bool, None]
|
|
13
|
+
JsonObject = Dict[str, Any]
|
|
14
|
+
JsonArray = List[Any]
|
|
15
|
+
JsonValue = Union[JsonPrimitive, JsonArray, JsonObject]
|
|
16
|
+
|
|
17
|
+
# Delimiter type
|
|
18
|
+
Delimiter = str
|
|
19
|
+
DelimiterKey = Literal["comma", "tab", "pipe"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EncodeOptions(TypedDict, total=False):
|
|
23
|
+
"""Options for TOON encoding.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
indent: Number of spaces per indentation level (default: 2)
|
|
27
|
+
delimiter: Delimiter character for arrays (default: comma)
|
|
28
|
+
lengthMarker: Optional marker to prefix array lengths (default: False)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
indent: int
|
|
32
|
+
delimiter: Delimiter
|
|
33
|
+
lengthMarker: Union[Literal["#"], Literal[False]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ResolvedEncodeOptions:
|
|
37
|
+
"""Resolved encoding options with defaults applied."""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
indent: int = 2,
|
|
42
|
+
delimiter: str = ",",
|
|
43
|
+
length_marker: Union[Literal["#"], Literal[False]] = False,
|
|
44
|
+
) -> None:
|
|
45
|
+
self.indent = indent
|
|
46
|
+
self.delimiter = delimiter
|
|
47
|
+
self.lengthMarker: Union[str, Literal[False]] = length_marker
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DecodeOptions:
|
|
51
|
+
"""Options for TOON decoding.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
indent: Number of spaces per indentation level (default: 2)
|
|
55
|
+
strict: Enable strict validation (default: True)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, indent: int = 2, strict: bool = True) -> None:
|
|
59
|
+
self.indent = indent
|
|
60
|
+
self.strict = strict
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Depth type for tracking indentation level
|
|
64
|
+
Depth = int
|
toon_format/utils.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Token analysis utilities for TOON format.
|
|
4
|
+
|
|
5
|
+
This module provides utilities for counting tokens and comparing
|
|
6
|
+
token efficiency between JSON and TOON formats. Useful for:
|
|
7
|
+
- Estimating API costs (tokens are the primary cost driver)
|
|
8
|
+
- Optimizing prompt sizes for LLM context windows
|
|
9
|
+
- Benchmarking TOON's token efficiency
|
|
10
|
+
|
|
11
|
+
Functions:
|
|
12
|
+
count_tokens: Count tokens in a text string
|
|
13
|
+
estimate_savings: Compare JSON vs TOON token counts
|
|
14
|
+
compare_formats: Generate formatted comparison table
|
|
15
|
+
|
|
16
|
+
Requirements:
|
|
17
|
+
tiktoken: Install with `uv add tiktoken` or `uv add toon_format[benchmark]`
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> import toon_format
|
|
21
|
+
>>> data = {"name": "Alice", "age": 30}
|
|
22
|
+
>>> result = toon_format.estimate_savings(data)
|
|
23
|
+
>>> print(f"TOON saves {result['savings_percent']:.1f}% tokens")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import functools
|
|
27
|
+
import json
|
|
28
|
+
from typing import Any, Dict
|
|
29
|
+
|
|
30
|
+
# Import encode from parent package (defined in __init__.py before this module is imported)
|
|
31
|
+
# __init__.py defines encode() before importing utils, so this is safe
|
|
32
|
+
from . import encode
|
|
33
|
+
|
|
34
|
+
__all__ = ["count_tokens", "estimate_savings", "compare_formats"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_TIKTOKEN_MISSING_MSG = (
|
|
38
|
+
"tiktoken is required for token counting. "
|
|
39
|
+
"Install with: uv add tiktoken or uv add toon_format[benchmark]"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _require_tiktoken():
|
|
44
|
+
try:
|
|
45
|
+
import tiktoken # type: ignore[import-not-found]
|
|
46
|
+
except ImportError as exc: # pragma: no cover - exercised via count_tokens
|
|
47
|
+
raise RuntimeError(_TIKTOKEN_MISSING_MSG) from exc
|
|
48
|
+
return tiktoken
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@functools.lru_cache(maxsize=1)
|
|
52
|
+
def _get_tokenizer():
|
|
53
|
+
"""Get cached tiktoken tokenizer for o200k_base encoding.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
tiktoken.Encoding: The o200k_base tokenizer (gpt5/gpt5-mini).
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
RuntimeError: If tiktoken is not installed.
|
|
60
|
+
"""
|
|
61
|
+
tiktoken = _require_tiktoken()
|
|
62
|
+
return tiktoken.get_encoding("o200k_base")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def count_tokens(text: str, encoding: str = "o200k_base") -> int:
|
|
66
|
+
"""Count tokens in a text string using tiktoken.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: The string to tokenize.
|
|
70
|
+
encoding: Tokenizer encoding name (default: 'o200k_base' for gpt5/gpt5-mini).
|
|
71
|
+
Other options include 'cl100k_base' (GPT-3.5), 'p50k_base' (older models).
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
int: The number of tokens in the text.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
>>> import toon_format
|
|
78
|
+
>>> text = "Hello, world!"
|
|
79
|
+
>>> toon_format.count_tokens(text)
|
|
80
|
+
4
|
|
81
|
+
|
|
82
|
+
Note:
|
|
83
|
+
Requires tiktoken to be installed: uv add tiktoken or uv add toon_format[benchmark]
|
|
84
|
+
"""
|
|
85
|
+
if encoding == "o200k_base":
|
|
86
|
+
enc = _get_tokenizer()
|
|
87
|
+
else:
|
|
88
|
+
tiktoken = _require_tiktoken()
|
|
89
|
+
enc = tiktoken.get_encoding(encoding)
|
|
90
|
+
|
|
91
|
+
return len(enc.encode(text))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def estimate_savings(data: Any, encoding: str = "o200k_base") -> Dict[str, Any]:
|
|
95
|
+
"""Compare token counts between JSON and TOON formats.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
data: Python dict or list to compare.
|
|
99
|
+
encoding: Tokenizer encoding name (default: 'o200k_base').
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
dict: Dictionary containing:
|
|
103
|
+
- json_tokens (int): Token count for JSON format
|
|
104
|
+
- toon_tokens (int): Token count for TOON format
|
|
105
|
+
- savings (int): Absolute token savings (json_tokens - toon_tokens)
|
|
106
|
+
- savings_percent (float): Percentage savings
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
>>> import toon_format
|
|
110
|
+
>>> data = {"employees": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]}
|
|
111
|
+
>>> result = toon_format.estimate_savings(data)
|
|
112
|
+
>>> print(f"Savings: {result['savings_percent']:.1f}%")
|
|
113
|
+
Savings: 42.3%
|
|
114
|
+
|
|
115
|
+
Note:
|
|
116
|
+
Significant savings are typically achieved with structured data,
|
|
117
|
+
especially arrays of uniform objects (tabular data).
|
|
118
|
+
"""
|
|
119
|
+
# Encode as JSON
|
|
120
|
+
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
|
121
|
+
json_tokens = count_tokens(json_str, encoding)
|
|
122
|
+
|
|
123
|
+
# Encode as TOON
|
|
124
|
+
toon_str = encode(data)
|
|
125
|
+
toon_tokens = count_tokens(toon_str, encoding)
|
|
126
|
+
|
|
127
|
+
# Calculate savings
|
|
128
|
+
savings = max(0, json_tokens - toon_tokens)
|
|
129
|
+
savings_percent = (savings / json_tokens * 100.0) if json_tokens > 0 else 0.0
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"json_tokens": json_tokens,
|
|
133
|
+
"toon_tokens": toon_tokens,
|
|
134
|
+
"savings": savings,
|
|
135
|
+
"savings_percent": savings_percent,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def compare_formats(data: Any, encoding: str = "o200k_base") -> str:
|
|
140
|
+
"""Generate a formatted comparison table showing JSON vs TOON metrics.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
data: Python dict or list to compare.
|
|
144
|
+
encoding: Tokenizer encoding name (default: 'o200k_base').
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
str: Formatted table as multi-line string showing token counts,
|
|
148
|
+
character sizes, and savings percentage.
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
>>> import toon_format
|
|
152
|
+
>>> data = {"users": [{"id": 1, "name": "Alice"}]}
|
|
153
|
+
>>> print(toon_format.compare_formats(data))
|
|
154
|
+
Format Comparison
|
|
155
|
+
────────────────────────────────────────────────
|
|
156
|
+
Format Tokens Size (chars)
|
|
157
|
+
JSON 1,234 5,678
|
|
158
|
+
TOON 789 3,456
|
|
159
|
+
────────────────────────────────────────────────
|
|
160
|
+
Savings: 445 tokens (36.1%)
|
|
161
|
+
|
|
162
|
+
Note:
|
|
163
|
+
This is useful for quick visual comparison during development.
|
|
164
|
+
"""
|
|
165
|
+
# Get token metrics
|
|
166
|
+
metrics = estimate_savings(data, encoding)
|
|
167
|
+
|
|
168
|
+
# Encode both formats to get character counts
|
|
169
|
+
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
|
170
|
+
toon_str = encode(data)
|
|
171
|
+
|
|
172
|
+
json_chars = len(json_str)
|
|
173
|
+
toon_chars = len(toon_str)
|
|
174
|
+
|
|
175
|
+
# Build formatted table
|
|
176
|
+
separator = "─" * 48
|
|
177
|
+
lines = [
|
|
178
|
+
"Format Comparison",
|
|
179
|
+
separator,
|
|
180
|
+
"Format Tokens Size (chars)",
|
|
181
|
+
f"JSON {metrics['json_tokens']:>7,} {json_chars:>11,}",
|
|
182
|
+
f"TOON {metrics['toon_tokens']:>7,} {toon_chars:>11,}",
|
|
183
|
+
separator,
|
|
184
|
+
f"Savings: {metrics['savings']:,} tokens ({metrics['savings_percent']:.1f}%)",
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
return "\n".join(lines)
|