flow-toon-format 0.9.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flow_toon_format-0.9.0b2.dist-info/METADATA +200 -0
- flow_toon_format-0.9.0b2.dist-info/RECORD +24 -0
- flow_toon_format-0.9.0b2.dist-info/WHEEL +4 -0
- flow_toon_format-0.9.0b2.dist-info/entry_points.txt +2 -0
- flow_toon_format-0.9.0b2.dist-info/licenses/LICENSE +24 -0
- toon_format/__init__.py +40 -0
- toon_format/__main__.py +13 -0
- toon_format/_literal_utils.py +70 -0
- toon_format/_parsing_utils.py +167 -0
- toon_format/_scanner.py +289 -0
- toon_format/_string_utils.py +169 -0
- toon_format/_validation.py +150 -0
- toon_format/cli.py +217 -0
- toon_format/constants.py +84 -0
- toon_format/decoder.py +788 -0
- toon_format/encoder.py +56 -0
- toon_format/encoders.py +456 -0
- toon_format/logging_config.py +92 -0
- toon_format/normalize.py +237 -0
- toon_format/primitives.py +171 -0
- toon_format/py.typed +0 -0
- toon_format/types.py +64 -0
- toon_format/utils.py +187 -0
- toon_format/writer.py +53 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flow_toon_format
|
|
3
|
+
Version: 0.9.0b2
|
|
4
|
+
Summary: A compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage
|
|
5
|
+
Project-URL: Homepage, https://github.com/toon-format/toon-python
|
|
6
|
+
Project-URL: Repository, https://github.com/toon-format/toon-python
|
|
7
|
+
Project-URL: Documentation, https://github.com/toon-format/spec
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/toon-format/toon-python/issues
|
|
9
|
+
Author-email: Johann Schopplich <hello@johannschopplich.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: data-format,llm,serialization,token-efficient,toon
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Requires-Dist: typing-extensions>=4.0.0; python_version < '3.10'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# TOON Format for Python
|
|
30
|
+
|
|
31
|
+
[](https://github.com/toon-format/toon-python/actions)
|
|
32
|
+
[](https://pypi.org/project/toon_format/)
|
|
33
|
+
|
|
34
|
+
> **⚠️ Beta Status (v0.9.x):** This library is in active development and working towards spec compliance. Beta published to PyPI. API may change before 1.0.0 release.
|
|
35
|
+
|
|
36
|
+
Compact, human-readable serialization format for LLM contexts with **30-60% token reduction** vs JSON. Combines YAML-like indentation with CSV-like tabular arrays. Working towards full compatibility with the [official TOON specification](https://github.com/toon-format/spec).
|
|
37
|
+
|
|
38
|
+
**Key Features:** Minimal syntax • Tabular arrays for uniform data • Array length validation • Python 3.8+ • Comprehensive test coverage.
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Beta published to PyPI - install from source:
|
|
42
|
+
git clone https://github.com/toon-format/toon-python.git
|
|
43
|
+
cd toon-python
|
|
44
|
+
uv sync
|
|
45
|
+
|
|
46
|
+
# Or install directly from GitHub:
|
|
47
|
+
pip install git+https://github.com/toon-format/toon-python.git
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quick Start
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from toon_format import encode, decode
|
|
54
|
+
|
|
55
|
+
# Simple object
|
|
56
|
+
encode({"name": "Alice", "age": 30})
|
|
57
|
+
# name: Alice
|
|
58
|
+
# age: 30
|
|
59
|
+
|
|
60
|
+
# Tabular array (uniform objects)
|
|
61
|
+
encode([{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}])
|
|
62
|
+
# [2,]{id,name}:
|
|
63
|
+
# 1,Alice
|
|
64
|
+
# 2,Bob
|
|
65
|
+
|
|
66
|
+
# Decode back to Python
|
|
67
|
+
decode("items[2]: apple,banana")
|
|
68
|
+
# {'items': ['apple', 'banana']}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## CLI Usage
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Auto-detect format by extension
|
|
75
|
+
toon input.json -o output.toon # Encode
|
|
76
|
+
toon data.toon -o output.json # Decode
|
|
77
|
+
echo '{"x": 1}' | toon - # Stdin/stdout
|
|
78
|
+
|
|
79
|
+
# Options
|
|
80
|
+
toon data.json --encode --delimiter "\t" --length-marker
|
|
81
|
+
toon data.toon --decode --no-strict --indent 4
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Options:** `-e/--encode` `-d/--decode` `-o/--output` `--delimiter` `--indent` `--length-marker` `--no-strict`
|
|
85
|
+
|
|
86
|
+
## API Reference
|
|
87
|
+
|
|
88
|
+
### `encode(value, options=None)` → `str`
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
encode({"id": 123}, {"delimiter": "\t", "indent": 4, "lengthMarker": "#"})
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Options:**
|
|
95
|
+
- `delimiter`: `","` (default), `"\t"`, `"|"`
|
|
96
|
+
- `indent`: Spaces per level (default: `2`)
|
|
97
|
+
- `lengthMarker`: `""` (default) or `"#"` to prefix array lengths
|
|
98
|
+
|
|
99
|
+
### `decode(input_str, options=None)` → `Any`
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
decode("id: 123", {"indent": 2, "strict": True})
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Options:**
|
|
106
|
+
- `indent`: Expected indent size (default: `2`)
|
|
107
|
+
- `strict`: Validate syntax, lengths, delimiters (default: `True`)
|
|
108
|
+
|
|
109
|
+
### Token Counting & Comparison
|
|
110
|
+
|
|
111
|
+
Measure token efficiency and compare formats:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from toon_format import estimate_savings, compare_formats, count_tokens
|
|
115
|
+
|
|
116
|
+
# Measure savings
|
|
117
|
+
data = {"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]}
|
|
118
|
+
result = estimate_savings(data)
|
|
119
|
+
print(f"Saves {result['savings_percent']:.1f}% tokens") # Saves 42.3% tokens
|
|
120
|
+
|
|
121
|
+
# Visual comparison
|
|
122
|
+
print(compare_formats(data))
|
|
123
|
+
# Format Comparison
|
|
124
|
+
# ────────────────────────────────────────────────
|
|
125
|
+
# Format Tokens Size (chars)
|
|
126
|
+
# JSON 45 123
|
|
127
|
+
# TOON 28 85
|
|
128
|
+
# ────────────────────────────────────────────────
|
|
129
|
+
# Savings: 17 tokens (37.8%)
|
|
130
|
+
|
|
131
|
+
# Count tokens directly
|
|
132
|
+
toon_str = encode(data)
|
|
133
|
+
tokens = count_tokens(toon_str) # Uses tiktoken (gpt5/gpt5-mini)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**Requires tiktoken:** `uv add tiktoken` (benchmark features are optional)
|
|
137
|
+
|
|
138
|
+
## Format Specification
|
|
139
|
+
|
|
140
|
+
| Type | Example Input | TOON Output |
|
|
141
|
+
|------|---------------|-------------|
|
|
142
|
+
| **Object** | `{"name": "Alice", "age": 30}` | `name: Alice`<br>`age: 30` |
|
|
143
|
+
| **Primitive Array** | `[1, 2, 3]` | `[3]: 1,2,3` |
|
|
144
|
+
| **Tabular Array** | `[{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]` | `[2,]{id,name}:`<br> `1,A`<br> `2,B` |
|
|
145
|
+
| **Mixed Array** | `[{"x": 1}, 42, "hi"]` | `[3]:`<br> `- x: 1`<br> `- 42`<br> `- hi` |
|
|
146
|
+
|
|
147
|
+
**Quoting:** Only when necessary (empty, keywords, numeric strings, whitespace, structural chars, delimiters)
|
|
148
|
+
|
|
149
|
+
**Type Normalization:** `Infinity/NaN/Functions` → `null` • `Decimal` → `float` • `datetime` → ISO 8601 • `-0` → `0`
|
|
150
|
+
|
|
151
|
+
## Development
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
# Setup (requires uv: https://docs.astral.sh/uv/)
|
|
155
|
+
git clone https://github.com/toon-format/toon-python.git
|
|
156
|
+
cd toon-python
|
|
157
|
+
uv sync
|
|
158
|
+
|
|
159
|
+
# Run tests (792 tests, 91% coverage, 85% enforced)
|
|
160
|
+
uv run pytest --cov=toon_format --cov-report=term
|
|
161
|
+
|
|
162
|
+
# Code quality
|
|
163
|
+
uv run ruff check src/ tests/ # Lint
|
|
164
|
+
uv run ruff format src/ tests/ # Format
|
|
165
|
+
uv run mypy src/ # Type check
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**CI/CD:** GitHub Actions • Python 3.8-3.14 • Coverage enforcement • PR coverage comments
|
|
169
|
+
|
|
170
|
+
## Project Status & Roadmap
|
|
171
|
+
|
|
172
|
+
Following semantic versioning towards 1.0.0:
|
|
173
|
+
|
|
174
|
+
- **v0.8.x** - Initial code set, tests, documentation ✅
|
|
175
|
+
- **v0.9.x** - Serializer improvements, spec compliance testing, publishing setup (current)
|
|
176
|
+
- **v1.0.0-rc.x** - Release candidates for production readiness
|
|
177
|
+
- **v1.0.0** - First stable release with full spec compliance
|
|
178
|
+
|
|
179
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
|
|
180
|
+
|
|
181
|
+
## Documentation
|
|
182
|
+
|
|
183
|
+
- [📘 Full Documentation](docs/) - Complete guides and references
|
|
184
|
+
- [🔧 API Reference](docs/api.md) - Detailed function documentation
|
|
185
|
+
- [📋 Format Specification](docs/format.md) - TOON syntax and rules
|
|
186
|
+
- [🤖 LLM Integration](docs/llm-integration.md) - Best practices for LLM usage
|
|
187
|
+
- [📜 TOON Spec](https://github.com/toon-format/spec) - Official specification
|
|
188
|
+
- [🐛 Issues](https://github.com/toon-format/toon-python/issues) - Bug reports and features
|
|
189
|
+
- [🤝 Contributing](CONTRIBUTING.md) - Contribution guidelines
|
|
190
|
+
|
|
191
|
+
## Contributors
|
|
192
|
+
|
|
193
|
+
- [Xavi Vinaixa](https://github.com/xaviviro)
|
|
194
|
+
- [David Pirogov](https://github.com/davidpirogov)
|
|
195
|
+
- [Justar](https://github.com/Justar96)
|
|
196
|
+
- [Johann Schopplich](https://github.com/johannschopplich)
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
MIT License – see [LICENSE](LICENSE) for details
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
toon_format/__init__.py,sha256=NBvplQppXveRV0Sy2oG4qogjqoy_2yCYGNOQge0Wp0c,1124
|
|
2
|
+
toon_format/__main__.py,sha256=e4hF2NKL3x7ZC1DG--XXkX90l71KMxCvE5jk_LJlVDM,264
|
|
3
|
+
toon_format/_literal_utils.py,sha256=jQxZAGcY14rWrZ3o5Ra0LwJ9RLiP3uW6aPgbqEHr3jE,2043
|
|
4
|
+
toon_format/_parsing_utils.py,sha256=Pq73LVs8UrDnSvBGOqRYoThUGfyIfI8PgbYi_wvOpzM,5153
|
|
5
|
+
toon_format/_scanner.py,sha256=qIjnfIi58zqKtfLeJEd5__cSjj7Plpoe83aJx_BIZ_g,8387
|
|
6
|
+
toon_format/_string_utils.py,sha256=eFr5fV1QOQiN332AaD486_7ZYNrYdruRFOEYZDzFVyg,4463
|
|
7
|
+
toon_format/_validation.py,sha256=UQzvAAq4ZvuzZdgKNugoXe8STUaA3GPy0sX8o5_XaqE,4129
|
|
8
|
+
toon_format/cli.py,sha256=-kOfsoIkR0AiiqXsTOHvWr1BiWEffCh7UFppJTKBEDw,5540
|
|
9
|
+
toon_format/constants.py,sha256=OGhLhHm-th9hLDrc07Aqv5TgT18SACqYscfpp7rnwtw,1765
|
|
10
|
+
toon_format/decoder.py,sha256=hz_aPaxsStr7DeJ66iZui--HhxFuogtyuDav8tZBtws,24474
|
|
11
|
+
toon_format/encoder.py,sha256=BB6bdgCV9dSQPjmiKpiFn9OMVdexKnvl0Y_G_TbKCJo,1770
|
|
12
|
+
toon_format/encoders.py,sha256=dSplBNgaZGwKH_DiaDiACZybCpwp5FiJNJZocIiBIQo,15995
|
|
13
|
+
toon_format/logging_config.py,sha256=ExOUSyY4wF-MaR6Vle5YQYFFN9I4nU6RY8b5wymfm0o,2886
|
|
14
|
+
toon_format/normalize.py,sha256=j82iVfAd45GYBktwx-89SNAmHyB20MNBNBjCtxMrpC8,7183
|
|
15
|
+
toon_format/primitives.py,sha256=2E_dCaKTygPBY8lQSOrlFVZlj0euSlRrXjIdwKY581c,5181
|
|
16
|
+
toon_format/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
toon_format/types.py,sha256=zB43j8-n5KALM5nKGlfvFB0wvAmIHbTPALn2KVFYCQY,1785
|
|
18
|
+
toon_format/utils.py,sha256=X1jgHerPe4KRpJa2zSd_OuiFNIuybEixFpmE6K3f0ps,6221
|
|
19
|
+
toon_format/writer.py,sha256=rQh3CVwttFkXLHvE0Jd4EntebTAc4YAntNnbuP0K_HQ,1752
|
|
20
|
+
flow_toon_format-0.9.0b2.dist-info/METADATA,sha256=dPnRfnlgHRCzp_cKxmFLdGCUiD3Y5xsltX1oQew2VV0,7443
|
|
21
|
+
flow_toon_format-0.9.0b2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
22
|
+
flow_toon_format-0.9.0b2.dist-info/entry_points.txt,sha256=zZEiQ-mNPtmkya4I6IEGmxcnPH4hjEyf-mmXXHJfS6Y,46
|
|
23
|
+
flow_toon_format-0.9.0b2.dist-info/licenses/LICENSE,sha256=f_Jm96-xCJNqEx5Xu5zaijPWDcQ5S1Mu3vclEIOkmTg,1197
|
|
24
|
+
flow_toon_format-0.9.0b2.dist-info/RECORD,,
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025-PRESENT Xavi Vinaixa
|
|
4
|
+
Copyright (c) 2025-PRESENT David Pirogov
|
|
5
|
+
Copyright (c) 2025-PRESENT Justar
|
|
6
|
+
Copyright (c) 2025-PRESENT Johann Schopplich
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
10
|
+
in the Software without restriction, including without limitation the rights
|
|
11
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
13
|
+
furnished to do so, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice shall be included in all
|
|
16
|
+
copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
24
|
+
SOFTWARE.
|
toon_format/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""TOON Format for Python.
|
|
4
|
+
|
|
5
|
+
Token-Oriented Object Notation (TOON) is a compact, human-readable serialization
|
|
6
|
+
format optimized for LLM contexts. Achieves 30-60% token reduction vs JSON while
|
|
7
|
+
maintaining readability and structure.
|
|
8
|
+
|
|
9
|
+
This package provides encoding and decoding functionality with 100% compatibility
|
|
10
|
+
with the official TOON specification (v1.3).
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> from toon_format import encode, decode
|
|
14
|
+
>>> data = {"name": "Alice", "age": 30}
|
|
15
|
+
>>> toon = encode(data)
|
|
16
|
+
>>> print(toon)
|
|
17
|
+
name: Alice
|
|
18
|
+
age: 30
|
|
19
|
+
>>> decode(toon)
|
|
20
|
+
{'name': 'Alice', 'age': 30}
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .decoder import ToonDecodeError, decode
|
|
24
|
+
from .encoder import encode
|
|
25
|
+
from .types import DecodeOptions, Delimiter, DelimiterKey, EncodeOptions
|
|
26
|
+
from .utils import compare_formats, count_tokens, estimate_savings
|
|
27
|
+
|
|
28
|
+
__version__ = "0.9.0-beta.1"
|
|
29
|
+
__all__ = [
|
|
30
|
+
"encode",
|
|
31
|
+
"decode",
|
|
32
|
+
"ToonDecodeError",
|
|
33
|
+
"Delimiter",
|
|
34
|
+
"DelimiterKey",
|
|
35
|
+
"EncodeOptions",
|
|
36
|
+
"DecodeOptions",
|
|
37
|
+
"count_tokens",
|
|
38
|
+
"estimate_savings",
|
|
39
|
+
"compare_formats",
|
|
40
|
+
]
|
toon_format/__main__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""CLI entry point for TOON format.
|
|
4
|
+
|
|
5
|
+
Allows running the package as a module: python -m toon_format
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from .cli import main
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
sys.exit(main())
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Utilities for detecting literal token types.
|
|
4
|
+
|
|
5
|
+
This module provides functions to identify different types of literal
|
|
6
|
+
values in TOON syntax, such as booleans, null, and numeric literals.
|
|
7
|
+
Used during decoding to distinguish between literal values and strings.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .constants import FALSE_LITERAL, NULL_LITERAL, TRUE_LITERAL
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_boolean_or_null_literal(token: str) -> bool:
|
|
14
|
+
"""Check if a token is a boolean or null literal (`true`, `false`, `null`).
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
token: The token to check
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
True if the token is a boolean or null literal
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
>>> is_boolean_or_null_literal("true")
|
|
24
|
+
True
|
|
25
|
+
>>> is_boolean_or_null_literal("null")
|
|
26
|
+
True
|
|
27
|
+
>>> is_boolean_or_null_literal("hello")
|
|
28
|
+
False
|
|
29
|
+
"""
|
|
30
|
+
return token == TRUE_LITERAL or token == FALSE_LITERAL or token == NULL_LITERAL
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_numeric_literal(token: str) -> bool:
|
|
34
|
+
"""Check if a token represents a valid numeric literal.
|
|
35
|
+
|
|
36
|
+
Rejects numbers with leading zeros (except `"0"` itself or decimals like `"0.5"`).
|
|
37
|
+
Per Section 7.3 of the TOON specification.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
token: The token to check
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
True if the token is a valid numeric literal
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
>>> is_numeric_literal("42")
|
|
47
|
+
True
|
|
48
|
+
>>> is_numeric_literal("3.14")
|
|
49
|
+
True
|
|
50
|
+
>>> is_numeric_literal("0.5")
|
|
51
|
+
True
|
|
52
|
+
>>> is_numeric_literal("0123") # Leading zero - not valid
|
|
53
|
+
False
|
|
54
|
+
>>> is_numeric_literal("hello")
|
|
55
|
+
False
|
|
56
|
+
"""
|
|
57
|
+
if not token:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
# Must not have leading zeros (except for `"0"` itself or decimals like `"0.5"`)
|
|
61
|
+
if len(token) > 1 and token[0] == "0" and token[1] != ".":
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
# Check if it's a valid number
|
|
65
|
+
try:
|
|
66
|
+
num = float(token)
|
|
67
|
+
# Reject NaN and infinity
|
|
68
|
+
return not (num != num or not (-float("inf") < num < float("inf")))
|
|
69
|
+
except ValueError:
|
|
70
|
+
return False
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Copyright (c) 2025 TOON Format Organization
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""Parsing utilities for quote-aware string processing.
|
|
4
|
+
|
|
5
|
+
This module provides utilities for parsing TOON strings while respecting
|
|
6
|
+
quoted sections and escape sequences. Used extensively in decoder for
|
|
7
|
+
finding delimiters and structural characters outside of quoted strings.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Iterator, List, Tuple
|
|
11
|
+
|
|
12
|
+
from .constants import BACKSLASH, DOUBLE_QUOTE
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def iter_unquoted(line: str, start: int = 0) -> Iterator[Tuple[int, str, bool]]:
|
|
16
|
+
"""Iterate over characters in a line, tracking quote state.
|
|
17
|
+
|
|
18
|
+
This is the core utility for quote-aware string processing. It handles:
|
|
19
|
+
- Tracking quote boundaries
|
|
20
|
+
- Skipping escaped characters within quotes
|
|
21
|
+
- Yielding (index, character, is_quoted) tuples
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
line: The line to iterate over
|
|
25
|
+
start: Starting position (default: 0)
|
|
26
|
+
|
|
27
|
+
Yields:
|
|
28
|
+
Tuple of (index, char, is_quoted) for each character
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> list(iter_unquoted('a"b:c"d'))
|
|
32
|
+
[(0, 'a', False), (1, '"', False), (2, 'b', True), (3, ':', True),
|
|
33
|
+
(4, 'c', True), (5, '"', True), (6, 'd', False)]
|
|
34
|
+
"""
|
|
35
|
+
in_quotes = False
|
|
36
|
+
i = start
|
|
37
|
+
|
|
38
|
+
while i < len(line):
|
|
39
|
+
char = line[i]
|
|
40
|
+
|
|
41
|
+
if char == DOUBLE_QUOTE:
|
|
42
|
+
# Yield quote with current state, THEN toggle for next char
|
|
43
|
+
yield (i, char, in_quotes)
|
|
44
|
+
in_quotes = not in_quotes
|
|
45
|
+
elif char == BACKSLASH and i + 1 < len(line) and in_quotes:
|
|
46
|
+
# Escaped character - yield backslash, then skip and yield next char
|
|
47
|
+
yield (i, char, True)
|
|
48
|
+
i += 1
|
|
49
|
+
if i < len(line):
|
|
50
|
+
yield (i, line[i], True)
|
|
51
|
+
else:
|
|
52
|
+
yield (i, char, in_quotes)
|
|
53
|
+
|
|
54
|
+
i += 1
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def find_unquoted_char(line: str, target_char: str, start: int = 0) -> int:
|
|
58
|
+
"""Find first occurrence of target character outside of quoted strings.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
line: Line to search
|
|
62
|
+
target_char: Character to find
|
|
63
|
+
start: Starting position (default: 0)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Index of character, or -1 if not found
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
>>> find_unquoted_char('a:b"c:d"e', ':')
|
|
70
|
+
1
|
|
71
|
+
>>> find_unquoted_char('a"b:c"d:e', ':', 0)
|
|
72
|
+
7
|
|
73
|
+
>>> find_unquoted_char('"a:b":c', ':', 0)
|
|
74
|
+
5
|
|
75
|
+
"""
|
|
76
|
+
for i, char, is_quoted in iter_unquoted(line, start):
|
|
77
|
+
if char == target_char and not is_quoted:
|
|
78
|
+
return i
|
|
79
|
+
return -1
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_delimited_values(line: str, delimiter: str) -> List[str]:
|
|
83
|
+
"""Parse delimiter-separated values, respecting quotes and escapes.
|
|
84
|
+
|
|
85
|
+
This function splits a line on the delimiter, but only at unquoted positions.
|
|
86
|
+
Quotes and escape sequences within quoted sections are preserved.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
line: Line content
|
|
90
|
+
delimiter: Active delimiter (e.g., ',', '\\t', '|')
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of token strings (with quotes and escapes preserved)
|
|
94
|
+
|
|
95
|
+
Examples:
|
|
96
|
+
>>> parse_delimited_values('a,b,c', ',')
|
|
97
|
+
['a', 'b', 'c']
|
|
98
|
+
>>> parse_delimited_values('a,"b,c",d', ',')
|
|
99
|
+
['a', '"b,c"', 'd']
|
|
100
|
+
>>> parse_delimited_values('"a,b",c', ',')
|
|
101
|
+
['"a,b"', 'c']
|
|
102
|
+
"""
|
|
103
|
+
tokens: List[str] = []
|
|
104
|
+
current: List[str] = []
|
|
105
|
+
|
|
106
|
+
for i, char, is_quoted in iter_unquoted(line):
|
|
107
|
+
if char == delimiter and not is_quoted:
|
|
108
|
+
# Split on unquoted delimiter
|
|
109
|
+
tokens.append("".join(current))
|
|
110
|
+
current = []
|
|
111
|
+
else:
|
|
112
|
+
current.append(char)
|
|
113
|
+
|
|
114
|
+
# Add final token (always add, even if empty, to handle trailing delimiters)
|
|
115
|
+
if current or tokens:
|
|
116
|
+
tokens.append("".join(current))
|
|
117
|
+
|
|
118
|
+
return tokens
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def split_at_unquoted_char(line: str, target_char: str) -> Tuple[str, str]:
|
|
122
|
+
"""Split a line at the first unquoted occurrence of target character.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
line: Line content
|
|
126
|
+
target_char: Character to split on
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Tuple of (before, after) strings
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If target character not found outside quotes
|
|
133
|
+
|
|
134
|
+
Examples:
|
|
135
|
+
>>> split_at_unquoted_char('key: value', ':')
|
|
136
|
+
('key', ' value')
|
|
137
|
+
>>> split_at_unquoted_char('"key:1": value', ':')
|
|
138
|
+
('"key:1"', ' value')
|
|
139
|
+
"""
|
|
140
|
+
idx = find_unquoted_char(line, target_char)
|
|
141
|
+
if idx == -1:
|
|
142
|
+
raise ValueError(f"Character '{target_char}' not found outside quotes")
|
|
143
|
+
return (line[:idx], line[idx + 1 :])
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def find_first_unquoted(line: str, chars: List[str], start: int = 0) -> Tuple[int, str]:
|
|
147
|
+
"""Find the first occurrence of any character in chars, outside quotes.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
line: Line to search
|
|
151
|
+
chars: List of characters to search for
|
|
152
|
+
start: Starting position (default: 0)
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Tuple of (index, character) for first match, or (-1, '') if none found
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
>>> find_first_unquoted('a:b,c', [':', ','])
|
|
159
|
+
(1, ':')
|
|
160
|
+
>>> find_first_unquoted('a"b:c",d', [':', ','])
|
|
161
|
+
(7, ',')
|
|
162
|
+
"""
|
|
163
|
+
char_set = set(chars)
|
|
164
|
+
for i, char, is_quoted in iter_unquoted(line, start):
|
|
165
|
+
if char in char_set and not is_quoted:
|
|
166
|
+
return (i, char)
|
|
167
|
+
return (-1, "")
|