cton 0.4.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +117 -345
- data/lib/cton/binary.rb +74 -0
- data/lib/cton/decoder.rb +18 -6
- data/lib/cton/encoder.rb +19 -3
- data/lib/cton/schema.rb +369 -0
- data/lib/cton/stream.rb +44 -0
- data/lib/cton/version.rb +1 -1
- data/lib/cton.rb +63 -0
- data/sig/cton/binary.rbs +12 -0
- data/sig/cton/schema.rbs +84 -0
- data/sig/cton/stream.rbs +13 -0
- data/sig/cton.rbs +15 -0
- metadata +8 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a1d32c4a3d5f55726071135d05705be7084d7c8332a1f03a5a724523981a7b81
|
|
4
|
+
data.tar.gz: '0813fb27119f36ab0117e7ca1401fe33d8826bc7bfbe09b68589dece8ae2f376'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e536052cfe992ab7a5844640b622093a5d6f091edfa6af6ce456761e229fa1eb949ce50f103e9aadca54fa6abce66b0fc535fb5335c884e5de5bd2b29408d747
|
|
7
|
+
data.tar.gz: 66be8d4e9ff12a3e99dd138c82aa676cdc6e8f9e61a07183137b24de07b2f2d177c9a11b6e27e2407476dc45a19e3b012b63be87f942981a34002a6a292f791e
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.0] - 2026-01-17
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **Schema Validation DSL**: Define schemas via `Cton.schema` and validate data with `Cton.validate_schema` for LLM-safe outputs.
|
|
13
|
+
- **Streaming APIs**: `Cton.load_stream`, `Cton.dump_stream`, plus `StreamReader`/`StreamWriter` for newline-delimited documents.
|
|
14
|
+
- **CTON-B Binary Mode**: Optional binary envelope with compression via `Cton.dump_binary`/`Cton.load_binary`.
|
|
15
|
+
- **CLI Enhancements**: `--schema`, `--stream`, `--to-binary`, and `--from-binary` support.
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- **Performance**: Faster scalar scans in the decoder and reusable scalar buffers in the encoder.
|
|
20
|
+
- **Docs**: README refocused on LLM usage, schema validation, and streaming workflows.
|
|
21
|
+
|
|
8
22
|
## [0.4.0] - 2025-11-26
|
|
9
23
|
|
|
10
24
|
### Added
|
data/README.md
CHANGED
|
@@ -3,96 +3,73 @@
|
|
|
3
3
|
[](https://badge.fury.io/rb/cton)
|
|
4
4
|
[](https://github.com/davidesantangelo/cton/blob/master/LICENSE.txt)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
CTON (Compact Token-Oriented Notation) is a token-efficient, JSON-compatible wire format built for LLM prompts. It keeps structure explicit (objects, arrays, table arrays) while removing syntactic noise, so prompts are shorter and outputs are easier to validate. CTON is deterministic and round-trippable, making it safe for LLM workflows.
|
|
7
|
+
|
|
8
|
+
**CTON is designed to be the reference language for LLM data exchange**: short, deterministic, schema-aware.
|
|
7
9
|
|
|
8
10
|
---
|
|
9
11
|
|
|
10
|
-
##
|
|
12
|
+
## Quickstart
|
|
11
13
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
- [Token Savings](#token-savings-vs-json--toon)
|
|
16
|
-
- [Installation](#installation)
|
|
17
|
-
- [Usage](#usage)
|
|
18
|
-
- [Performance & Benchmarks](#performance--benchmarks)
|
|
19
|
-
- [Teaching CTON to LLMs](#teaching-cton-to-llms)
|
|
20
|
-
- [Development](#development)
|
|
21
|
-
- [Contributing](#contributing)
|
|
22
|
-
- [License](#license)
|
|
14
|
+
```bash
|
|
15
|
+
bundle add cton
|
|
16
|
+
```
|
|
23
17
|
|
|
24
|
-
|
|
18
|
+
```ruby
|
|
19
|
+
require "cton"
|
|
25
20
|
|
|
26
|
-
|
|
21
|
+
payload = {
|
|
22
|
+
"user" => { "id" => 42, "name" => "Ada" },
|
|
23
|
+
"tags" => ["llm", "compact"],
|
|
24
|
+
"events" => [
|
|
25
|
+
{ "id" => 1, "action" => "login" },
|
|
26
|
+
{ "id" => 2, "action" => "upload" }
|
|
27
|
+
]
|
|
28
|
+
}
|
|
27
29
|
|
|
28
|
-
|
|
30
|
+
cton = Cton.dump(payload)
|
|
31
|
+
# => user(id=42,name=Ada)
|
|
32
|
+
# => tags[2]=llm,compact
|
|
33
|
+
# => events[2]{id,action}=1,login;2,upload
|
|
29
34
|
|
|
30
|
-
|
|
35
|
+
round_trip = Cton.load(cton)
|
|
36
|
+
# => same as payload
|
|
37
|
+
```
|
|
31
38
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
4. **Comments**: Single-line comments with `#` for annotating data.
|
|
39
|
-
5. **Validation API**: Check CTON syntax without full parsing for quick validation.
|
|
40
|
-
6. **Token Statistics**: Built-in measurement of token efficiency vs JSON.
|
|
41
|
-
7. **Custom Type Registry**: Register serializers for domain objects.
|
|
39
|
+
```bash
|
|
40
|
+
# CLI usage
|
|
41
|
+
cton input.json
|
|
42
|
+
cton --to-json data.cton
|
|
43
|
+
cton --stats input.json
|
|
44
|
+
```
|
|
42
45
|
|
|
43
46
|
---
|
|
44
47
|
|
|
45
|
-
##
|
|
48
|
+
## Why CTON for LLMs?
|
|
46
49
|
|
|
47
|
-
|
|
50
|
+
- **Shorter prompts**: CTON removes braces, indentation, and repeated keys.
|
|
51
|
+
- **Schema hints built-in**: arrays include length and tables include headers.
|
|
52
|
+
- **Deterministic output**: round-trip safe and validates structure.
|
|
53
|
+
- **LLM-friendly**: small grammar + clear guardrails for generation.
|
|
48
54
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
"id": 123
|
|
55
|
-
}
|
|
56
|
-
```
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## CTON in 60 seconds
|
|
58
|
+
|
|
59
|
+
### Objects & Scalars
|
|
57
60
|
|
|
58
|
-
**CTON**
|
|
59
61
|
```text
|
|
60
62
|
task=planning,urgent=true,id=123
|
|
61
63
|
```
|
|
62
64
|
|
|
63
65
|
### Nested Objects
|
|
64
66
|
|
|
65
|
-
**JSON**
|
|
66
|
-
```json
|
|
67
|
-
{
|
|
68
|
-
"user": {
|
|
69
|
-
"name": "Davide",
|
|
70
|
-
"settings": {
|
|
71
|
-
"theme": "dark"
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
**CTON**
|
|
78
67
|
```text
|
|
79
|
-
user(name=
|
|
68
|
+
user(name=Ada,settings(theme=dark))
|
|
80
69
|
```
|
|
81
70
|
|
|
82
|
-
### Arrays
|
|
83
|
-
|
|
84
|
-
**JSON**
|
|
85
|
-
```json
|
|
86
|
-
{
|
|
87
|
-
"tags": ["ruby", "gem", "llm"],
|
|
88
|
-
"files": [
|
|
89
|
-
{ "name": "README.md", "size": 1024 },
|
|
90
|
-
{ "name": "lib/cton.rb", "size": 2048 }
|
|
91
|
-
]
|
|
92
|
-
}
|
|
93
|
-
```
|
|
71
|
+
### Arrays & Tables
|
|
94
72
|
|
|
95
|
-
**CTON**
|
|
96
73
|
```text
|
|
97
74
|
tags[3]=ruby,gem,llm
|
|
98
75
|
files[2]{name,size}=README.md,1024;lib/cton.rb,2048
|
|
@@ -100,351 +77,146 @@ files[2]{name,size}=README.md,1024;lib/cton.rb,2048
|
|
|
100
77
|
|
|
101
78
|
---
|
|
102
79
|
|
|
103
|
-
##
|
|
104
|
-
|
|
105
|
-
- **Less noise than YAML/JSON**: no indentation, no braces around the root, and optional quoting.
|
|
106
|
-
- **Schema guardrails**: arrays carry their length (`friends[3]`) and table headers (`{id,name,...}`) so downstream parsing can verify shape.
|
|
107
|
-
- **LLM-friendly**: works as a single string you can embed in a prompt together with short parsing instructions.
|
|
108
|
-
- **Token savings**: CTON compounds the JSON → TOON savings.
|
|
80
|
+
## LLM Prompt Kit (Recommended)
|
|
109
81
|
|
|
110
|
-
|
|
82
|
+
System prompt template:
|
|
111
83
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
- **Net effect**: In practice you can often reclaim **50–60% of the token budget** versus raw JSON, leaving more room for instructions or reasoning steps while keeping a deterministic schema.
|
|
84
|
+
```markdown
|
|
85
|
+
You are an expert in CTON (Compact Token-Oriented Notation). Convert between JSON and CTON following the rules below and preserve the schema exactly.
|
|
115
86
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
bundle add cton
|
|
87
|
+
Rules:
|
|
88
|
+
1. Do not wrap the root in `{}`.
|
|
89
|
+
2. Objects use `key=value` and nested objects use `key(...)`.
|
|
90
|
+
3. Arrays are `key[N]=v1,v2` and table arrays are `key[N]{k1,k2}=v1,v2;v1,v2`.
|
|
91
|
+
4. Use unquoted literals for `true`, `false`, and `null`.
|
|
92
|
+
5. Quote strings containing reserved characters (`,`, `;`, `=`, `(`, `)`) or whitespace.
|
|
93
|
+
6. Always keep array length and table headers accurate.
|
|
124
94
|
```
|
|
125
95
|
|
|
126
|
-
|
|
96
|
+
Few-shot example:
|
|
127
97
|
|
|
128
|
-
```
|
|
129
|
-
|
|
98
|
+
```text
|
|
99
|
+
JSON: {"team":[{"id":1,"name":"Ada"},{"id":2,"name":"Lin"}]}
|
|
100
|
+
CTON: team[2]{id,name}=1,Ada;2,Lin
|
|
130
101
|
```
|
|
131
102
|
|
|
132
103
|
---
|
|
133
104
|
|
|
134
|
-
##
|
|
135
|
-
|
|
136
|
-
```ruby
|
|
137
|
-
require "cton"
|
|
105
|
+
## Schema Validation (1.0.0)
|
|
138
106
|
|
|
139
|
-
|
|
140
|
-
"context" => {
|
|
141
|
-
"task" => "Our favorite hikes together",
|
|
142
|
-
"location" => "Boulder",
|
|
143
|
-
"season" => "spring_2025"
|
|
144
|
-
},
|
|
145
|
-
"friends" => %w[ana luis sam],
|
|
146
|
-
"hikes" => [
|
|
147
|
-
{ "id" => 1, "name" => "Blue Lake Trail", "distanceKm" => 7.5, "elevationGain" => 320, "companion" => "ana", "wasSunny" => true },
|
|
148
|
-
{ "id" => 2, "name" => "Ridge Overlook", "distanceKm" => 9.2, "elevationGain" => 540, "companion" => "luis", "wasSunny" => false },
|
|
149
|
-
{ "id" => 3, "name" => "Wildflower Loop", "distanceKm" => 5.1, "elevationGain" => 180, "companion" => "sam", "wasSunny" => true }
|
|
150
|
-
]
|
|
151
|
-
}
|
|
107
|
+
CTON ships with a schema DSL for validation inside your LLM pipeline.
|
|
152
108
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
inline = Cton.dump(payload, separator: "")
|
|
166
|
-
|
|
167
|
-
# Pretty print for human readability
|
|
168
|
-
pretty = Cton.dump(payload, pretty: true)
|
|
169
|
-
|
|
170
|
-
# Stream to an IO object (file, socket, etc.)
|
|
171
|
-
File.open("data.cton", "w") do |f|
|
|
172
|
-
Cton.dump(payload, f)
|
|
109
|
+
```ruby
|
|
110
|
+
schema = Cton.schema do
|
|
111
|
+
object do
|
|
112
|
+
key "user" do
|
|
113
|
+
object do
|
|
114
|
+
key "id", integer
|
|
115
|
+
key "name", string
|
|
116
|
+
optional "role", enum("admin", "viewer")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
key "tags", array(of: string)
|
|
120
|
+
end
|
|
173
121
|
end
|
|
174
122
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
strict = Cton.dump(payload, decimal_mode: :precise)
|
|
123
|
+
result = Cton.validate_schema(payload, schema)
|
|
124
|
+
puts result.valid? # true/false
|
|
178
125
|
```
|
|
179
126
|
|
|
180
|
-
|
|
127
|
+
Schema files can be used from the CLI as well:
|
|
181
128
|
|
|
182
|
-
|
|
129
|
+
```ruby
|
|
130
|
+
# schema.rb
|
|
131
|
+
CTON_SCHEMA = Cton.schema do
|
|
132
|
+
object do
|
|
133
|
+
key "user", object { key "id", integer }
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
```
|
|
183
137
|
|
|
184
138
|
```bash
|
|
185
|
-
|
|
186
|
-
echo '{"hello": "world"}' | cton
|
|
187
|
-
# => hello=world
|
|
188
|
-
|
|
189
|
-
# Convert CTON to JSON
|
|
190
|
-
echo 'hello=world' | cton --to-json
|
|
191
|
-
# => {"hello":"world"}
|
|
192
|
-
|
|
193
|
-
# Pretty print
|
|
194
|
-
cton --pretty input.json
|
|
195
|
-
|
|
196
|
-
# Minify (fully inline, no separators)
|
|
197
|
-
cton --minify input.json
|
|
198
|
-
|
|
199
|
-
# Validate CTON syntax
|
|
200
|
-
cton --validate input.cton
|
|
201
|
-
# => ✓ Valid CTON
|
|
202
|
-
|
|
203
|
-
# Show token savings statistics
|
|
204
|
-
echo '{"name": "test", "items": [1,2,3]}' | cton --stats
|
|
205
|
-
# => JSON: 33 chars / 33 bytes (~9 tokens)
|
|
206
|
-
# => CTON: 26 chars / 26 bytes (~7 tokens)
|
|
207
|
-
# => Saved: 21.2% (7 chars, ~2 tokens)
|
|
139
|
+
cton --schema schema.rb input.cton
|
|
208
140
|
```
|
|
209
141
|
|
|
210
|
-
|
|
142
|
+
---
|
|
211
143
|
|
|
212
|
-
|
|
144
|
+
## Streaming IO (1.0.0)
|
|
213
145
|
|
|
214
|
-
|
|
146
|
+
Handle newline-delimited CTON streams efficiently:
|
|
215
147
|
|
|
216
148
|
```ruby
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Age is optional
|
|
222
|
-
age=30
|
|
223
|
-
)
|
|
224
|
-
CTON
|
|
225
|
-
|
|
226
|
-
Cton.load(cton_with_comments)
|
|
227
|
-
# => {"user" => {"name" => "Alice", "age" => 30}}
|
|
228
|
-
|
|
229
|
-
# Add comments when encoding
|
|
230
|
-
Cton.dump(data, comments: { "user" => "User configuration" })
|
|
149
|
+
io = File.open("events.cton", "r")
|
|
150
|
+
Cton.load_stream(io).each do |event|
|
|
151
|
+
# process event
|
|
152
|
+
end
|
|
231
153
|
```
|
|
232
154
|
|
|
233
|
-
#### Validation API
|
|
234
|
-
|
|
235
|
-
Validate CTON syntax without full parsing:
|
|
236
|
-
|
|
237
155
|
```ruby
|
|
238
|
-
|
|
239
|
-
Cton.
|
|
240
|
-
Cton.valid?("key=(broken") # => false
|
|
241
|
-
|
|
242
|
-
# Detailed validation with error info
|
|
243
|
-
result = Cton.validate("key=(broken")
|
|
244
|
-
result.valid? # => false
|
|
245
|
-
result.errors.first.message # => "Expected '=' in object"
|
|
246
|
-
result.errors.first.line # => 1
|
|
247
|
-
result.errors.first.column # => 5
|
|
156
|
+
io = File.open("events.cton", "w")
|
|
157
|
+
Cton.dump_stream(events, io)
|
|
248
158
|
```
|
|
249
159
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
Measure CTON's token efficiency compared to JSON:
|
|
253
|
-
|
|
254
|
-
```ruby
|
|
255
|
-
stats = Cton.stats(data)
|
|
256
|
-
puts stats.savings_percent # => 45.5
|
|
257
|
-
puts stats.estimated_token_savings # => 12
|
|
258
|
-
|
|
259
|
-
# Full comparison
|
|
260
|
-
puts stats.to_s
|
|
261
|
-
# => JSON: 100 chars / 100 bytes (~25 tokens)
|
|
262
|
-
# => CTON: 55 chars / 55 bytes (~14 tokens)
|
|
263
|
-
# => Saved: 45.0% (45 chars, ~11 tokens)
|
|
264
|
-
|
|
265
|
-
# Compare all format variants
|
|
266
|
-
Cton::Stats.compare(data)
|
|
267
|
-
# => { cton: {...}, cton_inline: {...}, json: {...}, ... }
|
|
268
|
-
```
|
|
160
|
+
---
|
|
269
161
|
|
|
270
|
-
|
|
162
|
+
## CTON-B (Binary Mode)
|
|
271
163
|
|
|
272
|
-
|
|
164
|
+
CTON-B is an optional binary envelope for compact transport (with optional compression):
|
|
273
165
|
|
|
274
166
|
```ruby
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def initialize(cents, currency)
|
|
278
|
-
@cents = cents
|
|
279
|
-
@currency = currency
|
|
280
|
-
end
|
|
281
|
-
end
|
|
282
|
-
|
|
283
|
-
# Register as object
|
|
284
|
-
Cton.register_type(Money) do |money|
|
|
285
|
-
{ amount: money.cents, currency: money.currency }
|
|
286
|
-
end
|
|
287
|
-
|
|
288
|
-
Cton.dump("price" => Money.new(1999, "USD"))
|
|
289
|
-
# => "price(amount=1999,currency=USD)"
|
|
290
|
-
|
|
291
|
-
# Register as scalar
|
|
292
|
-
Cton.register_type(UUID, as: :scalar) { |uuid| uuid.to_s }
|
|
293
|
-
|
|
294
|
-
# Unregister when done
|
|
295
|
-
Cton.unregister_type(Money)
|
|
167
|
+
binary = Cton.dump_binary(payload)
|
|
168
|
+
round_trip = Cton.load_binary(binary)
|
|
296
169
|
```
|
|
297
170
|
|
|
298
|
-
|
|
171
|
+
CLI:
|
|
299
172
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
begin
|
|
304
|
-
Cton.load("user(name=Alice,invalid")
|
|
305
|
-
rescue Cton::ParseError => e
|
|
306
|
-
puts e.message # => "Unterminated object at line 1, column 20"
|
|
307
|
-
puts e.line # => 1
|
|
308
|
-
puts e.column # => 20
|
|
309
|
-
puts e.source_excerpt # => "...name=Alice,invalid"
|
|
310
|
-
puts e.suggestions # => ["Did you forget a closing ')'?"]
|
|
311
|
-
end
|
|
173
|
+
```bash
|
|
174
|
+
cton --to-binary input.json > output.ctonb
|
|
175
|
+
cton --from-binary output.ctonb
|
|
312
176
|
```
|
|
313
177
|
|
|
314
|
-
|
|
315
|
-
CTON natively supports serialization for:
|
|
316
|
-
- `Time` and `Date` (ISO8601 strings)
|
|
317
|
-
- `Set` (converted to Arrays)
|
|
318
|
-
- `OpenStruct` (converted to Objects)
|
|
319
|
-
|
|
320
|
-
#### Table detection
|
|
321
|
-
Whenever an array is made of hashes that all expose the same scalar keys, the encoder flattens it into a table to save tokens. Mixed or nested arrays fall back to `[N]=(value1,value2,...)`.
|
|
322
|
-
|
|
323
|
-
#### Separators & ambiguity
|
|
324
|
-
Removing every newline makes certain inputs ambiguous because `sam` and the next key `hikes` can merge into `samhikes`. The default `separator: "\n"` avoids that by inserting a single newline between root segments. You may pass `separator: ""` to `Cton.dump` for maximum compactness, but decoding such strings is only safe if you can guarantee extra quoting or whitespace between segments. When you intentionally omit separators, keep next-level keys alphabetic (e.g., `payload`, `k42`) so the decoder's boundary heuristic can split `...1payload...` without misclassifying numeric prefixes.
|
|
325
|
-
|
|
326
|
-
#### Literal safety & number normalization
|
|
327
|
-
Following the TOON specification's guardrails, the encoder now:
|
|
328
|
-
- Auto-quotes strings that would otherwise be parsed as booleans, `null`, or numbers (e.g., `"true"`, `"007"`, `"1e6"`, `"-5"`) so they round-trip as strings without extra work.
|
|
329
|
-
- Canonicalizes float/BigDecimal output: no exponent notation, no trailing zeros, and `-0` collapses to `0`.
|
|
330
|
-
- Converts `NaN` and `±Infinity` inputs to `null`, matching TOON's normalization guidance so downstream decoders don't explode on non-finite numbers.
|
|
331
|
-
|
|
332
|
-
#### Decimal normalization modes
|
|
333
|
-
- `decimal_mode: :fast` (default) prefers Ruby's native float representation and only falls back to `BigDecimal` when scientific notation is detected, minimizing allocations on tight loops.
|
|
334
|
-
- `decimal_mode: :precise` forces the legacy `BigDecimal` path for every float, which is slower but useful for audit-grade dumps where you want deterministic decimal expansion.
|
|
335
|
-
- Both modes share the same trailing-zero stripping and `-0 → 0` normalization, so switching modes never affects integer formatting.
|
|
178
|
+
Note: `--stream` with binary assumes newline-delimited binary frames.
|
|
336
179
|
|
|
337
180
|
---
|
|
338
181
|
|
|
339
182
|
## Performance & Benchmarks
|
|
340
183
|
|
|
341
|
-
CTON focuses on throughput:
|
|
184
|
+
CTON focuses on throughput: memoized table schemas, low-allocation scalar streams, and fast boundary detection for inline docs.
|
|
185
|
+
|
|
186
|
+
Run benchmarks:
|
|
342
187
|
|
|
343
188
|
```bash
|
|
344
189
|
bundle exec ruby bench/encode_decode_bench.rb
|
|
345
|
-
# customize input size / iterations
|
|
346
190
|
ITERATIONS=2000 STREAM_SIZE=400 bundle exec ruby bench/encode_decode_bench.rb
|
|
347
191
|
```
|
|
348
192
|
|
|
349
|
-
Latest results on Ruby 3.1.4/macOS (M-series), 1,000 iterations, `STREAM_SIZE=200`:
|
|
350
|
-
|
|
351
|
-
| Benchmark | Time (s) |
|
|
352
|
-
| --- | --- |
|
|
353
|
-
| `cton dump` (:fast) | 0.626 |
|
|
354
|
-
| `cton dump` (:precise) | 0.658 |
|
|
355
|
-
| `json generate` | 0.027 |
|
|
356
|
-
| `cton load` | 2.067 |
|
|
357
|
-
| `json parse` | 0.045 |
|
|
358
|
-
| `cton inline load` (separator=`""`, double payload) | 4.140 |
|
|
359
|
-
|
|
360
|
-
`cton inline load` deliberately concatenates documents without separators to stress the new boundary detector; it now finishes without the runaway allocations seen in earlier releases.
|
|
361
|
-
|
|
362
193
|
---
|
|
363
194
|
|
|
364
|
-
##
|
|
365
|
-
|
|
366
|
-
Use this system prompt to teach an LLM how to understand and generate CTON:
|
|
367
|
-
|
|
368
|
-
````markdown
|
|
369
|
-
You are an expert in data serialization and specifically in CTON (Compact Token-Oriented Notation). CTON is a token-efficient data format optimized for LLMs that serves as a compact alternative to JSON.
|
|
370
|
-
|
|
371
|
-
Your task is to interpret CTON input and convert it to JSON, or convert JSON input into valid CTON format, following the specification below.
|
|
372
|
-
|
|
373
|
-
### CTON Specification
|
|
374
|
-
|
|
375
|
-
CTON minimizes syntax characters (braces, quotes) while preserving structure and type safety.
|
|
376
|
-
|
|
377
|
-
**1. Basic Structure (Key-Value)**
|
|
378
|
-
- **Rule:** Do not use outer curly braces `{}` for the root object.
|
|
379
|
-
- **Rule:** Use `=` to separate keys and values.
|
|
380
|
-
- **Rule:** Use `,` to separate fields.
|
|
381
|
-
- **Rule:** Do not use quotes around "safe" strings (alphanumeric, simple text).
|
|
382
|
-
- **Example:** - JSON: `{"task": "planning", "urgent": true}`
|
|
383
|
-
- CTON: `task=planning,urgent=true`
|
|
384
|
-
|
|
385
|
-
**2. Nested Objects**
|
|
386
|
-
- **Rule:** Use parentheses `()` to denote a nested object instead of `{}`.
|
|
387
|
-
- **Example:**
|
|
388
|
-
- JSON: `{"context": {"user": "Davide", "theme": "dark"}}`
|
|
389
|
-
- CTON: `context(user=Davide,theme=dark)`
|
|
390
|
-
|
|
391
|
-
**3. Arrays of Objects (Table Compression)**
|
|
392
|
-
- **Rule:** Use the syntax `key[count]{columns}=values` for arrays of objects to avoid repeating keys.
|
|
393
|
-
- **Structure:** `key[Length]{col1,col2}=val1,val2;val1,val2`
|
|
394
|
-
- **Details:** - `[N]` denotes the number of items in the array.
|
|
395
|
-
- `{col1,col2}` defines the schema headers.
|
|
396
|
-
- `;` separates distinct objects (rows).
|
|
397
|
-
- `,` separates values within an object.
|
|
398
|
-
- **Example:**
|
|
195
|
+
## CLI Reference
|
|
399
196
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
CTON: `files[2]{name,size}=README.md,1024;lib.rb,2048`
|
|
411
|
-
|
|
412
|
-
**4. Type Safety & Literals**
|
|
413
|
-
- **Booleans/Null:** `true`, `false`, and `null` are preserved as literals (unquoted).
|
|
414
|
-
- **Numbers:** Integers and floats are written as is (e.g., `1024`, `3.14`).
|
|
415
|
-
- **Escaping:** If a string value looks like a boolean, number, or contains reserved characters (like `,`, `;`, `=`, `(`, `)`), it must be wrapped in double quotes (e.g., `"true"`).
|
|
416
|
-
|
|
417
|
-
### Examples for Training
|
|
418
|
-
|
|
419
|
-
**Input (JSON):**
|
|
420
|
-
```json
|
|
421
|
-
{
|
|
422
|
-
"id": 123,
|
|
423
|
-
"active": true,
|
|
424
|
-
"metadata": {
|
|
425
|
-
"created_at": "2023-01-01",
|
|
426
|
-
"tags": "admin"
|
|
427
|
-
}
|
|
428
|
-
}
|
|
197
|
+
```bash
|
|
198
|
+
cton [input] # auto-detect JSON/CTON
|
|
199
|
+
cton --to-json input.cton # CTON → JSON
|
|
200
|
+
cton --to-cton input.json # JSON → CTON
|
|
201
|
+
cton --to-binary input.json # JSON → CTON-B
|
|
202
|
+
cton --from-binary input.ctonb
|
|
203
|
+
cton --minify input.json # no separators
|
|
204
|
+
cton --pretty input.json
|
|
205
|
+
cton --stream input.ndjson
|
|
206
|
+
cton --schema schema.rb input.cton
|
|
429
207
|
```
|
|
430
|
-
````
|
|
431
208
|
|
|
432
209
|
---
|
|
433
210
|
|
|
434
|
-
## Type Safety
|
|
435
|
-
|
|
436
|
-
CTON ships with RBS signatures (`sig/cton.rbs`) to support type checking and IDE autocompletion.
|
|
437
|
-
|
|
438
211
|
## Development
|
|
439
212
|
|
|
440
213
|
```bash
|
|
441
214
|
bin/setup # install dependencies
|
|
442
215
|
bundle exec rake # run tests and rubocop
|
|
443
216
|
bin/console # interactive playground
|
|
444
|
-
bundle exec ruby bench/encode_decode_bench.rb # performance smoke test
|
|
445
217
|
```
|
|
446
218
|
|
|
447
|
-
|
|
219
|
+
---
|
|
448
220
|
|
|
449
221
|
## Contributing
|
|
450
222
|
|
data/lib/cton/binary.rb
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zlib"
|
|
4
|
+
|
|
5
|
+
module Cton
|
|
6
|
+
module Binary
|
|
7
|
+
MAGIC = "CTON".b
|
|
8
|
+
VERSION = 1
|
|
9
|
+
FLAG_COMPRESSED = 1
|
|
10
|
+
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
def dump(data, compress: true, **options)
|
|
14
|
+
payload = Cton.dump(data, **options).b
|
|
15
|
+
flags = 0
|
|
16
|
+
|
|
17
|
+
if compress
|
|
18
|
+
payload = Zlib.deflate(payload)
|
|
19
|
+
flags |= FLAG_COMPRESSED
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
header = MAGIC + [VERSION, flags].pack("CC")
|
|
23
|
+
header + encode_varint(payload.bytesize) + payload
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def load(binary)
|
|
27
|
+
source = binary.to_s.b
|
|
28
|
+
raise Cton::Error, "Invalid CTON-B header" unless source.start_with?(MAGIC)
|
|
29
|
+
|
|
30
|
+
version = source.getbyte(4)
|
|
31
|
+
flags = source.getbyte(5)
|
|
32
|
+
raise Cton::Error, "Unsupported CTON-B version" unless version == VERSION
|
|
33
|
+
|
|
34
|
+
length, consumed = decode_varint(source, 6)
|
|
35
|
+
payload_start = 6 + consumed
|
|
36
|
+
payload = source.byteslice(payload_start, length)
|
|
37
|
+
raise Cton::Error, "Invalid CTON-B payload length" if payload.nil? || payload.bytesize < length
|
|
38
|
+
|
|
39
|
+
payload = Zlib.inflate(payload) if (flags & FLAG_COMPRESSED).positive?
|
|
40
|
+
|
|
41
|
+
Cton.load(payload)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def encode_varint(value)
|
|
45
|
+
bytes = []
|
|
46
|
+
remaining = value
|
|
47
|
+
while remaining >= 0x80
|
|
48
|
+
bytes << ((remaining & 0x7f) | 0x80)
|
|
49
|
+
remaining >>= 7
|
|
50
|
+
end
|
|
51
|
+
bytes << remaining
|
|
52
|
+
bytes.pack("C*")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def decode_varint(source, offset)
|
|
56
|
+
result = 0
|
|
57
|
+
shift = 0
|
|
58
|
+
index = offset
|
|
59
|
+
|
|
60
|
+
loop do
|
|
61
|
+
byte = source.getbyte(index)
|
|
62
|
+
raise Cton::Error, "Invalid CTON-B varint" unless byte
|
|
63
|
+
|
|
64
|
+
result |= (byte & 0x7f) << shift
|
|
65
|
+
index += 1
|
|
66
|
+
break if (byte & 0x80).zero?
|
|
67
|
+
|
|
68
|
+
shift += 7
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
[result, index - offset]
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|