@creationix/jot 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +112 -23
  3. package/dist/jot.cjs +446 -0
  4. package/dist/jot.d.ts +6 -0
  5. package/dist/jot.js +442 -0
  6. package/package.json +34 -4
  7. package/SUMMARY.md +0 -151
  8. package/TOKEN_COUNTS.md +0 -97
  9. package/bun.lock +0 -19
  10. package/jot.test.ts +0 -133
  11. package/jot.ts +0 -650
  12. package/samples/chat.jot +0 -1
  13. package/samples/chat.json +0 -1
  14. package/samples/chat.pretty.jot +0 -6
  15. package/samples/chat.pretty.json +0 -16
  16. package/samples/firewall.jot +0 -1
  17. package/samples/firewall.json +0 -1
  18. package/samples/firewall.pretty.jot +0 -235
  19. package/samples/firewall.pretty.json +0 -344
  20. package/samples/github-issue.jot +0 -1
  21. package/samples/github-issue.json +0 -1
  22. package/samples/github-issue.pretty.jot +0 -15
  23. package/samples/github-issue.pretty.json +0 -20
  24. package/samples/hikes.jot +0 -1
  25. package/samples/hikes.json +0 -1
  26. package/samples/hikes.pretty.jot +0 -14
  27. package/samples/hikes.pretty.json +0 -38
  28. package/samples/irregular.jot +0 -1
  29. package/samples/irregular.json +0 -1
  30. package/samples/irregular.pretty.jot +0 -13
  31. package/samples/irregular.pretty.json +0 -23
  32. package/samples/json-counts-cache.jot +0 -1
  33. package/samples/json-counts-cache.json +0 -1
  34. package/samples/json-counts-cache.pretty.jot +0 -26
  35. package/samples/json-counts-cache.pretty.json +0 -26
  36. package/samples/key-folding-basic.jot +0 -1
  37. package/samples/key-folding-basic.json +0 -1
  38. package/samples/key-folding-basic.pretty.jot +0 -7
  39. package/samples/key-folding-basic.pretty.json +0 -25
  40. package/samples/key-folding-mixed.jot +0 -1
  41. package/samples/key-folding-mixed.json +0 -1
  42. package/samples/key-folding-mixed.pretty.jot +0 -16
  43. package/samples/key-folding-mixed.pretty.json +0 -24
  44. package/samples/key-folding-with-array.jot +0 -1
  45. package/samples/key-folding-with-array.json +0 -1
  46. package/samples/key-folding-with-array.pretty.jot +0 -6
  47. package/samples/key-folding-with-array.pretty.json +0 -29
  48. package/samples/large.jot +0 -1
  49. package/samples/large.json +0 -1
  50. package/samples/large.pretty.jot +0 -72
  51. package/samples/large.pretty.json +0 -93
  52. package/samples/logs.jot +0 -1
  53. package/samples/logs.json +0 -1
  54. package/samples/logs.pretty.jot +0 -96
  55. package/samples/logs.pretty.json +0 -350
  56. package/samples/medium.jot +0 -1
  57. package/samples/medium.json +0 -1
  58. package/samples/medium.pretty.jot +0 -13
  59. package/samples/medium.pretty.json +0 -30
  60. package/samples/metrics.jot +0 -1
  61. package/samples/metrics.json +0 -1
  62. package/samples/metrics.pretty.jot +0 -11
  63. package/samples/metrics.pretty.json +0 -25
  64. package/samples/package.jot +0 -1
  65. package/samples/package.json +0 -1
  66. package/samples/package.pretty.jot +0 -18
  67. package/samples/package.pretty.json +0 -18
  68. package/samples/products.jot +0 -1
  69. package/samples/products.json +0 -1
  70. package/samples/products.pretty.jot +0 -69
  71. package/samples/products.pretty.json +0 -235
  72. package/samples/routes.jot +0 -1
  73. package/samples/routes.json +0 -1
  74. package/samples/routes.pretty.jot +0 -142
  75. package/samples/routes.pretty.json +0 -354
  76. package/samples/small.jot +0 -1
  77. package/samples/small.json +0 -1
  78. package/samples/small.pretty.jot +0 -8
  79. package/samples/small.pretty.json +0 -12
  80. package/samples/users-50.jot +0 -1
  81. package/samples/users-50.json +0 -1
  82. package/samples/users-50.pretty.jot +0 -53
  83. package/samples/users-50.pretty.json +0 -354
package/dist/jot.js ADDED
@@ -0,0 +1,442 @@
1
+ const RESERVED = new Set(["true", "false", "null"])
2
+ const UNSAFE = [":", ",", "{", "}", "[", "]", '"', ";", "\\"]
3
+ const WS_RE = /\s/
4
+ const KEY_TERM_RE = /[:\,{}\[\];]|\s/
5
+ function needsQuotes(s, extra = []) {
6
+ const chars = [...UNSAFE, ...extra]
7
+ return (
8
+ s === "" ||
9
+ s.trim() !== s ||
10
+ RESERVED.has(s) ||
11
+ !Number.isNaN(Number(s)) ||
12
+ chars.some((c) => s.includes(c)) ||
13
+ [...s].some((c) => c.charCodeAt(0) < 32)
14
+ )
15
+ }
16
+ const quote = (s) => (needsQuotes(s) ? JSON.stringify(s) : s)
17
+ const quoteKey = (s) => (needsQuotes(s, ["."]) ? JSON.stringify(s) : s)
18
+ function getFoldPath(value) {
19
+ const path = []
20
+ let current = value
21
+ while (current !== null && typeof current === "object" && !Array.isArray(current)) {
22
+ const keys = Object.keys(current)
23
+ if (keys.length !== 1 || keys[0].includes(".")) {
24
+ break
25
+ }
26
+ path.push(keys[0])
27
+ current = current[keys[0]]
28
+ }
29
+ return path.length > 0 ? { path, leaf: current } : null
30
+ }
31
+ function groupBySchema(arr) {
32
+ const groups = []
33
+ for (const obj of arr) {
34
+ const keys = Object.keys(obj)
35
+ const last = groups.at(-1)
36
+ if (last && last.keys.join(",") === keys.join(",")) {
37
+ last.objects.push(obj)
38
+ } else {
39
+ groups.push({ keys, objects: [obj] })
40
+ }
41
+ }
42
+ return groups
43
+ }
44
+ let opts = {}
45
+ let depth = 0
46
+ const ind = () => (opts.pretty ? (opts.indent ?? " ").repeat(depth) : "")
47
+ function stringifyValue(value, atLineStart = false) {
48
+ if (value === null) {
49
+ return "null"
50
+ }
51
+ if (typeof value === "boolean") {
52
+ return String(value)
53
+ }
54
+ if (typeof value === "number") {
55
+ return String(value)
56
+ }
57
+ if (typeof value === "string") {
58
+ return quote(value)
59
+ }
60
+ if (Array.isArray(value)) {
61
+ return stringifyArray(value)
62
+ }
63
+ if (typeof value === "object") {
64
+ return stringifyObject(value, atLineStart)
65
+ }
66
+ return String(value)
67
+ }
68
+ function stringifyArray(arr) {
69
+ const isTable = arr.length >= 2 && arr.every((i) => i !== null && typeof i === "object" && !Array.isArray(i))
70
+ if (isTable) {
71
+ const groups = groupBySchema(arr)
72
+ if (groups.some((g) => g.objects.length >= 2)) {
73
+ return stringifyTable(groups)
74
+ }
75
+ }
76
+ if (arr.length === 1) {
77
+ return `[${stringifyValue(arr[0])}]`
78
+ }
79
+ const hasComplex = arr.some((i) => i !== null && typeof i === "object")
80
+ if (opts.pretty && arr.length > 0 && hasComplex) {
81
+ depth++
82
+ const items = arr.map((i) => `${ind()}${stringifyValue(i, true)}`)
83
+ depth--
84
+ return `[\n${items.join(",\n")}\n${ind()}]`
85
+ }
86
+ const sep = opts.pretty ? ", " : ","
87
+ const items = arr.map((v) => stringifyValue(v)).join(sep)
88
+ return opts.pretty ? `[ ${items} ]` : `[${items}]`
89
+ }
90
+ function stringifyTable(groups) {
91
+ const sep = opts.pretty ? ", " : ","
92
+ if (opts.pretty) {
93
+ depth++
94
+ const schemaInd = ind()
95
+ depth++
96
+ const dataInd = ind()
97
+ const rows = []
98
+ for (const { keys, objects } of groups) {
99
+ rows.push(`${schemaInd}:${keys.map((k) => quoteKey(k)).join(sep)}`)
100
+ for (const obj of objects) rows.push(`${dataInd}${keys.map((k) => stringifyValue(obj[k])).join(sep)}`)
101
+ }
102
+ depth -= 2
103
+ return `{{\n${rows.join("\n")}\n${ind()}}}`
104
+ }
105
+ const parts = []
106
+ for (const { keys, objects } of groups) {
107
+ parts.push(`:${keys.map((k) => quoteKey(k)).join(sep)}`)
108
+ for (const obj of objects) {
109
+ parts.push(keys.map((k) => stringifyValue(obj[k])).join(sep))
110
+ }
111
+ }
112
+ return `{{${parts.join(";")}}}`
113
+ }
114
+ function stringifyObject(obj, atLineStart = false) {
115
+ const keys = Object.keys(obj)
116
+ const pair = (k, pretty) => {
117
+ const val = obj[k]
118
+ if (!needsQuotes(k, ["."]) && val !== null && typeof val === "object" && !Array.isArray(val)) {
119
+ const fold = getFoldPath(val)
120
+ if (fold) {
121
+ const foldedKey = `${k}.${fold.path.join(".")}`
122
+ return pretty ? `${foldedKey}: ${stringifyValue(fold.leaf)}` : `${foldedKey}:${stringifyValue(fold.leaf)}`
123
+ }
124
+ }
125
+ const qk = quoteKey(k)
126
+ return pretty ? `${qk}: ${stringifyValue(val)}` : `${qk}:${stringifyValue(val)}`
127
+ }
128
+ if (opts.pretty && keys.length > 1) {
129
+ depth++
130
+ const rawPairs = keys.map((k) => pair(k, true))
131
+ const lastMulti = rawPairs.at(-1)?.endsWith("}") || rawPairs.at(-1)?.endsWith("]")
132
+ const compact = atLineStart && !lastMulti
133
+ const pairs = rawPairs.map((p, i) => (i === 0 && compact ? p : `${ind()}${p}`))
134
+ depth--
135
+ return compact ? `{ ${pairs.join(",\n")} }` : `{\n${pairs.join(",\n")}\n${ind()}}`
136
+ }
137
+ if (opts.pretty && keys.length === 1) {
138
+ return `{ ${pair(keys[0], true)} }`
139
+ }
140
+ return `{${keys.map((k) => pair(k, false)).join(",")}}`
141
+ }
142
+ export function stringify(data, options = {}) {
143
+ opts = { pretty: false, indent: " ", ...options }
144
+ depth = 0
145
+ return stringifyValue(data)
146
+ }
147
+ // Parser
148
+ class JotParser {
149
+ input
150
+ pos = 0
151
+ constructor(input) {
152
+ this.input = input
153
+ }
154
+ parse() {
155
+ this.ws()
156
+ const result = this.value("")
157
+ this.ws()
158
+ if (this.pos < this.input.length) {
159
+ throw new Error(`Unexpected character at position ${this.pos}: '${this.input[this.pos]}'`)
160
+ }
161
+ return result
162
+ }
163
+ ws() {
164
+ while (this.pos < this.input.length && WS_RE.test(this.input[this.pos])) this.pos++
165
+ }
166
+ peek = () => this.input[this.pos] || ""
167
+ value(terminators = "") {
168
+ this.ws()
169
+ const ch = this.peek()
170
+ if (ch === "{") {
171
+ return this.input[this.pos + 1] === "{" ? this.table() : this.object()
172
+ }
173
+ if (ch === "[") {
174
+ return this.array()
175
+ }
176
+ if (ch === '"') {
177
+ return this.quoted()
178
+ }
179
+ return this.atom(terminators)
180
+ }
181
+ quoted() {
182
+ this.pos++
183
+ let result = ""
184
+ while (this.pos < this.input.length) {
185
+ const ch = this.input[this.pos]
186
+ if (ch === '"') {
187
+ this.pos++
188
+ return result
189
+ }
190
+ if (ch === "\\") {
191
+ this.pos++
192
+ const esc = this.input[this.pos]
193
+ const escMap = {
194
+ '"': '"',
195
+ "\\": "\\",
196
+ "/": "/",
197
+ b: "\b",
198
+ f: "\f",
199
+ n: "\n",
200
+ r: "\r",
201
+ t: "\t",
202
+ }
203
+ if (esc in escMap) {
204
+ result += escMap[esc]
205
+ } else if (esc === "u") {
206
+ result += String.fromCharCode(Number.parseInt(this.input.slice(this.pos + 1, this.pos + 5), 16))
207
+ this.pos += 4
208
+ } else {
209
+ throw new Error(`Invalid escape sequence '\\${esc}'`)
210
+ }
211
+ } else {
212
+ result += ch
213
+ }
214
+ this.pos++
215
+ }
216
+ throw new Error("Unterminated string")
217
+ }
218
+ parseToken(terminators) {
219
+ const start = this.pos
220
+ if (terminators === "") {
221
+ const token = this.input.slice(start).trim()
222
+ this.pos = this.input.length
223
+ if (token === "") {
224
+ throw new Error(`Unexpected end of input at position ${start}`)
225
+ }
226
+ return token
227
+ }
228
+ while (this.pos < this.input.length && !terminators.includes(this.input[this.pos])) {
229
+ this.pos++
230
+ }
231
+ const token = this.input.slice(start, this.pos).trim()
232
+ if (token === "") {
233
+ throw new Error(`Unexpected character at position ${this.pos}: '${this.peek()}'`)
234
+ }
235
+ return token
236
+ }
237
+ tokenToValue(token) {
238
+ if (token === "null") {
239
+ return null
240
+ }
241
+ if (token === "true") {
242
+ return true
243
+ }
244
+ if (token === "false") {
245
+ return false
246
+ }
247
+ const num = Number(token)
248
+ if (!Number.isNaN(num) && token !== "") {
249
+ return num
250
+ }
251
+ return token
252
+ }
253
+ atom(terminators) {
254
+ return this.tokenToValue(this.parseToken(terminators))
255
+ }
256
+ array() {
257
+ this.pos++
258
+ const result = []
259
+ this.ws()
260
+ while (this.peek() !== "]") {
261
+ if (this.pos >= this.input.length) {
262
+ throw new Error("Unterminated array")
263
+ }
264
+ result.push(this.value(",]"))
265
+ this.ws()
266
+ if (this.peek() === ",") {
267
+ this.pos++
268
+ this.ws()
269
+ }
270
+ }
271
+ this.pos++
272
+ return result
273
+ }
274
+ table() {
275
+ this.pos += 2
276
+ const result = []
277
+ let schema = []
278
+ this.ws()
279
+ while (this.input.slice(this.pos, this.pos + 2) !== "}}") {
280
+ if (this.pos >= this.input.length) {
281
+ throw new Error("Unterminated table")
282
+ }
283
+ this.ws()
284
+ if (this.peek() === ":") {
285
+ this.pos++
286
+ schema = this.schemaRow()
287
+ } else {
288
+ if (schema.length === 0) {
289
+ throw new Error(`Data row without schema at position ${this.pos}`)
290
+ }
291
+ const values = this.dataRow(schema.length)
292
+ const obj = {}
293
+ for (let i = 0; i < schema.length; i++) {
294
+ obj[schema[i]] = values[i]
295
+ }
296
+ result.push(obj)
297
+ }
298
+ this.ws()
299
+ if (this.peek() === ";") {
300
+ this.pos++
301
+ this.ws()
302
+ }
303
+ }
304
+ this.pos += 2
305
+ return result
306
+ }
307
+ schemaRow() {
308
+ const cols = []
309
+ let col = ""
310
+ while (this.pos < this.input.length) {
311
+ const ch = this.input[this.pos]
312
+ if ((ch === "}" && this.input[this.pos + 1] === "}") || ch === ";" || ch === "\n") {
313
+ if (col.trim()) {
314
+ cols.push(col.trim())
315
+ }
316
+ break
317
+ }
318
+ if (ch === ",") {
319
+ if (col.trim()) {
320
+ cols.push(col.trim())
321
+ }
322
+ col = ""
323
+ this.pos++
324
+ continue
325
+ }
326
+ col += ch
327
+ this.pos++
328
+ }
329
+ return cols
330
+ }
331
+ dataRow(colCount) {
332
+ const values = []
333
+ for (let i = 0; i < colCount; i++) {
334
+ this.ws()
335
+ values.push(this.tableValue(i < colCount - 1 ? ",;}\n" : ";}\n"))
336
+ this.ws()
337
+ if (this.peek() === ",") {
338
+ this.pos++
339
+ }
340
+ }
341
+ return values
342
+ }
343
+ tableValue(terminators) {
344
+ this.ws()
345
+ const ch = this.peek()
346
+ if (ch === '"') {
347
+ return this.quoted()
348
+ }
349
+ if (ch === "{") {
350
+ return this.input[this.pos + 1] === "{" ? this.table() : this.object()
351
+ }
352
+ if (ch === "[") {
353
+ return this.array()
354
+ }
355
+ const start = this.pos
356
+ while (this.pos < this.input.length) {
357
+ const c = this.input[this.pos]
358
+ if ((c === "}" && this.input[this.pos + 1] === "}") || terminators.includes(c)) {
359
+ break
360
+ }
361
+ this.pos++
362
+ }
363
+ const token = this.input.slice(start, this.pos).trim()
364
+ return token === "" ? null : this.tokenToValue(token)
365
+ }
366
+ object() {
367
+ this.pos++
368
+ const result = {}
369
+ this.ws()
370
+ while (this.peek() !== "}") {
371
+ if (this.pos >= this.input.length) {
372
+ throw new Error("Unterminated object")
373
+ }
374
+ const { key, quoted } = this.parseKey()
375
+ this.ws()
376
+ if (this.peek() !== ":") {
377
+ throw new Error(`Expected ':' after key '${key}' at position ${this.pos}`)
378
+ }
379
+ this.pos++
380
+ const value = this.value(",}")
381
+ if (quoted) {
382
+ result[key] = value
383
+ } else {
384
+ this.merge(result, this.unfold(key, value))
385
+ }
386
+ this.ws()
387
+ if (this.peek() === ",") {
388
+ this.pos++
389
+ this.ws()
390
+ }
391
+ }
392
+ this.pos++
393
+ return result
394
+ }
395
+ parseKey() {
396
+ this.ws()
397
+ if (this.peek() === '"') {
398
+ return { key: this.quoted(), quoted: true }
399
+ }
400
+ const start = this.pos
401
+ while (this.pos < this.input.length && !KEY_TERM_RE.test(this.input[this.pos])) this.pos++
402
+ const key = this.input.slice(start, this.pos)
403
+ if (key === "") {
404
+ throw new Error(`Expected key at position ${this.pos}`)
405
+ }
406
+ return { key, quoted: false }
407
+ }
408
+ unfold(keyPath, value) {
409
+ const parts = keyPath.split(".")
410
+ const result = {}
411
+ let current = result
412
+ for (let i = 0; i < parts.length - 1; i++) {
413
+ const nested = {}
414
+ current[parts[i]] = nested
415
+ current = nested
416
+ }
417
+ current[parts.at(-1)] = value
418
+ return result
419
+ }
420
+ merge(target, src) {
421
+ for (const key of Object.keys(src)) {
422
+ const tv = target[key]
423
+ const sv = src[key]
424
+ if (
425
+ key in target &&
426
+ typeof tv === "object" &&
427
+ tv !== null &&
428
+ !Array.isArray(tv) &&
429
+ typeof sv === "object" &&
430
+ sv !== null &&
431
+ !Array.isArray(sv)
432
+ ) {
433
+ this.merge(tv, sv)
434
+ } else {
435
+ target[key] = sv
436
+ }
437
+ }
438
+ }
439
+ }
440
+ export function parse(input) {
441
+ return new JotParser(input).parse()
442
+ }
package/package.json CHANGED
@@ -1,10 +1,40 @@
1
1
  {
2
2
  "name": "@creationix/jot",
3
- "version": "0.0.1",
3
+ "repository": {
4
+ "type": "git",
5
+ "url": "https://github.com/creationix/jot"
6
+ },
7
+ "version": "0.1.0",
8
+ "description": "LLM and human friendly JSON alternative",
9
+ "type": "module",
10
+ "main": "./dist/jot.cjs",
11
+ "module": "./dist/jot.js",
12
+ "types": "./dist/jot.d.ts",
13
+ "exports": {
14
+ "import": "./dist/jot.js",
15
+ "require": "./dist/jot.cjs"
16
+ },
17
+ "files": [
18
+ "dist/*",
19
+ "README.md"
20
+ ],
21
+ "scripts": {
22
+ "build:clean": "rm -rf dist/*",
23
+ "build:cjs": "tsc --project tsconfig.cjs.json && mv dist/jot.js dist/jot.cjs",
24
+ "build:esm": "tsc --project tsconfig.json",
25
+ "build:tidy": "biome format --fix dist && biome lint --fix dist",
26
+ "build": "npm run build:clean && npm run build:cjs && npm run build:esm && npm run build:tidy",
27
+ "format": "biome format --fix src && biome lint --fix src",
28
+ "lint": "biome lint src",
29
+ "test": "bun test",
30
+ "prepublish": "npm run format && npm run lint && npm run build && bun samples/measure-tokens.ts"
31
+ },
4
32
  "author": "Tim Caswell <tim@creationix.com>",
5
33
  "license": "MIT",
6
- "dependencies": {},
7
34
  "devDependencies": {
8
- "@types/bun": "^1.3.5"
35
+ "@anthropic-ai/tokenizer": "^0.0.4",
36
+ "@biomejs/biome": "^1.9.4",
37
+ "@types/bun": "^1.3.5",
38
+ "typescript": "^5.7.3"
9
39
  }
10
- }
40
+ }
package/SUMMARY.md DELETED
@@ -1,151 +0,0 @@
1
- # Encoding Format Comparison
2
-
3
- Token counts for 18 test documents across three tokenizers. For LLM systems, **tokens matter more than bytes**.
4
-
5
- ## Recommendation
6
-
7
- **Use Jot** for LLM contexts — saves 16-17% tokens vs JSON.
8
-
9
- ## Token Efficiency
10
-
11
- <!-- CHART_START -->
12
- ```mermaid
13
- xychart-beta
14
- title "Token Counts by Format"
15
- x-axis ["Jot", "JSON-m", "JSONito", "Jot-P", "D2", "TOON", "YAML", "TOML", "JSON-s", "JSON-p"]
16
- y-axis "Tokens" 0 --> 16000
17
- line "Qwen" [6525, 7748, 7757, 8239, 8292, 8315, 9543, 10180, 11799, 12656]
18
- line "Legacy" [6420, 7377, 7794, 7204, 7582, 7079, 7661, 11204, 10966, 11937]
19
- line "Claude" [6747, 8132, 8327, 8500, 7928, 8405, 9456, 11485, 12687, 14403]
20
- ```
21
- <!-- CHART_END -->
22
-
23
- ### Compact Formats
24
-
25
- For machine-to-machine or LLM contexts where readability isn't required.
26
-
27
- <!-- COMPACT_START -->
28
- | Format | Qwen | Legacy | Claude | Bytes |
29
- |-----------------------------------------------------|---------------:|---------------:|---------------:|---------------:|
30
- | **[Jot](jot/)** | 6,525 (-16%) | 6,420 (-13%) | 6,747 (-17%) | 16,621 (-28%) |
31
- | [JSON](https://www.json.org/) (mini) | 7,748 | 7,377 | 8,132 | 23,119 |
32
- | [JSONito](https://github.com/creationix/jsonito) | 7,757 (+0%) | 7,794 (+6%) | 8,327 (+2%) | 14,059 (-39%) |
33
- | [D2](https://github.com/creationix/d2) | 8,292 (+7%) | 7,582 (+3%) | 7,928 (-3%) | 17,328 (-25%) |
34
- <!-- COMPACT_END -->
35
-
36
- ### Pretty-Printed Formats
37
-
38
- For human-readable output or when LLMs need to read/write structured data.
39
-
40
- <!-- PRETTY_START -->
41
- | Format | Qwen | Legacy | Claude | Bytes |
42
- |-----------------------------------------------------|---------------:|---------------:|---------------:|---------------:|
43
- | **[Jot](jot/) (pretty)** | 8,239 (-35%) | 7,204 (-40%) | 8,500 (-41%) | 23,676 (-41%) |
44
- | [TOON](toon/) | 8,315 (-34%) | 7,079 (-41%) | 8,405 (-42%) | 22,780 (-43%) |
45
- | [YAML](https://yaml.org/) | 9,543 (-25%) | 7,661 (-36%) | 9,456 (-34%) | 26,757 (-33%) |
46
- | [TOML](https://toml.io/) | 10,180 (-20%) | 11,204 (-6%) | 11,485 (-20%) | 28,930 (-27%) |
47
- | [JSON](json/smart-json.ts) (smart) | 11,799 (-7%) | 10,966 (-8%) | 12,687 (-12%) | 32,657 (-18%) |
48
- | [JSON](https://www.json.org/) (pretty) | 12,656 | 11,937 | 14,403 | 39,884 |
49
- <!-- PRETTY_END -->
50
-
51
- ## Format Descriptions
52
-
53
- ### [Jot](jot/)
54
-
55
- JSON with three optimizations:
56
-
57
- 1. **Unquoted strings** — omit quotes unless value contains `: ; , { } [ ] "` or parses as number/boolean/null
58
- 2. **Key folding** — `{a:{b:1}}` → `{a.b:1}` for single-key nested objects
59
- 3. **Tables** — `[{a:1},{a:2}]` → `{{:a;1;2}}` for repeating object schemas
60
-
61
- ```jot
62
- {config.host:localhost,users:{{:id,name;1,Alice;2,Bob}}}
63
- ```
64
-
65
- It also has a pretty-printed variant that adds indentation and newlines for readability.
66
-
67
- ```jot
68
- {
69
- config.host: localhost,
70
- users: {{
71
- :id, name;
72
- 1, Alice;
73
- 2, Bob
74
- }}
75
- }
76
- ```
77
-
78
- ### [TOON](toon/)
79
-
80
- YAML-like indentation with optional table syntax and count guards.
81
-
82
- ```toon
83
- users[2]{id,name}:
84
- 1,Alice
85
- 2,Bob
86
- ```
87
-
88
- ### [JSONito](https://github.com/creationix/jsonito)
89
-
90
- Byte-optimized JSON with string deduplication via preamble dictionary.
91
-
92
- ```jito
93
- {name'config'version'5~1.0.0enabled'!a~maxRetries6.timeout'eFw.tags'[a~productionapi'v1']}
94
- ```
95
-
96
- ### [D2](https://github.com/creationix/d2)
97
-
98
- Declarative data format using `=` assignment and shell-like quoting.
99
-
100
- ## Why Not Byte-Optimized Formats?
101
-
102
- Formats like JSONito achieve excellent byte compression (-39%) but:
103
-
104
- - Token savings are inconsistent (small docs often cost more than JSON)
105
- - Deduplication preambles add overhead that doesn't scale down
106
- - LLMs cannot reliably generate formats requiring state tracking
107
-
108
- ## LLM Encoding Ability
109
-
110
- Tested Qwen3-30b's ability to encode JSON → Jot (3 runs per document, 17 docs):
111
-
112
- | Document Type | Semantic Accuracy |
113
- |----------------------------------------------|------------------:|
114
- | Simple configs (small, metrics, package) | 100% |
115
- | Key folding test cases | 100% |
116
- | Table-friendly (users-50) | 100% |
117
- | Text-heavy (chat) | 100% |
118
- | Complex/nested (large, firewall, routes) | 0% |
119
- | Irregular schemas (medium, hikes, irregular) | 0% |
120
- | **Overall** | **47%** |
121
-
122
- Small models struggle with Jot's advanced features on complex documents. For LLM-generated output, consider using simpler Jot (unquoted strings only) or providing FORMAT.md as context.
123
-
124
- ## Tokenizers
125
-
126
- - **Qwen**: Qwen3-Coder-30b via LM Studio API
127
- - **Legacy**: Anthropic legacy tokenizer (`@anthropic-ai/tokenizer`)
128
- - **Claude**: Claude API token counting endpoint (Sonnet/Opus/Haiku share tokenizer)
129
-
130
- ## Test Data
131
-
132
- 18 documents covering diverse structures:
133
-
134
- | Document | Description |
135
- |-------------------|----------------------------------|
136
- | small | Config object (6 fields) |
137
- | medium | User list with metadata |
138
- | large | Kubernetes deployment spec |
139
- | hikes | Tabular records (uniform schema) |
140
- | chat | LLM conversation (text-heavy) |
141
- | metrics | Time series (numeric-heavy) |
142
- | package | npm manifest (nested deps) |
143
- | github-issue | Mixed nesting with labels |
144
- | irregular | Event log (varying keys) |
145
- | users-50 | 50 user records (table-friendly) |
146
- | logs | 50 log entries (semi-uniform) |
147
- | firewall | WAF rules (deeply nested) |
148
- | products | E-commerce catalog (variants) |
149
- | routes | API routing config (large tables)|
150
- | key-folding-* | Key folding test cases |
151
- | json-counts-cache | Cached token counts |