slimjson 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -25
- package/README_EN.md +34 -25
- package/compress.js +44 -74
- package/coverage/clover.xml +275 -0
- package/coverage/coverage-final.json +2 -0
- package/coverage/lcov-report/base.css +224 -0
- package/coverage/lcov-report/block-navigation.js +87 -0
- package/coverage/lcov-report/compress.js.html +1630 -0
- package/coverage/lcov-report/favicon.png +0 -0
- package/coverage/lcov-report/index.html +116 -0
- package/coverage/lcov-report/prettify.css +1 -0
- package/coverage/lcov-report/prettify.js +2 -0
- package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
- package/coverage/lcov-report/sorter.js +210 -0
- package/coverage/lcov.info +636 -0
- package/data/data.json +96365 -0
- package/data/data.json.slim +1 -0
- package/esm.mjs +1 -0
- package/package.json +1 -1
- package/test.js +719 -214
- package/.claude/settings.local.json +0 -11
package/README.md
CHANGED
|
@@ -440,61 +440,70 @@ console.log(`压缩率: ${ratio}%`);
|
|
|
440
440
|
|
|
441
441
|
## LLM 数据检索准确率
|
|
442
442
|
|
|
443
|
-
使用 209
|
|
443
|
+
使用 209 道数据检索题在 2 个模型上测试不同格式下 LLM 的理解准确率。
|
|
444
444
|
|
|
445
445
|
#### 效率排名(每 1K tokens 的准确率)
|
|
446
446
|
|
|
447
447
|
```
|
|
448
|
-
slimjson ████████████████████ 44.
|
|
449
|
-
TOON ███████████████░░░░░
|
|
448
|
+
slimjson ████████████████████ 44.3 acc%/1K tok │ 94.5% acc │ 2,133 tokens
|
|
449
|
+
TOON ███████████████░░░░░ 33.8 acc%/1K tok │ 92.3% acc │ 2,734 tokens
|
|
450
450
|
JSON compact ██████████████░░░░░░ 31.0 acc%/1K tok │ 95.2% acc │ 3,072 tokens
|
|
451
|
-
YAML ███████████░░░░░░░░░
|
|
452
|
-
JSON
|
|
453
|
-
XML ████████░░░░░░░░░░░░ 18.
|
|
451
|
+
YAML ███████████░░░░░░░░░ 24.9 acc%/1K tok │ 92.3% acc │ 3,716 tokens
|
|
452
|
+
JSON █████████░░░░░░░░░░░ 20.3 acc%/1K tok │ 92.3% acc │ 4,538 tokens
|
|
453
|
+
XML ████████░░░░░░░░░░░░ 18.1 acc%/1K tok │ 93.3% acc │ 5,162 tokens
|
|
454
454
|
```
|
|
455
455
|
|
|
456
456
|
*效率分数 = (准确率% ÷ tokens) × 1,000,越高越好。*
|
|
457
457
|
|
|
458
|
-
> slimjson 准确率 **94.
|
|
458
|
+
> slimjson 准确率 **94.5%**(vs JSON 的 92.3%),同时节省 **53.0%** tokens。
|
|
459
459
|
|
|
460
460
|
#### 各模型准确率
|
|
461
461
|
|
|
462
462
|
```
|
|
463
463
|
deepseek-v4-flash
|
|
464
|
-
JSON ███████████████████░ 95.7% (200/209)
|
|
465
464
|
XML ███████████████████░ 95.7% (200/209)
|
|
465
|
+
JSON ███████████████████░ 95.7% (200/209)
|
|
466
466
|
JSON compact ███████████████████░ 95.2% (199/209)
|
|
467
|
-
→ slimjson ███████████████████░ 94.7% (198/209)
|
|
468
467
|
YAML ███████████████████░ 94.3% (197/209)
|
|
468
|
+
→ slimjson ███████████████████░ 93.3% (195/209)
|
|
469
469
|
TOON ███████████████████░ 92.8% (194/209)
|
|
470
470
|
CSV ██████████████████░░ 91.7% (100/109)
|
|
471
|
+
|
|
472
|
+
mimo-v2.5-pro
|
|
473
|
+
→ slimjson ███████████████████░ 95.7% (200/209)
|
|
474
|
+
JSON compact ███████████████████░ 95.2% (199/209)
|
|
475
|
+
TOON ██████████████████░░ 91.9% (192/209)
|
|
476
|
+
XML ██████████████████░░ 90.9% (190/209)
|
|
477
|
+
YAML ██████████████████░░ 90.4% (189/209)
|
|
478
|
+
JSON ██████████████████░░ 89.0% (186/209)
|
|
479
|
+
CSV ██████████████████░░ 88.1% (96/109)
|
|
471
480
|
```
|
|
472
481
|
|
|
473
482
|
#### 按题型准确率
|
|
474
483
|
|
|
475
|
-
| 题型 | JSON | XML | JSON
|
|
476
|
-
|
|
477
|
-
| 字段检索 |
|
|
478
|
-
| 聚合计算 |
|
|
479
|
-
| 条件筛选 | 97.9% |
|
|
480
|
-
| 结构感知 | 88.0% |
|
|
481
|
-
| 结构验证 |
|
|
484
|
+
| 题型 | JSON compact | slimjson | XML | JSON | TOON | YAML | CSV |
|
|
485
|
+
|------|-------------|----------|-----|------|------|------|-----|
|
|
486
|
+
| 字段检索 | 99.3% | 98.5% | 98.5% | 99.3% | 95.6% | 98.5% | 98.4% |
|
|
487
|
+
| 聚合计算 | 94.4% | 96.0% | 88.9% | 89.7% | 92.9% | 90.5% | 84.5% |
|
|
488
|
+
| 条件筛选 | 97.9% | 96.9% | 94.8% | 91.7% | 93.8% | 92.7% | 88.9% |
|
|
489
|
+
| 结构感知 | 88.0% | 88.0% | 90.0% | 90.0% | 90.0% | 88.0% | 87.5% |
|
|
490
|
+
| 结构验证 | 60.0% | 30.0% | 80.0% | 50.0% | 40.0% | 50.0% | 80.0% |
|
|
482
491
|
|
|
483
492
|
#### 测试数据集
|
|
484
493
|
|
|
485
|
-
| 数据集 | 行数 | 结构类型 | CSV 支持 |
|
|
486
|
-
|
|
487
|
-
| 均匀员工记录 | 100 | 均匀 | ✓ |
|
|
488
|
-
| 电商订单(嵌套结构) | 50 | 嵌套 | ✗ |
|
|
489
|
-
| 时间序列分析数据 | 60 | 均匀 | ✓ |
|
|
490
|
-
| Top 100 GitHub 仓库 | 100 | 均匀 | ✓ |
|
|
491
|
-
| 半均匀事件日志 | 75 | 半均匀 | ✗ |
|
|
492
|
-
| 深层嵌套配置 | 11 | 深层 | ✗ |
|
|
494
|
+
| 数据集 | 行数 | 结构类型 | CSV 支持 | 表格化程度 |
|
|
495
|
+
|--------|------|----------|----------|-----------|
|
|
496
|
+
| 均匀员工记录 | 100 | 均匀 | ✓ | 100% |
|
|
497
|
+
| 电商订单(嵌套结构) | 50 | 嵌套 | ✗ | 33% |
|
|
498
|
+
| 时间序列分析数据 | 60 | 均匀 | ✓ | 100% |
|
|
499
|
+
| Top 100 GitHub 仓库 | 100 | 均匀 | ✓ | 100% |
|
|
500
|
+
| 半均匀事件日志 | 75 | 半均匀 | ✗ | 50% |
|
|
501
|
+
| 深层嵌套配置 | 11 | 深层 | ✗ | 0% |
|
|
493
502
|
|
|
494
503
|
## 开发
|
|
495
504
|
|
|
496
505
|
```bash
|
|
497
|
-
# 运行测试(
|
|
506
|
+
# 运行测试(209 个用例,100% 覆盖率)
|
|
498
507
|
npm test
|
|
499
508
|
|
|
500
509
|
# 运行压缩率基准测试(含 trim 对比)
|
package/README_EN.md
CHANGED
|
@@ -429,61 +429,70 @@ Flat tabular datasets where CSV is applicable.
|
|
|
429
429
|
|
|
430
430
|
## LLM Data Retrieval Accuracy
|
|
431
431
|
|
|
432
|
-
Accuracy tested with 209 data retrieval questions across different input formats.
|
|
432
|
+
Accuracy tested with 209 data retrieval questions across 2 LLMs on different input formats.
|
|
433
433
|
|
|
434
434
|
#### Efficiency Ranking (Accuracy per 1K Tokens)
|
|
435
435
|
|
|
436
436
|
```
|
|
437
|
-
slimjson ████████████████████ 44.
|
|
438
|
-
TOON ███████████████░░░░░
|
|
437
|
+
slimjson ████████████████████ 44.3 acc%/1K tok │ 94.5% acc │ 2,133 tokens
|
|
438
|
+
TOON ███████████████░░░░░ 33.8 acc%/1K tok │ 92.3% acc │ 2,734 tokens
|
|
439
439
|
JSON compact ██████████████░░░░░░ 31.0 acc%/1K tok │ 95.2% acc │ 3,072 tokens
|
|
440
|
-
YAML ███████████░░░░░░░░░
|
|
441
|
-
JSON
|
|
442
|
-
XML ████████░░░░░░░░░░░░ 18.
|
|
440
|
+
YAML ███████████░░░░░░░░░ 24.9 acc%/1K tok │ 92.3% acc │ 3,716 tokens
|
|
441
|
+
JSON █████████░░░░░░░░░░░ 20.3 acc%/1K tok │ 92.3% acc │ 4,538 tokens
|
|
442
|
+
XML ████████░░░░░░░░░░░░ 18.1 acc%/1K tok │ 93.3% acc │ 5,162 tokens
|
|
443
443
|
```
|
|
444
444
|
|
|
445
445
|
*Efficiency score = (Accuracy % ÷ Tokens) × 1,000. Higher is better.*
|
|
446
446
|
|
|
447
|
-
> slimjson achieves **94.
|
|
447
|
+
> slimjson achieves **94.5%** accuracy (vs JSON's 92.3%) while using **53.0% fewer tokens**.
|
|
448
448
|
|
|
449
449
|
#### Per-Model Accuracy
|
|
450
450
|
|
|
451
451
|
```
|
|
452
452
|
deepseek-v4-flash
|
|
453
|
-
JSON ███████████████████░ 95.7% (200/209)
|
|
454
453
|
XML ███████████████████░ 95.7% (200/209)
|
|
454
|
+
JSON ███████████████████░ 95.7% (200/209)
|
|
455
455
|
JSON compact ███████████████████░ 95.2% (199/209)
|
|
456
|
-
→ slimjson ███████████████████░ 94.7% (198/209)
|
|
457
456
|
YAML ███████████████████░ 94.3% (197/209)
|
|
457
|
+
→ slimjson ███████████████████░ 93.3% (195/209)
|
|
458
458
|
TOON ███████████████████░ 92.8% (194/209)
|
|
459
459
|
CSV ██████████████████░░ 91.7% (100/109)
|
|
460
|
+
|
|
461
|
+
mimo-v2.5-pro
|
|
462
|
+
→ slimjson ███████████████████░ 95.7% (200/209)
|
|
463
|
+
JSON compact ███████████████████░ 95.2% (199/209)
|
|
464
|
+
TOON ██████████████████░░ 91.9% (192/209)
|
|
465
|
+
XML ██████████████████░░ 90.9% (190/209)
|
|
466
|
+
YAML ██████████████████░░ 90.4% (189/209)
|
|
467
|
+
JSON ██████████████████░░ 89.0% (186/209)
|
|
468
|
+
CSV ██████████████████░░ 88.1% (96/109)
|
|
460
469
|
```
|
|
461
470
|
|
|
462
471
|
#### Accuracy by Question Type
|
|
463
472
|
|
|
464
|
-
| Question Type | JSON | XML | JSON
|
|
465
|
-
|
|
466
|
-
| Field Retrieval |
|
|
467
|
-
| Aggregation |
|
|
468
|
-
| Filtering | 97.9% |
|
|
469
|
-
| Structure Awareness | 88.0% |
|
|
470
|
-
| Structural Validation |
|
|
473
|
+
| Question Type | JSON compact | slimjson | XML | JSON | TOON | YAML | CSV |
|
|
474
|
+
|---------------|-------------|----------|-----|------|------|------|-----|
|
|
475
|
+
| Field Retrieval | 99.3% | 98.5% | 98.5% | 99.3% | 95.6% | 98.5% | 98.4% |
|
|
476
|
+
| Aggregation | 94.4% | 96.0% | 88.9% | 89.7% | 92.9% | 90.5% | 84.5% |
|
|
477
|
+
| Filtering | 97.9% | 96.9% | 94.8% | 91.7% | 93.8% | 92.7% | 88.9% |
|
|
478
|
+
| Structure Awareness | 88.0% | 88.0% | 90.0% | 90.0% | 90.0% | 88.0% | 87.5% |
|
|
479
|
+
| Structural Validation | 60.0% | 30.0% | 80.0% | 50.0% | 40.0% | 50.0% | 80.0% |
|
|
471
480
|
|
|
472
481
|
#### Datasets Tested
|
|
473
482
|
|
|
474
|
-
| Dataset | Rows | Structure | CSV Support |
|
|
475
|
-
|
|
476
|
-
| Uniform employee records | 100 | uniform | ✓ |
|
|
477
|
-
| E-commerce orders (nested) | 50 | nested | ✗ |
|
|
478
|
-
| Time-series analytics data | 60 | uniform | ✓ |
|
|
479
|
-
| Top 100 GitHub repositories | 100 | uniform | ✓ |
|
|
480
|
-
| Semi-uniform event logs | 75 | semi-uniform | ✗ |
|
|
481
|
-
| Deeply nested configuration | 11 | deep | ✗ |
|
|
483
|
+
| Dataset | Rows | Structure | CSV Support | Tabular % |
|
|
484
|
+
|---------|------|-----------|-------------|-----------|
|
|
485
|
+
| Uniform employee records | 100 | uniform | ✓ | 100% |
|
|
486
|
+
| E-commerce orders (nested) | 50 | nested | ✗ | 33% |
|
|
487
|
+
| Time-series analytics data | 60 | uniform | ✓ | 100% |
|
|
488
|
+
| Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
|
|
489
|
+
| Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
|
|
490
|
+
| Deeply nested configuration | 11 | deep | ✗ | 0% |
|
|
482
491
|
|
|
483
492
|
## Development
|
|
484
493
|
|
|
485
494
|
```bash
|
|
486
|
-
# Run tests (
|
|
495
|
+
# Run tests (209 cases, 100% coverage)
|
|
487
496
|
npm test
|
|
488
497
|
|
|
489
498
|
# Run compression ratio benchmarks (with trim comparison)
|
package/compress.js
CHANGED
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
*/
|
|
14
14
|
function mergeSchemas(s1, s2) {
|
|
15
15
|
if (!Array.isArray(s1) || !Array.isArray(s2)) return s1;
|
|
16
|
-
|
|
17
16
|
const first1 = s1[0];
|
|
18
17
|
const first2 = s2[0];
|
|
19
18
|
|
|
@@ -40,12 +39,7 @@ function mergeSchemas(s1, s2) {
|
|
|
40
39
|
}
|
|
41
40
|
|
|
42
41
|
// 两者都是数组(不是对象 schema)→ 递归合并第一个元素
|
|
43
|
-
|
|
44
|
-
return [mergeSchemas(first1, first2)];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// 其他情况(原始值数组或类型不匹配)→ 取第一个
|
|
48
|
-
return s1;
|
|
42
|
+
return [mergeSchemas(first1, first2)];
|
|
49
43
|
}
|
|
50
44
|
|
|
51
45
|
/**
|
|
@@ -76,12 +70,10 @@ function inferSchema(value) {
|
|
|
76
70
|
return [inferObjectSchema(objects)];
|
|
77
71
|
}
|
|
78
72
|
// 原始值数组 - 不压缩,由父级处理
|
|
79
|
-
return
|
|
80
|
-
}
|
|
81
|
-
if (typeof value === 'object' && value !== null) {
|
|
82
|
-
return inferObjectSchema([value]);
|
|
73
|
+
return;
|
|
83
74
|
}
|
|
84
|
-
|
|
75
|
+
// value 是单个对象
|
|
76
|
+
return inferObjectSchema([value]);
|
|
85
77
|
}
|
|
86
78
|
|
|
87
79
|
/**
|
|
@@ -106,7 +98,7 @@ function inferObjectSchema(objects) {
|
|
|
106
98
|
}
|
|
107
99
|
|
|
108
100
|
return keyOrder.map(key => {
|
|
109
|
-
const values = keyValues.get(key)
|
|
101
|
+
const values = keyValues.get(key);
|
|
110
102
|
if (values.length === 0) return key;
|
|
111
103
|
|
|
112
104
|
const sample = values[0];
|
|
@@ -147,15 +139,12 @@ function inferObjectSchema(objects) {
|
|
|
147
139
|
for (const v of values) {
|
|
148
140
|
if (Array.isArray(v)) {
|
|
149
141
|
const s = inferSchema(v);
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
const inner = Array.isArray(s) && s.length === 1 ? s[0] : s;
|
|
153
|
-
merged = merged ? mergeSchemas(merged, inner) : inner;
|
|
154
|
-
}
|
|
142
|
+
const inner = s[0];
|
|
143
|
+
merged = merged ? mergeSchemas(merged, inner) : inner;
|
|
155
144
|
}
|
|
156
145
|
}
|
|
157
146
|
// 再包一层 [] 表示"数组的数组"
|
|
158
|
-
return { [key]: [merged
|
|
147
|
+
return { [key]: [merged] };
|
|
159
148
|
}
|
|
160
149
|
|
|
161
150
|
// 原始值数组(如 ["张三","李四"])→ 不压缩,直接用 key 名
|
|
@@ -180,31 +169,21 @@ function compressWithSchema(value, schema) {
|
|
|
180
169
|
return value.map(item => compressWithSchema(item, inner));
|
|
181
170
|
}
|
|
182
171
|
|
|
183
|
-
// schema 包含 undefined → 原始值数组,不压缩
|
|
184
|
-
if (Array.isArray(schema) && schema.some(s => s === undefined || s === null)) {
|
|
185
|
-
return value;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
172
|
// schema 是数组(对象 schema)→ 值是对象
|
|
189
|
-
if (
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
return compressWithSchema(val, valueSchema);
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
return value;
|
|
173
|
+
if (!value || typeof value !== 'object') return value;
|
|
174
|
+
return schema.map(fieldDef => {
|
|
175
|
+
let key, valueSchema;
|
|
176
|
+
if (typeof fieldDef === 'string') {
|
|
177
|
+
key = fieldDef;
|
|
178
|
+
valueSchema = undefined;
|
|
179
|
+
} else {
|
|
180
|
+
key = Object.keys(fieldDef)[0];
|
|
181
|
+
valueSchema = fieldDef[key];
|
|
182
|
+
}
|
|
183
|
+
const val = value[key];
|
|
184
|
+
if (val == null) return null;
|
|
185
|
+
return compressWithSchema(val, valueSchema);
|
|
186
|
+
});
|
|
208
187
|
}
|
|
209
188
|
|
|
210
189
|
/**
|
|
@@ -257,41 +236,32 @@ function decompressWithSchema(data, schema) {
|
|
|
257
236
|
|
|
258
237
|
// schema 是 [innerSchema] → 还原为数组
|
|
259
238
|
if (Array.isArray(schema) && schema.length === 1 && Array.isArray(schema[0])) {
|
|
260
|
-
if (!Array.isArray(data)) return data;
|
|
261
239
|
const inner = schema[0];
|
|
262
240
|
return data.map(item => decompressWithSchema(item, inner));
|
|
263
241
|
}
|
|
264
242
|
|
|
265
|
-
//
|
|
266
|
-
if (
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
} else if (typeof fieldDef === 'object' && fieldDef !== null) {
|
|
282
|
-
key = Object.keys(fieldDef)[0];
|
|
283
|
-
valueSchema = fieldDef[key];
|
|
284
|
-
} else {
|
|
285
|
-
continue;
|
|
286
|
-
}
|
|
287
|
-
const val = data[i];
|
|
288
|
-
if (val === undefined) { obj[key] = null; continue; }
|
|
289
|
-
obj[key] = decompressWithSchema(val, valueSchema);
|
|
243
|
+
// 原始值(混合数组中的原始元素)→ 直接返回
|
|
244
|
+
if (typeof data !== 'object') return data;
|
|
245
|
+
|
|
246
|
+
// 对象 schema → 还原为对象
|
|
247
|
+
const obj = {};
|
|
248
|
+
for (let i = 0; i < schema.length; i++) {
|
|
249
|
+
const fieldDef = schema[i];
|
|
250
|
+
let key, valueSchema;
|
|
251
|
+
if (typeof fieldDef === 'string') {
|
|
252
|
+
key = fieldDef;
|
|
253
|
+
valueSchema = undefined;
|
|
254
|
+
} else if (typeof fieldDef === 'object' && fieldDef !== null) {
|
|
255
|
+
key = Object.keys(fieldDef)[0];
|
|
256
|
+
valueSchema = fieldDef[key];
|
|
257
|
+
} else {
|
|
258
|
+
continue;
|
|
290
259
|
}
|
|
291
|
-
|
|
260
|
+
const val = data[i];
|
|
261
|
+
if (val === undefined) { obj[key] = null; continue; }
|
|
262
|
+
obj[key] = decompressWithSchema(val, valueSchema);
|
|
292
263
|
}
|
|
293
|
-
|
|
294
|
-
return data;
|
|
264
|
+
return obj;
|
|
295
265
|
}
|
|
296
266
|
|
|
297
267
|
/**
|
|
@@ -494,8 +464,7 @@ function parse(text) {
|
|
|
494
464
|
skipWs();
|
|
495
465
|
if (text[pos] !== ':') error('Expected :');
|
|
496
466
|
pos++;
|
|
497
|
-
|
|
498
|
-
obj[key] = val;
|
|
467
|
+
obj[key] = parseValue();
|
|
499
468
|
skipWs();
|
|
500
469
|
if (text[pos] === '}') { pos++; return obj; }
|
|
501
470
|
if (text[pos] === ',') { pos++; continue; }
|
|
@@ -542,3 +511,4 @@ function parse(text) {
|
|
|
542
511
|
}
|
|
543
512
|
|
|
544
513
|
module.exports = { compress, decompress, stringify, parse };
|
|
514
|
+
module.exports.default = module.exports;
|