llmpress 0.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +282 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +955 -0
- package/dist/src/detect.d.ts +15 -0
- package/dist/src/detect.d.ts.map +1 -0
- package/dist/src/flatten.d.ts +2 -0
- package/dist/src/flatten.d.ts.map +1 -0
- package/dist/src/formatters/csv.d.ts +5 -0
- package/dist/src/formatters/csv.d.ts.map +1 -0
- package/dist/src/formatters/hybrid.d.ts +5 -0
- package/dist/src/formatters/hybrid.d.ts.map +1 -0
- package/dist/src/formatters/markdown-table.d.ts +4 -0
- package/dist/src/formatters/markdown-table.d.ts.map +1 -0
- package/dist/src/formatters/toon.d.ts +5 -0
- package/dist/src/formatters/toon.d.ts.map +1 -0
- package/dist/src/formatters/tsv.d.ts +5 -0
- package/dist/src/formatters/tsv.d.ts.map +1 -0
- package/dist/src/formatters/yaml.d.ts +6 -0
- package/dist/src/formatters/yaml.d.ts.map +1 -0
- package/dist/src/index.d.ts +45 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/tokenizer.d.ts +3 -0
- package/dist/src/tokenizer.d.ts.map +1 -0
- package/package.json +64 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Devansh Gandhi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# llmpress
|
|
2
|
+
|
|
3
|
+
Auto-detects your data's shape and converts it to the most token-efficient format for LLM prompts. Feed it anything — arrays, objects, nested structures — and get back a compact string representation along with token savings stats.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
bun add llmpress
|
|
9
|
+
# or: npm install llmpress
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
```ts
|
|
15
|
+
import { format } from "llmpress";
|
|
16
|
+
|
|
17
|
+
const users = [
|
|
18
|
+
{ id: 1, name: "Alice", role: "admin", active: true },
|
|
19
|
+
{ id: 2, name: "Bob", role: "editor", active: true },
|
|
20
|
+
{ id: 3, name: "Carol", role: "viewer", active: false },
|
|
21
|
+
];
|
|
22
|
+
|
|
23
|
+
const result = await format(users);
|
|
24
|
+
|
|
25
|
+
console.log(result.output);
|
|
26
|
+
// [3]{id,name,role,active}:
|
|
27
|
+
// 1,Alice,admin,true
|
|
28
|
+
// 2,Bob,editor,true
|
|
29
|
+
// 3,Carol,viewer,false
|
|
30
|
+
|
|
31
|
+
console.log(result.format); // "toon"
|
|
32
|
+
console.log(result.tokensSaved); // 52 (% reduction vs raw JSON)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## API
|
|
36
|
+
|
|
37
|
+
### `format(data, options?)`
|
|
38
|
+
|
|
39
|
+
Auto-detects shape, picks the best format, returns the result.
|
|
40
|
+
|
|
41
|
+
```ts
|
|
42
|
+
const result = await format(data, {
|
|
43
|
+
as?: "toon" | "yaml" | "csv" | "tsv" | "markdown-table" | "json-compact" | "hybrid",
|
|
44
|
+
tokenizer?: "estimate" | "tiktoken", // default: "estimate"
|
|
45
|
+
maxDepth?: number, // default: 2
|
|
46
|
+
flatten?: boolean, // default: false
|
|
47
|
+
report?: boolean, // default: false
|
|
48
|
+
delimiter?: "," | "\t" | "|",
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**`as`** — override auto-detection and force a specific format.
|
|
53
|
+
|
|
54
|
+
**`flatten`** — flatten nested keys with dot notation before formatting. `{ a: { b: 1 } }` becomes `{ "a.b": 1 }`. Useful for nested objects you want in TOON/CSV.
|
|
55
|
+
|
|
56
|
+
**`report`** — attach a human-readable explanation of why this format was chosen.
|
|
57
|
+
|
|
58
|
+
**`tokenizer`** — `"estimate"` uses `Math.ceil(length / 4)` (fast, ~94% accurate). `"tiktoken"` uses `cl100k_base` encoding via `@dqbd/tiktoken` (optional peer dep).
|
|
59
|
+
|
|
60
|
+
**Return value:**
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
{
|
|
64
|
+
output: string;
|
|
65
|
+
format: OutputFormat;
|
|
66
|
+
inputTokens: number;
|
|
67
|
+
outputTokens: number;
|
|
68
|
+
tokensSaved: number; // 0–100, clamped to 0 if output is larger
|
|
69
|
+
report?: {
|
|
70
|
+
inputShape: "tabular" | "nested" | "flat-object" | "primitive-array" | "scalar" | "mixed";
|
|
71
|
+
depth: number;
|
|
72
|
+
uniformityScore: number;
|
|
73
|
+
reason: string;
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
### `compare(data)`
|
|
81
|
+
|
|
82
|
+
Runs all formatters and returns token counts for each, sorted cheapest first. Use this to audit which format works best for your data.
|
|
83
|
+
|
|
84
|
+
```ts
|
|
85
|
+
const results = await compare(users);
|
|
86
|
+
// [
|
|
87
|
+
// { format: "toon", tokens: 38, savings: 52 },
|
|
88
|
+
// { format: "csv", tokens: 46, savings: 42 },
|
|
89
|
+
// { format: "tsv", tokens: 46, savings: 42 },
|
|
90
|
+
// { format: "yaml", tokens: 54, savings: 33 },
|
|
91
|
+
// { format: "markdown-table", tokens: 74, savings: 7 },
|
|
92
|
+
// { format: "json-compact", tokens: 75, savings: 6 },
|
|
93
|
+
// { format: "json", tokens: 80, savings: 0 },
|
|
94
|
+
// ]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### `detect(data)`
|
|
100
|
+
|
|
101
|
+
Inspect data shape without formatting.
|
|
102
|
+
|
|
103
|
+
```ts
|
|
104
|
+
import { detect } from "llmpress";
|
|
105
|
+
|
|
106
|
+
const info = detect(users);
|
|
107
|
+
// {
|
|
108
|
+
// shape: "tabular",
|
|
109
|
+
// depth: 1,
|
|
110
|
+
// uniformityScore: 1.0,
|
|
111
|
+
// recommendedFormat: "toon",
|
|
112
|
+
// reason: "Uniform tabular array (score=1.00) — TOON is most token-efficient"
|
|
113
|
+
// }
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
### Individual formatters
|
|
119
|
+
|
|
120
|
+
All formatters are exported for direct use:
|
|
121
|
+
|
|
122
|
+
```ts
|
|
123
|
+
import { toToon, toYaml, toCsv, toTsv, toMarkdownTable } from "llmpress";
|
|
124
|
+
|
|
125
|
+
toToon(data, options?)
|
|
126
|
+
toYaml(data, options?)
|
|
127
|
+
toCsv(data, options?)
|
|
128
|
+
toTsv(data, options?)
|
|
129
|
+
toMarkdownTable(data, options?)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Output Formats
|
|
135
|
+
|
|
136
|
+
### TOON (Token-Oriented Object Notation)
|
|
137
|
+
|
|
138
|
+
The most compact format for structured data. Handles both arrays and objects natively.
|
|
139
|
+
|
|
140
|
+
**Uniform array** — header encodes row count and field names once; rows are bare values:
|
|
141
|
+
```
|
|
142
|
+
[3]{id,name,role,active}:
|
|
143
|
+
1,Alice,admin,true
|
|
144
|
+
2,Bob,editor,true
|
|
145
|
+
3,Carol,viewer,false
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Object with nested table** — scalars as `key: value`, arrays as inline headers:
|
|
149
|
+
```
|
|
150
|
+
title: Q1 Metrics
|
|
151
|
+
metrics[3]{date,views,clicks}:
|
|
152
|
+
2025-01-01,6138,174
|
|
153
|
+
2025-01-02,4616,274
|
|
154
|
+
2025-01-03,4460,143
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Best for: uniform arrays, objects containing tabular data, composite profiles.
|
|
158
|
+
|
|
159
|
+
### YAML
|
|
160
|
+
|
|
161
|
+
Readable key-value format. Hand-rolled serializer — no dependencies. Strings are quoted only when necessary.
|
|
162
|
+
|
|
163
|
+
```yaml
|
|
164
|
+
- id: 1
|
|
165
|
+
name: Alice
|
|
166
|
+
role: admin
|
|
167
|
+
active: true
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Best for: flat objects, config data, sparse/non-uniform arrays.
|
|
171
|
+
|
|
172
|
+
### CSV / TSV
|
|
173
|
+
|
|
174
|
+
Standard comma or tab-separated with header row. Values containing delimiters or quotes are properly escaped.
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
id,name,role,active
|
|
178
|
+
1,Alice,admin,true
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Best for: data destined for further parsing or spreadsheet-style display.
|
|
182
|
+
|
|
183
|
+
### Markdown Table
|
|
184
|
+
|
|
185
|
+
Padded table with alignment. Values containing `|` are escaped.
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
| id | name | role | active |
|
|
189
|
+
|----|-------|--------|--------|
|
|
190
|
+
| 1 | Alice | admin | true |
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Best for: data being rendered directly in markdown (e.g. in a chat UI).
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Auto-Detection
|
|
198
|
+
|
|
199
|
+
`detect()` classifies data and picks the best format:
|
|
200
|
+
|
|
201
|
+
| Input shape | Condition | Format |
|
|
202
|
+
|---|---|---|
|
|
203
|
+
| Primitive / null | — | `json-compact` |
|
|
204
|
+
| Object | shallow flat scalars only | `yaml` |
|
|
205
|
+
| Object | has uniform array field(s) | `toon` |
|
|
206
|
+
| Object | composite: deeply nested fields (depth > 3) | `hybrid` |
|
|
207
|
+
| Object | composite: few complex sections (≤ 3 non-scalar fields) | `hybrid` |
|
|
208
|
+
| Object | composite: many complex sections | `toon` |
|
|
209
|
+
| Empty array | — | `toon` |
|
|
210
|
+
| Array of primitives | — | `csv` |
|
|
211
|
+
| Array of objects + primitives | — | `toon` |
|
|
212
|
+
| Array of objects | uniformity ≥ 0.85, no array columns | `toon` |
|
|
213
|
+
| Array of objects | uniformity ≥ 0.85, has array columns | `hybrid` |
|
|
214
|
+
| Array of objects | uniformity 0.50–0.84 | `yaml` |
|
|
215
|
+
| Array of objects | uniformity < 0.50 | `json-compact` |
|
|
216
|
+
|
|
217
|
+
**Uniformity score** = shared keys across all items / total unique keys. Measures how consistent the schema is across rows.
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Nested Data
|
|
222
|
+
|
|
223
|
+
Use `flatten: true` to dot-flatten nested objects before formatting:
|
|
224
|
+
|
|
225
|
+
```ts
|
|
226
|
+
const products = [
|
|
227
|
+
{ id: 1, name: "Widget", specs: { weight: "100g", dims: "10x5" } },
|
|
228
|
+
];
|
|
229
|
+
|
|
230
|
+
const result = await format(products, { flatten: true });
|
|
231
|
+
// Detects as tabular after flattening:
|
|
232
|
+
// [1]{id,name,specs.weight,specs.dims}:
|
|
233
|
+
// 1,Widget,100g,10x5
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Benchmarks
|
|
239
|
+
|
|
240
|
+
Measured on Bun 1.3.10, Apple Silicon. Selected highlights:
|
|
241
|
+
|
|
242
|
+
| Fixture | Format | Savings |
|
|
243
|
+
|---|---|---|
|
|
244
|
+
| User list (uniform array) | TOON | **53%** |
|
|
245
|
+
| Time-series analytics (object + table) | TOON | **51%** |
|
|
246
|
+
| Log entries (500 rows) | TOON | **48%** |
|
|
247
|
+
| GitHub repos (25 repos) | TOON | **38%** |
|
|
248
|
+
| Product catalog (flattened) | TOON | **39%** |
|
|
249
|
+
| LinkedIn profile (composite object) | TOON | **31%** |
|
|
250
|
+
| LinkedIn posts (array + nested cols) | hybrid | **13%** |
|
|
251
|
+
| Config object (flat key-value) | YAML | 10% |
|
|
252
|
+
| Search API response (exa) | hybrid | 4% |
|
|
253
|
+
| Deeply nested composite | hybrid | 4% |
|
|
254
|
+
|
|
255
|
+
`format()` runs in ~10ms on a 1000-row array. See [docs/BENCHMARKS.md](./docs/BENCHMARKS.md) for full results across all fixtures.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Development
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
bun test # run all tests
|
|
263
|
+
bun test tests/unit # unit tests only
|
|
264
|
+
bun test tests/integration # integration tests only
|
|
265
|
+
bun test --update-snapshots # regenerate snapshots
|
|
266
|
+
bun run bench # performance + savings benchmarks
|
|
267
|
+
bun tsc --noEmit # type check
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## Optional: tiktoken
|
|
271
|
+
|
|
272
|
+
For precise token counts instead of the character-length heuristic:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
bun add @dqbd/tiktoken
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
```ts
|
|
279
|
+
const result = await format(data, { tokenizer: "tiktoken" });
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Uses `cl100k_base` encoding (GPT-4 / Claude).
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC"}
|