@shrkcrft/compress 0.1.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +21 -0
- package/dist/cache/align-volatile-tokens.d.ts +13 -0
- package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/align-volatile-tokens.js +51 -0
- package/dist/cache/alignment-map.d.ts +23 -0
- package/dist/cache/alignment-map.d.ts.map +1 -0
- package/dist/cache/alignment-map.js +1 -0
- package/dist/cache/alignment-result.d.ts +11 -0
- package/dist/cache/alignment-result.d.ts.map +1 -0
- package/dist/cache/alignment-result.js +1 -0
- package/dist/cache/detect-volatile-tokens.d.ts +10 -0
- package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/detect-volatile-tokens.js +41 -0
- package/dist/cache/placeholder.d.ts +28 -0
- package/dist/cache/placeholder.d.ts.map +1 -0
- package/dist/cache/placeholder.js +0 -0
- package/dist/cache/restore-volatile-tokens.d.ts +10 -0
- package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/restore-volatile-tokens.js +21 -0
- package/dist/cache/volatile-classify.d.ts +11 -0
- package/dist/cache/volatile-classify.d.ts.map +1 -0
- package/dist/cache/volatile-classify.js +35 -0
- package/dist/cache/volatile-kind.d.ts +13 -0
- package/dist/cache/volatile-kind.d.ts.map +1 -0
- package/dist/cache/volatile-kind.js +13 -0
- package/dist/cache/volatile-token.d.ts +14 -0
- package/dist/cache/volatile-token.d.ts.map +1 -0
- package/dist/cache/volatile-token.js +1 -0
- package/dist/ccr/ccr-entry.d.ts +13 -0
- package/dist/ccr/ccr-entry.d.ts.map +1 -0
- package/dist/ccr/ccr-entry.js +1 -0
- package/dist/ccr/ccr-key.d.ts +9 -0
- package/dist/ccr/ccr-key.d.ts.map +1 -0
- package/dist/ccr/ccr-key.js +19 -0
- package/dist/ccr/ccr-marker.d.ts +23 -0
- package/dist/ccr/ccr-marker.d.ts.map +1 -0
- package/dist/ccr/ccr-marker.js +30 -0
- package/dist/ccr/ccr-store.d.ts +18 -0
- package/dist/ccr/ccr-store.d.ts.map +1 -0
- package/dist/ccr/ccr-store.js +1 -0
- package/dist/ccr/file-ccr-store.d.ts +19 -0
- package/dist/ccr/file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/file-ccr-store.js +53 -0
- package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
- package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
- package/dist/ccr/in-memory-ccr-store.js +45 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/ttl-file-ccr-store.js +117 -0
- package/dist/code/compress-code.d.ts +4 -0
- package/dist/code/compress-code.d.ts.map +1 -0
- package/dist/code/compress-code.js +294 -0
- package/dist/compress-content.d.ts +11 -0
- package/dist/compress-content.d.ts.map +1 -0
- package/dist/compress-content.js +79 -0
- package/dist/content/content-type.d.ts +28 -0
- package/dist/content/content-type.d.ts.map +1 -0
- package/dist/content/content-type.js +28 -0
- package/dist/content/detect-content-type.d.ts +9 -0
- package/dist/content/detect-content-type.d.ts.map +1 -0
- package/dist/content/detect-content-type.js +184 -0
- package/dist/content/segment.d.ts +21 -0
- package/dist/content/segment.d.ts.map +1 -0
- package/dist/content/segment.js +117 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +49 -0
- package/dist/json/compress-json.d.ts +18 -0
- package/dist/json/compress-json.d.ts.map +1 -0
- package/dist/json/compress-json.js +139 -0
- package/dist/json/render-compact-json.d.ts +10 -0
- package/dist/json/render-compact-json.d.ts.map +1 -0
- package/dist/json/render-compact-json.js +18 -0
- package/dist/relevance/bm25.d.ts +26 -0
- package/dist/relevance/bm25.d.ts.map +1 -0
- package/dist/relevance/bm25.js +115 -0
- package/dist/result/compress-options.d.ts +26 -0
- package/dist/result/compress-options.d.ts.map +1 -0
- package/dist/result/compress-options.js +1 -0
- package/dist/result/compression-result.d.ts +26 -0
- package/dist/result/compression-result.d.ts.map +1 -0
- package/dist/result/compression-result.js +1 -0
- package/dist/result/compression-strategy.d.ts +30 -0
- package/dist/result/compression-strategy.d.ts.map +1 -0
- package/dist/result/compression-strategy.js +30 -0
- package/dist/table/adaptive-size.d.ts +46 -0
- package/dist/table/adaptive-size.d.ts.map +1 -0
- package/dist/table/adaptive-size.js +170 -0
- package/dist/table/apply-value-dictionaries.d.ts +30 -0
- package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
- package/dist/table/apply-value-dictionaries.js +99 -0
- package/dist/table/column-presence.d.ts +20 -0
- package/dist/table/column-presence.d.ts.map +1 -0
- package/dist/table/column-presence.js +52 -0
- package/dist/table/columnar-json.d.ts +24 -0
- package/dist/table/columnar-json.d.ts.map +1 -0
- package/dist/table/columnar-json.js +83 -0
- package/dist/table/columnar-table.d.ts +24 -0
- package/dist/table/columnar-table.d.ts.map +1 -0
- package/dist/table/columnar-table.js +1 -0
- package/dist/table/compact-object-array.d.ts +12 -0
- package/dist/table/compact-object-array.d.ts.map +1 -0
- package/dist/table/compact-object-array.js +88 -0
- package/dist/table/field-spec.d.ts +13 -0
- package/dist/table/field-spec.d.ts.map +1 -0
- package/dist/table/field-spec.js +1 -0
- package/dist/table/object-map.d.ts +28 -0
- package/dist/table/object-map.d.ts.map +1 -0
- package/dist/table/object-map.js +119 -0
- package/dist/table/render-table.d.ts +11 -0
- package/dist/table/render-table.d.ts.map +1 -0
- package/dist/table/render-table.js +39 -0
- package/dist/table/sample-object-array.d.ts +11 -0
- package/dist/table/sample-object-array.d.ts.map +1 -0
- package/dist/table/sample-object-array.js +171 -0
- package/dist/table/sample-options.d.ts +29 -0
- package/dist/table/sample-options.d.ts.map +1 -0
- package/dist/table/sample-options.js +1 -0
- package/dist/table/sampled-table.d.ts +33 -0
- package/dist/table/sampled-table.d.ts.map +1 -0
- package/dist/table/sampled-table.js +8 -0
- package/dist/table/table-compaction.d.ts +19 -0
- package/dist/table/table-compaction.d.ts.map +1 -0
- package/dist/table/table-compaction.js +1 -0
- package/dist/table/table-formats.d.ts +23 -0
- package/dist/table/table-formats.d.ts.map +1 -0
- package/dist/table/table-formats.js +233 -0
- package/dist/text/compress-diff.d.ts +20 -0
- package/dist/text/compress-diff.d.ts.map +1 -0
- package/dist/text/compress-diff.js +344 -0
- package/dist/text/compress-lines.d.ts +12 -0
- package/dist/text/compress-lines.d.ts.map +1 -0
- package/dist/text/compress-lines.js +44 -0
- package/dist/text/compress-log.d.ts +12 -0
- package/dist/text/compress-log.d.ts.map +1 -0
- package/dist/text/compress-log.js +202 -0
- package/dist/text/compress-markdown.d.ts +15 -0
- package/dist/text/compress-markdown.d.ts.map +1 -0
- package/dist/text/compress-markdown.js +96 -0
- package/dist/text/compress-search.d.ts +11 -0
- package/dist/text/compress-search.d.ts.map +1 -0
- package/dist/text/compress-search.js +78 -0
- package/dist/text/finalize.d.ts +21 -0
- package/dist/text/finalize.d.ts.map +1 -0
- package/dist/text/finalize.js +54 -0
- package/dist/text/line-utils.d.ts +20 -0
- package/dist/text/line-utils.d.ts.map +1 -0
- package/dist/text/line-utils.js +65 -0
- package/dist/text/lockfile-names.d.ts +3 -0
- package/dist/text/lockfile-names.d.ts.map +1 -0
- package/dist/text/lockfile-names.js +33 -0
- package/dist/text/log-template.d.ts +31 -0
- package/dist/text/log-template.d.ts.map +1 -0
- package/dist/text/log-template.js +239 -0
- package/dist/tokens/estimate-tokens.d.ts +17 -0
- package/dist/tokens/estimate-tokens.d.ts.map +1 -0
- package/dist/tokens/estimate-tokens.js +53 -0
- package/dist/tokens/token-savings.d.ts +20 -0
- package/dist/tokens/token-savings.d.ts.map +1 -0
- package/dist/tokens/token-savings.js +1 -0
- package/package.json +52 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Drain-style log-template mining — a LOSSLESS pre-pass for {@link compressLog}.
|
|
3
|
+
*
|
|
4
|
+
* Repeated structured log lines (`worker-3 processing batch 17 ok` × N) carry
|
|
5
|
+
* almost no new information per line: the fixed words repeat and only a few
|
|
6
|
+
* variable tokens move. This miner tokenizes each line, replaces its variable
|
|
7
|
+
* tokens (numbers, hex, UUIDs, ISO timestamps, quoted strings) with a `{}`
|
|
8
|
+
* placeholder to form a *template*, groups consecutive lines sharing a
|
|
9
|
+
* template, and collapses each run to one template plus a compact per-column
|
|
10
|
+
* encoding of the captured variables.
|
|
11
|
+
*
|
|
12
|
+
* It is **lossless by construction**: a template is exactly the original line
|
|
13
|
+
* with its variable matches replaced by `{}`, so `template ⋈ variables` rebuilds
|
|
14
|
+
* every original line, in order — no CCR needed. {@link reconstructLogTemplates}
|
|
15
|
+
* is the inverse and is exercised by the round-trip tests.
|
|
16
|
+
*
|
|
17
|
+
* Only runs with ≥1 variable column collapse; pure-identical repeats are left
|
|
18
|
+
* for the downstream signal-selector's de-duplication so its behaviour (and the
|
|
19
|
+
* `… N omitted …` markers callers rely on) is preserved.
|
|
20
|
+
*/
|
|
21
|
+
// Block sentinels. Chosen to never occur in real logs; if the input contains
|
|
22
|
+
// either, mining is skipped wholesale so a collision can't corrupt round-trip.
|
|
23
|
+
const BLOCK_OPEN = '⟦';
|
|
24
|
+
const BLOCK_CLOSE = '⟧';
|
|
25
|
+
const PLACEHOLDER = '{}';
|
|
26
|
+
/** Minimum consecutive same-template lines before a run is worth collapsing. */
|
|
27
|
+
const MIN_RUN = 3;
|
|
28
|
+
// Variable token classes, specific → general. Correctness of the round-trip
|
|
29
|
+
// does NOT depend on this list (template+matches always rebuild the line); it
|
|
30
|
+
// only governs how MUCH collapses.
|
|
31
|
+
const VAR_RE = new RegExp([
|
|
32
|
+
'\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?(?:Z|[+-]\\d{2}:?\\d{2})?', // ISO timestamp
|
|
33
|
+
'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', // UUID
|
|
34
|
+
'0x[0-9a-fA-F]+', // hex literal
|
|
35
|
+
'\\b[0-9a-fA-F]{12,}\\b', // long hash
|
|
36
|
+
'"(?:[^"\\\\]|\\\\.)*"', // double-quoted string
|
|
37
|
+
"'(?:[^'\\\\]|\\\\.)*'", // single-quoted string
|
|
38
|
+
'\\d+(?:\\.\\d+)?', // integer / decimal
|
|
39
|
+
].join('|'), 'g');
|
|
40
|
+
// High-signal lines the downstream selector keeps individually — never folded
|
|
41
|
+
// into a template, so an error / crash / stack frame always stays verbatim.
|
|
42
|
+
const SIGNAL_RE = /\b(?:ERROR|FATAL|FAIL(?:ED|URE)?|EXCEPTION|panic|segmentation fault|segfault|core dumped|bus error|out of memory|oom|traceback|undefined reference|undefined symbol|assertion (?:failed)|SIG(?:SEGV|ABRT|KILL|BUS|FPE|ILL))\b/i;
|
|
43
|
+
const STACK_RE = /^\s+(?:at\s+\S+|File ".*", line \d+)/;
|
|
44
|
+
/** Collapse consecutive same-template runs. Lossless; reversible via {@link reconstructLogTemplates}. */
|
|
45
|
+
export function mineLogTemplates(lines) {
|
|
46
|
+
// A sentinel anywhere means we can't guarantee a clean round-trip — bail.
|
|
47
|
+
for (const l of lines) {
|
|
48
|
+
if (l.includes(BLOCK_OPEN) || l.includes(BLOCK_CLOSE)) {
|
|
49
|
+
return { lines: [...lines], reduced: false };
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
const templateOf = (line) => {
|
|
53
|
+
if (line.includes(PLACEHOLDER) || SIGNAL_RE.test(line) || STACK_RE.test(line))
|
|
54
|
+
return null;
|
|
55
|
+
return line.replace(VAR_RE, PLACEHOLDER);
|
|
56
|
+
};
|
|
57
|
+
const templates = lines.map(templateOf);
|
|
58
|
+
const out = [];
|
|
59
|
+
let reduced = false;
|
|
60
|
+
let i = 0;
|
|
61
|
+
while (i < lines.length) {
|
|
62
|
+
const tpl = templates[i];
|
|
63
|
+
if (tpl === null || tpl === undefined) {
|
|
64
|
+
out.push(lines[i] ?? '');
|
|
65
|
+
i += 1;
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
let j = i + 1;
|
|
69
|
+
while (j < lines.length && templates[j] === tpl)
|
|
70
|
+
j += 1;
|
|
71
|
+
const run = lines.slice(i, j);
|
|
72
|
+
const block = run.length >= MIN_RUN ? encodeBlock(tpl, run) : null;
|
|
73
|
+
if (block) {
|
|
74
|
+
out.push(...block);
|
|
75
|
+
reduced = true;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
out.push(...run);
|
|
79
|
+
}
|
|
80
|
+
i = j;
|
|
81
|
+
}
|
|
82
|
+
return { lines: out, reduced };
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Encode one run as a template block, or null when it isn't worth it (no
|
|
86
|
+
* variable columns: pure-identical repeats are left for the selector's dedup).
|
|
87
|
+
*/
|
|
88
|
+
function encodeBlock(template, run) {
|
|
89
|
+
const parts = template.split(PLACEHOLDER);
|
|
90
|
+
const cols = parts.length - 1;
|
|
91
|
+
const valuesPerLine = run.map((l) => l.match(VAR_RE) ?? []);
|
|
92
|
+
// Fold constant columns into the template; collect the variable columns.
|
|
93
|
+
let folded = parts[0] ?? '';
|
|
94
|
+
const varColumns = [];
|
|
95
|
+
for (let c = 0; c < cols; c += 1) {
|
|
96
|
+
const column = valuesPerLine.map((v) => v[c] ?? '');
|
|
97
|
+
const allEqual = column.every((v) => v === column[0]);
|
|
98
|
+
if (allEqual) {
|
|
99
|
+
folded += column[0] ?? '';
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
folded += PLACEHOLDER;
|
|
103
|
+
varColumns.push(column);
|
|
104
|
+
}
|
|
105
|
+
folded += parts[c + 1] ?? '';
|
|
106
|
+
}
|
|
107
|
+
// No moving parts → leave it for the selector's identical-line dedup.
|
|
108
|
+
if (varColumns.length === 0)
|
|
109
|
+
return null;
|
|
110
|
+
const block = [`${BLOCK_OPEN}×${run.length}${BLOCK_CLOSE} ${folded}`];
|
|
111
|
+
for (const column of varColumns) {
|
|
112
|
+
block.push(`${BLOCK_OPEN}c${BLOCK_CLOSE} ${encodeColumn(column)}`);
|
|
113
|
+
}
|
|
114
|
+
block.push(`${BLOCK_OPEN}/${BLOCK_CLOSE}`);
|
|
115
|
+
return block;
|
|
116
|
+
}
|
|
117
|
+
/** Encode one variable column as the tightest lossless form: seq | cyc | lit. */
|
|
118
|
+
function encodeColumn(values) {
|
|
119
|
+
const seq = asArithmetic(values);
|
|
120
|
+
if (seq)
|
|
121
|
+
return `seq ${seq.start} ${seq.step}`;
|
|
122
|
+
const cyc = asCycle(values);
|
|
123
|
+
if (cyc)
|
|
124
|
+
return `cyc ${cyc.map(escapeValue).join('|')}`;
|
|
125
|
+
return `lit ${values.map(escapeValue).join('|')}`;
|
|
126
|
+
}
|
|
127
|
+
/** Decode a column encoding into its N values. */
|
|
128
|
+
function decodeColumn(enc, n) {
|
|
129
|
+
const sp = enc.indexOf(' ');
|
|
130
|
+
const kind = sp < 0 ? enc : enc.slice(0, sp);
|
|
131
|
+
const rest = sp < 0 ? '' : enc.slice(sp + 1);
|
|
132
|
+
if (kind === 'seq') {
|
|
133
|
+
const [start, step] = rest.split(' ').map((x) => Number(x));
|
|
134
|
+
return Array.from({ length: n }, (_, k) => String((start ?? 0) + k * (step ?? 0)));
|
|
135
|
+
}
|
|
136
|
+
if (kind === 'cyc') {
|
|
137
|
+
const pat = splitEscaped(rest);
|
|
138
|
+
return Array.from({ length: n }, (_, k) => pat[k % pat.length] ?? '');
|
|
139
|
+
}
|
|
140
|
+
// lit
|
|
141
|
+
return splitEscaped(rest);
|
|
142
|
+
}
|
|
143
|
+
/** An integer arithmetic progression that reconstructs every value canonically, else null. */
|
|
144
|
+
function asArithmetic(values) {
|
|
145
|
+
if (values.length < 2)
|
|
146
|
+
return null;
|
|
147
|
+
const nums = [];
|
|
148
|
+
for (const v of values) {
|
|
149
|
+
// Canonical non-negative integer only (no leading zeros, fits exactly).
|
|
150
|
+
if (!/^\d+$/.test(v) || (v.length > 1 && v[0] === '0') || v.length > 15)
|
|
151
|
+
return null;
|
|
152
|
+
nums.push(Number(v));
|
|
153
|
+
}
|
|
154
|
+
const step = nums[1] - nums[0];
|
|
155
|
+
for (let k = 0; k < nums.length; k += 1) {
|
|
156
|
+
if (String(nums[0] + k * step) !== values[k])
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
return { start: nums[0], step };
|
|
160
|
+
}
|
|
161
|
+
/** A short repeating pattern (period 2..8) that covers the whole column, else null. */
|
|
162
|
+
function asCycle(values) {
|
|
163
|
+
const n = values.length;
|
|
164
|
+
const maxPeriod = Math.min(8, Math.floor(n / 2));
|
|
165
|
+
for (let p = 2; p <= maxPeriod; p += 1) {
|
|
166
|
+
let ok = true;
|
|
167
|
+
for (let k = 0; k < n; k += 1) {
|
|
168
|
+
if (values[k] !== values[k % p]) {
|
|
169
|
+
ok = false;
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
if (ok)
|
|
174
|
+
return values.slice(0, p);
|
|
175
|
+
}
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
function escapeValue(s) {
|
|
179
|
+
return s.replace(/\\/g, '\\\\').replace(/\|/g, '\\|');
|
|
180
|
+
}
|
|
181
|
+
/** Split on unescaped `|` and unescape each field. Inverse of {@link escapeValue}. */
|
|
182
|
+
function splitEscaped(s) {
|
|
183
|
+
const out = [];
|
|
184
|
+
let cur = '';
|
|
185
|
+
for (let i = 0; i < s.length; i += 1) {
|
|
186
|
+
const ch = s[i];
|
|
187
|
+
if (ch === '\\' && i + 1 < s.length) {
|
|
188
|
+
cur += s[i + 1];
|
|
189
|
+
i += 1;
|
|
190
|
+
}
|
|
191
|
+
else if (ch === '|') {
|
|
192
|
+
out.push(cur);
|
|
193
|
+
cur = '';
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
cur += ch;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
out.push(cur);
|
|
200
|
+
return out;
|
|
201
|
+
}
|
|
202
|
+
const HEADER_RE = new RegExp(`^${BLOCK_OPEN}×(\\d+)${BLOCK_CLOSE} (.*)$`);
|
|
203
|
+
const COL_RE = new RegExp(`^${BLOCK_OPEN}c${BLOCK_CLOSE} (.*)$`);
|
|
204
|
+
const CLOSE = `${BLOCK_OPEN}/${BLOCK_CLOSE}`;
|
|
205
|
+
/** Inverse of {@link mineLogTemplates}: expand every block back to its original lines. */
|
|
206
|
+
export function reconstructLogTemplates(text) {
|
|
207
|
+
const lines = text.split('\n');
|
|
208
|
+
const out = [];
|
|
209
|
+
let i = 0;
|
|
210
|
+
while (i < lines.length) {
|
|
211
|
+
const header = HEADER_RE.exec(lines[i] ?? '');
|
|
212
|
+
if (!header) {
|
|
213
|
+
out.push(lines[i] ?? '');
|
|
214
|
+
i += 1;
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
const n = Number(header[1]);
|
|
218
|
+
const folded = header[2] ?? '';
|
|
219
|
+
i += 1;
|
|
220
|
+
const encodings = [];
|
|
221
|
+
while (i < lines.length && lines[i] !== CLOSE) {
|
|
222
|
+
const col = COL_RE.exec(lines[i] ?? '');
|
|
223
|
+
if (col)
|
|
224
|
+
encodings.push(col[1] ?? '');
|
|
225
|
+
i += 1;
|
|
226
|
+
}
|
|
227
|
+
i += 1; // skip CLOSE
|
|
228
|
+
const columns = encodings.map((enc) => decodeColumn(enc, n));
|
|
229
|
+
const parts = folded.split(PLACEHOLDER);
|
|
230
|
+
for (let k = 0; k < n; k += 1) {
|
|
231
|
+
let line = parts[0] ?? '';
|
|
232
|
+
for (let c = 0; c < columns.length; c += 1) {
|
|
233
|
+
line += (columns[c]?.[k] ?? '') + (parts[c + 1] ?? '');
|
|
234
|
+
}
|
|
235
|
+
out.push(line);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return out.join('\n');
|
|
239
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { ITokenSavings } from './token-savings.js';
|
|
2
|
+
import { EContentType } from '../content/content-type.js';
|
|
3
|
+
/**
|
|
4
|
+
* Approximate token count. Average English token ≈ 4 chars; denser classes use
|
|
5
|
+
* a class-specific ratio when `contentType` is supplied. Only the character
|
|
6
|
+
* term is typed — the `words * 1.3` floor is content-independent. Pure: reads
|
|
7
|
+
* only its args + a frozen table.
|
|
8
|
+
*/
|
|
9
|
+
export declare function estimateTokens(text: string, contentType?: EContentType): number;
|
|
10
|
+
/**
|
|
11
|
+
* Measure the token delta between a before/after string (optionally typed).
|
|
12
|
+
* Clamps at zero so a compressor is never reported as a net loss. Both sides
|
|
13
|
+
* use the same `contentType`, so a uniform divisor cannot flip a real
|
|
14
|
+
* reduction into a false passthrough.
|
|
15
|
+
*/
|
|
16
|
+
export declare function measureSavings(before: string, after: string, contentType?: EContentType): ITokenSavings;
|
|
17
|
+
//# sourceMappingURL=estimate-tokens.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"estimate-tokens.d.ts","sourceRoot":"","sources":["../../src/tokens/estimate-tokens.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AA6B1D;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,WAAW,CAAC,EAAE,YAAY,GAAG,MAAM,CAO/E;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAC5B,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EACb,WAAW,CAAC,EAAE,YAAY,GACzB,aAAa,CAMf"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { EContentType } from "../content/content-type.js";
|
|
2
|
+
/**
|
|
3
|
+
* Chars-per-token by content class. Denser content (JSON punctuation, code
|
|
4
|
+
* operators) packs more characters per BPE token than prose, so a LARGER
|
|
5
|
+
* divisor yields a LOWER (more accurate) token estimate. Prose stays at 4 —
|
|
6
|
+
* the exact legacy value — so the untyped path is byte-identical.
|
|
7
|
+
*/
|
|
8
|
+
const CHARS_PER_TOKEN = Object.freeze({
|
|
9
|
+
[EContentType.JsonArray]: 2.5,
|
|
10
|
+
[EContentType.Json]: 2.5,
|
|
11
|
+
[EContentType.GitDiff]: 3.0,
|
|
12
|
+
[EContentType.SearchResults]: 3.0,
|
|
13
|
+
[EContentType.BuildLog]: 3.5,
|
|
14
|
+
[EContentType.SourceCode]: 3.2,
|
|
15
|
+
[EContentType.Markdown]: 4.0,
|
|
16
|
+
[EContentType.Yaml]: 3.5,
|
|
17
|
+
[EContentType.Csv]: 3.0,
|
|
18
|
+
[EContentType.PlainText]: 4.0,
|
|
19
|
+
});
|
|
20
|
+
/**
|
|
21
|
+
* The legacy divisor. With no content type, `estimateTokens` reproduces the
|
|
22
|
+
* exact `max(ceil(chars/4), ceil(words*1.3))` formula bit-for-bit, keeping it
|
|
23
|
+
* in lockstep with `@shrkcrft/context`'s separate estimator. DO NOT change this
|
|
24
|
+
* default or the `words*1.3` floor without updating that peer in step.
|
|
25
|
+
*/
|
|
26
|
+
const DEFAULT_CHARS_PER_TOKEN = 4;
|
|
27
|
+
/**
|
|
28
|
+
* Approximate token count. Average English token ≈ 4 chars; denser classes use
|
|
29
|
+
* a class-specific ratio when `contentType` is supplied. Only the character
|
|
30
|
+
* term is typed — the `words * 1.3` floor is content-independent. Pure: reads
|
|
31
|
+
* only its args + a frozen table.
|
|
32
|
+
*/
|
|
33
|
+
export function estimateTokens(text, contentType) {
|
|
34
|
+
if (!text)
|
|
35
|
+
return 0;
|
|
36
|
+
const divisor = contentType === undefined ? DEFAULT_CHARS_PER_TOKEN : CHARS_PER_TOKEN[contentType];
|
|
37
|
+
const chars = text.length;
|
|
38
|
+
const words = text.trim().split(/\s+/).length;
|
|
39
|
+
return Math.max(Math.ceil(chars / divisor), Math.ceil(words * 1.3));
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Measure the token delta between a before/after string (optionally typed).
|
|
43
|
+
* Clamps at zero so a compressor is never reported as a net loss. Both sides
|
|
44
|
+
* use the same `contentType`, so a uniform divisor cannot flip a real
|
|
45
|
+
* reduction into a false passthrough.
|
|
46
|
+
*/
|
|
47
|
+
export function measureSavings(before, after, contentType) {
|
|
48
|
+
const b = estimateTokens(before, contentType);
|
|
49
|
+
const a = estimateTokens(after, contentType);
|
|
50
|
+
const saved = Math.max(0, b - a);
|
|
51
|
+
const ratio = b === 0 ? 0 : Math.round((saved / b) * 10000) / 10000;
|
|
52
|
+
return { before: b, after: a, saved, ratio };
|
|
53
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token-accounting summary for a compression pass. Every compressor in this
|
|
3
|
+
* package reports its effect through this shape so savings are measured, not
|
|
4
|
+
* assumed.
|
|
5
|
+
*/
|
|
6
|
+
export interface ITokenSavings {
|
|
7
|
+
/** Estimated tokens of the input. */
|
|
8
|
+
before: number;
|
|
9
|
+
/** Estimated tokens of the output. */
|
|
10
|
+
after: number;
|
|
11
|
+
/** `before - after` (never negative; clamped at 0). */
|
|
12
|
+
saved: number;
|
|
13
|
+
/**
|
|
14
|
+
* Fraction saved in `[0, 1]`, rounded to 4 dp. `0` when the input was
|
|
15
|
+
* empty or the output grew (a compressor must never be reported as a
|
|
16
|
+
* net loss).
|
|
17
|
+
*/
|
|
18
|
+
ratio: number;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=token-savings.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-savings.d.ts","sourceRoot":"","sources":["../../src/tokens/token-savings.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,qCAAqC;IACrC,MAAM,EAAE,MAAM,CAAC;IACf,sCAAsC;IACtC,KAAK,EAAE,MAAM,CAAC;IACd,uDAAuD;IACvD,KAAK,EAAE,MAAM,CAAC;IACd;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAC;CACf"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@shrkcrft/compress",
|
|
3
|
+
"version": "0.1.0-alpha.16",
|
|
4
|
+
"description": "SharkCraft deterministic context-compression engine: content routing, lossless columnar/table compaction, log/search/diff line reduction, and reversible Compress-Cache-Retrieve (CCR). No model inside — every transform is a pure function of its input.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"author": "SharkCraft contributors",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"main": "./dist/index.js",
|
|
9
|
+
"types": "./dist/index.d.d.ts",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"bun": "./src/index.ts",
|
|
14
|
+
"import": "./dist/index.js",
|
|
15
|
+
"default": "./dist/index.js"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"files": [
|
|
19
|
+
"dist",
|
|
20
|
+
"README.md",
|
|
21
|
+
"LICENSE"
|
|
22
|
+
],
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "git+https://github.com/shrkcrft/sharkcraft.git",
|
|
26
|
+
"directory": "packages/compress"
|
|
27
|
+
},
|
|
28
|
+
"homepage": "https://github.com/shrkcrft/sharkcraft",
|
|
29
|
+
"bugs": {
|
|
30
|
+
"url": "https://github.com/shrkcrft/sharkcraft/issues"
|
|
31
|
+
},
|
|
32
|
+
"keywords": [
|
|
33
|
+
"sharkcraft",
|
|
34
|
+
"compression",
|
|
35
|
+
"tokens",
|
|
36
|
+
"ccr",
|
|
37
|
+
"deterministic"
|
|
38
|
+
],
|
|
39
|
+
"engines": {
|
|
40
|
+
"bun": ">=1.1.0",
|
|
41
|
+
"node": ">=18"
|
|
42
|
+
},
|
|
43
|
+
"scripts": {
|
|
44
|
+
"typecheck": "tsc --noEmit -p tsconfig.json"
|
|
45
|
+
},
|
|
46
|
+
"dependencies": {
|
|
47
|
+
"@shrkcrft/core": "^0.1.0-alpha.16"
|
|
48
|
+
},
|
|
49
|
+
"publishConfig": {
|
|
50
|
+
"access": "public"
|
|
51
|
+
}
|
|
52
|
+
}
|