@shrkcrft/compress 0.1.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +21 -0
  3. package/dist/cache/align-volatile-tokens.d.ts +13 -0
  4. package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
  5. package/dist/cache/align-volatile-tokens.js +51 -0
  6. package/dist/cache/alignment-map.d.ts +23 -0
  7. package/dist/cache/alignment-map.d.ts.map +1 -0
  8. package/dist/cache/alignment-map.js +1 -0
  9. package/dist/cache/alignment-result.d.ts +11 -0
  10. package/dist/cache/alignment-result.d.ts.map +1 -0
  11. package/dist/cache/alignment-result.js +1 -0
  12. package/dist/cache/detect-volatile-tokens.d.ts +10 -0
  13. package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
  14. package/dist/cache/detect-volatile-tokens.js +41 -0
  15. package/dist/cache/placeholder.d.ts +28 -0
  16. package/dist/cache/placeholder.d.ts.map +1 -0
  17. package/dist/cache/placeholder.js +0 -0
  18. package/dist/cache/restore-volatile-tokens.d.ts +10 -0
  19. package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
  20. package/dist/cache/restore-volatile-tokens.js +21 -0
  21. package/dist/cache/volatile-classify.d.ts +11 -0
  22. package/dist/cache/volatile-classify.d.ts.map +1 -0
  23. package/dist/cache/volatile-classify.js +35 -0
  24. package/dist/cache/volatile-kind.d.ts +13 -0
  25. package/dist/cache/volatile-kind.d.ts.map +1 -0
  26. package/dist/cache/volatile-kind.js +13 -0
  27. package/dist/cache/volatile-token.d.ts +14 -0
  28. package/dist/cache/volatile-token.d.ts.map +1 -0
  29. package/dist/cache/volatile-token.js +1 -0
  30. package/dist/ccr/ccr-entry.d.ts +13 -0
  31. package/dist/ccr/ccr-entry.d.ts.map +1 -0
  32. package/dist/ccr/ccr-entry.js +1 -0
  33. package/dist/ccr/ccr-key.d.ts +9 -0
  34. package/dist/ccr/ccr-key.d.ts.map +1 -0
  35. package/dist/ccr/ccr-key.js +19 -0
  36. package/dist/ccr/ccr-marker.d.ts +23 -0
  37. package/dist/ccr/ccr-marker.d.ts.map +1 -0
  38. package/dist/ccr/ccr-marker.js +30 -0
  39. package/dist/ccr/ccr-store.d.ts +18 -0
  40. package/dist/ccr/ccr-store.d.ts.map +1 -0
  41. package/dist/ccr/ccr-store.js +1 -0
  42. package/dist/ccr/file-ccr-store.d.ts +19 -0
  43. package/dist/ccr/file-ccr-store.d.ts.map +1 -0
  44. package/dist/ccr/file-ccr-store.js +53 -0
  45. package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
  46. package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
  47. package/dist/ccr/in-memory-ccr-store.js +45 -0
  48. package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
  49. package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
  50. package/dist/ccr/ttl-file-ccr-store.js +117 -0
  51. package/dist/code/compress-code.d.ts +4 -0
  52. package/dist/code/compress-code.d.ts.map +1 -0
  53. package/dist/code/compress-code.js +294 -0
  54. package/dist/compress-content.d.ts +11 -0
  55. package/dist/compress-content.d.ts.map +1 -0
  56. package/dist/compress-content.js +79 -0
  57. package/dist/content/content-type.d.ts +28 -0
  58. package/dist/content/content-type.d.ts.map +1 -0
  59. package/dist/content/content-type.js +28 -0
  60. package/dist/content/detect-content-type.d.ts +9 -0
  61. package/dist/content/detect-content-type.d.ts.map +1 -0
  62. package/dist/content/detect-content-type.js +184 -0
  63. package/dist/content/segment.d.ts +21 -0
  64. package/dist/content/segment.d.ts.map +1 -0
  65. package/dist/content/segment.js +117 -0
  66. package/dist/index.d.ts +61 -0
  67. package/dist/index.d.ts.map +1 -0
  68. package/dist/index.js +49 -0
  69. package/dist/json/compress-json.d.ts +18 -0
  70. package/dist/json/compress-json.d.ts.map +1 -0
  71. package/dist/json/compress-json.js +139 -0
  72. package/dist/json/render-compact-json.d.ts +10 -0
  73. package/dist/json/render-compact-json.d.ts.map +1 -0
  74. package/dist/json/render-compact-json.js +18 -0
  75. package/dist/relevance/bm25.d.ts +26 -0
  76. package/dist/relevance/bm25.d.ts.map +1 -0
  77. package/dist/relevance/bm25.js +115 -0
  78. package/dist/result/compress-options.d.ts +26 -0
  79. package/dist/result/compress-options.d.ts.map +1 -0
  80. package/dist/result/compress-options.js +1 -0
  81. package/dist/result/compression-result.d.ts +26 -0
  82. package/dist/result/compression-result.d.ts.map +1 -0
  83. package/dist/result/compression-result.js +1 -0
  84. package/dist/result/compression-strategy.d.ts +30 -0
  85. package/dist/result/compression-strategy.d.ts.map +1 -0
  86. package/dist/result/compression-strategy.js +30 -0
  87. package/dist/table/adaptive-size.d.ts +46 -0
  88. package/dist/table/adaptive-size.d.ts.map +1 -0
  89. package/dist/table/adaptive-size.js +170 -0
  90. package/dist/table/apply-value-dictionaries.d.ts +30 -0
  91. package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
  92. package/dist/table/apply-value-dictionaries.js +99 -0
  93. package/dist/table/column-presence.d.ts +20 -0
  94. package/dist/table/column-presence.d.ts.map +1 -0
  95. package/dist/table/column-presence.js +52 -0
  96. package/dist/table/columnar-json.d.ts +24 -0
  97. package/dist/table/columnar-json.d.ts.map +1 -0
  98. package/dist/table/columnar-json.js +83 -0
  99. package/dist/table/columnar-table.d.ts +24 -0
  100. package/dist/table/columnar-table.d.ts.map +1 -0
  101. package/dist/table/columnar-table.js +1 -0
  102. package/dist/table/compact-object-array.d.ts +12 -0
  103. package/dist/table/compact-object-array.d.ts.map +1 -0
  104. package/dist/table/compact-object-array.js +88 -0
  105. package/dist/table/field-spec.d.ts +13 -0
  106. package/dist/table/field-spec.d.ts.map +1 -0
  107. package/dist/table/field-spec.js +1 -0
  108. package/dist/table/object-map.d.ts +28 -0
  109. package/dist/table/object-map.d.ts.map +1 -0
  110. package/dist/table/object-map.js +119 -0
  111. package/dist/table/render-table.d.ts +11 -0
  112. package/dist/table/render-table.d.ts.map +1 -0
  113. package/dist/table/render-table.js +39 -0
  114. package/dist/table/sample-object-array.d.ts +11 -0
  115. package/dist/table/sample-object-array.d.ts.map +1 -0
  116. package/dist/table/sample-object-array.js +171 -0
  117. package/dist/table/sample-options.d.ts +29 -0
  118. package/dist/table/sample-options.d.ts.map +1 -0
  119. package/dist/table/sample-options.js +1 -0
  120. package/dist/table/sampled-table.d.ts +33 -0
  121. package/dist/table/sampled-table.d.ts.map +1 -0
  122. package/dist/table/sampled-table.js +8 -0
  123. package/dist/table/table-compaction.d.ts +19 -0
  124. package/dist/table/table-compaction.d.ts.map +1 -0
  125. package/dist/table/table-compaction.js +1 -0
  126. package/dist/table/table-formats.d.ts +23 -0
  127. package/dist/table/table-formats.d.ts.map +1 -0
  128. package/dist/table/table-formats.js +233 -0
  129. package/dist/text/compress-diff.d.ts +20 -0
  130. package/dist/text/compress-diff.d.ts.map +1 -0
  131. package/dist/text/compress-diff.js +344 -0
  132. package/dist/text/compress-lines.d.ts +12 -0
  133. package/dist/text/compress-lines.d.ts.map +1 -0
  134. package/dist/text/compress-lines.js +44 -0
  135. package/dist/text/compress-log.d.ts +12 -0
  136. package/dist/text/compress-log.d.ts.map +1 -0
  137. package/dist/text/compress-log.js +202 -0
  138. package/dist/text/compress-markdown.d.ts +15 -0
  139. package/dist/text/compress-markdown.d.ts.map +1 -0
  140. package/dist/text/compress-markdown.js +96 -0
  141. package/dist/text/compress-search.d.ts +11 -0
  142. package/dist/text/compress-search.d.ts.map +1 -0
  143. package/dist/text/compress-search.js +78 -0
  144. package/dist/text/finalize.d.ts +21 -0
  145. package/dist/text/finalize.d.ts.map +1 -0
  146. package/dist/text/finalize.js +54 -0
  147. package/dist/text/line-utils.d.ts +20 -0
  148. package/dist/text/line-utils.d.ts.map +1 -0
  149. package/dist/text/line-utils.js +65 -0
  150. package/dist/text/lockfile-names.d.ts +3 -0
  151. package/dist/text/lockfile-names.d.ts.map +1 -0
  152. package/dist/text/lockfile-names.js +33 -0
  153. package/dist/text/log-template.d.ts +31 -0
  154. package/dist/text/log-template.d.ts.map +1 -0
  155. package/dist/text/log-template.js +239 -0
  156. package/dist/tokens/estimate-tokens.d.ts +17 -0
  157. package/dist/tokens/estimate-tokens.d.ts.map +1 -0
  158. package/dist/tokens/estimate-tokens.js +53 -0
  159. package/dist/tokens/token-savings.d.ts +20 -0
  160. package/dist/tokens/token-savings.d.ts.map +1 -0
  161. package/dist/tokens/token-savings.js +1 -0
  162. package/package.json +52 -0
@@ -0,0 +1,239 @@
1
+ /**
2
+ * Drain-style log-template mining — a LOSSLESS pre-pass for {@link compressLog}.
3
+ *
4
+ * Repeated structured log lines (`worker-3 processing batch 17 ok` × N) carry
5
+ * almost no new information per line: the fixed words repeat and only a few
6
+ * variable tokens move. This miner tokenizes each line, replaces its variable
7
+ * tokens (numbers, hex, UUIDs, ISO timestamps, quoted strings) with a `{}`
8
+ * placeholder to form a *template*, groups consecutive lines sharing a
9
+ * template, and collapses each run to one template plus a compact per-column
10
+ * encoding of the captured variables.
11
+ *
12
+ * It is **lossless by construction**: a template is exactly the original line
13
+ * with its variable matches replaced by `{}`, so `template ⋈ variables` rebuilds
14
+ * every original line, in order — no CCR needed. {@link reconstructLogTemplates}
15
+ * is the inverse and is exercised by the round-trip tests.
16
+ *
17
+ * Only runs with ≥1 variable column collapse; pure-identical repeats are left
18
+ * for the downstream signal-selector's de-duplication so its behaviour (and the
19
+ * `… N omitted …` markers callers rely on) is preserved.
20
+ */
21
+ // Block sentinels. Chosen to never occur in real logs; if the input contains
22
+ // either, mining is skipped wholesale so a collision can't corrupt round-trip.
23
+ const BLOCK_OPEN = '⟦';
24
+ const BLOCK_CLOSE = '⟧';
25
+ const PLACEHOLDER = '{}';
26
+ /** Minimum consecutive same-template lines before a run is worth collapsing. */
27
+ const MIN_RUN = 3;
28
+ // Variable token classes, specific → general. Correctness of the round-trip
29
+ // does NOT depend on this list (template+matches always rebuild the line); it
30
+ // only governs how MUCH collapses.
31
+ const VAR_RE = new RegExp([
32
+ '\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?(?:Z|[+-]\\d{2}:?\\d{2})?', // ISO timestamp
33
+ '[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', // UUID
34
+ '0x[0-9a-fA-F]+', // hex literal
35
+ '\\b[0-9a-fA-F]{12,}\\b', // long hash
36
+ '"(?:[^"\\\\]|\\\\.)*"', // double-quoted string
37
+ "'(?:[^'\\\\]|\\\\.)*'", // single-quoted string
38
+ '\\d+(?:\\.\\d+)?', // integer / decimal
39
+ ].join('|'), 'g');
40
+ // High-signal lines the downstream selector keeps individually — never folded
41
+ // into a template, so an error / crash / stack frame always stays verbatim.
42
+ const SIGNAL_RE = /\b(?:ERROR|FATAL|FAIL(?:ED|URE)?|EXCEPTION|panic|segmentation fault|segfault|core dumped|bus error|out of memory|oom|traceback|undefined reference|undefined symbol|assertion (?:failed)|SIG(?:SEGV|ABRT|KILL|BUS|FPE|ILL))\b/i;
43
+ const STACK_RE = /^\s+(?:at\s+\S+|File ".*", line \d+)/;
44
+ /** Collapse consecutive same-template runs. Lossless; reversible via {@link reconstructLogTemplates}. */
45
+ export function mineLogTemplates(lines) {
46
+ // A sentinel anywhere means we can't guarantee a clean round-trip — bail.
47
+ for (const l of lines) {
48
+ if (l.includes(BLOCK_OPEN) || l.includes(BLOCK_CLOSE)) {
49
+ return { lines: [...lines], reduced: false };
50
+ }
51
+ }
52
+ const templateOf = (line) => {
53
+ if (line.includes(PLACEHOLDER) || SIGNAL_RE.test(line) || STACK_RE.test(line))
54
+ return null;
55
+ return line.replace(VAR_RE, PLACEHOLDER);
56
+ };
57
+ const templates = lines.map(templateOf);
58
+ const out = [];
59
+ let reduced = false;
60
+ let i = 0;
61
+ while (i < lines.length) {
62
+ const tpl = templates[i];
63
+ if (tpl === null || tpl === undefined) {
64
+ out.push(lines[i] ?? '');
65
+ i += 1;
66
+ continue;
67
+ }
68
+ let j = i + 1;
69
+ while (j < lines.length && templates[j] === tpl)
70
+ j += 1;
71
+ const run = lines.slice(i, j);
72
+ const block = run.length >= MIN_RUN ? encodeBlock(tpl, run) : null;
73
+ if (block) {
74
+ out.push(...block);
75
+ reduced = true;
76
+ }
77
+ else {
78
+ out.push(...run);
79
+ }
80
+ i = j;
81
+ }
82
+ return { lines: out, reduced };
83
+ }
84
+ /**
85
+ * Encode one run as a template block, or null when it isn't worth it (no
86
+ * variable columns: pure-identical repeats are left for the selector's dedup).
87
+ */
88
+ function encodeBlock(template, run) {
89
+ const parts = template.split(PLACEHOLDER);
90
+ const cols = parts.length - 1;
91
+ const valuesPerLine = run.map((l) => l.match(VAR_RE) ?? []);
92
+ // Fold constant columns into the template; collect the variable columns.
93
+ let folded = parts[0] ?? '';
94
+ const varColumns = [];
95
+ for (let c = 0; c < cols; c += 1) {
96
+ const column = valuesPerLine.map((v) => v[c] ?? '');
97
+ const allEqual = column.every((v) => v === column[0]);
98
+ if (allEqual) {
99
+ folded += column[0] ?? '';
100
+ }
101
+ else {
102
+ folded += PLACEHOLDER;
103
+ varColumns.push(column);
104
+ }
105
+ folded += parts[c + 1] ?? '';
106
+ }
107
+ // No moving parts → leave it for the selector's identical-line dedup.
108
+ if (varColumns.length === 0)
109
+ return null;
110
+ const block = [`${BLOCK_OPEN}×${run.length}${BLOCK_CLOSE} ${folded}`];
111
+ for (const column of varColumns) {
112
+ block.push(`${BLOCK_OPEN}c${BLOCK_CLOSE} ${encodeColumn(column)}`);
113
+ }
114
+ block.push(`${BLOCK_OPEN}/${BLOCK_CLOSE}`);
115
+ return block;
116
+ }
117
+ /** Encode one variable column as the tightest lossless form: seq | cyc | lit. */
118
+ function encodeColumn(values) {
119
+ const seq = asArithmetic(values);
120
+ if (seq)
121
+ return `seq ${seq.start} ${seq.step}`;
122
+ const cyc = asCycle(values);
123
+ if (cyc)
124
+ return `cyc ${cyc.map(escapeValue).join('|')}`;
125
+ return `lit ${values.map(escapeValue).join('|')}`;
126
+ }
127
+ /** Decode a column encoding into its N values. */
128
+ function decodeColumn(enc, n) {
129
+ const sp = enc.indexOf(' ');
130
+ const kind = sp < 0 ? enc : enc.slice(0, sp);
131
+ const rest = sp < 0 ? '' : enc.slice(sp + 1);
132
+ if (kind === 'seq') {
133
+ const [start, step] = rest.split(' ').map((x) => Number(x));
134
+ return Array.from({ length: n }, (_, k) => String((start ?? 0) + k * (step ?? 0)));
135
+ }
136
+ if (kind === 'cyc') {
137
+ const pat = splitEscaped(rest);
138
+ return Array.from({ length: n }, (_, k) => pat[k % pat.length] ?? '');
139
+ }
140
+ // lit
141
+ return splitEscaped(rest);
142
+ }
143
+ /** An integer arithmetic progression that reconstructs every value canonically, else null. */
144
+ function asArithmetic(values) {
145
+ if (values.length < 2)
146
+ return null;
147
+ const nums = [];
148
+ for (const v of values) {
149
+ // Canonical non-negative integer only (no leading zeros, fits exactly).
150
+ if (!/^\d+$/.test(v) || (v.length > 1 && v[0] === '0') || v.length > 15)
151
+ return null;
152
+ nums.push(Number(v));
153
+ }
154
+ const step = nums[1] - nums[0];
155
+ for (let k = 0; k < nums.length; k += 1) {
156
+ if (String(nums[0] + k * step) !== values[k])
157
+ return null;
158
+ }
159
+ return { start: nums[0], step };
160
+ }
161
+ /** A short repeating pattern (period 2..8) that covers the whole column, else null. */
162
+ function asCycle(values) {
163
+ const n = values.length;
164
+ const maxPeriod = Math.min(8, Math.floor(n / 2));
165
+ for (let p = 2; p <= maxPeriod; p += 1) {
166
+ let ok = true;
167
+ for (let k = 0; k < n; k += 1) {
168
+ if (values[k] !== values[k % p]) {
169
+ ok = false;
170
+ break;
171
+ }
172
+ }
173
+ if (ok)
174
+ return values.slice(0, p);
175
+ }
176
+ return null;
177
+ }
178
+ function escapeValue(s) {
179
+ return s.replace(/\\/g, '\\\\').replace(/\|/g, '\\|');
180
+ }
181
+ /** Split on unescaped `|` and unescape each field. Inverse of {@link escapeValue}. */
182
+ function splitEscaped(s) {
183
+ const out = [];
184
+ let cur = '';
185
+ for (let i = 0; i < s.length; i += 1) {
186
+ const ch = s[i];
187
+ if (ch === '\\' && i + 1 < s.length) {
188
+ cur += s[i + 1];
189
+ i += 1;
190
+ }
191
+ else if (ch === '|') {
192
+ out.push(cur);
193
+ cur = '';
194
+ }
195
+ else {
196
+ cur += ch;
197
+ }
198
+ }
199
+ out.push(cur);
200
+ return out;
201
+ }
202
+ const HEADER_RE = new RegExp(`^${BLOCK_OPEN}×(\\d+)${BLOCK_CLOSE} (.*)$`);
203
+ const COL_RE = new RegExp(`^${BLOCK_OPEN}c${BLOCK_CLOSE} (.*)$`);
204
+ const CLOSE = `${BLOCK_OPEN}/${BLOCK_CLOSE}`;
205
+ /** Inverse of {@link mineLogTemplates}: expand every block back to its original lines. */
206
+ export function reconstructLogTemplates(text) {
207
+ const lines = text.split('\n');
208
+ const out = [];
209
+ let i = 0;
210
+ while (i < lines.length) {
211
+ const header = HEADER_RE.exec(lines[i] ?? '');
212
+ if (!header) {
213
+ out.push(lines[i] ?? '');
214
+ i += 1;
215
+ continue;
216
+ }
217
+ const n = Number(header[1]);
218
+ const folded = header[2] ?? '';
219
+ i += 1;
220
+ const encodings = [];
221
+ while (i < lines.length && lines[i] !== CLOSE) {
222
+ const col = COL_RE.exec(lines[i] ?? '');
223
+ if (col)
224
+ encodings.push(col[1] ?? '');
225
+ i += 1;
226
+ }
227
+ i += 1; // skip CLOSE
228
+ const columns = encodings.map((enc) => decodeColumn(enc, n));
229
+ const parts = folded.split(PLACEHOLDER);
230
+ for (let k = 0; k < n; k += 1) {
231
+ let line = parts[0] ?? '';
232
+ for (let c = 0; c < columns.length; c += 1) {
233
+ line += (columns[c]?.[k] ?? '') + (parts[c + 1] ?? '');
234
+ }
235
+ out.push(line);
236
+ }
237
+ }
238
+ return out.join('\n');
239
+ }
@@ -0,0 +1,17 @@
1
+ import type { ITokenSavings } from './token-savings.js';
2
+ import { EContentType } from '../content/content-type.js';
3
+ /**
4
+ * Approximate token count. Average English token ≈ 4 chars; denser classes use
5
+ * a class-specific ratio when `contentType` is supplied. Only the character
6
+ * term is typed — the `words * 1.3` floor is content-independent. Pure: reads
7
+ * only its args + a frozen table.
8
+ */
9
+ export declare function estimateTokens(text: string, contentType?: EContentType): number;
10
+ /**
11
+ * Measure the token delta between a before/after string (optionally typed).
12
+ * Clamps at zero so a compressor is never reported as a net loss. Both sides
13
+ * use the same `contentType`, so a uniform divisor cannot flip a real
14
+ * reduction into a false passthrough.
15
+ */
16
+ export declare function measureSavings(before: string, after: string, contentType?: EContentType): ITokenSavings;
17
+ //# sourceMappingURL=estimate-tokens.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"estimate-tokens.d.ts","sourceRoot":"","sources":["../../src/tokens/estimate-tokens.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AA6B1D;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,WAAW,CAAC,EAAE,YAAY,GAAG,MAAM,CAO/E;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAC5B,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EACb,WAAW,CAAC,EAAE,YAAY,GACzB,aAAa,CAMf"}
@@ -0,0 +1,53 @@
1
+ import { EContentType } from "../content/content-type.js";
2
+ /**
3
+ * Chars-per-token by content class. Denser content (JSON punctuation, code
4
+ * operators) packs more characters per BPE token than prose, so a LARGER
5
+ * divisor yields a LOWER (more accurate) token estimate. Prose stays at 4 —
6
+ * the exact legacy value — so the untyped path is byte-identical.
7
+ */
8
+ const CHARS_PER_TOKEN = Object.freeze({
9
+ [EContentType.JsonArray]: 2.5,
10
+ [EContentType.Json]: 2.5,
11
+ [EContentType.GitDiff]: 3.0,
12
+ [EContentType.SearchResults]: 3.0,
13
+ [EContentType.BuildLog]: 3.5,
14
+ [EContentType.SourceCode]: 3.2,
15
+ [EContentType.Markdown]: 4.0,
16
+ [EContentType.Yaml]: 3.5,
17
+ [EContentType.Csv]: 3.0,
18
+ [EContentType.PlainText]: 4.0,
19
+ });
20
+ /**
21
+ * The legacy divisor. With no content type, `estimateTokens` reproduces the
22
+ * exact `max(ceil(chars/4), ceil(words*1.3))` formula bit-for-bit, keeping it
23
+ * in lockstep with `@shrkcrft/context`'s separate estimator. DO NOT change this
24
+ * default or the `words*1.3` floor without updating that peer in step.
25
+ */
26
+ const DEFAULT_CHARS_PER_TOKEN = 4;
27
+ /**
28
+ * Approximate token count. Average English token ≈ 4 chars; denser classes use
29
+ * a class-specific ratio when `contentType` is supplied. Only the character
30
+ * term is typed — the `words * 1.3` floor is content-independent. Pure: reads
31
+ * only its args + a frozen table.
32
+ */
33
+ export function estimateTokens(text, contentType) {
34
+ if (!text)
35
+ return 0;
36
+ const divisor = contentType === undefined ? DEFAULT_CHARS_PER_TOKEN : CHARS_PER_TOKEN[contentType];
37
+ const chars = text.length;
38
+ const words = text.trim().split(/\s+/).length;
39
+ return Math.max(Math.ceil(chars / divisor), Math.ceil(words * 1.3));
40
+ }
41
+ /**
42
+ * Measure the token delta between a before/after string (optionally typed).
43
+ * Clamps at zero so a compressor is never reported as a net loss. Both sides
44
+ * use the same `contentType`, so a uniform divisor cannot flip a real
45
+ * reduction into a false passthrough.
46
+ */
47
+ export function measureSavings(before, after, contentType) {
48
+ const b = estimateTokens(before, contentType);
49
+ const a = estimateTokens(after, contentType);
50
+ const saved = Math.max(0, b - a);
51
+ const ratio = b === 0 ? 0 : Math.round((saved / b) * 10000) / 10000;
52
+ return { before: b, after: a, saved, ratio };
53
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Token-accounting summary for a compression pass. Every compressor in this
3
+ * package reports its effect through this shape so savings are measured, not
4
+ * assumed.
5
+ */
6
+ export interface ITokenSavings {
7
+ /** Estimated tokens of the input. */
8
+ before: number;
9
+ /** Estimated tokens of the output. */
10
+ after: number;
11
+ /** `before - after` (never negative; clamped at 0). */
12
+ saved: number;
13
+ /**
14
+ * Fraction saved in `[0, 1]`, rounded to 4 dp. `0` when the input was
15
+ * empty or the output grew (a compressor must never be reported as a
16
+ * net loss).
17
+ */
18
+ ratio: number;
19
+ }
20
+ //# sourceMappingURL=token-savings.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-savings.d.ts","sourceRoot":"","sources":["../../src/tokens/token-savings.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,qCAAqC;IACrC,MAAM,EAAE,MAAM,CAAC;IACf,sCAAsC;IACtC,KAAK,EAAE,MAAM,CAAC;IACd,uDAAuD;IACvD,KAAK,EAAE,MAAM,CAAC;IACd;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAC;CACf"}
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,52 @@
1
+ {
2
+ "name": "@shrkcrft/compress",
3
+ "version": "0.1.0-alpha.16",
4
+ "description": "SharkCraft deterministic context-compression engine: content routing, lossless columnar/table compaction, log/search/diff line reduction, and reversible Compress-Cache-Retrieve (CCR). No model inside — every transform is a pure function of its input.",
5
+ "license": "MIT",
6
+ "author": "SharkCraft contributors",
7
+ "type": "module",
8
+ "main": "./dist/index.js",
9
+ "types": "./dist/index.d.d.ts",
10
+ "exports": {
11
+ ".": {
12
+ "types": "./dist/index.d.ts",
13
+ "bun": "./src/index.ts",
14
+ "import": "./dist/index.js",
15
+ "default": "./dist/index.js"
16
+ }
17
+ },
18
+ "files": [
19
+ "dist",
20
+ "README.md",
21
+ "LICENSE"
22
+ ],
23
+ "repository": {
24
+ "type": "git",
25
+ "url": "git+https://github.com/shrkcrft/sharkcraft.git",
26
+ "directory": "packages/compress"
27
+ },
28
+ "homepage": "https://github.com/shrkcrft/sharkcraft",
29
+ "bugs": {
30
+ "url": "https://github.com/shrkcrft/sharkcraft/issues"
31
+ },
32
+ "keywords": [
33
+ "sharkcraft",
34
+ "compression",
35
+ "tokens",
36
+ "ccr",
37
+ "deterministic"
38
+ ],
39
+ "engines": {
40
+ "bun": ">=1.1.0",
41
+ "node": ">=18"
42
+ },
43
+ "scripts": {
44
+ "typecheck": "tsc --noEmit -p tsconfig.json"
45
+ },
46
+ "dependencies": {
47
+ "@shrkcrft/core": "^0.1.0-alpha.16"
48
+ },
49
+ "publishConfig": {
50
+ "access": "public"
51
+ }
52
+ }