@shrkcrft/compress 0.1.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +21 -0
  3. package/dist/cache/align-volatile-tokens.d.ts +13 -0
  4. package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
  5. package/dist/cache/align-volatile-tokens.js +51 -0
  6. package/dist/cache/alignment-map.d.ts +23 -0
  7. package/dist/cache/alignment-map.d.ts.map +1 -0
  8. package/dist/cache/alignment-map.js +1 -0
  9. package/dist/cache/alignment-result.d.ts +11 -0
  10. package/dist/cache/alignment-result.d.ts.map +1 -0
  11. package/dist/cache/alignment-result.js +1 -0
  12. package/dist/cache/detect-volatile-tokens.d.ts +10 -0
  13. package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
  14. package/dist/cache/detect-volatile-tokens.js +41 -0
  15. package/dist/cache/placeholder.d.ts +28 -0
  16. package/dist/cache/placeholder.d.ts.map +1 -0
  17. package/dist/cache/placeholder.js +0 -0
  18. package/dist/cache/restore-volatile-tokens.d.ts +10 -0
  19. package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
  20. package/dist/cache/restore-volatile-tokens.js +21 -0
  21. package/dist/cache/volatile-classify.d.ts +11 -0
  22. package/dist/cache/volatile-classify.d.ts.map +1 -0
  23. package/dist/cache/volatile-classify.js +35 -0
  24. package/dist/cache/volatile-kind.d.ts +13 -0
  25. package/dist/cache/volatile-kind.d.ts.map +1 -0
  26. package/dist/cache/volatile-kind.js +13 -0
  27. package/dist/cache/volatile-token.d.ts +14 -0
  28. package/dist/cache/volatile-token.d.ts.map +1 -0
  29. package/dist/cache/volatile-token.js +1 -0
  30. package/dist/ccr/ccr-entry.d.ts +13 -0
  31. package/dist/ccr/ccr-entry.d.ts.map +1 -0
  32. package/dist/ccr/ccr-entry.js +1 -0
  33. package/dist/ccr/ccr-key.d.ts +9 -0
  34. package/dist/ccr/ccr-key.d.ts.map +1 -0
  35. package/dist/ccr/ccr-key.js +19 -0
  36. package/dist/ccr/ccr-marker.d.ts +23 -0
  37. package/dist/ccr/ccr-marker.d.ts.map +1 -0
  38. package/dist/ccr/ccr-marker.js +30 -0
  39. package/dist/ccr/ccr-store.d.ts +18 -0
  40. package/dist/ccr/ccr-store.d.ts.map +1 -0
  41. package/dist/ccr/ccr-store.js +1 -0
  42. package/dist/ccr/file-ccr-store.d.ts +19 -0
  43. package/dist/ccr/file-ccr-store.d.ts.map +1 -0
  44. package/dist/ccr/file-ccr-store.js +53 -0
  45. package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
  46. package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
  47. package/dist/ccr/in-memory-ccr-store.js +45 -0
  48. package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
  49. package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
  50. package/dist/ccr/ttl-file-ccr-store.js +117 -0
  51. package/dist/code/compress-code.d.ts +4 -0
  52. package/dist/code/compress-code.d.ts.map +1 -0
  53. package/dist/code/compress-code.js +294 -0
  54. package/dist/compress-content.d.ts +11 -0
  55. package/dist/compress-content.d.ts.map +1 -0
  56. package/dist/compress-content.js +79 -0
  57. package/dist/content/content-type.d.ts +28 -0
  58. package/dist/content/content-type.d.ts.map +1 -0
  59. package/dist/content/content-type.js +28 -0
  60. package/dist/content/detect-content-type.d.ts +9 -0
  61. package/dist/content/detect-content-type.d.ts.map +1 -0
  62. package/dist/content/detect-content-type.js +184 -0
  63. package/dist/content/segment.d.ts +21 -0
  64. package/dist/content/segment.d.ts.map +1 -0
  65. package/dist/content/segment.js +117 -0
  66. package/dist/index.d.ts +61 -0
  67. package/dist/index.d.ts.map +1 -0
  68. package/dist/index.js +49 -0
  69. package/dist/json/compress-json.d.ts +18 -0
  70. package/dist/json/compress-json.d.ts.map +1 -0
  71. package/dist/json/compress-json.js +139 -0
  72. package/dist/json/render-compact-json.d.ts +10 -0
  73. package/dist/json/render-compact-json.d.ts.map +1 -0
  74. package/dist/json/render-compact-json.js +18 -0
  75. package/dist/relevance/bm25.d.ts +26 -0
  76. package/dist/relevance/bm25.d.ts.map +1 -0
  77. package/dist/relevance/bm25.js +115 -0
  78. package/dist/result/compress-options.d.ts +26 -0
  79. package/dist/result/compress-options.d.ts.map +1 -0
  80. package/dist/result/compress-options.js +1 -0
  81. package/dist/result/compression-result.d.ts +26 -0
  82. package/dist/result/compression-result.d.ts.map +1 -0
  83. package/dist/result/compression-result.js +1 -0
  84. package/dist/result/compression-strategy.d.ts +30 -0
  85. package/dist/result/compression-strategy.d.ts.map +1 -0
  86. package/dist/result/compression-strategy.js +30 -0
  87. package/dist/table/adaptive-size.d.ts +46 -0
  88. package/dist/table/adaptive-size.d.ts.map +1 -0
  89. package/dist/table/adaptive-size.js +170 -0
  90. package/dist/table/apply-value-dictionaries.d.ts +30 -0
  91. package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
  92. package/dist/table/apply-value-dictionaries.js +99 -0
  93. package/dist/table/column-presence.d.ts +20 -0
  94. package/dist/table/column-presence.d.ts.map +1 -0
  95. package/dist/table/column-presence.js +52 -0
  96. package/dist/table/columnar-json.d.ts +24 -0
  97. package/dist/table/columnar-json.d.ts.map +1 -0
  98. package/dist/table/columnar-json.js +83 -0
  99. package/dist/table/columnar-table.d.ts +24 -0
  100. package/dist/table/columnar-table.d.ts.map +1 -0
  101. package/dist/table/columnar-table.js +1 -0
  102. package/dist/table/compact-object-array.d.ts +12 -0
  103. package/dist/table/compact-object-array.d.ts.map +1 -0
  104. package/dist/table/compact-object-array.js +88 -0
  105. package/dist/table/field-spec.d.ts +13 -0
  106. package/dist/table/field-spec.d.ts.map +1 -0
  107. package/dist/table/field-spec.js +1 -0
  108. package/dist/table/object-map.d.ts +28 -0
  109. package/dist/table/object-map.d.ts.map +1 -0
  110. package/dist/table/object-map.js +119 -0
  111. package/dist/table/render-table.d.ts +11 -0
  112. package/dist/table/render-table.d.ts.map +1 -0
  113. package/dist/table/render-table.js +39 -0
  114. package/dist/table/sample-object-array.d.ts +11 -0
  115. package/dist/table/sample-object-array.d.ts.map +1 -0
  116. package/dist/table/sample-object-array.js +171 -0
  117. package/dist/table/sample-options.d.ts +29 -0
  118. package/dist/table/sample-options.d.ts.map +1 -0
  119. package/dist/table/sample-options.js +1 -0
  120. package/dist/table/sampled-table.d.ts +33 -0
  121. package/dist/table/sampled-table.d.ts.map +1 -0
  122. package/dist/table/sampled-table.js +8 -0
  123. package/dist/table/table-compaction.d.ts +19 -0
  124. package/dist/table/table-compaction.d.ts.map +1 -0
  125. package/dist/table/table-compaction.js +1 -0
  126. package/dist/table/table-formats.d.ts +23 -0
  127. package/dist/table/table-formats.d.ts.map +1 -0
  128. package/dist/table/table-formats.js +233 -0
  129. package/dist/text/compress-diff.d.ts +20 -0
  130. package/dist/text/compress-diff.d.ts.map +1 -0
  131. package/dist/text/compress-diff.js +344 -0
  132. package/dist/text/compress-lines.d.ts +12 -0
  133. package/dist/text/compress-lines.d.ts.map +1 -0
  134. package/dist/text/compress-lines.js +44 -0
  135. package/dist/text/compress-log.d.ts +12 -0
  136. package/dist/text/compress-log.d.ts.map +1 -0
  137. package/dist/text/compress-log.js +202 -0
  138. package/dist/text/compress-markdown.d.ts +15 -0
  139. package/dist/text/compress-markdown.d.ts.map +1 -0
  140. package/dist/text/compress-markdown.js +96 -0
  141. package/dist/text/compress-search.d.ts +11 -0
  142. package/dist/text/compress-search.d.ts.map +1 -0
  143. package/dist/text/compress-search.js +78 -0
  144. package/dist/text/finalize.d.ts +21 -0
  145. package/dist/text/finalize.d.ts.map +1 -0
  146. package/dist/text/finalize.js +54 -0
  147. package/dist/text/line-utils.d.ts +20 -0
  148. package/dist/text/line-utils.d.ts.map +1 -0
  149. package/dist/text/line-utils.js +65 -0
  150. package/dist/text/lockfile-names.d.ts +3 -0
  151. package/dist/text/lockfile-names.d.ts.map +1 -0
  152. package/dist/text/lockfile-names.js +33 -0
  153. package/dist/text/log-template.d.ts +31 -0
  154. package/dist/text/log-template.d.ts.map +1 -0
  155. package/dist/text/log-template.js +239 -0
  156. package/dist/tokens/estimate-tokens.d.ts +17 -0
  157. package/dist/tokens/estimate-tokens.d.ts.map +1 -0
  158. package/dist/tokens/estimate-tokens.js +53 -0
  159. package/dist/tokens/token-savings.d.ts +20 -0
  160. package/dist/tokens/token-savings.d.ts.map +1 -0
  161. package/dist/tokens/token-savings.js +1 -0
  162. package/package.json +52 -0
@@ -0,0 +1,344 @@
1
+ import { EContentType } from "../content/content-type.js";
2
+ import { ECompressionStrategy } from "../result/compression-strategy.js";
3
+ import { splitLines, queryTokens, queryOverlap, elide } from "./line-utils.js";
4
+ import { finalizeLossy, passthroughResult } from "./finalize.js";
5
+ import { formatCcrMarker } from "../ccr/ccr-marker.js";
6
+ import { isLockfileName } from "./lockfile-names.js";
7
+ // Header tokens that are unambiguous at column 0 (a hunk body line starts with
8
+ // ` `/`+`/`-`, never these). `--- `/`+++ ` are handled separately because they
9
+ // collide with deleted/added content lines.
10
+ const SAFE_HEADER_RE = /^(?:diff --git |index |new file|deleted file|old mode|new mode|similarity |rename |copy )/;
11
+ const HUNK_HEADER_RE = /^@@ /;
12
+ function isChangeLine(line) {
13
+ if (line.startsWith('+') && !line.startsWith('+++'))
14
+ return true;
15
+ if (line.startsWith('-') && !line.startsWith('---'))
16
+ return true;
17
+ return false;
18
+ }
19
+ /**
20
+ * Reduce a unified diff to its highest-signal lines. Two passes compose:
21
+ *
22
+ * 1. **Diff-noise offload** (this wrapper): lockfile sections
23
+ * (`package-lock.json` & friends) collapse to a one-line marker, and
24
+ * whitespace-only hunks (pure reindentation) collapse to a marker — the
25
+ * single largest sources of useless diff tokens. Both are CCR-recoverable.
26
+ * 2. **Core hunk compression** ({@link compressDiffCore}): the remaining real
27
+ * changes keep their changed lines plus a tight context window, capped per
28
+ * file.
29
+ *
30
+ * The offload pass only engages when a `diff --git` section is actually a
31
+ * lockfile or contains a whitespace-only hunk; every other diff routes straight
32
+ * to the core compressor, byte-identical to before. Recoverable via CCR; output
33
+ * favours LLM readability over `git apply` fidelity.
34
+ */
35
+ export function compressDiff(text, opts = {}) {
36
+ const lines = splitLines(text);
37
+ const sections = segmentDiffSections(lines);
38
+ // Only take the offload path when the diff cleanly segments into `diff --git`
39
+ // sections AND at least one is noise. Anything else stays on the core path.
40
+ const hasNoise = sections !== null &&
41
+ sections.some((s) => s.isLockfile || s.hunks.some((h) => h.whitespaceOnly));
42
+ if (!sections || !hasNoise)
43
+ return compressDiffCore(text, opts);
44
+ const out = [];
45
+ let lockfileCount = 0;
46
+ let wsHunkCount = 0;
47
+ for (const s of sections) {
48
+ if (s.isLockfile) {
49
+ lockfileCount += 1;
50
+ const sectionText = s.lines.join('\n');
51
+ const elided = s.lines.length - s.headerLines.length;
52
+ const key = opts.store ? opts.store.put(sectionText) : undefined;
53
+ const marker = `[lockfile ${s.basename}: ${plural(elided, 'line')} elided${key ? ` ${formatCcrMarker(key)}` : ''}]`;
54
+ out.push([...s.headerLines, marker].join('\n'));
55
+ continue;
56
+ }
57
+ const wsHunks = s.hunks.filter((h) => h.whitespaceOnly);
58
+ const normalHunks = s.hunks.filter((h) => !h.whitespaceOnly);
59
+ if (wsHunks.length === 0) {
60
+ // No noise in this section — compress it with the core pass (per-section
61
+ // CCR is handled once over the whole diff by finalizeLossy below).
62
+ out.push(compressDiffCore(s.lines.join('\n'), { ...opts, store: undefined }).compressed);
63
+ continue;
64
+ }
65
+ wsHunkCount += wsHunks.length;
66
+ const normalSection = [...s.headerLines, ...normalHunks.flatMap((h) => h.lines)];
67
+ const compressedNormal = normalHunks.length > 0
68
+ ? compressDiffCore(normalSection.join('\n'), { ...opts, store: undefined }).compressed
69
+ : s.headerLines.join('\n');
70
+ const wsText = wsHunks.flatMap((h) => h.lines).join('\n');
71
+ const wsLines = wsHunks.reduce((n, h) => n + h.lines.length, 0);
72
+ const key = opts.store ? opts.store.put(wsText) : undefined;
73
+ const wsMarker = `[whitespace-only: ${plural(wsHunks.length, 'hunk')}, ${plural(wsLines, 'line')} elided${key ? ` ${formatCcrMarker(key)}` : ''}]`;
74
+ out.push([compressedNormal, wsMarker].join('\n'));
75
+ }
76
+ const note = `full diff: ${plural(lockfileCount, 'lockfile')} + ${plural(wsHunkCount, 'whitespace hunk')} offloaded`;
77
+ return finalizeLossy({
78
+ original: text,
79
+ body: out.join('\n'),
80
+ contentType: EContentType.GitDiff,
81
+ strategy: ECompressionStrategy.Diff,
82
+ opts,
83
+ note,
84
+ });
85
+ }
86
+ function plural(n, noun) {
87
+ return `${n} ${noun}${n === 1 ? '' : 's'}`;
88
+ }
89
+ /**
90
+ * Split a diff into per-file sections at `diff --git` boundaries. Returns null
91
+ * when the diff doesn't cleanly start with a `diff --git` section (preamble,
92
+ * headerless `diff -u`, etc.) so the caller falls back to the core path rather
93
+ * than risk a fragile segmentation.
94
+ */
95
+ function segmentDiffSections(lines) {
96
+ const starts = [];
97
+ for (let i = 0; i < lines.length; i += 1) {
98
+ if ((lines[i] ?? '').startsWith('diff --git '))
99
+ starts.push(i);
100
+ }
101
+ if (starts.length === 0 || starts[0] !== 0)
102
+ return null;
103
+ const sections = [];
104
+ for (let k = 0; k < starts.length; k += 1) {
105
+ const begin = starts[k];
106
+ const end = k + 1 < starts.length ? starts[k + 1] : lines.length;
107
+ const sectionLines = lines.slice(begin, end);
108
+ sections.push(buildSection(sectionLines));
109
+ }
110
+ return sections;
111
+ }
112
+ function buildSection(sectionLines) {
113
+ let firstHunk = sectionLines.findIndex((l) => HUNK_HEADER_RE.test(l));
114
+ if (firstHunk < 0)
115
+ firstHunk = sectionLines.length;
116
+ const headerLines = sectionLines.slice(0, firstHunk);
117
+ const hunks = [];
118
+ let cur = null;
119
+ for (let i = firstHunk; i < sectionLines.length; i += 1) {
120
+ const line = sectionLines[i] ?? '';
121
+ if (HUNK_HEADER_RE.test(line)) {
122
+ if (cur)
123
+ hunks.push({ lines: cur, whitespaceOnly: isWhitespaceOnlyHunk(cur) });
124
+ cur = [line];
125
+ }
126
+ else if (cur) {
127
+ cur.push(line);
128
+ }
129
+ }
130
+ if (cur)
131
+ hunks.push({ lines: cur, whitespaceOnly: isWhitespaceOnlyHunk(cur) });
132
+ const basename = sectionPath(headerLines);
133
+ return {
134
+ lines: sectionLines,
135
+ headerLines,
136
+ hunks,
137
+ basename,
138
+ isLockfile: basename.length > 0 && isLockfileName(basename),
139
+ };
140
+ }
141
+ /** The changed file's basename, parsed from `+++ b/<path>` or the git header. */
142
+ function sectionPath(headerLines) {
143
+ let path = '';
144
+ for (const line of headerLines) {
145
+ const plus = /^\+\+\+ b\/(.*)$/.exec(line);
146
+ if (plus) {
147
+ path = plus[1] ?? '';
148
+ break;
149
+ }
150
+ }
151
+ if (!path) {
152
+ const git = /^diff --git a\/.+ b\/(.+)$/.exec(headerLines[0] ?? '');
153
+ if (git)
154
+ path = git[1] ?? '';
155
+ }
156
+ // `+++ b/path` is clean inside a `diff --git` section, but guard a stray tab.
157
+ path = (path.split('\t')[0] ?? '').trim();
158
+ return path.split('/').pop() ?? '';
159
+ }
160
+ /**
161
+ * True when a hunk's only real change is whitespace: the normalized contents of
162
+ * its removed lines and added lines are identical multisets (so each `-x` is
163
+ * answered by a `+x` differing only in leading/trailing/internal whitespace).
164
+ * Pure-context hunks (no changes) are NOT whitespace-only — they're left for the
165
+ * core pass.
166
+ */
167
+ function isWhitespaceOnlyHunk(hunkLines) {
168
+ const norm = (s) => s.replace(/\s+/g, ' ').trim();
169
+ const removed = [];
170
+ const added = [];
171
+ for (const line of hunkLines) {
172
+ if (line.startsWith('-') && !line.startsWith('---'))
173
+ removed.push(norm(line.slice(1)));
174
+ else if (line.startsWith('+') && !line.startsWith('+++'))
175
+ added.push(norm(line.slice(1)));
176
+ }
177
+ if (removed.length === 0 || removed.length !== added.length)
178
+ return false;
179
+ const a = [...removed].sort();
180
+ const b = [...added].sort();
181
+ return a.every((v, i) => v === b[i]);
182
+ }
183
+ /**
184
+ * Reduce a unified diff to the changed lines plus a tight context window,
185
+ * capping hunks per file (first + last + highest-scoring kept). File headers
186
+ * are preserved so the diff stays attributable; trimmed context and dropped
187
+ * hunks are elided. Recoverable via CCR. Output favours LLM readability over
188
+ * `git apply` fidelity.
189
+ */
190
+ function compressDiffCore(text, opts = {}) {
191
+ const lines = splitLines(text);
192
+ const minLines = opts.minLines ?? 12;
193
+ if (lines.length < minLines)
194
+ return passthroughResult(text, EContentType.GitDiff);
195
+ const tokens = queryTokens(opts.query);
196
+ const maxContext = 3;
197
+ const maxHunks = opts.maxItems ?? 12;
198
+ const fileHeaderLines = new Map();
199
+ const hunks = [];
200
+ let currentFile = -1;
201
+ let current = null;
202
+ let sawGitHeader = false; // a `diff --git` just opened the current file
203
+ let expectPlusHeader = false; // a `--- ` file header was just seen; its `+++ ` partner is next
204
+ const registerHeader = (i) => {
205
+ if (currentFile < 0) {
206
+ currentFile = 0;
207
+ if (!fileHeaderLines.has(0))
208
+ fileHeaderLines.set(0, []);
209
+ }
210
+ const list = fileHeaderLines.get(currentFile) ?? [];
211
+ list.push(i);
212
+ fileHeaderLines.set(currentFile, list);
213
+ current = null; // header lines sit between hunks
214
+ };
215
+ for (let i = 0; i < lines.length; i += 1) {
216
+ const line = lines[i] ?? '';
217
+ if (line.startsWith('diff --git ')) {
218
+ currentFile += 1;
219
+ fileHeaderLines.set(currentFile, [i]);
220
+ current = null;
221
+ sawGitHeader = true;
222
+ expectPlusHeader = false;
223
+ continue;
224
+ }
225
+ if (HUNK_HEADER_RE.test(line)) {
226
+ if (currentFile < 0) {
227
+ currentFile = 0;
228
+ if (!fileHeaderLines.has(0))
229
+ fileHeaderLines.set(0, []);
230
+ }
231
+ current = { file: currentFile, header: i, body: [], changeCount: 0, score: 0 };
232
+ hunks.push(current);
233
+ sawGitHeader = false;
234
+ expectPlusHeader = false;
235
+ continue;
236
+ }
237
+ // A `--- ` line is a file header only when its `+++ ` partner follows AND a
238
+ // hunk header comes next — a real header is immediately followed by `@@`.
239
+ // This rejects an in-hunk deleted/added content pair (`--- foo` / `+++ bar`)
240
+ // that would otherwise be mistaken for a new file. It begins a NEW file
241
+ // unless a `diff --git` already opened this one (headerless `diff -u`).
242
+ if (line.startsWith('--- ') &&
243
+ (lines[i + 1] ?? '').startsWith('+++ ') &&
244
+ (lines[i + 2] ?? '').startsWith('@@')) {
245
+ if (!sawGitHeader) {
246
+ currentFile += 1;
247
+ fileHeaderLines.set(currentFile, []);
248
+ }
249
+ registerHeader(i);
250
+ sawGitHeader = false;
251
+ expectPlusHeader = true;
252
+ continue;
253
+ }
254
+ if (expectPlusHeader && line.startsWith('+++ ')) {
255
+ registerHeader(i);
256
+ expectPlusHeader = false;
257
+ continue;
258
+ }
259
+ if (SAFE_HEADER_RE.test(line)) {
260
+ registerHeader(i);
261
+ continue;
262
+ }
263
+ if (current) {
264
+ current.body.push(i);
265
+ if (isChangeLine(line)) {
266
+ current.changeCount += 1;
267
+ current.score += queryOverlap(line, tokens) * 0.3;
268
+ }
269
+ }
270
+ }
271
+ if (hunks.length === 0)
272
+ return passthroughResult(text, EContentType.GitDiff);
273
+ // Per-file hunk cap: always keep first + last, fill remainder by score.
274
+ const keptHunks = new Set();
275
+ const byFile = new Map();
276
+ for (const h of hunks) {
277
+ const list = byFile.get(h.file) ?? [];
278
+ list.push(h);
279
+ byFile.set(h.file, list);
280
+ }
281
+ for (const list of byFile.values()) {
282
+ if (list.length <= maxHunks) {
283
+ for (const h of list)
284
+ keptHunks.add(h);
285
+ continue;
286
+ }
287
+ // Select per file so the cap is honoured exactly: the first hunk, the last
288
+ // (only if the cap allows two), then the highest-scoring until full. A
289
+ // per-file set is the source of truth — a global counter has cross-file
290
+ // slack and lets one file overflow by one.
291
+ const fileKept = new Set();
292
+ fileKept.add(list[0]);
293
+ if (maxHunks >= 2)
294
+ fileKept.add(list[list.length - 1]);
295
+ const ranked = [...list].sort((a, b) => (b.score - a.score) || (b.changeCount - a.changeCount) || (a.header - b.header));
296
+ for (const h of ranked) {
297
+ if (fileKept.size >= maxHunks)
298
+ break;
299
+ fileKept.add(h);
300
+ }
301
+ for (const h of fileKept)
302
+ keptHunks.add(h);
303
+ }
304
+ const keep = new Set();
305
+ const filesWithKeptHunk = new Set();
306
+ for (const h of keptHunks)
307
+ filesWithKeptHunk.add(h.file);
308
+ for (const [file, headerLines] of fileHeaderLines) {
309
+ if (filesWithKeptHunk.has(file))
310
+ for (const i of headerLines)
311
+ keep.add(i);
312
+ }
313
+ for (const h of keptHunks) {
314
+ keep.add(h.header);
315
+ // Mark change-line positions, then keep context within ±maxContext.
316
+ const changePos = new Set();
317
+ for (let p = 0; p < h.body.length; p += 1) {
318
+ const li = h.body[p];
319
+ if (isChangeLine(lines[li] ?? ''))
320
+ changePos.add(p);
321
+ }
322
+ for (let p = 0; p < h.body.length; p += 1) {
323
+ const li = h.body[p];
324
+ let near = changePos.has(p);
325
+ if (!near) {
326
+ for (let d = 1; d <= maxContext && !near; d += 1) {
327
+ if (changePos.has(p - d) || changePos.has(p + d))
328
+ near = true;
329
+ }
330
+ }
331
+ if (near)
332
+ keep.add(li);
333
+ }
334
+ }
335
+ const body = elide(lines, keep);
336
+ return finalizeLossy({
337
+ original: text,
338
+ body,
339
+ contentType: EContentType.GitDiff,
340
+ strategy: ECompressionStrategy.Diff,
341
+ opts,
342
+ note: `full diff: ${hunks.length} hunks across ${byFile.size} files`,
343
+ });
344
+ }
@@ -0,0 +1,12 @@
1
+ import { EContentType } from '../content/content-type.js';
2
+ import type { ICompressionResult } from '../result/compression-result.js';
3
+ import type { ICompressOptions } from '../result/compress-options.js';
4
+ /**
5
+ * Conservative generic reduction for prose / plain text: drop exact-duplicate
6
+ * non-blank lines (keeping the first occurrence) and collapse runs of blank
7
+ * lines. Prose with little repetition passes through unchanged — which is the
8
+ * honest outcome; structured content should route to a typed compressor
9
+ * instead. Recoverable via CCR.
10
+ */
11
+ export declare function compressLines(text: string, contentType?: EContentType, opts?: ICompressOptions): ICompressionResult;
12
+ //# sourceMappingURL=compress-lines.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compress-lines.d.ts","sourceRoot":"","sources":["../../src/text/compress-lines.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAE1D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAItE;;;;;;GAMG;AACH,wBAAgB,aAAa,CAC3B,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,YAAqC,EAClD,IAAI,GAAE,gBAAqB,GAC1B,kBAAkB,CA+BpB"}
@@ -0,0 +1,44 @@
1
+ import { EContentType } from "../content/content-type.js";
2
+ import { ECompressionStrategy } from "../result/compression-strategy.js";
3
+ import { splitLines, elide } from "./line-utils.js";
4
+ import { finalizeLossy, passthroughResult } from "./finalize.js";
5
+ /**
6
+ * Conservative generic reduction for prose / plain text: drop exact-duplicate
7
+ * non-blank lines (keeping the first occurrence) and collapse runs of blank
8
+ * lines. Prose with little repetition passes through unchanged — which is the
9
+ * honest outcome; structured content should route to a typed compressor
10
+ * instead. Recoverable via CCR.
11
+ */
12
+ export function compressLines(text, contentType = EContentType.PlainText, opts = {}) {
13
+ const lines = splitLines(text);
14
+ const minLines = opts.minLines ?? 8;
15
+ if (lines.length < minLines)
16
+ return passthroughResult(text, contentType);
17
+ const keep = new Set();
18
+ const seen = new Set();
19
+ let prevBlank = false;
20
+ for (let i = 0; i < lines.length; i += 1) {
21
+ const line = lines[i] ?? '';
22
+ const blank = line.trim().length === 0;
23
+ if (blank) {
24
+ if (!prevBlank)
25
+ keep.add(i);
26
+ prevBlank = true;
27
+ continue;
28
+ }
29
+ prevBlank = false;
30
+ if (seen.has(line))
31
+ continue; // exact duplicate — drop
32
+ seen.add(line);
33
+ keep.add(i);
34
+ }
35
+ const body = elide(lines, keep);
36
+ return finalizeLossy({
37
+ original: text,
38
+ body,
39
+ contentType,
40
+ strategy: ECompressionStrategy.Lines,
41
+ opts,
42
+ note: `full text: ${lines.length} lines`,
43
+ });
44
+ }
@@ -0,0 +1,12 @@
1
+ import type { ICompressionResult } from '../result/compression-result.js';
2
+ import type { ICompressOptions } from '../result/compress-options.js';
3
+ /**
4
+ * Reduce build / test / runtime logs to their signal: errors and their FULL
5
+ * multi-frame stack traces, the exception punchline, de-duplicated warnings,
6
+ * summary lines, and first/last anchors. The rest is elided. When a hard
7
+ * `maxItems` cap applies, lines are dropped by PRIORITY (summaries > errors >
8
+ * anchors > other), never by position — so the closing summary always survives.
9
+ * Deterministic and order-preserving; the full log is recoverable via CCR.
10
+ */
11
+ export declare function compressLog(text: string, opts?: ICompressOptions): ICompressionResult;
12
+ //# sourceMappingURL=compress-log.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compress-log.d.ts","sourceRoot":"","sources":["../../src/text/compress-log.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AA0BtE;;;;;;;GAOG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CAsIzF"}
@@ -0,0 +1,202 @@
1
+ import { EContentType } from "../content/content-type.js";
2
+ import { ECompressionStrategy } from "../result/compression-strategy.js";
3
+ import { splitLines, dedupeKey, queryTokens } from "./line-utils.js";
4
+ import { finalizeLossy, passthroughResult } from "./finalize.js";
5
+ import { mineLogTemplates } from "./log-template.js";
6
+ import { bm25Scores } from "../relevance/bm25.js";
7
+ import { formatCcrMarker } from "../ccr/ccr-marker.js";
8
+ const ERROR_RE = /\b(?:ERROR|FATAL|FAIL(?:ED|URE)?|EXCEPTION|panic)\b/i;
9
+ // High-signal failure lines that often carry NONE of the ERROR/FATAL/FAIL
10
+ // keywords yet ARE the root cause: native crashes (segfault, core dump, bus
11
+ // error), the OOM killer, linker errors, fatal POSIX signals, and assertion
12
+ // failures. Without this they get elided when they aren't an anchor. Treated
13
+ // exactly like an error line (kept, with the preceding line and any following
14
+ // trace). Keeping an occasional benign match costs one extra line — far cheaper
15
+ // than dropping the actual cause.
16
+ const FATAL_SIGNAL_RE = /\b(?:segmentation fault|segfault|core dumped|bus error|out of memory|oom[- ]?kill(?:er|ed)?|killed process|undefined reference to|undefined symbol|symbol\(s\) not found|cannot find -l|assertion (?:failed|.*failed)|SIG(?:SEGV|ABRT|KILL|BUS|FPE|ILL)\b|signal \d+|Aborted)\b/i;
17
+ const WARN_RE = /\bWARN(?:ING)?\b/i;
18
+ const SUMMARY_RE = /\b(?:\d+ (?:passed|failed|error|errors|skipped)|Tests:|Test Suites:|collected \d+|BUILD (?:SUCCESS|FAIL(?:ED|URE)?)|Summary:)\b|^[✓✗×]/;
19
+ const STACK_RE = /^\s+(?:at\s+\S+|File ".*", line \d+)/;
20
+ // Start of a multi-frame trace region.
21
+ const TRACEBACK_START = /^\s*Traceback\b|^\s*Caused by:|^\s*Exception in thread\b/;
22
+ // The punchline of a trace: `ValueError: boom`, `java.lang.NullPointerException: null`.
23
+ const EXCEPTION_SUMMARY = /^[\w.$]*(?:Error|Exception|Warning|Panic)\b.*:/;
24
+ /**
25
+ * Reduce build / test / runtime logs to their signal: errors and their FULL
26
+ * multi-frame stack traces, the exception punchline, de-duplicated warnings,
27
+ * summary lines, and first/last anchors. The rest is elided. When a hard
28
+ * `maxItems` cap applies, lines are dropped by PRIORITY (summaries > errors >
29
+ * anchors > other), never by position — so the closing summary always survives.
30
+ * Deterministic and order-preserving; the full log is recoverable via CCR.
31
+ */
32
+ export function compressLog(text, opts = {}) {
33
+ const lines = splitLines(text);
34
+ const minLines = opts.minLines ?? 12;
35
+ if (lines.length < minLines)
36
+ return passthroughResult(text, EContentType.BuildLog);
37
+ const tokens = queryTokens(opts.query);
38
+ // P3.2: BM25 relevance for the query (idf-weighted, length-normalized, ID-term
39
+ // boosted). Computed only when a query is present, so the no-query path is
40
+ // unchanged.
41
+ const relScores = opts.query ? bm25Scores(opts.query, lines) : null;
42
+ const keep = new Set();
43
+ const errorIdx = new Set();
44
+ const summaryIdx = new Set();
45
+ const anchorIdx = new Set();
46
+ const queryIdx = new Set();
47
+ const seenWarn = new Set();
48
+ let stackActive = false;
49
+ let inFrameSource = false; // we are inside a frame's indented source block
50
+ for (const i of [0, 1, lines.length - 2, lines.length - 1]) {
51
+ if (i >= 0 && i < lines.length) {
52
+ keep.add(i);
53
+ anchorIdx.add(i);
54
+ }
55
+ }
56
+ for (let i = 0; i < lines.length; i += 1) {
57
+ const line = lines[i] ?? '';
58
+ const isSummary = SUMMARY_RE.test(line);
59
+ // Errors / trace starts are handled FIRST so trace control flow is correct,
60
+ // but a line that is ALSO a summary (e.g. "Tests: 1 failed" — "failed"
61
+ // matches ERROR_RE) is still tagged into summaryIdx so the cap ranks it as
62
+ // a summary. This keeps multi-frame traces intact while letting the closing
63
+ // result survive a tight cap.
64
+ if (ERROR_RE.test(line) ||
65
+ FATAL_SIGNAL_RE.test(line) ||
66
+ TRACEBACK_START.test(line) ||
67
+ EXCEPTION_SUMMARY.test(line)) {
68
+ keep.add(i);
69
+ errorIdx.add(i);
70
+ if (isSummary)
71
+ summaryIdx.add(i);
72
+ if (i - 1 >= 0)
73
+ keep.add(i - 1);
74
+ stackActive = true;
75
+ inFrameSource = false;
76
+ continue;
77
+ }
78
+ if (stackActive) {
79
+ if (line.trim().length === 0) {
80
+ stackActive = false; // a blank line ends the trace region
81
+ inFrameSource = false;
82
+ }
83
+ else if (STACK_RE.test(line)) {
84
+ keep.add(i); // a real stack frame (`at …` / `File …`)
85
+ errorIdx.add(i);
86
+ inFrameSource = true;
87
+ continue;
88
+ }
89
+ else if (inFrameSource && /^\s/.test(line)) {
90
+ keep.add(i); // indented source line(s) under a frame — keep the whole block
91
+ errorIdx.add(i);
92
+ continue;
93
+ }
94
+ else if (/^\s/.test(line)) {
95
+ // Indented, but NOT after a frame (e.g. a captured-stdout / locals dump
96
+ // straight after the error) — drop it, but stay in the trace region.
97
+ continue;
98
+ }
99
+ else {
100
+ stackActive = false; // a dedented non-trace line ends the region — re-check it below
101
+ inFrameSource = false;
102
+ }
103
+ }
104
+ if (isSummary) {
105
+ keep.add(i);
106
+ summaryIdx.add(i);
107
+ continue;
108
+ }
109
+ if (WARN_RE.test(line)) {
110
+ const k = dedupeKey(line);
111
+ if (!seenWarn.has(k)) {
112
+ seenWarn.add(k);
113
+ keep.add(i);
114
+ }
115
+ continue;
116
+ }
117
+ if (relScores && relScores[i] > 0) {
118
+ keep.add(i);
119
+ queryIdx.add(i);
120
+ }
121
+ }
122
+ // Hard cap: force-keep the CLOSING summary (the last summary line) so the
123
+ // test/build result always survives, then fill the rest errors-first, then
124
+ // other summaries, then anchors, then the remainder. This keeps both the real
125
+ // error and the closing result even when summary-shaped noise is abundant.
126
+ if (opts.maxItems && keep.size > opts.maxItems) {
127
+ const cap = opts.maxItems;
128
+ const chosen = new Set();
129
+ const summaries = [...summaryIdx].sort((a, b) => a - b);
130
+ if (summaries.length > 0)
131
+ chosen.add(summaries[summaries.length - 1]);
132
+ const rank = (i) => errorIdx.has(i) ? 0 : summaryIdx.has(i) ? 1 : queryIdx.has(i) ? 2 : anchorIdx.has(i) ? 3 : 4;
133
+ // Within a tier, the more query-relevant line (higher BM25) wins; rel is 0
134
+ // for non-query lines, so this is a no-op tiebreak without a query.
135
+ const rel = (i) => (relScores ? relScores[i] : 0);
136
+ const rest = [...keep]
137
+ .filter((i) => !chosen.has(i))
138
+ .sort((a, b) => rank(a) - rank(b) || rel(b) - rel(a) || a - b);
139
+ for (const i of rest) {
140
+ if (chosen.size >= cap)
141
+ break;
142
+ chosen.add(i);
143
+ }
144
+ keep.clear();
145
+ for (const i of chosen)
146
+ keep.add(i);
147
+ }
148
+ // P2.2: collapse repetitive runs of KEPT lines (summary/query spam) into
149
+ // lossless template blocks. Mining only the *kept* runs is the key: lines the
150
+ // selector drops stay dropped (a one-line `… omitted …` always beats keeping
151
+ // a template block), so noise logs never regress — only signal the agent
152
+ // actually sees gets the lossless columnar collapse.
153
+ //
154
+ // P4.5: when a CCR store is present, cache the original up front and stamp its
155
+ // key into each elision hint, so the agent can tell a root cause was dropped
156
+ // RIGHT THERE and retrieve it. finalizeLossy reuses this same key (and skips
157
+ // its own trailing marker since the body already carries it).
158
+ const ccrKey = opts.store ? opts.store.put(text) : undefined;
159
+ const body = elideWithTemplates(lines, keep, ccrKey);
160
+ return finalizeLossy({
161
+ original: text,
162
+ body,
163
+ contentType: EContentType.BuildLog,
164
+ strategy: ECompressionStrategy.Log,
165
+ opts,
166
+ note: `full log: ${lines.length} lines`,
167
+ });
168
+ }
169
+ /**
170
+ * Like {@link elide}, but each maximal run of consecutive KEPT lines is passed
171
+ * through {@link mineLogTemplates} so repetitive kept lines collapse to a
172
+ * lossless template block. Each dropped run becomes a single hint; when
173
+ * `ccrKey` is given the hint carries `→ <<ccr:KEY>>` so the elided detail is
174
+ * retrievable in place (P4.5).
175
+ */
176
+ function elideWithTemplates(lines, keep, ccrKey) {
177
+ const out = [];
178
+ let dropped = 0;
179
+ const flush = () => {
180
+ if (dropped > 0) {
181
+ const hint = ccrKey ? ` → ${formatCcrMarker(ccrKey)}` : '';
182
+ out.push(`… ${dropped} line${dropped === 1 ? '' : 's'} omitted${hint}`);
183
+ dropped = 0;
184
+ }
185
+ };
186
+ let i = 0;
187
+ while (i < lines.length) {
188
+ if (!keep.has(i)) {
189
+ dropped += 1;
190
+ i += 1;
191
+ continue;
192
+ }
193
+ flush();
194
+ let j = i;
195
+ while (j < lines.length && keep.has(j))
196
+ j += 1;
197
+ out.push(...mineLogTemplates(lines.slice(i, j)).lines);
198
+ i = j;
199
+ }
200
+ flush();
201
+ return out.join('\n');
202
+ }
@@ -0,0 +1,15 @@
1
+ import type { ICompressionResult } from '../result/compression-result.js';
2
+ import type { ICompressOptions } from '../result/compress-options.js';
3
+ /**
4
+ * Markdown-aware reduction that keeps a document's SKELETON — every header, the
5
+ * first line of each section/paragraph, table rows, and a capped run of list
6
+ * items — while thinning paragraph continuations and collapsing fenced code
7
+ * block bodies. Structure is never dropped (headers always survive), so the
8
+ * outline stays navigable; the full document is recoverable via CCR.
9
+ *
10
+ * Note: this runs only when an agent explicitly compresses markdown (via
11
+ * `shrk compress` / `compress_context`). SharkCraft's own briefs/context are
12
+ * never silently passed through it.
13
+ */
14
+ export declare function compressMarkdown(text: string, opts?: ICompressOptions): ICompressionResult;
15
+ //# sourceMappingURL=compress-markdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compress-markdown.d.ts","sourceRoot":"","sources":["../../src/text/compress-markdown.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAStE;;;;;;;;;;GAUG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CA8E9F"}