@shrkcrft/compress 0.1.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +21 -0
- package/dist/cache/align-volatile-tokens.d.ts +13 -0
- package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/align-volatile-tokens.js +51 -0
- package/dist/cache/alignment-map.d.ts +23 -0
- package/dist/cache/alignment-map.d.ts.map +1 -0
- package/dist/cache/alignment-map.js +1 -0
- package/dist/cache/alignment-result.d.ts +11 -0
- package/dist/cache/alignment-result.d.ts.map +1 -0
- package/dist/cache/alignment-result.js +1 -0
- package/dist/cache/detect-volatile-tokens.d.ts +10 -0
- package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/detect-volatile-tokens.js +41 -0
- package/dist/cache/placeholder.d.ts +28 -0
- package/dist/cache/placeholder.d.ts.map +1 -0
- package/dist/cache/placeholder.js +0 -0
- package/dist/cache/restore-volatile-tokens.d.ts +10 -0
- package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/restore-volatile-tokens.js +21 -0
- package/dist/cache/volatile-classify.d.ts +11 -0
- package/dist/cache/volatile-classify.d.ts.map +1 -0
- package/dist/cache/volatile-classify.js +35 -0
- package/dist/cache/volatile-kind.d.ts +13 -0
- package/dist/cache/volatile-kind.d.ts.map +1 -0
- package/dist/cache/volatile-kind.js +13 -0
- package/dist/cache/volatile-token.d.ts +14 -0
- package/dist/cache/volatile-token.d.ts.map +1 -0
- package/dist/cache/volatile-token.js +1 -0
- package/dist/ccr/ccr-entry.d.ts +13 -0
- package/dist/ccr/ccr-entry.d.ts.map +1 -0
- package/dist/ccr/ccr-entry.js +1 -0
- package/dist/ccr/ccr-key.d.ts +9 -0
- package/dist/ccr/ccr-key.d.ts.map +1 -0
- package/dist/ccr/ccr-key.js +19 -0
- package/dist/ccr/ccr-marker.d.ts +23 -0
- package/dist/ccr/ccr-marker.d.ts.map +1 -0
- package/dist/ccr/ccr-marker.js +30 -0
- package/dist/ccr/ccr-store.d.ts +18 -0
- package/dist/ccr/ccr-store.d.ts.map +1 -0
- package/dist/ccr/ccr-store.js +1 -0
- package/dist/ccr/file-ccr-store.d.ts +19 -0
- package/dist/ccr/file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/file-ccr-store.js +53 -0
- package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
- package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
- package/dist/ccr/in-memory-ccr-store.js +45 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/ttl-file-ccr-store.js +117 -0
- package/dist/code/compress-code.d.ts +4 -0
- package/dist/code/compress-code.d.ts.map +1 -0
- package/dist/code/compress-code.js +294 -0
- package/dist/compress-content.d.ts +11 -0
- package/dist/compress-content.d.ts.map +1 -0
- package/dist/compress-content.js +79 -0
- package/dist/content/content-type.d.ts +28 -0
- package/dist/content/content-type.d.ts.map +1 -0
- package/dist/content/content-type.js +28 -0
- package/dist/content/detect-content-type.d.ts +9 -0
- package/dist/content/detect-content-type.d.ts.map +1 -0
- package/dist/content/detect-content-type.js +184 -0
- package/dist/content/segment.d.ts +21 -0
- package/dist/content/segment.d.ts.map +1 -0
- package/dist/content/segment.js +117 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +49 -0
- package/dist/json/compress-json.d.ts +18 -0
- package/dist/json/compress-json.d.ts.map +1 -0
- package/dist/json/compress-json.js +139 -0
- package/dist/json/render-compact-json.d.ts +10 -0
- package/dist/json/render-compact-json.d.ts.map +1 -0
- package/dist/json/render-compact-json.js +18 -0
- package/dist/relevance/bm25.d.ts +26 -0
- package/dist/relevance/bm25.d.ts.map +1 -0
- package/dist/relevance/bm25.js +115 -0
- package/dist/result/compress-options.d.ts +26 -0
- package/dist/result/compress-options.d.ts.map +1 -0
- package/dist/result/compress-options.js +1 -0
- package/dist/result/compression-result.d.ts +26 -0
- package/dist/result/compression-result.d.ts.map +1 -0
- package/dist/result/compression-result.js +1 -0
- package/dist/result/compression-strategy.d.ts +30 -0
- package/dist/result/compression-strategy.d.ts.map +1 -0
- package/dist/result/compression-strategy.js +30 -0
- package/dist/table/adaptive-size.d.ts +46 -0
- package/dist/table/adaptive-size.d.ts.map +1 -0
- package/dist/table/adaptive-size.js +170 -0
- package/dist/table/apply-value-dictionaries.d.ts +30 -0
- package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
- package/dist/table/apply-value-dictionaries.js +99 -0
- package/dist/table/column-presence.d.ts +20 -0
- package/dist/table/column-presence.d.ts.map +1 -0
- package/dist/table/column-presence.js +52 -0
- package/dist/table/columnar-json.d.ts +24 -0
- package/dist/table/columnar-json.d.ts.map +1 -0
- package/dist/table/columnar-json.js +83 -0
- package/dist/table/columnar-table.d.ts +24 -0
- package/dist/table/columnar-table.d.ts.map +1 -0
- package/dist/table/columnar-table.js +1 -0
- package/dist/table/compact-object-array.d.ts +12 -0
- package/dist/table/compact-object-array.d.ts.map +1 -0
- package/dist/table/compact-object-array.js +88 -0
- package/dist/table/field-spec.d.ts +13 -0
- package/dist/table/field-spec.d.ts.map +1 -0
- package/dist/table/field-spec.js +1 -0
- package/dist/table/object-map.d.ts +28 -0
- package/dist/table/object-map.d.ts.map +1 -0
- package/dist/table/object-map.js +119 -0
- package/dist/table/render-table.d.ts +11 -0
- package/dist/table/render-table.d.ts.map +1 -0
- package/dist/table/render-table.js +39 -0
- package/dist/table/sample-object-array.d.ts +11 -0
- package/dist/table/sample-object-array.d.ts.map +1 -0
- package/dist/table/sample-object-array.js +171 -0
- package/dist/table/sample-options.d.ts +29 -0
- package/dist/table/sample-options.d.ts.map +1 -0
- package/dist/table/sample-options.js +1 -0
- package/dist/table/sampled-table.d.ts +33 -0
- package/dist/table/sampled-table.d.ts.map +1 -0
- package/dist/table/sampled-table.js +8 -0
- package/dist/table/table-compaction.d.ts +19 -0
- package/dist/table/table-compaction.d.ts.map +1 -0
- package/dist/table/table-compaction.js +1 -0
- package/dist/table/table-formats.d.ts +23 -0
- package/dist/table/table-formats.d.ts.map +1 -0
- package/dist/table/table-formats.js +233 -0
- package/dist/text/compress-diff.d.ts +20 -0
- package/dist/text/compress-diff.d.ts.map +1 -0
- package/dist/text/compress-diff.js +344 -0
- package/dist/text/compress-lines.d.ts +12 -0
- package/dist/text/compress-lines.d.ts.map +1 -0
- package/dist/text/compress-lines.js +44 -0
- package/dist/text/compress-log.d.ts +12 -0
- package/dist/text/compress-log.d.ts.map +1 -0
- package/dist/text/compress-log.js +202 -0
- package/dist/text/compress-markdown.d.ts +15 -0
- package/dist/text/compress-markdown.d.ts.map +1 -0
- package/dist/text/compress-markdown.js +96 -0
- package/dist/text/compress-search.d.ts +11 -0
- package/dist/text/compress-search.d.ts.map +1 -0
- package/dist/text/compress-search.js +78 -0
- package/dist/text/finalize.d.ts +21 -0
- package/dist/text/finalize.d.ts.map +1 -0
- package/dist/text/finalize.js +54 -0
- package/dist/text/line-utils.d.ts +20 -0
- package/dist/text/line-utils.d.ts.map +1 -0
- package/dist/text/line-utils.js +65 -0
- package/dist/text/lockfile-names.d.ts +3 -0
- package/dist/text/lockfile-names.d.ts.map +1 -0
- package/dist/text/lockfile-names.js +33 -0
- package/dist/text/log-template.d.ts +31 -0
- package/dist/text/log-template.d.ts.map +1 -0
- package/dist/text/log-template.js +239 -0
- package/dist/tokens/estimate-tokens.d.ts +17 -0
- package/dist/tokens/estimate-tokens.d.ts.map +1 -0
- package/dist/tokens/estimate-tokens.js +53 -0
- package/dist/tokens/token-savings.d.ts +20 -0
- package/dist/tokens/token-savings.d.ts.map +1 -0
- package/dist/tokens/token-savings.js +1 -0
- package/package.json +52 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { EContentType } from "../content/content-type.js";
|
|
2
|
+
import { ECompressionStrategy } from "../result/compression-strategy.js";
|
|
3
|
+
import { splitLines, queryTokens, queryOverlap, elide } from "./line-utils.js";
|
|
4
|
+
import { finalizeLossy, passthroughResult } from "./finalize.js";
|
|
5
|
+
const HEADER = /^#{1,6}\s/;
|
|
6
|
+
const LIST_ITEM = /^\s*(?:[-*+]\s|\d+\.\s)/;
|
|
7
|
+
const FENCE = /^\s*(?:```|~~~)/;
|
|
8
|
+
const TABLE_ROW = /^\s*\|/;
|
|
9
|
+
/**
|
|
10
|
+
* Markdown-aware reduction that keeps a document's SKELETON — every header, the
|
|
11
|
+
* first line of each section/paragraph, table rows, and a capped run of list
|
|
12
|
+
* items — while thinning paragraph continuations and collapsing fenced code
|
|
13
|
+
* block bodies. Structure is never dropped (headers always survive), so the
|
|
14
|
+
* outline stays navigable; the full document is recoverable via CCR.
|
|
15
|
+
*
|
|
16
|
+
* Note: this runs only when an agent explicitly compresses markdown (via
|
|
17
|
+
* `shrk compress` / `compress_context`). SharkCraft's own briefs/context are
|
|
18
|
+
* never silently passed through it.
|
|
19
|
+
*/
|
|
20
|
+
export function compressMarkdown(text, opts = {}) {
|
|
21
|
+
const lines = splitLines(text);
|
|
22
|
+
const minLines = opts.minLines ?? 12;
|
|
23
|
+
if (lines.length < minLines)
|
|
24
|
+
return passthroughResult(text, EContentType.Markdown);
|
|
25
|
+
const tokens = queryTokens(opts.query);
|
|
26
|
+
const maxListRun = opts.maxItems && opts.maxItems > 0 ? opts.maxItems : 8;
|
|
27
|
+
const keep = new Set();
|
|
28
|
+
let inFence = false;
|
|
29
|
+
let atParagraphStart = true; // first prose line of a paragraph is the lead
|
|
30
|
+
let listRun = 0;
|
|
31
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
32
|
+
const line = lines[i] ?? '';
|
|
33
|
+
const trimmed = line.trim();
|
|
34
|
+
if (FENCE.test(line)) {
|
|
35
|
+
keep.add(i); // keep both fences; interior is elided
|
|
36
|
+
inFence = !inFence;
|
|
37
|
+
atParagraphStart = false;
|
|
38
|
+
listRun = 0;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (inFence) {
|
|
42
|
+
if (tokens.length > 0 && queryOverlap(line, tokens) > 0)
|
|
43
|
+
keep.add(i); // keep query-relevant code lines
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
if (trimmed.length === 0) {
|
|
47
|
+
atParagraphStart = true;
|
|
48
|
+
listRun = 0;
|
|
49
|
+
continue; // blank runs collapse via elide
|
|
50
|
+
}
|
|
51
|
+
if (HEADER.test(line)) {
|
|
52
|
+
keep.add(i);
|
|
53
|
+
atParagraphStart = true; // the line after a header is a section lead
|
|
54
|
+
listRun = 0;
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (TABLE_ROW.test(line)) {
|
|
58
|
+
keep.add(i); // tables are already dense structure — keep rows
|
|
59
|
+
atParagraphStart = false;
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
if (LIST_ITEM.test(line)) {
|
|
63
|
+
listRun += 1;
|
|
64
|
+
if (listRun <= maxListRun)
|
|
65
|
+
keep.add(i);
|
|
66
|
+
atParagraphStart = false;
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
// Setext header: a text line underlined by a run of `=` (h1) or `-` (h2).
|
|
70
|
+
// Keep the title AND its underline so the header survives intact.
|
|
71
|
+
const underline = (lines[i + 1] ?? '').trim();
|
|
72
|
+
if (/^=+$/.test(underline) || /^-+$/.test(underline)) {
|
|
73
|
+
keep.add(i);
|
|
74
|
+
keep.add(i + 1);
|
|
75
|
+
atParagraphStart = true; // the line after a header is a section lead
|
|
76
|
+
listRun = 0;
|
|
77
|
+
i += 1; // consume the underline (the for-loop's increment skips it)
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
// Prose: keep the lead line of a paragraph/section, drop continuations.
|
|
81
|
+
if (atParagraphStart || (tokens.length > 0 && queryOverlap(line, tokens) > 0)) {
|
|
82
|
+
keep.add(i);
|
|
83
|
+
}
|
|
84
|
+
atParagraphStart = false;
|
|
85
|
+
listRun = 0;
|
|
86
|
+
}
|
|
87
|
+
const body = elide(lines, keep);
|
|
88
|
+
return finalizeLossy({
|
|
89
|
+
original: text,
|
|
90
|
+
body,
|
|
91
|
+
contentType: EContentType.Markdown,
|
|
92
|
+
strategy: ECompressionStrategy.Markdown,
|
|
93
|
+
opts,
|
|
94
|
+
note: `full document: ${lines.length} lines`,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ICompressionResult } from '../result/compression-result.js';
|
|
2
|
+
import type { ICompressOptions } from '../result/compress-options.js';
|
|
3
|
+
/**
|
|
4
|
+
* Reduce grep / ripgrep `file:line:` output to the highest-signal matches:
|
|
5
|
+
* the first hit in every file is always kept (so no file silently vanishes),
|
|
6
|
+
* then the top matches per file by query overlap and priority keywords. Lines
|
|
7
|
+
* that aren't matches (headers, blanks) are preserved as structure. Dropped
|
|
8
|
+
* matches are elided; the full output is recoverable via CCR.
|
|
9
|
+
*/
|
|
10
|
+
export declare function compressSearch(text: string, opts?: ICompressOptions): ICompressionResult;
|
|
11
|
+
//# sourceMappingURL=compress-search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compress-search.d.ts","sourceRoot":"","sources":["../../src/text/compress-search.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAkBtE;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CA4D5F"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { EContentType } from "../content/content-type.js";
|
|
2
|
+
import { ECompressionStrategy } from "../result/compression-strategy.js";
|
|
3
|
+
import { splitLines, elide } from "./line-utils.js";
|
|
4
|
+
import { finalizeLossy, passthroughResult } from "./finalize.js";
|
|
5
|
+
import { computeOptimalK } from "../table/adaptive-size.js";
|
|
6
|
+
import { bm25Scores } from "../relevance/bm25.js";
|
|
7
|
+
// Allow an optional Windows drive prefix (`C:`) before the path, so rg/grep
|
|
8
|
+
// output captured on Windows still parses (the drive colon isn't the separator).
|
|
9
|
+
const SEARCH_LINE = /^((?:[A-Za-z]:)?[^\s:]+):(\d+):(.*)$/;
|
|
10
|
+
const PRIORITY_RE = /\b(?:ERROR|FAIL|TODO|FIXME|BUG|throw|panic|deprecated)\b/i;
|
|
11
|
+
/**
|
|
12
|
+
* Reduce grep / ripgrep `file:line:` output to the highest-signal matches:
|
|
13
|
+
* the first hit in every file is always kept (so no file silently vanishes),
|
|
14
|
+
* then the top matches per file by query overlap and priority keywords. Lines
|
|
15
|
+
* that aren't matches (headers, blanks) are preserved as structure. Dropped
|
|
16
|
+
* matches are elided; the full output is recoverable via CCR.
|
|
17
|
+
*/
|
|
18
|
+
export function compressSearch(text, opts = {}) {
|
|
19
|
+
const lines = splitLines(text);
|
|
20
|
+
const matches = [];
|
|
21
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
22
|
+
const m = SEARCH_LINE.exec(lines[i] ?? '');
|
|
23
|
+
if (!m)
|
|
24
|
+
continue;
|
|
25
|
+
const file = m[1] ?? '';
|
|
26
|
+
const body = m[3] ?? '';
|
|
27
|
+
matches.push({ index: i, file, body, score: PRIORITY_RE.test(body) ? 0.5 : 0 });
|
|
28
|
+
}
|
|
29
|
+
if (matches.length < 2)
|
|
30
|
+
return passthroughResult(text, EContentType.SearchResults);
|
|
31
|
+
// P3.2: bias retained matches by BM25 relevance to the query (idf-weighted,
|
|
32
|
+
// length-normalized, ID-term boosted). No query → all zeros, so ranking falls
|
|
33
|
+
// back to the priority-keyword bonus exactly as before.
|
|
34
|
+
if (opts.query) {
|
|
35
|
+
const rel = bm25Scores(opts.query, matches.map((m) => m.body));
|
|
36
|
+
for (let k = 0; k < matches.length; k += 1)
|
|
37
|
+
matches[k].score += rel[k];
|
|
38
|
+
}
|
|
39
|
+
// P3.1: with no explicit cap, size the per-file keep from how much unique
|
|
40
|
+
// information the match bodies carry — fewer on redundant hits, up to 8 on
|
|
41
|
+
// diverse ones. An explicit `maxItems` always wins.
|
|
42
|
+
const perFile = opts.maxItems ?? computeOptimalK(matches.map((m) => m.body), { min: 2, max: 8 });
|
|
43
|
+
const byFile = new Map();
|
|
44
|
+
for (const m of matches) {
|
|
45
|
+
const list = byFile.get(m.file) ?? [];
|
|
46
|
+
list.push(m);
|
|
47
|
+
byFile.set(m.file, list);
|
|
48
|
+
}
|
|
49
|
+
const keep = new Set();
|
|
50
|
+
// Keep every non-match line (structural): headers, separators, blanks.
|
|
51
|
+
const matchIdx = new Set(matches.map((m) => m.index));
|
|
52
|
+
for (let i = 0; i < lines.length; i += 1)
|
|
53
|
+
if (!matchIdx.has(i))
|
|
54
|
+
keep.add(i);
|
|
55
|
+
for (const list of byFile.values()) {
|
|
56
|
+
// Always keep the first match in the file (so no file silently vanishes),
|
|
57
|
+
// then fill the per-file budget with the highest-scoring REMAINING matches.
|
|
58
|
+
// Excluding `first` from the ranked fill keeps the total at exactly `perFile`
|
|
59
|
+
// rather than `perFile + 1` when the first match isn't itself top-ranked.
|
|
60
|
+
const first = list[0];
|
|
61
|
+
keep.add(first.index);
|
|
62
|
+
const ranked = [...list]
|
|
63
|
+
.filter((m) => m.index !== first.index)
|
|
64
|
+
.sort((a, b) => (b.score - a.score) || (a.index - b.index))
|
|
65
|
+
.slice(0, Math.max(0, perFile - 1));
|
|
66
|
+
for (const m of ranked)
|
|
67
|
+
keep.add(m.index);
|
|
68
|
+
}
|
|
69
|
+
const body = elide(lines, keep);
|
|
70
|
+
return finalizeLossy({
|
|
71
|
+
original: text,
|
|
72
|
+
body,
|
|
73
|
+
contentType: EContentType.SearchResults,
|
|
74
|
+
strategy: ECompressionStrategy.Search,
|
|
75
|
+
opts,
|
|
76
|
+
note: `full results: ${matches.length} matches in ${byFile.size} files`,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { EContentType } from '../content/content-type.js';
|
|
2
|
+
import { ECompressionStrategy } from '../result/compression-strategy.js';
|
|
3
|
+
import type { ICompressionResult } from '../result/compression-result.js';
|
|
4
|
+
import type { ICompressOptions } from '../result/compress-options.js';
|
|
5
|
+
/** A no-op result: the input wasn't worth compressing. */
|
|
6
|
+
export declare function passthroughResult(original: string, contentType: EContentType, note?: string): ICompressionResult;
|
|
7
|
+
/**
|
|
8
|
+
* Wrap a lossy compressor's body into a result: cache the original (when a
|
|
9
|
+
* store is given) and append a CCR retrieval marker, then verify the pass
|
|
10
|
+
* actually saved tokens — if it didn't, fall back to passthrough so a
|
|
11
|
+
* compressor is never a net loss.
|
|
12
|
+
*/
|
|
13
|
+
export declare function finalizeLossy(params: {
|
|
14
|
+
original: string;
|
|
15
|
+
body: string;
|
|
16
|
+
contentType: EContentType;
|
|
17
|
+
strategy: ECompressionStrategy;
|
|
18
|
+
opts: ICompressOptions;
|
|
19
|
+
note: string;
|
|
20
|
+
}): ICompressionResult;
|
|
21
|
+
//# sourceMappingURL=finalize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"finalize.d.ts","sourceRoot":"","sources":["../../src/text/finalize.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAItE,0DAA0D;AAC1D,wBAAgB,iBAAiB,CAC/B,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,YAAY,EACzB,IAAI,SAAmC,GACtC,kBAAkB,CASpB;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE;IACpC,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,YAAY,CAAC;IAC1B,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,gBAAgB,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,kBAAkB,CAgCrB"}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { ECompressionStrategy } from "../result/compression-strategy.js";
|
|
2
|
+
import { measureSavings } from "../tokens/estimate-tokens.js";
|
|
3
|
+
import { formatCcrMarker } from "../ccr/ccr-marker.js";
|
|
4
|
+
/** A no-op result: the input wasn't worth compressing. */
|
|
5
|
+
export function passthroughResult(original, contentType, note = 'below threshold — no reduction') {
|
|
6
|
+
return {
|
|
7
|
+
compressed: original,
|
|
8
|
+
contentType,
|
|
9
|
+
strategy: ECompressionStrategy.Passthrough,
|
|
10
|
+
savings: measureSavings(original, original, contentType),
|
|
11
|
+
lossy: false,
|
|
12
|
+
note,
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Wrap a lossy compressor's body into a result: cache the original (when a
|
|
17
|
+
* store is given) and append a CCR retrieval marker, then verify the pass
|
|
18
|
+
* actually saved tokens — if it didn't, fall back to passthrough so a
|
|
19
|
+
* compressor is never a net loss.
|
|
20
|
+
*/
|
|
21
|
+
export function finalizeLossy(params) {
|
|
22
|
+
const { original, body, contentType, strategy, opts, note } = params;
|
|
23
|
+
// Compare modulo `\r`: the compressors run on LF-normalized lines, so a CRLF
|
|
24
|
+
// input with NO elision yields a body that differs only by line endings —
|
|
25
|
+
// that is not a real reduction and must passthrough the original untouched.
|
|
26
|
+
const reduced = body.replace(/\r/g, '') !== original.replace(/\r/g, '');
|
|
27
|
+
if (!reduced)
|
|
28
|
+
return passthroughResult(original, contentType);
|
|
29
|
+
let compressed = body;
|
|
30
|
+
let key;
|
|
31
|
+
if (opts.store) {
|
|
32
|
+
key = opts.store.put(original);
|
|
33
|
+
// Skip the trailing marker when the body already references THIS key inline
|
|
34
|
+
// (e.g. compressLog's per-drop elision hints) — no need to repeat it. A
|
|
35
|
+
// different inline key (e.g. a diff's per-section keys) still gets the
|
|
36
|
+
// whole-blob marker appended. The marker carries only the key: the human
|
|
37
|
+
// `note` is shipped separately in the result, so repeating it on the wire
|
|
38
|
+
// would just cost tokens.
|
|
39
|
+
compressed = body.includes(`<<ccr:${key}`) ? body : `${body}\n${formatCcrMarker(key)}`;
|
|
40
|
+
}
|
|
41
|
+
const savings = measureSavings(original, compressed, contentType);
|
|
42
|
+
if (savings.after >= savings.before) {
|
|
43
|
+
return passthroughResult(original, contentType);
|
|
44
|
+
}
|
|
45
|
+
return {
|
|
46
|
+
compressed,
|
|
47
|
+
contentType,
|
|
48
|
+
strategy,
|
|
49
|
+
savings,
|
|
50
|
+
lossy: true,
|
|
51
|
+
...(key ? { ccrKey: key } : {}),
|
|
52
|
+
note,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/** Split into lines, tolerant of CRLF, without inventing a trailing line. */
|
|
2
|
+
export declare function splitLines(text: string): string[];
|
|
3
|
+
/**
|
|
4
|
+
* A normalization key for near-duplicate detection: lowercased, with numbers,
|
|
5
|
+
* hex blobs and quoted paths collapsed to placeholders and whitespace
|
|
6
|
+
* squeezed. Two log/warning lines that differ only in a counter or address
|
|
7
|
+
* share a key, so repeated noise dedupes to one representative.
|
|
8
|
+
*/
|
|
9
|
+
export declare function dedupeKey(line: string): string;
|
|
10
|
+
/** Tokenize a query into lowercase words worth matching (length ≥ 2). */
|
|
11
|
+
export declare function queryTokens(query: string | undefined): string[];
|
|
12
|
+
/** How many query tokens appear in `text` (case-insensitive substring). */
|
|
13
|
+
export declare function queryOverlap(text: string, tokens: readonly string[]): number;
|
|
14
|
+
/**
|
|
15
|
+
* Collapse a set of kept line indices into an elided block: kept lines verbatim,
|
|
16
|
+
* each dropped run replaced by a single `… N line(s) omitted …` placeholder.
|
|
17
|
+
* Deterministic and order-preserving.
|
|
18
|
+
*/
|
|
19
|
+
export declare function elide(lines: readonly string[], keep: ReadonlySet<number>): string;
|
|
20
|
+
//# sourceMappingURL=line-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"line-utils.d.ts","sourceRoot":"","sources":["../../src/text/line-utils.ts"],"names":[],"mappings":"AAAA,6EAA6E;AAC7E,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAEjD;AAED;;;;;GAKG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAQ9C;AAED,yEAAyE;AACzE,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,EAAE,CAO/D;AAED,2EAA2E;AAC3E,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,CAM5E;AAED;;;;GAIG;AACH,wBAAgB,KAAK,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,EAAE,IAAI,EAAE,WAAW,CAAC,MAAM,CAAC,GAAG,MAAM,CAgBjF"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/** Split into lines, tolerant of CRLF, without inventing a trailing line. */
|
|
2
|
+
export function splitLines(text) {
|
|
3
|
+
return text.replace(/\r\n/g, '\n').split('\n');
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* A normalization key for near-duplicate detection: lowercased, with numbers,
|
|
7
|
+
* hex blobs and quoted paths collapsed to placeholders and whitespace
|
|
8
|
+
* squeezed. Two log/warning lines that differ only in a counter or address
|
|
9
|
+
* share a key, so repeated noise dedupes to one representative.
|
|
10
|
+
*/
|
|
11
|
+
export function dedupeKey(line) {
|
|
12
|
+
return line
|
|
13
|
+
.toLowerCase()
|
|
14
|
+
.replace(/0x[0-9a-f]+/g, '<x>')
|
|
15
|
+
.replace(/\b[0-9a-f]{8,}\b/g, '<x>')
|
|
16
|
+
.replace(/\d+/g, '<n>')
|
|
17
|
+
.replace(/\s+/g, ' ')
|
|
18
|
+
.trim();
|
|
19
|
+
}
|
|
20
|
+
/** Tokenize a query into lowercase words worth matching (length ≥ 2). */
|
|
21
|
+
export function queryTokens(query) {
|
|
22
|
+
if (!query)
|
|
23
|
+
return [];
|
|
24
|
+
const seen = new Set();
|
|
25
|
+
for (const raw of query.toLowerCase().split(/[^a-z0-9_]+/)) {
|
|
26
|
+
if (raw.length >= 2)
|
|
27
|
+
seen.add(raw);
|
|
28
|
+
}
|
|
29
|
+
return [...seen];
|
|
30
|
+
}
|
|
31
|
+
/** How many query tokens appear in `text` (case-insensitive substring). */
|
|
32
|
+
export function queryOverlap(text, tokens) {
|
|
33
|
+
if (tokens.length === 0)
|
|
34
|
+
return 0;
|
|
35
|
+
const lower = text.toLowerCase();
|
|
36
|
+
let hits = 0;
|
|
37
|
+
for (const t of tokens)
|
|
38
|
+
if (lower.includes(t))
|
|
39
|
+
hits += 1;
|
|
40
|
+
return hits;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Collapse a set of kept line indices into an elided block: kept lines verbatim,
|
|
44
|
+
* each dropped run replaced by a single `… N line(s) omitted …` placeholder.
|
|
45
|
+
* Deterministic and order-preserving.
|
|
46
|
+
*/
|
|
47
|
+
export function elide(lines, keep) {
|
|
48
|
+
const out = [];
|
|
49
|
+
let dropped = 0;
|
|
50
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
51
|
+
if (keep.has(i)) {
|
|
52
|
+
if (dropped > 0) {
|
|
53
|
+
out.push(`… ${dropped} line${dropped === 1 ? '' : 's'} omitted`);
|
|
54
|
+
dropped = 0;
|
|
55
|
+
}
|
|
56
|
+
out.push(lines[i] ?? '');
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
dropped += 1;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
if (dropped > 0)
|
|
63
|
+
out.push(`… ${dropped} line${dropped === 1 ? '' : 's'} omitted`);
|
|
64
|
+
return out.join('\n');
|
|
65
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lockfile-names.d.ts","sourceRoot":"","sources":["../../src/text/lockfile-names.ts"],"names":[],"mappings":"AA8BA,8EAA8E;AAC9E,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAExD"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Basenames of dependency lockfiles. A lockfile diff is almost pure churn —
|
|
3
|
+
* thousands of integrity hashes and resolved versions an agent never reads —
|
|
4
|
+
* so the diff compressor elides its body to a one-line marker (recoverable via
|
|
5
|
+
* CCR). Matched by exact basename (case-insensitive), never by extension, so a
|
|
6
|
+
* hand-written `versions.lock` config isn't swept up by accident.
|
|
7
|
+
*/
|
|
8
|
+
const LOCKFILE_BASENAMES = new Set([
|
|
9
|
+
'package-lock.json',
|
|
10
|
+
'npm-shrinkwrap.json',
|
|
11
|
+
'yarn.lock',
|
|
12
|
+
'pnpm-lock.yaml',
|
|
13
|
+
'bun.lock',
|
|
14
|
+
'bun.lockb',
|
|
15
|
+
'packages.lock.json',
|
|
16
|
+
'cargo.lock',
|
|
17
|
+
'go.sum',
|
|
18
|
+
'composer.lock',
|
|
19
|
+
'gemfile.lock',
|
|
20
|
+
'poetry.lock',
|
|
21
|
+
'pipfile.lock',
|
|
22
|
+
'pdm.lock',
|
|
23
|
+
'gradle.lockfile',
|
|
24
|
+
'mix.lock',
|
|
25
|
+
'flake.lock',
|
|
26
|
+
'pubspec.lock',
|
|
27
|
+
'packwiz.lock',
|
|
28
|
+
'deno.lock',
|
|
29
|
+
]);
|
|
30
|
+
/** True when `basename` is a known dependency lockfile (case-insensitive). */
|
|
31
|
+
export function isLockfileName(basename) {
|
|
32
|
+
return LOCKFILE_BASENAMES.has(basename.toLowerCase());
|
|
33
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Drain-style log-template mining — a LOSSLESS pre-pass for {@link compressLog}.
|
|
3
|
+
*
|
|
4
|
+
* Repeated structured log lines (`worker-3 processing batch 17 ok` × N) carry
|
|
5
|
+
* almost no new information per line: the fixed words repeat and only a few
|
|
6
|
+
* variable tokens move. This miner tokenizes each line, replaces its variable
|
|
7
|
+
* tokens (numbers, hex, UUIDs, ISO timestamps, quoted strings) with a `{}`
|
|
8
|
+
* placeholder to form a *template*, groups consecutive lines sharing a
|
|
9
|
+
* template, and collapses each run to one template plus a compact per-column
|
|
10
|
+
* encoding of the captured variables.
|
|
11
|
+
*
|
|
12
|
+
* It is **lossless by construction**: a template is exactly the original line
|
|
13
|
+
* with its variable matches replaced by `{}`, so `template ⋈ variables` rebuilds
|
|
14
|
+
* every original line, in order — no CCR needed. {@link reconstructLogTemplates}
|
|
15
|
+
* is the inverse and is exercised by the round-trip tests.
|
|
16
|
+
*
|
|
17
|
+
* Only runs with ≥1 variable column collapse; pure-identical repeats are left
|
|
18
|
+
* for the downstream signal-selector's de-duplication so its behaviour (and the
|
|
19
|
+
* `… N omitted …` markers callers rely on) is preserved.
|
|
20
|
+
*/
|
|
21
|
+
export interface IMinedLog {
|
|
22
|
+
/** The transformed line list with collapsible runs replaced by blocks. */
|
|
23
|
+
lines: string[];
|
|
24
|
+
/** True when at least one run collapsed. */
|
|
25
|
+
reduced: boolean;
|
|
26
|
+
}
|
|
27
|
+
/** Collapse consecutive same-template runs. Lossless; reversible via {@link reconstructLogTemplates}. */
|
|
28
|
+
export declare function mineLogTemplates(lines: readonly string[]): IMinedLog;
|
|
29
|
+
/** Inverse of {@link mineLogTemplates}: expand every block back to its original lines. */
|
|
30
|
+
export declare function reconstructLogTemplates(text: string): string;
|
|
31
|
+
//# sourceMappingURL=log-template.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"log-template.d.ts","sourceRoot":"","sources":["../../src/text/log-template.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAiCH,MAAM,WAAW,SAAS;IACxB,0EAA0E;IAC1E,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,4CAA4C;IAC5C,OAAO,EAAE,OAAO,CAAC;CAClB;AAED,yGAAyG;AACzG,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,GAAG,SAAS,CAuCpE;AA4HD,0FAA0F;AAC1F,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAgC5D"}
|