@shrkcrft/compress 0.1.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +21 -0
- package/dist/cache/align-volatile-tokens.d.ts +13 -0
- package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/align-volatile-tokens.js +51 -0
- package/dist/cache/alignment-map.d.ts +23 -0
- package/dist/cache/alignment-map.d.ts.map +1 -0
- package/dist/cache/alignment-map.js +1 -0
- package/dist/cache/alignment-result.d.ts +11 -0
- package/dist/cache/alignment-result.d.ts.map +1 -0
- package/dist/cache/alignment-result.js +1 -0
- package/dist/cache/detect-volatile-tokens.d.ts +10 -0
- package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/detect-volatile-tokens.js +41 -0
- package/dist/cache/placeholder.d.ts +28 -0
- package/dist/cache/placeholder.d.ts.map +1 -0
- package/dist/cache/placeholder.js +0 -0
- package/dist/cache/restore-volatile-tokens.d.ts +10 -0
- package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/restore-volatile-tokens.js +21 -0
- package/dist/cache/volatile-classify.d.ts +11 -0
- package/dist/cache/volatile-classify.d.ts.map +1 -0
- package/dist/cache/volatile-classify.js +35 -0
- package/dist/cache/volatile-kind.d.ts +13 -0
- package/dist/cache/volatile-kind.d.ts.map +1 -0
- package/dist/cache/volatile-kind.js +13 -0
- package/dist/cache/volatile-token.d.ts +14 -0
- package/dist/cache/volatile-token.d.ts.map +1 -0
- package/dist/cache/volatile-token.js +1 -0
- package/dist/ccr/ccr-entry.d.ts +13 -0
- package/dist/ccr/ccr-entry.d.ts.map +1 -0
- package/dist/ccr/ccr-entry.js +1 -0
- package/dist/ccr/ccr-key.d.ts +9 -0
- package/dist/ccr/ccr-key.d.ts.map +1 -0
- package/dist/ccr/ccr-key.js +19 -0
- package/dist/ccr/ccr-marker.d.ts +23 -0
- package/dist/ccr/ccr-marker.d.ts.map +1 -0
- package/dist/ccr/ccr-marker.js +30 -0
- package/dist/ccr/ccr-store.d.ts +18 -0
- package/dist/ccr/ccr-store.d.ts.map +1 -0
- package/dist/ccr/ccr-store.js +1 -0
- package/dist/ccr/file-ccr-store.d.ts +19 -0
- package/dist/ccr/file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/file-ccr-store.js +53 -0
- package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
- package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
- package/dist/ccr/in-memory-ccr-store.js +45 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/ttl-file-ccr-store.js +117 -0
- package/dist/code/compress-code.d.ts +4 -0
- package/dist/code/compress-code.d.ts.map +1 -0
- package/dist/code/compress-code.js +294 -0
- package/dist/compress-content.d.ts +11 -0
- package/dist/compress-content.d.ts.map +1 -0
- package/dist/compress-content.js +79 -0
- package/dist/content/content-type.d.ts +28 -0
- package/dist/content/content-type.d.ts.map +1 -0
- package/dist/content/content-type.js +28 -0
- package/dist/content/detect-content-type.d.ts +9 -0
- package/dist/content/detect-content-type.d.ts.map +1 -0
- package/dist/content/detect-content-type.js +184 -0
- package/dist/content/segment.d.ts +21 -0
- package/dist/content/segment.d.ts.map +1 -0
- package/dist/content/segment.js +117 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +49 -0
- package/dist/json/compress-json.d.ts +18 -0
- package/dist/json/compress-json.d.ts.map +1 -0
- package/dist/json/compress-json.js +139 -0
- package/dist/json/render-compact-json.d.ts +10 -0
- package/dist/json/render-compact-json.d.ts.map +1 -0
- package/dist/json/render-compact-json.js +18 -0
- package/dist/relevance/bm25.d.ts +26 -0
- package/dist/relevance/bm25.d.ts.map +1 -0
- package/dist/relevance/bm25.js +115 -0
- package/dist/result/compress-options.d.ts +26 -0
- package/dist/result/compress-options.d.ts.map +1 -0
- package/dist/result/compress-options.js +1 -0
- package/dist/result/compression-result.d.ts +26 -0
- package/dist/result/compression-result.d.ts.map +1 -0
- package/dist/result/compression-result.js +1 -0
- package/dist/result/compression-strategy.d.ts +30 -0
- package/dist/result/compression-strategy.d.ts.map +1 -0
- package/dist/result/compression-strategy.js +30 -0
- package/dist/table/adaptive-size.d.ts +46 -0
- package/dist/table/adaptive-size.d.ts.map +1 -0
- package/dist/table/adaptive-size.js +170 -0
- package/dist/table/apply-value-dictionaries.d.ts +30 -0
- package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
- package/dist/table/apply-value-dictionaries.js +99 -0
- package/dist/table/column-presence.d.ts +20 -0
- package/dist/table/column-presence.d.ts.map +1 -0
- package/dist/table/column-presence.js +52 -0
- package/dist/table/columnar-json.d.ts +24 -0
- package/dist/table/columnar-json.d.ts.map +1 -0
- package/dist/table/columnar-json.js +83 -0
- package/dist/table/columnar-table.d.ts +24 -0
- package/dist/table/columnar-table.d.ts.map +1 -0
- package/dist/table/columnar-table.js +1 -0
- package/dist/table/compact-object-array.d.ts +12 -0
- package/dist/table/compact-object-array.d.ts.map +1 -0
- package/dist/table/compact-object-array.js +88 -0
- package/dist/table/field-spec.d.ts +13 -0
- package/dist/table/field-spec.d.ts.map +1 -0
- package/dist/table/field-spec.js +1 -0
- package/dist/table/object-map.d.ts +28 -0
- package/dist/table/object-map.d.ts.map +1 -0
- package/dist/table/object-map.js +119 -0
- package/dist/table/render-table.d.ts +11 -0
- package/dist/table/render-table.d.ts.map +1 -0
- package/dist/table/render-table.js +39 -0
- package/dist/table/sample-object-array.d.ts +11 -0
- package/dist/table/sample-object-array.d.ts.map +1 -0
- package/dist/table/sample-object-array.js +171 -0
- package/dist/table/sample-options.d.ts +29 -0
- package/dist/table/sample-options.d.ts.map +1 -0
- package/dist/table/sample-options.js +1 -0
- package/dist/table/sampled-table.d.ts +33 -0
- package/dist/table/sampled-table.d.ts.map +1 -0
- package/dist/table/sampled-table.js +8 -0
- package/dist/table/table-compaction.d.ts +19 -0
- package/dist/table/table-compaction.d.ts.map +1 -0
- package/dist/table/table-compaction.js +1 -0
- package/dist/table/table-formats.d.ts +23 -0
- package/dist/table/table-formats.d.ts.map +1 -0
- package/dist/table/table-formats.js +233 -0
- package/dist/text/compress-diff.d.ts +20 -0
- package/dist/text/compress-diff.d.ts.map +1 -0
- package/dist/text/compress-diff.js +344 -0
- package/dist/text/compress-lines.d.ts +12 -0
- package/dist/text/compress-lines.d.ts.map +1 -0
- package/dist/text/compress-lines.js +44 -0
- package/dist/text/compress-log.d.ts +12 -0
- package/dist/text/compress-log.d.ts.map +1 -0
- package/dist/text/compress-log.js +202 -0
- package/dist/text/compress-markdown.d.ts +15 -0
- package/dist/text/compress-markdown.d.ts.map +1 -0
- package/dist/text/compress-markdown.js +96 -0
- package/dist/text/compress-search.d.ts +11 -0
- package/dist/text/compress-search.d.ts.map +1 -0
- package/dist/text/compress-search.js +78 -0
- package/dist/text/finalize.d.ts +21 -0
- package/dist/text/finalize.d.ts.map +1 -0
- package/dist/text/finalize.js +54 -0
- package/dist/text/line-utils.d.ts +20 -0
- package/dist/text/line-utils.d.ts.map +1 -0
- package/dist/text/line-utils.js +65 -0
- package/dist/text/lockfile-names.d.ts +3 -0
- package/dist/text/lockfile-names.d.ts.map +1 -0
- package/dist/text/lockfile-names.js +33 -0
- package/dist/text/log-template.d.ts +31 -0
- package/dist/text/log-template.d.ts.map +1 -0
- package/dist/text/log-template.js +239 -0
- package/dist/tokens/estimate-tokens.d.ts +17 -0
- package/dist/tokens/estimate-tokens.d.ts.map +1 -0
- package/dist/tokens/estimate-tokens.js +53 -0
- package/dist/tokens/token-savings.d.ts +20 -0
- package/dist/tokens/token-savings.d.ts.map +1 -0
- package/dist/tokens/token-savings.js +1 -0
- package/package.json +52 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coarse content classes the router recognises. The class selects which
|
|
3
|
+
* deterministic compressor runs. Ordered loosely from most-specific
|
|
4
|
+
* (cheapest to over-trigger) to least.
|
|
5
|
+
*/
|
|
6
|
+
export var EContentType;
|
|
7
|
+
(function (EContentType) {
|
|
8
|
+
/** A JSON array (top-level `[ ... ]`). The table compactor's prime target. */
|
|
9
|
+
EContentType["JsonArray"] = "json-array";
|
|
10
|
+
/** A JSON object or scalar (top-level `{ ... }` / value). */
|
|
11
|
+
EContentType["Json"] = "json";
|
|
12
|
+
/** A unified/`git` diff. */
|
|
13
|
+
EContentType["GitDiff"] = "git-diff";
|
|
14
|
+
/** grep/ripgrep `file:line:` style search output. */
|
|
15
|
+
EContentType["SearchResults"] = "search-results";
|
|
16
|
+
/** Build / test / runtime log output. */
|
|
17
|
+
EContentType["BuildLog"] = "build-log";
|
|
18
|
+
/** Source code in a recognised language. */
|
|
19
|
+
EContentType["SourceCode"] = "source-code";
|
|
20
|
+
/** Markdown prose / docs. */
|
|
21
|
+
EContentType["Markdown"] = "markdown";
|
|
22
|
+
/** YAML configuration / manifests (`key: value` mappings + `- ` lists). */
|
|
23
|
+
EContentType["Yaml"] = "yaml";
|
|
24
|
+
/** Delimiter-separated values (CSV / TSV): a stable column count per line. */
|
|
25
|
+
EContentType["Csv"] = "csv";
|
|
26
|
+
/** Anything else. */
|
|
27
|
+
EContentType["PlainText"] = "plain-text";
|
|
28
|
+
})(EContentType || (EContentType = {}));
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { EContentType } from './content-type.js';
|
|
2
|
+
/**
|
|
3
|
+
* Classify a blob deterministically. Order is significant: JSON is checked
|
|
4
|
+
* first (it round-trips cleanly through `JSON.parse`), then structural
|
|
5
|
+
* formats (diff/search), then heuristic ones (log/code/markdown), with
|
|
6
|
+
* plain text as the floor. Pure — same bytes in, same class out.
|
|
7
|
+
*/
|
|
8
|
+
export declare function detectContentType(text: string): EContentType;
|
|
9
|
+
//# sourceMappingURL=detect-content-type.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect-content-type.d.ts","sourceRoot":"","sources":["../../src/content/detect-content-type.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AA0FjD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAkG5D"}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { EContentType } from "./content-type.js";
|
|
2
|
+
const SEARCH_LINE = /^(?:[A-Za-z]:)?[^\s:]+:\d+:/;
|
|
3
|
+
// Compiler diagnostics that aren't `path:line:` shaped: tsc / MSVC
|
|
4
|
+
// `src/a.ts(10,5): error TS2322` and the `path(line):` family. These are search
|
|
5
|
+
// output, not source code — routing them to SourceCode mangled them.
|
|
6
|
+
const DIAGNOSTIC_LINE = /^(?:[A-Za-z]:)?[^\s:()]+\(\d+(?:,\d+)?\):\s/;
|
|
7
|
+
const DIFF_HUNK = /^@@ -\d+(?:,\d+)? \+\d+(?:,\d+)? @@/;
|
|
8
|
+
// YAML: `key:` / `key: value` mappings, `- ` sequence items, `---` doc markers.
|
|
9
|
+
const YAML_KEY = /^\s*[\w.-]+:(?:\s|$)/;
|
|
10
|
+
const YAML_LINE = /^\s*(?:[\w.-]+:(?:\s|$)|-\s|#|---\s*$|\.\.\.\s*$)/;
|
|
11
|
+
// A block-introducing key (`items:` with no inline value) followed by indented
|
|
12
|
+
// sequence items is unambiguously YAML — a Markdown list never has a bare
|
|
13
|
+
// `word:` line introducing indented bullets. Distinguishes list-heavy YAML
|
|
14
|
+
// (low key density) from a Markdown bullet list with an incidental `Note: x`.
|
|
15
|
+
const YAML_BLOCK_KEY = /^\s*[\w.-]+:\s*$/;
|
|
16
|
+
const YAML_INDENTED_SEQ = /^\s{2,}-\s/;
|
|
17
|
+
// Log levels must appear as a LINE PREFIX (optionally after a leading
|
|
18
|
+
// timestamp / bracket), not anywhere on the line — otherwise common code
|
|
19
|
+
// identifiers (`const ERROR = 500`, `enum { INFO, DEBUG }`) misroute to logs.
|
|
20
|
+
const LOG_MARKER = /^\s*(?:(?:\[?\d{4}-\d{2}-\d{2}[T ][\d:.,]+\]?|\S+\[\d+\]:)\s+)?\[?(?:ERROR|FATAL|FAIL(?:ED|URE)?|WARN(?:ING)?|INFO|DEBUG|NOTICE|TRACE)\b|^\S+\[\d+\]:\s|^\s*Traceback\b|^\s+at\s+\S+\s*\(|^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}/;
|
|
21
|
+
const MARKDOWN_MARKER = /^(?:#{1,6}\s|\s*[-*]\s|\s*\d+\.\s|```|>\s|\|)/;
|
|
22
|
+
// Code signals: declaration keywords + structural/statement shapes that real
|
|
23
|
+
// code has but prose / markdown / INI / env / TOML / nginx do NOT (each new
|
|
24
|
+
// signal measures 0.00 on those). Built as a union of annotated sources.
|
|
25
|
+
const CODE_MARKER = new RegExp([
|
|
26
|
+
// declaration / punctuation (original)
|
|
27
|
+
/\b(?:function|const|let|var|class|interface|enum|import|export|def|return|public|private|func|impl|struct|package|namespace)\b/,
|
|
28
|
+
/=>/,
|
|
29
|
+
/::/,
|
|
30
|
+
/^\s*@\w+/,
|
|
31
|
+
/^[\s{}()\[\];,]+$/,
|
|
32
|
+
// typed return/param annotation: `): Foo {` / `): Foo =>` / `]: Bar =`
|
|
33
|
+
/[)\]]\s*:\s*[A-Za-z_$][\w$.<>\[\], ]*\s*(?:=>|\{|=|$)/,
|
|
34
|
+
// assignment statement terminated by `;` (rejects ==/=== via [^=]; needs trailing ;)
|
|
35
|
+
/^\s*[A-Za-z_$][\w$.[\]]*\s*(?:\+|-|\*|\/|%|\?\?|\|\||&&|<<|>>|\||&|\^)?=\s*[^=].*;\s*$/,
|
|
36
|
+
// member/method call AT LINE START: `obj.method(` (anchored so prose
|
|
37
|
+
// "the system.config()" / "noun.verb(" embedded mid-sentence doesn't match)
|
|
38
|
+
/^\s*[A-Za-z_$][\w$]*\.[A-Za-z_$][\w$]*\s*\(/,
|
|
39
|
+
// bare call statement AT LINE START ending in `;`: `doThing(args);` (anchored
|
|
40
|
+
// so log lines like "Calling fetchUser(42);" don't match)
|
|
41
|
+
/^\s*[A-Za-z_$][\w$]*\([^()]*\)\s*;\s*$/,
|
|
42
|
+
// control-flow header: `if (x) {`, `for (...)`, `while/switch/catch (...)`
|
|
43
|
+
/^\s*(?:if|for|while|switch|catch)\s*\(.*\)\s*\{?\s*$/,
|
|
44
|
+
]
|
|
45
|
+
.map((r) => r.source)
|
|
46
|
+
.join('|'));
|
|
47
|
+
function lineHitRatio(lines, test) {
|
|
48
|
+
if (lines.length === 0)
|
|
49
|
+
return 0;
|
|
50
|
+
const match = typeof test === 'function' ? test : (l) => test.test(l);
|
|
51
|
+
let hits = 0;
|
|
52
|
+
for (const line of lines)
|
|
53
|
+
if (match(line))
|
|
54
|
+
hits += 1;
|
|
55
|
+
return hits / lines.length;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Delimiter-separated values: a stable column count per line. Returns true when
|
|
59
|
+
* one of `,`/`\t`/`;` yields the SAME count (≥1) on ≥90% of the non-blank lines
|
|
60
|
+
* (≥2 of them) — a shape prose and config never have.
|
|
61
|
+
*/
|
|
62
|
+
function looksDelimited(nonBlank) {
|
|
63
|
+
if (nonBlank.length < 2)
|
|
64
|
+
return false;
|
|
65
|
+
// `;` is excluded: semicolon-terminated prose/code lines have a stable count
|
|
66
|
+
// of 1 and would masquerade as 2-column CSV. Real CSV/TSV uses `,` or tab.
|
|
67
|
+
for (const delim of [',', '\t']) {
|
|
68
|
+
const counts = nonBlank.map((l) => l.split(delim).length - 1);
|
|
69
|
+
const freq = new Map();
|
|
70
|
+
for (const c of counts)
|
|
71
|
+
freq.set(c, (freq.get(c) ?? 0) + 1);
|
|
72
|
+
let modal = -1;
|
|
73
|
+
let modalFreq = 0;
|
|
74
|
+
for (const [c, f] of freq) {
|
|
75
|
+
if (f > modalFreq || (f === modalFreq && c > modal)) {
|
|
76
|
+
modal = c;
|
|
77
|
+
modalFreq = f;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
// A real CSV/TSV has the same column count (≥2 columns ⇒ ≥1 delimiter) on
|
|
81
|
+
// almost every line.
|
|
82
|
+
if (modal >= 1 && modalFreq / nonBlank.length >= 0.9)
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Classify a blob deterministically. Order is significant: JSON is checked
|
|
89
|
+
* first (it round-trips cleanly through `JSON.parse`), then structural
|
|
90
|
+
* formats (diff/search), then heuristic ones (log/code/markdown), with
|
|
91
|
+
* plain text as the floor. Pure — same bytes in, same class out.
|
|
92
|
+
*/
|
|
93
|
+
export function detectContentType(text) {
|
|
94
|
+
const trimmed = text.trim();
|
|
95
|
+
if (trimmed.length === 0)
|
|
96
|
+
return EContentType.PlainText;
|
|
97
|
+
// 1. JSON — only when it actually parses, so we never mis-route prose that
|
|
98
|
+
// merely starts with a bracket.
|
|
99
|
+
const first = trimmed[0];
|
|
100
|
+
if (first === '[' || first === '{') {
|
|
101
|
+
try {
|
|
102
|
+
const parsed = JSON.parse(trimmed);
|
|
103
|
+
if (Array.isArray(parsed))
|
|
104
|
+
return EContentType.JsonArray;
|
|
105
|
+
if (parsed !== null && typeof parsed === 'object')
|
|
106
|
+
return EContentType.Json;
|
|
107
|
+
return EContentType.Json;
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
// fall through — not valid JSON
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
const lines = trimmed.split('\n');
|
|
114
|
+
// 2. Git diff — a `diff --git` header or a clear hunk header set. Scan ALL
|
|
115
|
+
// `@@` lines (a leading malformed hunk must not defeat detection).
|
|
116
|
+
if (/^diff --git /m.test(trimmed) ||
|
|
117
|
+
(lines.some((l) => DIFF_HUNK.test(l)) &&
|
|
118
|
+
/^--- /m.test(trimmed) &&
|
|
119
|
+
/^\+\+\+ /m.test(trimmed))) {
|
|
120
|
+
return EContentType.GitDiff;
|
|
121
|
+
}
|
|
122
|
+
// 3. grep / ripgrep search output (`path:line:` prefix) OR compiler
|
|
123
|
+
// diagnostics (`path(line,col):`). Count either shape toward the ratio.
|
|
124
|
+
if (lineHitRatio(lines, (l) => SEARCH_LINE.test(l) || DIAGNOSTIC_LINE.test(l)) >= 0.6) {
|
|
125
|
+
return EContentType.SearchResults;
|
|
126
|
+
}
|
|
127
|
+
// 4. Build / test log (error / warn / timestamp markers dense enough).
|
|
128
|
+
if (lineHitRatio(lines, LOG_MARKER) >= 0.25)
|
|
129
|
+
return EContentType.BuildLog;
|
|
130
|
+
// 5. Source code — keyword / structural density over non-blank lines OUTSIDE
|
|
131
|
+
// fenced code blocks (a markdown doc's ``` examples must not be counted as
|
|
132
|
+
// the doc's own code). A real source file has no fences, so its basis is
|
|
133
|
+
// unchanged. EOL punctuation alone must NOT count (prose/config ending in
|
|
134
|
+
// `;` is not code).
|
|
135
|
+
// Only TOP-LEVEL fences (CommonMark allows ≤3 leading spaces) count — an
|
|
136
|
+
// indented backtick line shown as a prose example must not toggle the fence
|
|
137
|
+
// state and skew the balance check.
|
|
138
|
+
const fenceRe = /^ {0,3}(?:```|~~~)/;
|
|
139
|
+
// Only trust fence exclusion when fences are balanced. An odd (unterminated)
|
|
140
|
+
// count — e.g. a stray ``` inside a source file's string/comment — would
|
|
141
|
+
// otherwise flip `inFence` forever and exclude the rest of the file.
|
|
142
|
+
const fenceCount = lines.reduce((n, l) => (fenceRe.test(l) ? n + 1 : n), 0);
|
|
143
|
+
const excludeFences = fenceCount > 0 && fenceCount % 2 === 0;
|
|
144
|
+
let inFence = false;
|
|
145
|
+
const codeBasis = [];
|
|
146
|
+
for (const l of lines) {
|
|
147
|
+
if (excludeFences && fenceRe.test(l)) {
|
|
148
|
+
inFence = !inFence;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
if (!inFence && l.trim().length > 0)
|
|
152
|
+
codeBasis.push(l);
|
|
153
|
+
}
|
|
154
|
+
const codeRatio = codeBasis.length > 0 ? lineHitRatio(codeBasis, CODE_MARKER) : 0;
|
|
155
|
+
if (codeRatio >= 0.45)
|
|
156
|
+
return EContentType.SourceCode;
|
|
157
|
+
const nonBlank = lines.filter((l) => l.trim().length > 0);
|
|
158
|
+
// 5b. CSV / TSV — a stable column count per line (checked before YAML/markdown
|
|
159
|
+
// so a 2-column file isn't mistaken for `key: value` or a list).
|
|
160
|
+
if (looksDelimited(nonBlank))
|
|
161
|
+
return EContentType.Csv;
|
|
162
|
+
// 5c. YAML — ≥80% of non-blank lines are YAML-shaped AND either ≥30% are
|
|
163
|
+
// actual `key:` mappings (mapping-heavy config) OR there's a block-key →
|
|
164
|
+
// indented-sequence shape (list-heavy config). Both reject a plain
|
|
165
|
+
// Markdown bullet list. Checked before markdown, which would otherwise
|
|
166
|
+
// grab YAML's `- ` sequence items and lossily cap them.
|
|
167
|
+
if (nonBlank.length >= 2) {
|
|
168
|
+
const yamlShaped = lineHitRatio(nonBlank, YAML_LINE);
|
|
169
|
+
const keyDensity = lineHitRatio(nonBlank, YAML_KEY);
|
|
170
|
+
const blockSeq = nonBlank.some((l) => YAML_BLOCK_KEY.test(l)) && nonBlank.some((l) => YAML_INDENTED_SEQ.test(l));
|
|
171
|
+
if (yamlShaped >= 0.8 && (keyDensity >= 0.3 || blockSeq))
|
|
172
|
+
return EContentType.Yaml;
|
|
173
|
+
}
|
|
174
|
+
// 6. Markdown — a marker-dense blob, OR a prose doc with ≥2 ATX headers. The
|
|
175
|
+
// header rule is gated so a commented script (Python/shell `# …` lines, or
|
|
176
|
+
// a `#!`-shebang file) with low code-syntax density isn't mistaken for a doc.
|
|
177
|
+
const headerCount = lines.reduce((n, l) => (/^#{1,6}\s/.test(l) ? n + 1 : n), 0);
|
|
178
|
+
const looksLikeScript = (lines[0] ?? '').startsWith('#!');
|
|
179
|
+
if (lineHitRatio(lines, MARKDOWN_MARKER) >= 0.3 ||
|
|
180
|
+
(headerCount >= 2 && codeRatio < 0.15 && !looksLikeScript)) {
|
|
181
|
+
return EContentType.Markdown;
|
|
182
|
+
}
|
|
183
|
+
return EContentType.PlainText;
|
|
184
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { EContentType } from './content-type.js';
|
|
2
|
+
/**
|
|
3
|
+
* A typed run of a mixed blob. {@link segmentContent} splits a heterogeneous
|
|
4
|
+
* dump — prose interleaved with a JSON block and a stack trace, say — into
|
|
5
|
+
* contiguous runs so each can be compressed by its own strategy instead of
|
|
6
|
+
* forcing the whole blob through one. (P4.3)
|
|
7
|
+
*/
|
|
8
|
+
export interface IContentSegment {
|
|
9
|
+
type: EContentType;
|
|
10
|
+
text: string;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Split `text` into typed segments. Contiguous multi-line JSON blocks are
|
|
14
|
+
* isolated; the remaining lines are grouped into runs of one coarse class
|
|
15
|
+
* (blank lines extend the current run), and each run's real type is detected.
|
|
16
|
+
* A single-type blob yields exactly one segment.
|
|
17
|
+
*/
|
|
18
|
+
export declare function segmentContent(text: string): IContentSegment[];
|
|
19
|
+
/** Content classes that have a dedicated, materially-better compressor. */
|
|
20
|
+
export declare function isRichSegmentType(type: EContentType): boolean;
|
|
21
|
+
//# sourceMappingURL=segment.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"segment.d.ts","sourceRoot":"","sources":["../../src/content/segment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAIjD;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,YAAY,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;CACd;AA0DD;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,eAAe,EAAE,CAoC9D;AAED,2EAA2E;AAC3E,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAS7D"}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { EContentType } from "./content-type.js";
|
|
2
|
+
import { detectContentType } from "./detect-content-type.js";
|
|
3
|
+
import { splitLines } from "../text/line-utils.js";
|
|
4
|
+
/** Coarse per-line class used to group adjacent non-JSON lines. */
|
|
5
|
+
function coarseClass(line) {
|
|
6
|
+
if (/^(?:diff --git |@@ |--- |\+\+\+ )/.test(line))
|
|
7
|
+
return 'diff';
|
|
8
|
+
if (/^(?:[A-Za-z]:)?[^\s:]+:\d+:/.test(line))
|
|
9
|
+
return 'search';
|
|
10
|
+
if (/^\s*\[?\d{4}-\d{2}-\d{2}[T ]/.test(line) ||
|
|
11
|
+
/^\s*\[?(?:ERROR|FATAL|FAIL(?:ED|URE)?|WARN(?:ING)?|INFO|DEBUG|NOTICE|TRACE)\b/.test(line) ||
|
|
12
|
+
/^\s+at\s+\S/.test(line) ||
|
|
13
|
+
/^\s*Traceback\b/.test(line) ||
|
|
14
|
+
/^[\w.$]*(?:Error|Exception):/.test(line)) {
|
|
15
|
+
return 'log';
|
|
16
|
+
}
|
|
17
|
+
return 'prose';
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* If a JSON value opens at `start` (a line beginning `{`/`[`), return the index
|
|
21
|
+
* of the line where it closes balanced and parses — multi-line only. Otherwise
|
|
22
|
+
* null. String/escape aware so braces inside strings don't unbalance it.
|
|
23
|
+
*/
|
|
24
|
+
function findJsonBlock(lines, start) {
|
|
25
|
+
const open = (lines[start] ?? '').trimStart()[0];
|
|
26
|
+
if (open !== '{' && open !== '[')
|
|
27
|
+
return null;
|
|
28
|
+
let depth = 0;
|
|
29
|
+
let inStr = false;
|
|
30
|
+
let esc = false;
|
|
31
|
+
let started = false;
|
|
32
|
+
const limit = Math.min(lines.length, start + 2000);
|
|
33
|
+
for (let j = start; j < limit; j += 1) {
|
|
34
|
+
for (const ch of lines[j] ?? '') {
|
|
35
|
+
if (inStr) {
|
|
36
|
+
if (esc)
|
|
37
|
+
esc = false;
|
|
38
|
+
else if (ch === '\\')
|
|
39
|
+
esc = true;
|
|
40
|
+
else if (ch === '"')
|
|
41
|
+
inStr = false;
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (ch === '"')
|
|
45
|
+
inStr = true;
|
|
46
|
+
else if (ch === '{' || ch === '[') {
|
|
47
|
+
depth += 1;
|
|
48
|
+
started = true;
|
|
49
|
+
}
|
|
50
|
+
else if (ch === '}' || ch === ']')
|
|
51
|
+
depth -= 1;
|
|
52
|
+
}
|
|
53
|
+
if (started && depth <= 0) {
|
|
54
|
+
if (j === start)
|
|
55
|
+
return null; // single-line JSON stays inline with prose
|
|
56
|
+
try {
|
|
57
|
+
JSON.parse(lines.slice(start, j + 1).join('\n').trim());
|
|
58
|
+
return j;
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Split `text` into typed segments. Contiguous multi-line JSON blocks are
|
|
69
|
+
* isolated; the remaining lines are grouped into runs of one coarse class
|
|
70
|
+
* (blank lines extend the current run), and each run's real type is detected.
|
|
71
|
+
* A single-type blob yields exactly one segment.
|
|
72
|
+
*/
|
|
73
|
+
export function segmentContent(text) {
|
|
74
|
+
const lines = splitLines(text);
|
|
75
|
+
const n = lines.length;
|
|
76
|
+
const cls = new Array(n).fill('');
|
|
77
|
+
let i = 0;
|
|
78
|
+
while (i < n) {
|
|
79
|
+
const trimmed = (lines[i] ?? '').trimStart();
|
|
80
|
+
if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
|
|
81
|
+
const end = findJsonBlock(lines, i);
|
|
82
|
+
if (end !== null) {
|
|
83
|
+
for (let k = i; k <= end; k += 1)
|
|
84
|
+
cls[k] = 'json';
|
|
85
|
+
i = end + 1;
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
cls[i] = coarseClass(lines[i] ?? '');
|
|
90
|
+
i += 1;
|
|
91
|
+
}
|
|
92
|
+
// Group consecutive same-class lines; blank lines extend the current group.
|
|
93
|
+
const groups = [];
|
|
94
|
+
for (let idx = 0; idx < n; idx += 1) {
|
|
95
|
+
const blank = (lines[idx] ?? '').trim().length === 0;
|
|
96
|
+
const last = groups[groups.length - 1];
|
|
97
|
+
if (last && (blank || cls[idx] === cls[last.start])) {
|
|
98
|
+
last.end = idx + 1;
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
groups.push({ start: idx, end: idx + 1 });
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return groups.map(({ start, end }) => {
|
|
105
|
+
const segText = lines.slice(start, end).join('\n');
|
|
106
|
+
return { type: detectContentType(segText), text: segText };
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
/** Content classes that have a dedicated, materially-better compressor. */
|
|
110
|
+
export function isRichSegmentType(type) {
|
|
111
|
+
return (type === EContentType.Json ||
|
|
112
|
+
type === EContentType.JsonArray ||
|
|
113
|
+
type === EContentType.BuildLog ||
|
|
114
|
+
type === EContentType.GitDiff ||
|
|
115
|
+
type === EContentType.SearchResults ||
|
|
116
|
+
type === EContentType.SourceCode);
|
|
117
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@shrkcrft/compress` — SharkCraft's deterministic context-compression
|
|
3
|
+
* engine. Built to honour the engine's hard rule: no model inside. Every
|
|
4
|
+
* transform is a pure function of its input — content routing, lossless
|
|
5
|
+
* columnar/table compaction of object arrays, log/search/diff/line reduction,
|
|
6
|
+
* and reversible Compress-Cache-Retrieve (CCR). Used by the CLI, MCP server,
|
|
7
|
+
* and inspector to cut the tokens an agent pays for the same information.
|
|
8
|
+
*/
|
|
9
|
+
export { estimateTokens, measureSavings } from './tokens/estimate-tokens.js';
|
|
10
|
+
export type { ITokenSavings } from './tokens/token-savings.js';
|
|
11
|
+
export { EContentType } from './content/content-type.js';
|
|
12
|
+
export { detectContentType } from './content/detect-content-type.js';
|
|
13
|
+
export type { IContentSegment } from './content/segment.js';
|
|
14
|
+
export { segmentContent, isRichSegmentType } from './content/segment.js';
|
|
15
|
+
export type { ICcrEntry } from './ccr/ccr-entry.js';
|
|
16
|
+
export type { ICcrStore } from './ccr/ccr-store.js';
|
|
17
|
+
export { ccrKey } from './ccr/ccr-key.js';
|
|
18
|
+
export { CCR_MARKER_RE, formatCcrMarker, parseCcrMarkers } from './ccr/ccr-marker.js';
|
|
19
|
+
export type { ICcrMarkerRef } from './ccr/ccr-marker.js';
|
|
20
|
+
export { InMemoryCcrStore } from './ccr/in-memory-ccr-store.js';
|
|
21
|
+
export { FileCcrStore } from './ccr/file-ccr-store.js';
|
|
22
|
+
export type { ITtlFileCcrStoreOptions } from './ccr/ttl-file-ccr-store.js';
|
|
23
|
+
export { TtlFileCcrStore } from './ccr/ttl-file-ccr-store.js';
|
|
24
|
+
export type { IFieldSpec } from './table/field-spec.js';
|
|
25
|
+
export type { ITableCompaction } from './table/table-compaction.js';
|
|
26
|
+
export { compactObjectArray } from './table/compact-object-array.js';
|
|
27
|
+
export type { IColumnarTable } from './table/columnar-table.js';
|
|
28
|
+
export { tableToColumnar, compactArrayToColumnar, isColumnarTable, expandColumnar, } from './table/columnar-json.js';
|
|
29
|
+
export { renderTable } from './table/render-table.js';
|
|
30
|
+
export { renderCompactJson } from './json/render-compact-json.js';
|
|
31
|
+
export { compressJson } from './json/compress-json.js';
|
|
32
|
+
export { columnarToCsv, csvToObjects, columnarToMarkdownKv, markdownKvToObjects, } from './table/table-formats.js';
|
|
33
|
+
export type { IObjectMap } from './table/object-map.js';
|
|
34
|
+
export { compactObjectMap, expandObjectMap, isObjectMap } from './table/object-map.js';
|
|
35
|
+
export type { AdaptiveBias, IAdaptiveOptions } from './table/adaptive-size.js';
|
|
36
|
+
export { computeOptimalK, simhash, hammingDistance, kneedle, bigramCoverageCurve } from './table/adaptive-size.js';
|
|
37
|
+
export type { IBm25Options } from './relevance/bm25.js';
|
|
38
|
+
export { bm25Scores, topByBm25 } from './relevance/bm25.js';
|
|
39
|
+
export type { ISampleOptions } from './table/sample-options.js';
|
|
40
|
+
export type { ISampledTable } from './table/sampled-table.js';
|
|
41
|
+
export { isSampledTable } from './table/sampled-table.js';
|
|
42
|
+
export { sampleObjectArray } from './table/sample-object-array.js';
|
|
43
|
+
export { ECompressionStrategy } from './result/compression-strategy.js';
|
|
44
|
+
export type { ICompressionResult } from './result/compression-result.js';
|
|
45
|
+
export type { ICompressOptions } from './result/compress-options.js';
|
|
46
|
+
export { compressLog } from './text/compress-log.js';
|
|
47
|
+
export { compressSearch } from './text/compress-search.js';
|
|
48
|
+
export { compressDiff } from './text/compress-diff.js';
|
|
49
|
+
export { compressLines } from './text/compress-lines.js';
|
|
50
|
+
export { compressMarkdown } from './text/compress-markdown.js';
|
|
51
|
+
export { compressCode } from './code/compress-code.js';
|
|
52
|
+
export { EVolatileKind } from './cache/volatile-kind.js';
|
|
53
|
+
export type { IVolatileToken } from './cache/volatile-token.js';
|
|
54
|
+
export { detectVolatileTokens } from './cache/detect-volatile-tokens.js';
|
|
55
|
+
export { PLACEHOLDER_RE, formatPlaceholder } from './cache/placeholder.js';
|
|
56
|
+
export type { IAlignmentBinding, IAlignmentMap } from './cache/alignment-map.js';
|
|
57
|
+
export type { IAlignmentResult } from './cache/alignment-result.js';
|
|
58
|
+
export { alignVolatileTokens } from './cache/align-volatile-tokens.js';
|
|
59
|
+
export { restoreVolatileTokens } from './cache/restore-volatile-tokens.js';
|
|
60
|
+
export { compressContent } from './compress-content.js';
|
|
61
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7E,YAAY,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAG/D,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,iBAAiB,EAAE,MAAM,kCAAkC,CAAC;AACrE,YAAY,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAGzE,YAAY,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,YAAY,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtF,YAAY,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AACvD,YAAY,EAAE,uBAAuB,EAAE,MAAM,6BAA6B,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAG9D,YAAY,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACxD,YAAY,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EACL,eAAe,EACf,sBAAsB,EACtB,eAAe,EACf,cAAc,GACf,MAAM,0BAA0B,CAAC;AAClC,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAGvD,OAAO,EACL,aAAa,EACb,YAAY,EACZ,oBAAoB,EACpB,mBAAmB,GACpB,MAAM,0BAA0B,CAAC;AAGlC,YAAY,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAGvF,YAAY,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,0BAA0B,CAAC;AAC/E,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AAGnH,YAAY,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAG5D,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,YAAY,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AAGnE,OAAO,EAAE,oBAAoB,EAAE,MAAM,kCAAkC,CAAC;AACxE,YAAY,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACzE,YAAY,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAGrE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AAG/D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAGvD,OAAO,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACzD,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3E,YAAY,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACjF,YAAY,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,kCAAkC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,oCAAoC,CAAC;AAG3E,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@shrkcrft/compress` — SharkCraft's deterministic context-compression
|
|
3
|
+
* engine. Built to honour the engine's hard rule: no model inside. Every
|
|
4
|
+
* transform is a pure function of its input — content routing, lossless
|
|
5
|
+
* columnar/table compaction of object arrays, log/search/diff/line reduction,
|
|
6
|
+
* and reversible Compress-Cache-Retrieve (CCR). Used by the CLI, MCP server,
|
|
7
|
+
* and inspector to cut the tokens an agent pays for the same information.
|
|
8
|
+
*/
|
|
9
|
+
// Tokens / accounting
|
|
10
|
+
export { estimateTokens, measureSavings } from "./tokens/estimate-tokens.js";
|
|
11
|
+
// Content routing
|
|
12
|
+
export { EContentType } from "./content/content-type.js";
|
|
13
|
+
export { detectContentType } from "./content/detect-content-type.js";
|
|
14
|
+
export { segmentContent, isRichSegmentType } from "./content/segment.js";
|
|
15
|
+
export { ccrKey } from "./ccr/ccr-key.js";
|
|
16
|
+
export { CCR_MARKER_RE, formatCcrMarker, parseCcrMarkers } from "./ccr/ccr-marker.js";
|
|
17
|
+
export { InMemoryCcrStore } from "./ccr/in-memory-ccr-store.js";
|
|
18
|
+
export { FileCcrStore } from "./ccr/file-ccr-store.js";
|
|
19
|
+
export { TtlFileCcrStore } from "./ccr/ttl-file-ccr-store.js";
|
|
20
|
+
export { compactObjectArray } from "./table/compact-object-array.js";
|
|
21
|
+
export { tableToColumnar, compactArrayToColumnar, isColumnarTable, expandColumnar, } from "./table/columnar-json.js";
|
|
22
|
+
export { renderTable } from "./table/render-table.js";
|
|
23
|
+
export { renderCompactJson } from "./json/render-compact-json.js";
|
|
24
|
+
export { compressJson } from "./json/compress-json.js";
|
|
25
|
+
// Read-accuracy table encodings (P4.2): reversible CSV / Markdown-KV views.
|
|
26
|
+
export { columnarToCsv, csvToObjects, columnarToMarkdownKv, markdownKvToObjects, } from "./table/table-formats.js";
|
|
27
|
+
export { compactObjectMap, expandObjectMap, isObjectMap } from "./table/object-map.js";
|
|
28
|
+
export { computeOptimalK, simhash, hammingDistance, kneedle, bigramCoverageCurve } from "./table/adaptive-size.js";
|
|
29
|
+
export { bm25Scores, topByBm25 } from "./relevance/bm25.js";
|
|
30
|
+
export { isSampledTable } from "./table/sampled-table.js";
|
|
31
|
+
export { sampleObjectArray } from "./table/sample-object-array.js";
|
|
32
|
+
// Result shapes / options
|
|
33
|
+
export { ECompressionStrategy } from "./result/compression-strategy.js";
|
|
34
|
+
// Text compressors
|
|
35
|
+
export { compressLog } from "./text/compress-log.js";
|
|
36
|
+
export { compressSearch } from "./text/compress-search.js";
|
|
37
|
+
export { compressDiff } from "./text/compress-diff.js";
|
|
38
|
+
export { compressLines } from "./text/compress-lines.js";
|
|
39
|
+
export { compressMarkdown } from "./text/compress-markdown.js";
|
|
40
|
+
// Code-aware compression (outline: keep imports/types/signatures, elide bodies)
|
|
41
|
+
export { compressCode } from "./code/compress-code.js";
|
|
42
|
+
// Cache alignment — volatile-token detection + active reversible substitution
|
|
43
|
+
export { EVolatileKind } from "./cache/volatile-kind.js";
|
|
44
|
+
export { detectVolatileTokens } from "./cache/detect-volatile-tokens.js";
|
|
45
|
+
export { PLACEHOLDER_RE, formatPlaceholder } from "./cache/placeholder.js";
|
|
46
|
+
export { alignVolatileTokens } from "./cache/align-volatile-tokens.js";
|
|
47
|
+
export { restoreVolatileTokens } from "./cache/restore-volatile-tokens.js";
|
|
48
|
+
// Router
|
|
49
|
+
export { compressContent } from "./compress-content.js";
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { ICompressionResult } from '../result/compression-result.js';
|
|
2
|
+
import type { ICompressOptions } from '../result/compress-options.js';
|
|
3
|
+
/**
|
|
4
|
+
* Compress JSON losslessly. A homogeneous object array becomes a *columnar*
|
|
5
|
+
* encoding — the shared schema is hoisted once and each row carries only
|
|
6
|
+
* values — which is still valid JSON and exactly reconstructable via
|
|
7
|
+
* `expandColumnar` (absent keys, nulls and empty strings are all preserved
|
|
8
|
+
* distinctly). Anything else is minified. No detail is dropped, so no CCR
|
|
9
|
+
* marker is needed. Falls back to line dedup if the text isn't valid JSON, and
|
|
10
|
+
* passes through untouched when re-serialization would lose precision (integers
|
|
11
|
+
* beyond 2^53), so the lossless guarantee always holds.
|
|
12
|
+
*
|
|
13
|
+
* (The dense text table from `renderCompactJson` is *not* used here: it
|
|
14
|
+
* renders null / "" / absent identically, so it cannot carry the lossless
|
|
15
|
+
* guarantee this function advertises.)
|
|
16
|
+
*/
|
|
17
|
+
export declare function compressJson(text: string, opts?: ICompressOptions): ICompressionResult;
|
|
18
|
+
//# sourceMappingURL=compress-json.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compress-json.d.ts","sourceRoot":"","sources":["../../src/json/compress-json.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAiEtE;;;;;;;;;;;;;GAaG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CA6D1F"}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import { EContentType } from "../content/content-type.js";
|
|
2
|
+
import { ECompressionStrategy } from "../result/compression-strategy.js";
|
|
3
|
+
import { estimateTokens, measureSavings } from "../tokens/estimate-tokens.js";
|
|
4
|
+
import { compactArrayToColumnar } from "../table/columnar-json.js";
|
|
5
|
+
import { compactObjectMap } from "../table/object-map.js";
|
|
6
|
+
import { sampleObjectArray } from "../table/sample-object-array.js";
|
|
7
|
+
import { compressLines } from "../text/compress-lines.js";
|
|
8
|
+
import { finalizeLossy, passthroughResult } from "../text/finalize.js";
|
|
9
|
+
const NUMBER_TOKEN = /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/g;
|
|
10
|
+
/**
|
|
11
|
+
* Blank out the contents of every JSON string (object keys AND string values),
|
|
12
|
+
* leaving structural punctuation, whitespace, and bare literals in place. After
|
|
13
|
+
* this, the only digit runs left are genuine JSON *number* literals — a digit
|
|
14
|
+
* run that lived inside a string (a record id, a git SHA, a numeric-looking
|
|
15
|
+
* code) is gone. Quote handling respects backslash escapes so `"a\""` stays a
|
|
16
|
+
* single string.
|
|
17
|
+
*/
|
|
18
|
+
function stripJsonStrings(text) {
|
|
19
|
+
const out = [];
|
|
20
|
+
let inString = false;
|
|
21
|
+
for (let i = 0; i < text.length; i++) {
|
|
22
|
+
const ch = text[i];
|
|
23
|
+
if (inString) {
|
|
24
|
+
if (ch === '\\') {
|
|
25
|
+
i++; // skip the escaped char too
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
if (ch === '"')
|
|
29
|
+
inString = false;
|
|
30
|
+
continue; // drop the string's contents
|
|
31
|
+
}
|
|
32
|
+
if (ch === '"') {
|
|
33
|
+
inString = true;
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
out.push(ch);
|
|
37
|
+
}
|
|
38
|
+
return out.join('');
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* True if any JSON-number *literal* in the text would NOT survive a
|
|
42
|
+
* parse→serialize round trip — it overflows to Infinity (→ `null`), or carries
|
|
43
|
+
* more than ~15 significant digits (precision loss for big integers AND
|
|
44
|
+
* decimal-split floats like `90071992547409.93`). Counting significant mantissa
|
|
45
|
+
* digits (rather than a contiguous-digit run) is what catches floats whose
|
|
46
|
+
* digits straddle the dot. Only actual number literals are inspected — digit
|
|
47
|
+
* runs inside string values / keys are stripped first, because a numeric-looking
|
|
48
|
+
* STRING round-trips verbatim and is never at risk (so a list of records with
|
|
49
|
+
* id-like string fields still compacts losslessly). Sound: when true we keep the
|
|
50
|
+
* original bytes instead of a false "lossless".
|
|
51
|
+
*/
|
|
52
|
+
function hasRiskyNumber(text) {
|
|
53
|
+
const scannable = stripJsonStrings(text);
|
|
54
|
+
for (const match of scannable.matchAll(NUMBER_TOKEN)) {
|
|
55
|
+
const token = match[0];
|
|
56
|
+
const n = Number(token);
|
|
57
|
+
if (!Number.isFinite(n))
|
|
58
|
+
return true; // overflow → Infinity → null
|
|
59
|
+
if (n === 0 && /[1-9]/.test(token))
|
|
60
|
+
return true; // underflow: nonzero literal → 0
|
|
61
|
+
const sig = token.replace(/[eE].*$/, '').replace(/[-.]/g, '').replace(/^0+/, '');
|
|
62
|
+
if (sig.length > 15)
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compress JSON losslessly. A homogeneous object array becomes a *columnar*
|
|
69
|
+
* encoding — the shared schema is hoisted once and each row carries only
|
|
70
|
+
* values — which is still valid JSON and exactly reconstructable via
|
|
71
|
+
* `expandColumnar` (absent keys, nulls and empty strings are all preserved
|
|
72
|
+
* distinctly). Anything else is minified. No detail is dropped, so no CCR
|
|
73
|
+
* marker is needed. Falls back to line dedup if the text isn't valid JSON, and
|
|
74
|
+
* passes through untouched when re-serialization would lose precision (integers
|
|
75
|
+
* beyond 2^53), so the lossless guarantee always holds.
|
|
76
|
+
*
|
|
77
|
+
* (The dense text table from `renderCompactJson` is *not* used here: it
|
|
78
|
+
* renders null / "" / absent identically, so it cannot carry the lossless
|
|
79
|
+
* guarantee this function advertises.)
|
|
80
|
+
*/
|
|
81
|
+
export function compressJson(text, opts = {}) {
|
|
82
|
+
let parsed;
|
|
83
|
+
try {
|
|
84
|
+
parsed = JSON.parse(text);
|
|
85
|
+
}
|
|
86
|
+
catch {
|
|
87
|
+
return compressLines(text, EContentType.PlainText, opts);
|
|
88
|
+
}
|
|
89
|
+
const forced = opts.contentType === EContentType.Json || opts.contentType === EContentType.JsonArray
|
|
90
|
+
? opts.contentType
|
|
91
|
+
: undefined;
|
|
92
|
+
const contentType = forced ?? (Array.isArray(parsed) ? EContentType.JsonArray : EContentType.Json);
|
|
93
|
+
if (hasRiskyNumber(text)) {
|
|
94
|
+
return passthroughResult(text, contentType, 'precision-preserving passthrough');
|
|
95
|
+
}
|
|
96
|
+
const columnar = Array.isArray(parsed) ? compactArrayToColumnar(parsed) : null;
|
|
97
|
+
// P2.3: an object KEYED by id with homogeneous values hoists to a columnar
|
|
98
|
+
// `_omap` envelope — the array columnar's analogue for the common map shape.
|
|
99
|
+
const objectMap = !Array.isArray(parsed) ? compactObjectMap(parsed) : null;
|
|
100
|
+
const lossless = columnar
|
|
101
|
+
? JSON.stringify(columnar)
|
|
102
|
+
: objectMap
|
|
103
|
+
? JSON.stringify({ _omap: objectMap })
|
|
104
|
+
: (JSON.stringify(parsed) ?? 'null');
|
|
105
|
+
// Lossy sampler is a LAST resort: only for a homogeneous array that, even
|
|
106
|
+
// losslessly compacted, still exceeds an explicit `maxTokens` budget.
|
|
107
|
+
const budget = opts.maxTokens;
|
|
108
|
+
if (Array.isArray(parsed) && budget && budget > 0 && estimateTokens(lossless, contentType) > budget) {
|
|
109
|
+
const sampled = sampleObjectArray(parsed, {
|
|
110
|
+
...(opts.query !== undefined ? { query: opts.query } : {}),
|
|
111
|
+
...(opts.maxItems !== undefined ? { maxItems: opts.maxItems } : {}),
|
|
112
|
+
});
|
|
113
|
+
if (sampled) {
|
|
114
|
+
return finalizeLossy({
|
|
115
|
+
original: text,
|
|
116
|
+
body: JSON.stringify(sampled),
|
|
117
|
+
contentType,
|
|
118
|
+
strategy: ECompressionStrategy.Sample,
|
|
119
|
+
opts,
|
|
120
|
+
note: `${sampled._table.sample.dropped} of ${sampled._table.n} rows sampled`,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
const savings = measureSavings(text, lossless, contentType);
|
|
125
|
+
if (savings.after >= savings.before)
|
|
126
|
+
return passthroughResult(text, contentType);
|
|
127
|
+
return {
|
|
128
|
+
compressed: lossless,
|
|
129
|
+
contentType,
|
|
130
|
+
strategy: columnar || objectMap ? ECompressionStrategy.Table : ECompressionStrategy.MinifiedJson,
|
|
131
|
+
savings,
|
|
132
|
+
lossy: false,
|
|
133
|
+
note: columnar
|
|
134
|
+
? 'lossless columnar table (valid JSON; schema hoisted, keys deduped)'
|
|
135
|
+
: objectMap
|
|
136
|
+
? 'lossless columnar object-map (valid JSON; schema hoisted, keys deduped)'
|
|
137
|
+
: 'minified JSON (whitespace removed)',
|
|
138
|
+
};
|
|
139
|
+
}
|