@shrkcrft/compress 0.1.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +21 -0
  3. package/dist/cache/align-volatile-tokens.d.ts +13 -0
  4. package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
  5. package/dist/cache/align-volatile-tokens.js +51 -0
  6. package/dist/cache/alignment-map.d.ts +23 -0
  7. package/dist/cache/alignment-map.d.ts.map +1 -0
  8. package/dist/cache/alignment-map.js +1 -0
  9. package/dist/cache/alignment-result.d.ts +11 -0
  10. package/dist/cache/alignment-result.d.ts.map +1 -0
  11. package/dist/cache/alignment-result.js +1 -0
  12. package/dist/cache/detect-volatile-tokens.d.ts +10 -0
  13. package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
  14. package/dist/cache/detect-volatile-tokens.js +41 -0
  15. package/dist/cache/placeholder.d.ts +28 -0
  16. package/dist/cache/placeholder.d.ts.map +1 -0
  17. package/dist/cache/placeholder.js +0 -0
  18. package/dist/cache/restore-volatile-tokens.d.ts +10 -0
  19. package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
  20. package/dist/cache/restore-volatile-tokens.js +21 -0
  21. package/dist/cache/volatile-classify.d.ts +11 -0
  22. package/dist/cache/volatile-classify.d.ts.map +1 -0
  23. package/dist/cache/volatile-classify.js +35 -0
  24. package/dist/cache/volatile-kind.d.ts +13 -0
  25. package/dist/cache/volatile-kind.d.ts.map +1 -0
  26. package/dist/cache/volatile-kind.js +13 -0
  27. package/dist/cache/volatile-token.d.ts +14 -0
  28. package/dist/cache/volatile-token.d.ts.map +1 -0
  29. package/dist/cache/volatile-token.js +1 -0
  30. package/dist/ccr/ccr-entry.d.ts +13 -0
  31. package/dist/ccr/ccr-entry.d.ts.map +1 -0
  32. package/dist/ccr/ccr-entry.js +1 -0
  33. package/dist/ccr/ccr-key.d.ts +9 -0
  34. package/dist/ccr/ccr-key.d.ts.map +1 -0
  35. package/dist/ccr/ccr-key.js +19 -0
  36. package/dist/ccr/ccr-marker.d.ts +23 -0
  37. package/dist/ccr/ccr-marker.d.ts.map +1 -0
  38. package/dist/ccr/ccr-marker.js +30 -0
  39. package/dist/ccr/ccr-store.d.ts +18 -0
  40. package/dist/ccr/ccr-store.d.ts.map +1 -0
  41. package/dist/ccr/ccr-store.js +1 -0
  42. package/dist/ccr/file-ccr-store.d.ts +19 -0
  43. package/dist/ccr/file-ccr-store.d.ts.map +1 -0
  44. package/dist/ccr/file-ccr-store.js +53 -0
  45. package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
  46. package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
  47. package/dist/ccr/in-memory-ccr-store.js +45 -0
  48. package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
  49. package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
  50. package/dist/ccr/ttl-file-ccr-store.js +117 -0
  51. package/dist/code/compress-code.d.ts +4 -0
  52. package/dist/code/compress-code.d.ts.map +1 -0
  53. package/dist/code/compress-code.js +294 -0
  54. package/dist/compress-content.d.ts +11 -0
  55. package/dist/compress-content.d.ts.map +1 -0
  56. package/dist/compress-content.js +79 -0
  57. package/dist/content/content-type.d.ts +28 -0
  58. package/dist/content/content-type.d.ts.map +1 -0
  59. package/dist/content/content-type.js +28 -0
  60. package/dist/content/detect-content-type.d.ts +9 -0
  61. package/dist/content/detect-content-type.d.ts.map +1 -0
  62. package/dist/content/detect-content-type.js +184 -0
  63. package/dist/content/segment.d.ts +21 -0
  64. package/dist/content/segment.d.ts.map +1 -0
  65. package/dist/content/segment.js +117 -0
  66. package/dist/index.d.ts +61 -0
  67. package/dist/index.d.ts.map +1 -0
  68. package/dist/index.js +49 -0
  69. package/dist/json/compress-json.d.ts +18 -0
  70. package/dist/json/compress-json.d.ts.map +1 -0
  71. package/dist/json/compress-json.js +139 -0
  72. package/dist/json/render-compact-json.d.ts +10 -0
  73. package/dist/json/render-compact-json.d.ts.map +1 -0
  74. package/dist/json/render-compact-json.js +18 -0
  75. package/dist/relevance/bm25.d.ts +26 -0
  76. package/dist/relevance/bm25.d.ts.map +1 -0
  77. package/dist/relevance/bm25.js +115 -0
  78. package/dist/result/compress-options.d.ts +26 -0
  79. package/dist/result/compress-options.d.ts.map +1 -0
  80. package/dist/result/compress-options.js +1 -0
  81. package/dist/result/compression-result.d.ts +26 -0
  82. package/dist/result/compression-result.d.ts.map +1 -0
  83. package/dist/result/compression-result.js +1 -0
  84. package/dist/result/compression-strategy.d.ts +30 -0
  85. package/dist/result/compression-strategy.d.ts.map +1 -0
  86. package/dist/result/compression-strategy.js +30 -0
  87. package/dist/table/adaptive-size.d.ts +46 -0
  88. package/dist/table/adaptive-size.d.ts.map +1 -0
  89. package/dist/table/adaptive-size.js +170 -0
  90. package/dist/table/apply-value-dictionaries.d.ts +30 -0
  91. package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
  92. package/dist/table/apply-value-dictionaries.js +99 -0
  93. package/dist/table/column-presence.d.ts +20 -0
  94. package/dist/table/column-presence.d.ts.map +1 -0
  95. package/dist/table/column-presence.js +52 -0
  96. package/dist/table/columnar-json.d.ts +24 -0
  97. package/dist/table/columnar-json.d.ts.map +1 -0
  98. package/dist/table/columnar-json.js +83 -0
  99. package/dist/table/columnar-table.d.ts +24 -0
  100. package/dist/table/columnar-table.d.ts.map +1 -0
  101. package/dist/table/columnar-table.js +1 -0
  102. package/dist/table/compact-object-array.d.ts +12 -0
  103. package/dist/table/compact-object-array.d.ts.map +1 -0
  104. package/dist/table/compact-object-array.js +88 -0
  105. package/dist/table/field-spec.d.ts +13 -0
  106. package/dist/table/field-spec.d.ts.map +1 -0
  107. package/dist/table/field-spec.js +1 -0
  108. package/dist/table/object-map.d.ts +28 -0
  109. package/dist/table/object-map.d.ts.map +1 -0
  110. package/dist/table/object-map.js +119 -0
  111. package/dist/table/render-table.d.ts +11 -0
  112. package/dist/table/render-table.d.ts.map +1 -0
  113. package/dist/table/render-table.js +39 -0
  114. package/dist/table/sample-object-array.d.ts +11 -0
  115. package/dist/table/sample-object-array.d.ts.map +1 -0
  116. package/dist/table/sample-object-array.js +171 -0
  117. package/dist/table/sample-options.d.ts +29 -0
  118. package/dist/table/sample-options.d.ts.map +1 -0
  119. package/dist/table/sample-options.js +1 -0
  120. package/dist/table/sampled-table.d.ts +33 -0
  121. package/dist/table/sampled-table.d.ts.map +1 -0
  122. package/dist/table/sampled-table.js +8 -0
  123. package/dist/table/table-compaction.d.ts +19 -0
  124. package/dist/table/table-compaction.d.ts.map +1 -0
  125. package/dist/table/table-compaction.js +1 -0
  126. package/dist/table/table-formats.d.ts +23 -0
  127. package/dist/table/table-formats.d.ts.map +1 -0
  128. package/dist/table/table-formats.js +233 -0
  129. package/dist/text/compress-diff.d.ts +20 -0
  130. package/dist/text/compress-diff.d.ts.map +1 -0
  131. package/dist/text/compress-diff.js +344 -0
  132. package/dist/text/compress-lines.d.ts +12 -0
  133. package/dist/text/compress-lines.d.ts.map +1 -0
  134. package/dist/text/compress-lines.js +44 -0
  135. package/dist/text/compress-log.d.ts +12 -0
  136. package/dist/text/compress-log.d.ts.map +1 -0
  137. package/dist/text/compress-log.js +202 -0
  138. package/dist/text/compress-markdown.d.ts +15 -0
  139. package/dist/text/compress-markdown.d.ts.map +1 -0
  140. package/dist/text/compress-markdown.js +96 -0
  141. package/dist/text/compress-search.d.ts +11 -0
  142. package/dist/text/compress-search.d.ts.map +1 -0
  143. package/dist/text/compress-search.js +78 -0
  144. package/dist/text/finalize.d.ts +21 -0
  145. package/dist/text/finalize.d.ts.map +1 -0
  146. package/dist/text/finalize.js +54 -0
  147. package/dist/text/line-utils.d.ts +20 -0
  148. package/dist/text/line-utils.d.ts.map +1 -0
  149. package/dist/text/line-utils.js +65 -0
  150. package/dist/text/lockfile-names.d.ts +3 -0
  151. package/dist/text/lockfile-names.d.ts.map +1 -0
  152. package/dist/text/lockfile-names.js +33 -0
  153. package/dist/text/log-template.d.ts +31 -0
  154. package/dist/text/log-template.d.ts.map +1 -0
  155. package/dist/text/log-template.js +239 -0
  156. package/dist/tokens/estimate-tokens.d.ts +17 -0
  157. package/dist/tokens/estimate-tokens.d.ts.map +1 -0
  158. package/dist/tokens/estimate-tokens.js +53 -0
  159. package/dist/tokens/token-savings.d.ts +20 -0
  160. package/dist/tokens/token-savings.d.ts.map +1 -0
  161. package/dist/tokens/token-savings.js +1 -0
  162. package/package.json +52 -0
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Coarse content classes the router recognises. The class selects which
3
+ * deterministic compressor runs. Ordered loosely from most-specific
4
+ * (cheapest to over-trigger) to least.
5
+ */
6
+ export var EContentType;
7
+ (function (EContentType) {
8
+ /** A JSON array (top-level `[ ... ]`). The table compactor's prime target. */
9
+ EContentType["JsonArray"] = "json-array";
10
+ /** A JSON object or scalar (top-level `{ ... }` / value). */
11
+ EContentType["Json"] = "json";
12
+ /** A unified/`git` diff. */
13
+ EContentType["GitDiff"] = "git-diff";
14
+ /** grep/ripgrep `file:line:` style search output. */
15
+ EContentType["SearchResults"] = "search-results";
16
+ /** Build / test / runtime log output. */
17
+ EContentType["BuildLog"] = "build-log";
18
+ /** Source code in a recognised language. */
19
+ EContentType["SourceCode"] = "source-code";
20
+ /** Markdown prose / docs. */
21
+ EContentType["Markdown"] = "markdown";
22
+ /** YAML configuration / manifests (`key: value` mappings + `- ` lists). */
23
+ EContentType["Yaml"] = "yaml";
24
+ /** Delimiter-separated values (CSV / TSV): a stable column count per line. */
25
+ EContentType["Csv"] = "csv";
26
+ /** Anything else. */
27
+ EContentType["PlainText"] = "plain-text";
28
+ })(EContentType || (EContentType = {}));
@@ -0,0 +1,9 @@
1
+ import { EContentType } from './content-type.js';
2
+ /**
3
+ * Classify a blob deterministically. Order is significant: JSON is checked
4
+ * first (it round-trips cleanly through `JSON.parse`), then structural
5
+ * formats (diff/search), then heuristic ones (log/code/markdown), with
6
+ * plain text as the floor. Pure — same bytes in, same class out.
7
+ */
8
+ export declare function detectContentType(text: string): EContentType;
9
+ //# sourceMappingURL=detect-content-type.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect-content-type.d.ts","sourceRoot":"","sources":["../../src/content/detect-content-type.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AA0FjD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAkG5D"}
@@ -0,0 +1,184 @@
1
+ import { EContentType } from "./content-type.js";
2
+ const SEARCH_LINE = /^(?:[A-Za-z]:)?[^\s:]+:\d+:/;
3
+ // Compiler diagnostics that aren't `path:line:` shaped: tsc / MSVC
4
+ // `src/a.ts(10,5): error TS2322` and the `path(line):` family. These are search
5
+ // output, not source code — routing them to SourceCode mangled them.
6
+ const DIAGNOSTIC_LINE = /^(?:[A-Za-z]:)?[^\s:()]+\(\d+(?:,\d+)?\):\s/;
7
+ const DIFF_HUNK = /^@@ -\d+(?:,\d+)? \+\d+(?:,\d+)? @@/;
8
+ // YAML: `key:` / `key: value` mappings, `- ` sequence items, `---` doc markers.
9
+ const YAML_KEY = /^\s*[\w.-]+:(?:\s|$)/;
10
+ const YAML_LINE = /^\s*(?:[\w.-]+:(?:\s|$)|-\s|#|---\s*$|\.\.\.\s*$)/;
11
+ // A block-introducing key (`items:` with no inline value) followed by indented
12
+ // sequence items is unambiguously YAML — a Markdown list never has a bare
13
+ // `word:` line introducing indented bullets. Distinguishes list-heavy YAML
14
+ // (low key density) from a Markdown bullet list with an incidental `Note: x`.
15
+ const YAML_BLOCK_KEY = /^\s*[\w.-]+:\s*$/;
16
+ const YAML_INDENTED_SEQ = /^\s{2,}-\s/;
17
+ // Log levels must appear as a LINE PREFIX (optionally after a leading
18
+ // timestamp / bracket), not anywhere on the line — otherwise common code
19
+ // identifiers (`const ERROR = 500`, `enum { INFO, DEBUG }`) misroute to logs.
20
+ const LOG_MARKER = /^\s*(?:(?:\[?\d{4}-\d{2}-\d{2}[T ][\d:.,]+\]?|\S+\[\d+\]:)\s+)?\[?(?:ERROR|FATAL|FAIL(?:ED|URE)?|WARN(?:ING)?|INFO|DEBUG|NOTICE|TRACE)\b|^\S+\[\d+\]:\s|^\s*Traceback\b|^\s+at\s+\S+\s*\(|^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}/;
21
+ const MARKDOWN_MARKER = /^(?:#{1,6}\s|\s*[-*]\s|\s*\d+\.\s|```|>\s|\|)/;
22
+ // Code signals: declaration keywords + structural/statement shapes that real
23
+ // code has but prose / markdown / INI / env / TOML / nginx do NOT (each new
24
+ // signal measures 0.00 on those). Built as a union of annotated sources.
25
+ const CODE_MARKER = new RegExp([
26
+ // declaration / punctuation (original)
27
+ /\b(?:function|const|let|var|class|interface|enum|import|export|def|return|public|private|func|impl|struct|package|namespace)\b/,
28
+ /=>/,
29
+ /::/,
30
+ /^\s*@\w+/,
31
+ /^[\s{}()\[\];,]+$/,
32
+ // typed return/param annotation: `): Foo {` / `): Foo =>` / `]: Bar =`
33
+ /[)\]]\s*:\s*[A-Za-z_$][\w$.<>\[\], ]*\s*(?:=>|\{|=|$)/,
34
+ // assignment statement terminated by `;` (rejects ==/=== via [^=]; needs trailing ;)
35
+ /^\s*[A-Za-z_$][\w$.[\]]*\s*(?:\+|-|\*|\/|%|\?\?|\|\||&&|<<|>>|\||&|\^)?=\s*[^=].*;\s*$/,
36
+ // member/method call AT LINE START: `obj.method(` (anchored so prose
37
+ // "the system.config()" / "noun.verb(" embedded mid-sentence doesn't match)
38
+ /^\s*[A-Za-z_$][\w$]*\.[A-Za-z_$][\w$]*\s*\(/,
39
+ // bare call statement AT LINE START ending in `;`: `doThing(args);` (anchored
40
+ // so log lines like "Calling fetchUser(42);" don't match)
41
+ /^\s*[A-Za-z_$][\w$]*\([^()]*\)\s*;\s*$/,
42
+ // control-flow header: `if (x) {`, `for (...)`, `while/switch/catch (...)`
43
+ /^\s*(?:if|for|while|switch|catch)\s*\(.*\)\s*\{?\s*$/,
44
+ ]
45
+ .map((r) => r.source)
46
+ .join('|'));
47
+ function lineHitRatio(lines, test) {
48
+ if (lines.length === 0)
49
+ return 0;
50
+ const match = typeof test === 'function' ? test : (l) => test.test(l);
51
+ let hits = 0;
52
+ for (const line of lines)
53
+ if (match(line))
54
+ hits += 1;
55
+ return hits / lines.length;
56
+ }
57
+ /**
58
+ * Delimiter-separated values: a stable column count per line. Returns true when
59
+ * one of `,`/`\t`/`;` yields the SAME count (≥1) on ≥90% of the non-blank lines
60
+ * (≥2 of them) — a shape prose and config never have.
61
+ */
62
+ function looksDelimited(nonBlank) {
63
+ if (nonBlank.length < 2)
64
+ return false;
65
+ // `;` is excluded: semicolon-terminated prose/code lines have a stable count
66
+ // of 1 and would masquerade as 2-column CSV. Real CSV/TSV uses `,` or tab.
67
+ for (const delim of [',', '\t']) {
68
+ const counts = nonBlank.map((l) => l.split(delim).length - 1);
69
+ const freq = new Map();
70
+ for (const c of counts)
71
+ freq.set(c, (freq.get(c) ?? 0) + 1);
72
+ let modal = -1;
73
+ let modalFreq = 0;
74
+ for (const [c, f] of freq) {
75
+ if (f > modalFreq || (f === modalFreq && c > modal)) {
76
+ modal = c;
77
+ modalFreq = f;
78
+ }
79
+ }
80
+ // A real CSV/TSV has the same column count (≥2 columns ⇒ ≥1 delimiter) on
81
+ // almost every line.
82
+ if (modal >= 1 && modalFreq / nonBlank.length >= 0.9)
83
+ return true;
84
+ }
85
+ return false;
86
+ }
87
+ /**
88
+ * Classify a blob deterministically. Order is significant: JSON is checked
89
+ * first (it round-trips cleanly through `JSON.parse`), then structural
90
+ * formats (diff/search), then heuristic ones (log/code/markdown), with
91
+ * plain text as the floor. Pure — same bytes in, same class out.
92
+ */
93
+ export function detectContentType(text) {
94
+ const trimmed = text.trim();
95
+ if (trimmed.length === 0)
96
+ return EContentType.PlainText;
97
+ // 1. JSON — only when it actually parses, so we never mis-route prose that
98
+ // merely starts with a bracket.
99
+ const first = trimmed[0];
100
+ if (first === '[' || first === '{') {
101
+ try {
102
+ const parsed = JSON.parse(trimmed);
103
+ if (Array.isArray(parsed))
104
+ return EContentType.JsonArray;
105
+ if (parsed !== null && typeof parsed === 'object')
106
+ return EContentType.Json;
107
+ return EContentType.Json;
108
+ }
109
+ catch {
110
+ // fall through — not valid JSON
111
+ }
112
+ }
113
+ const lines = trimmed.split('\n');
114
+ // 2. Git diff — a `diff --git` header or a clear hunk header set. Scan ALL
115
+ // `@@` lines (a leading malformed hunk must not defeat detection).
116
+ if (/^diff --git /m.test(trimmed) ||
117
+ (lines.some((l) => DIFF_HUNK.test(l)) &&
118
+ /^--- /m.test(trimmed) &&
119
+ /^\+\+\+ /m.test(trimmed))) {
120
+ return EContentType.GitDiff;
121
+ }
122
+ // 3. grep / ripgrep search output (`path:line:` prefix) OR compiler
123
+ // diagnostics (`path(line,col):`). Count either shape toward the ratio.
124
+ if (lineHitRatio(lines, (l) => SEARCH_LINE.test(l) || DIAGNOSTIC_LINE.test(l)) >= 0.6) {
125
+ return EContentType.SearchResults;
126
+ }
127
+ // 4. Build / test log (error / warn / timestamp markers dense enough).
128
+ if (lineHitRatio(lines, LOG_MARKER) >= 0.25)
129
+ return EContentType.BuildLog;
130
+ // 5. Source code — keyword / structural density over non-blank lines OUTSIDE
131
+ // fenced code blocks (a markdown doc's ``` examples must not be counted as
132
+ // the doc's own code). A real source file has no fences, so its basis is
133
+ // unchanged. EOL punctuation alone must NOT count (prose/config ending in
134
+ // `;` is not code).
135
+ // Only TOP-LEVEL fences (CommonMark allows ≤3 leading spaces) count — an
136
+ // indented backtick line shown as a prose example must not toggle the fence
137
+ // state and skew the balance check.
138
+ const fenceRe = /^ {0,3}(?:```|~~~)/;
139
+ // Only trust fence exclusion when fences are balanced. An odd (unterminated)
140
+ // count — e.g. a stray ``` inside a source file's string/comment — would
141
+ // otherwise flip `inFence` forever and exclude the rest of the file.
142
+ const fenceCount = lines.reduce((n, l) => (fenceRe.test(l) ? n + 1 : n), 0);
143
+ const excludeFences = fenceCount > 0 && fenceCount % 2 === 0;
144
+ let inFence = false;
145
+ const codeBasis = [];
146
+ for (const l of lines) {
147
+ if (excludeFences && fenceRe.test(l)) {
148
+ inFence = !inFence;
149
+ continue;
150
+ }
151
+ if (!inFence && l.trim().length > 0)
152
+ codeBasis.push(l);
153
+ }
154
+ const codeRatio = codeBasis.length > 0 ? lineHitRatio(codeBasis, CODE_MARKER) : 0;
155
+ if (codeRatio >= 0.45)
156
+ return EContentType.SourceCode;
157
+ const nonBlank = lines.filter((l) => l.trim().length > 0);
158
+ // 5b. CSV / TSV — a stable column count per line (checked before YAML/markdown
159
+ // so a 2-column file isn't mistaken for `key: value` or a list).
160
+ if (looksDelimited(nonBlank))
161
+ return EContentType.Csv;
162
+ // 5c. YAML — ≥80% of non-blank lines are YAML-shaped AND either ≥30% are
163
+ // actual `key:` mappings (mapping-heavy config) OR there's a block-key →
164
+ // indented-sequence shape (list-heavy config). Both reject a plain
165
+ // Markdown bullet list. Checked before markdown, which would otherwise
166
+ // grab YAML's `- ` sequence items and lossily cap them.
167
+ if (nonBlank.length >= 2) {
168
+ const yamlShaped = lineHitRatio(nonBlank, YAML_LINE);
169
+ const keyDensity = lineHitRatio(nonBlank, YAML_KEY);
170
+ const blockSeq = nonBlank.some((l) => YAML_BLOCK_KEY.test(l)) && nonBlank.some((l) => YAML_INDENTED_SEQ.test(l));
171
+ if (yamlShaped >= 0.8 && (keyDensity >= 0.3 || blockSeq))
172
+ return EContentType.Yaml;
173
+ }
174
+ // 6. Markdown — a marker-dense blob, OR a prose doc with ≥2 ATX headers. The
175
+ // header rule is gated so a commented script (Python/shell `# …` lines, or
176
+ // a `#!`-shebang file) with low code-syntax density isn't mistaken for a doc.
177
+ const headerCount = lines.reduce((n, l) => (/^#{1,6}\s/.test(l) ? n + 1 : n), 0);
178
+ const looksLikeScript = (lines[0] ?? '').startsWith('#!');
179
+ if (lineHitRatio(lines, MARKDOWN_MARKER) >= 0.3 ||
180
+ (headerCount >= 2 && codeRatio < 0.15 && !looksLikeScript)) {
181
+ return EContentType.Markdown;
182
+ }
183
+ return EContentType.PlainText;
184
+ }
@@ -0,0 +1,21 @@
1
+ import { EContentType } from './content-type.js';
2
+ /**
3
+ * A typed run of a mixed blob. {@link segmentContent} splits a heterogeneous
4
+ * dump — prose interleaved with a JSON block and a stack trace, say — into
5
+ * contiguous runs so each can be compressed by its own strategy instead of
6
+ * forcing the whole blob through one. (P4.3)
7
+ */
8
+ export interface IContentSegment {
9
+ type: EContentType;
10
+ text: string;
11
+ }
12
+ /**
13
+ * Split `text` into typed segments. Contiguous multi-line JSON blocks are
14
+ * isolated; the remaining lines are grouped into runs of one coarse class
15
+ * (blank lines extend the current run), and each run's real type is detected.
16
+ * A single-type blob yields exactly one segment.
17
+ */
18
+ export declare function segmentContent(text: string): IContentSegment[];
19
+ /** Content classes that have a dedicated, materially-better compressor. */
20
+ export declare function isRichSegmentType(type: EContentType): boolean;
21
+ //# sourceMappingURL=segment.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"segment.d.ts","sourceRoot":"","sources":["../../src/content/segment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAIjD;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,YAAY,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;CACd;AA0DD;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,eAAe,EAAE,CAoC9D;AAED,2EAA2E;AAC3E,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAS7D"}
@@ -0,0 +1,117 @@
1
+ import { EContentType } from "./content-type.js";
2
+ import { detectContentType } from "./detect-content-type.js";
3
+ import { splitLines } from "../text/line-utils.js";
4
+ /** Coarse per-line class used to group adjacent non-JSON lines. */
5
+ function coarseClass(line) {
6
+ if (/^(?:diff --git |@@ |--- |\+\+\+ )/.test(line))
7
+ return 'diff';
8
+ if (/^(?:[A-Za-z]:)?[^\s:]+:\d+:/.test(line))
9
+ return 'search';
10
+ if (/^\s*\[?\d{4}-\d{2}-\d{2}[T ]/.test(line) ||
11
+ /^\s*\[?(?:ERROR|FATAL|FAIL(?:ED|URE)?|WARN(?:ING)?|INFO|DEBUG|NOTICE|TRACE)\b/.test(line) ||
12
+ /^\s+at\s+\S/.test(line) ||
13
+ /^\s*Traceback\b/.test(line) ||
14
+ /^[\w.$]*(?:Error|Exception):/.test(line)) {
15
+ return 'log';
16
+ }
17
+ return 'prose';
18
+ }
19
+ /**
20
+ * If a JSON value opens at `start` (a line beginning `{`/`[`), return the index
21
+ * of the line where it closes balanced and parses — multi-line only. Otherwise
22
+ * null. String/escape aware so braces inside strings don't unbalance it.
23
+ */
24
+ function findJsonBlock(lines, start) {
25
+ const open = (lines[start] ?? '').trimStart()[0];
26
+ if (open !== '{' && open !== '[')
27
+ return null;
28
+ let depth = 0;
29
+ let inStr = false;
30
+ let esc = false;
31
+ let started = false;
32
+ const limit = Math.min(lines.length, start + 2000);
33
+ for (let j = start; j < limit; j += 1) {
34
+ for (const ch of lines[j] ?? '') {
35
+ if (inStr) {
36
+ if (esc)
37
+ esc = false;
38
+ else if (ch === '\\')
39
+ esc = true;
40
+ else if (ch === '"')
41
+ inStr = false;
42
+ continue;
43
+ }
44
+ if (ch === '"')
45
+ inStr = true;
46
+ else if (ch === '{' || ch === '[') {
47
+ depth += 1;
48
+ started = true;
49
+ }
50
+ else if (ch === '}' || ch === ']')
51
+ depth -= 1;
52
+ }
53
+ if (started && depth <= 0) {
54
+ if (j === start)
55
+ return null; // single-line JSON stays inline with prose
56
+ try {
57
+ JSON.parse(lines.slice(start, j + 1).join('\n').trim());
58
+ return j;
59
+ }
60
+ catch {
61
+ return null;
62
+ }
63
+ }
64
+ }
65
+ return null;
66
+ }
67
+ /**
68
+ * Split `text` into typed segments. Contiguous multi-line JSON blocks are
69
+ * isolated; the remaining lines are grouped into runs of one coarse class
70
+ * (blank lines extend the current run), and each run's real type is detected.
71
+ * A single-type blob yields exactly one segment.
72
+ */
73
+ export function segmentContent(text) {
74
+ const lines = splitLines(text);
75
+ const n = lines.length;
76
+ const cls = new Array(n).fill('');
77
+ let i = 0;
78
+ while (i < n) {
79
+ const trimmed = (lines[i] ?? '').trimStart();
80
+ if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
81
+ const end = findJsonBlock(lines, i);
82
+ if (end !== null) {
83
+ for (let k = i; k <= end; k += 1)
84
+ cls[k] = 'json';
85
+ i = end + 1;
86
+ continue;
87
+ }
88
+ }
89
+ cls[i] = coarseClass(lines[i] ?? '');
90
+ i += 1;
91
+ }
92
+ // Group consecutive same-class lines; blank lines extend the current group.
93
+ const groups = [];
94
+ for (let idx = 0; idx < n; idx += 1) {
95
+ const blank = (lines[idx] ?? '').trim().length === 0;
96
+ const last = groups[groups.length - 1];
97
+ if (last && (blank || cls[idx] === cls[last.start])) {
98
+ last.end = idx + 1;
99
+ }
100
+ else {
101
+ groups.push({ start: idx, end: idx + 1 });
102
+ }
103
+ }
104
+ return groups.map(({ start, end }) => {
105
+ const segText = lines.slice(start, end).join('\n');
106
+ return { type: detectContentType(segText), text: segText };
107
+ });
108
+ }
109
+ /** Content classes that have a dedicated, materially-better compressor. */
110
+ export function isRichSegmentType(type) {
111
+ return (type === EContentType.Json ||
112
+ type === EContentType.JsonArray ||
113
+ type === EContentType.BuildLog ||
114
+ type === EContentType.GitDiff ||
115
+ type === EContentType.SearchResults ||
116
+ type === EContentType.SourceCode);
117
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * `@shrkcrft/compress` — SharkCraft's deterministic context-compression
3
+ * engine. Built to honour the engine's hard rule: no model inside. Every
4
+ * transform is a pure function of its input — content routing, lossless
5
+ * columnar/table compaction of object arrays, log/search/diff/line reduction,
6
+ * and reversible Compress-Cache-Retrieve (CCR). Used by the CLI, MCP server,
7
+ * and inspector to cut the tokens an agent pays for the same information.
8
+ */
9
+ export { estimateTokens, measureSavings } from './tokens/estimate-tokens.js';
10
+ export type { ITokenSavings } from './tokens/token-savings.js';
11
+ export { EContentType } from './content/content-type.js';
12
+ export { detectContentType } from './content/detect-content-type.js';
13
+ export type { IContentSegment } from './content/segment.js';
14
+ export { segmentContent, isRichSegmentType } from './content/segment.js';
15
+ export type { ICcrEntry } from './ccr/ccr-entry.js';
16
+ export type { ICcrStore } from './ccr/ccr-store.js';
17
+ export { ccrKey } from './ccr/ccr-key.js';
18
+ export { CCR_MARKER_RE, formatCcrMarker, parseCcrMarkers } from './ccr/ccr-marker.js';
19
+ export type { ICcrMarkerRef } from './ccr/ccr-marker.js';
20
+ export { InMemoryCcrStore } from './ccr/in-memory-ccr-store.js';
21
+ export { FileCcrStore } from './ccr/file-ccr-store.js';
22
+ export type { ITtlFileCcrStoreOptions } from './ccr/ttl-file-ccr-store.js';
23
+ export { TtlFileCcrStore } from './ccr/ttl-file-ccr-store.js';
24
+ export type { IFieldSpec } from './table/field-spec.js';
25
+ export type { ITableCompaction } from './table/table-compaction.js';
26
+ export { compactObjectArray } from './table/compact-object-array.js';
27
+ export type { IColumnarTable } from './table/columnar-table.js';
28
+ export { tableToColumnar, compactArrayToColumnar, isColumnarTable, expandColumnar, } from './table/columnar-json.js';
29
+ export { renderTable } from './table/render-table.js';
30
+ export { renderCompactJson } from './json/render-compact-json.js';
31
+ export { compressJson } from './json/compress-json.js';
32
+ export { columnarToCsv, csvToObjects, columnarToMarkdownKv, markdownKvToObjects, } from './table/table-formats.js';
33
+ export type { IObjectMap } from './table/object-map.js';
34
+ export { compactObjectMap, expandObjectMap, isObjectMap } from './table/object-map.js';
35
+ export type { AdaptiveBias, IAdaptiveOptions } from './table/adaptive-size.js';
36
+ export { computeOptimalK, simhash, hammingDistance, kneedle, bigramCoverageCurve } from './table/adaptive-size.js';
37
+ export type { IBm25Options } from './relevance/bm25.js';
38
+ export { bm25Scores, topByBm25 } from './relevance/bm25.js';
39
+ export type { ISampleOptions } from './table/sample-options.js';
40
+ export type { ISampledTable } from './table/sampled-table.js';
41
+ export { isSampledTable } from './table/sampled-table.js';
42
+ export { sampleObjectArray } from './table/sample-object-array.js';
43
+ export { ECompressionStrategy } from './result/compression-strategy.js';
44
+ export type { ICompressionResult } from './result/compression-result.js';
45
+ export type { ICompressOptions } from './result/compress-options.js';
46
+ export { compressLog } from './text/compress-log.js';
47
+ export { compressSearch } from './text/compress-search.js';
48
+ export { compressDiff } from './text/compress-diff.js';
49
+ export { compressLines } from './text/compress-lines.js';
50
+ export { compressMarkdown } from './text/compress-markdown.js';
51
+ export { compressCode } from './code/compress-code.js';
52
+ export { EVolatileKind } from './cache/volatile-kind.js';
53
+ export type { IVolatileToken } from './cache/volatile-token.js';
54
+ export { detectVolatileTokens } from './cache/detect-volatile-tokens.js';
55
+ export { PLACEHOLDER_RE, formatPlaceholder } from './cache/placeholder.js';
56
+ export type { IAlignmentBinding, IAlignmentMap } from './cache/alignment-map.js';
57
+ export type { IAlignmentResult } from './cache/alignment-result.js';
58
+ export { alignVolatileTokens } from './cache/align-volatile-tokens.js';
59
+ export { restoreVolatileTokens } from './cache/restore-volatile-tokens.js';
60
+ export { compressContent } from './compress-content.js';
61
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAC7E,YAAY,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAG/D,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,iBAAiB,EAAE,MAAM,kCAAkC,CAAC;AACrE,YAAY,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAGzE,YAAY,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,YAAY,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtF,YAAY,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AACvD,YAAY,EAAE,uBAAuB,EAAE,MAAM,6BAA6B,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAG9D,YAAY,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACxD,YAAY,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AACrE,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EACL,eAAe,EACf,sBAAsB,EACtB,eAAe,EACf,cAAc,GACf,MAAM,0BAA0B,CAAC;AAClC,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAGvD,OAAO,EACL,aAAa,EACb,YAAY,EACZ,oBAAoB,EACpB,mBAAmB,GACpB,MAAM,0BAA0B,CAAC;AAGlC,YAAY,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAGvF,YAAY,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,0BAA0B,CAAC;AAC/E,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AAGnH,YAAY,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAG5D,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,YAAY,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAC;AAGnE,OAAO,EAAE,oBAAoB,EAAE,MAAM,kCAAkC,CAAC;AACxE,YAAY,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACzE,YAAY,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAGrE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AAG/D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAGvD,OAAO,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACzD,YAAY,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3E,YAAY,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACjF,YAAY,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,kCAAkC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,oCAAoC,CAAC;AAG3E,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,49 @@
1
+ /**
2
+ * `@shrkcrft/compress` — SharkCraft's deterministic context-compression
3
+ * engine. Built to honour the engine's hard rule: no model inside. Every
4
+ * transform is a pure function of its input — content routing, lossless
5
+ * columnar/table compaction of object arrays, log/search/diff/line reduction,
6
+ * and reversible Compress-Cache-Retrieve (CCR). Used by the CLI, MCP server,
7
+ * and inspector to cut the tokens an agent pays for the same information.
8
+ */
9
+ // Tokens / accounting
10
+ export { estimateTokens, measureSavings } from "./tokens/estimate-tokens.js";
11
+ // Content routing
12
+ export { EContentType } from "./content/content-type.js";
13
+ export { detectContentType } from "./content/detect-content-type.js";
14
+ export { segmentContent, isRichSegmentType } from "./content/segment.js";
15
+ export { ccrKey } from "./ccr/ccr-key.js";
16
+ export { CCR_MARKER_RE, formatCcrMarker, parseCcrMarkers } from "./ccr/ccr-marker.js";
17
+ export { InMemoryCcrStore } from "./ccr/in-memory-ccr-store.js";
18
+ export { FileCcrStore } from "./ccr/file-ccr-store.js";
19
+ export { TtlFileCcrStore } from "./ccr/ttl-file-ccr-store.js";
20
+ export { compactObjectArray } from "./table/compact-object-array.js";
21
+ export { tableToColumnar, compactArrayToColumnar, isColumnarTable, expandColumnar, } from "./table/columnar-json.js";
22
+ export { renderTable } from "./table/render-table.js";
23
+ export { renderCompactJson } from "./json/render-compact-json.js";
24
+ export { compressJson } from "./json/compress-json.js";
25
+ // Read-accuracy table encodings (P4.2): reversible CSV / Markdown-KV views.
26
+ export { columnarToCsv, csvToObjects, columnarToMarkdownKv, markdownKvToObjects, } from "./table/table-formats.js";
27
+ export { compactObjectMap, expandObjectMap, isObjectMap } from "./table/object-map.js";
28
+ export { computeOptimalK, simhash, hammingDistance, kneedle, bigramCoverageCurve } from "./table/adaptive-size.js";
29
+ export { bm25Scores, topByBm25 } from "./relevance/bm25.js";
30
+ export { isSampledTable } from "./table/sampled-table.js";
31
+ export { sampleObjectArray } from "./table/sample-object-array.js";
32
+ // Result shapes / options
33
+ export { ECompressionStrategy } from "./result/compression-strategy.js";
34
+ // Text compressors
35
+ export { compressLog } from "./text/compress-log.js";
36
+ export { compressSearch } from "./text/compress-search.js";
37
+ export { compressDiff } from "./text/compress-diff.js";
38
+ export { compressLines } from "./text/compress-lines.js";
39
+ export { compressMarkdown } from "./text/compress-markdown.js";
40
+ // Code-aware compression (outline: keep imports/types/signatures, elide bodies)
41
+ export { compressCode } from "./code/compress-code.js";
42
+ // Cache alignment — volatile-token detection + active reversible substitution
43
+ export { EVolatileKind } from "./cache/volatile-kind.js";
44
+ export { detectVolatileTokens } from "./cache/detect-volatile-tokens.js";
45
+ export { PLACEHOLDER_RE, formatPlaceholder } from "./cache/placeholder.js";
46
+ export { alignVolatileTokens } from "./cache/align-volatile-tokens.js";
47
+ export { restoreVolatileTokens } from "./cache/restore-volatile-tokens.js";
48
+ // Router
49
+ export { compressContent } from "./compress-content.js";
@@ -0,0 +1,18 @@
1
+ import type { ICompressionResult } from '../result/compression-result.js';
2
+ import type { ICompressOptions } from '../result/compress-options.js';
3
+ /**
4
+ * Compress JSON losslessly. A homogeneous object array becomes a *columnar*
5
+ * encoding — the shared schema is hoisted once and each row carries only
6
+ * values — which is still valid JSON and exactly reconstructable via
7
+ * `expandColumnar` (absent keys, nulls and empty strings are all preserved
8
+ * distinctly). Anything else is minified. No detail is dropped, so no CCR
9
+ * marker is needed. Falls back to line dedup if the text isn't valid JSON, and
10
+ * passes through untouched when re-serialization would lose precision (integers
11
+ * beyond 2^53), so the lossless guarantee always holds.
12
+ *
13
+ * (The dense text table from `renderCompactJson` is *not* used here: it
14
+ * renders null / "" / absent identically, so it cannot carry the lossless
15
+ * guarantee this function advertises.)
16
+ */
17
+ export declare function compressJson(text: string, opts?: ICompressOptions): ICompressionResult;
18
+ //# sourceMappingURL=compress-json.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compress-json.d.ts","sourceRoot":"","sources":["../../src/json/compress-json.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAiEtE;;;;;;;;;;;;;GAaG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CA6D1F"}
@@ -0,0 +1,139 @@
1
+ import { EContentType } from "../content/content-type.js";
2
+ import { ECompressionStrategy } from "../result/compression-strategy.js";
3
+ import { estimateTokens, measureSavings } from "../tokens/estimate-tokens.js";
4
+ import { compactArrayToColumnar } from "../table/columnar-json.js";
5
+ import { compactObjectMap } from "../table/object-map.js";
6
+ import { sampleObjectArray } from "../table/sample-object-array.js";
7
+ import { compressLines } from "../text/compress-lines.js";
8
+ import { finalizeLossy, passthroughResult } from "../text/finalize.js";
9
+ const NUMBER_TOKEN = /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/g;
10
+ /**
11
+ * Blank out the contents of every JSON string (object keys AND string values),
12
+ * leaving structural punctuation, whitespace, and bare literals in place. After
13
+ * this, the only digit runs left are genuine JSON *number* literals — a digit
14
+ * run that lived inside a string (a record id, a git SHA, a numeric-looking
15
+ * code) is gone. Quote handling respects backslash escapes so `"a\""` stays a
16
+ * single string.
17
+ */
18
+ function stripJsonStrings(text) {
19
+ const out = [];
20
+ let inString = false;
21
+ for (let i = 0; i < text.length; i++) {
22
+ const ch = text[i];
23
+ if (inString) {
24
+ if (ch === '\\') {
25
+ i++; // skip the escaped char too
26
+ continue;
27
+ }
28
+ if (ch === '"')
29
+ inString = false;
30
+ continue; // drop the string's contents
31
+ }
32
+ if (ch === '"') {
33
+ inString = true;
34
+ continue;
35
+ }
36
+ out.push(ch);
37
+ }
38
+ return out.join('');
39
+ }
40
+ /**
41
+ * True if any JSON-number *literal* in the text would NOT survive a
42
+ * parse→serialize round trip — it overflows to Infinity (→ `null`), or carries
43
+ * more than ~15 significant digits (precision loss for big integers AND
44
+ * decimal-split floats like `90071992547409.93`). Counting significant mantissa
45
+ * digits (rather than a contiguous-digit run) is what catches floats whose
46
+ * digits straddle the dot. Only actual number literals are inspected — digit
47
+ * runs inside string values / keys are stripped first, because a numeric-looking
48
+ * STRING round-trips verbatim and is never at risk (so a list of records with
49
+ * id-like string fields still compacts losslessly). Sound: when true we keep the
50
+ * original bytes instead of a false "lossless".
51
+ */
52
+ function hasRiskyNumber(text) {
53
+ const scannable = stripJsonStrings(text);
54
+ for (const match of scannable.matchAll(NUMBER_TOKEN)) {
55
+ const token = match[0];
56
+ const n = Number(token);
57
+ if (!Number.isFinite(n))
58
+ return true; // overflow → Infinity → null
59
+ if (n === 0 && /[1-9]/.test(token))
60
+ return true; // underflow: nonzero literal → 0
61
+ const sig = token.replace(/[eE].*$/, '').replace(/[-.]/g, '').replace(/^0+/, '');
62
+ if (sig.length > 15)
63
+ return true;
64
+ }
65
+ return false;
66
+ }
67
+ /**
68
+ * Compress JSON losslessly. A homogeneous object array becomes a *columnar*
69
+ * encoding — the shared schema is hoisted once and each row carries only
70
+ * values — which is still valid JSON and exactly reconstructable via
71
+ * `expandColumnar` (absent keys, nulls and empty strings are all preserved
72
+ * distinctly). Anything else is minified. No detail is dropped, so no CCR
73
+ * marker is needed. Falls back to line dedup if the text isn't valid JSON, and
74
+ * passes through untouched when re-serialization would lose precision (integers
75
+ * beyond 2^53), so the lossless guarantee always holds.
76
+ *
77
+ * (The dense text table from `renderCompactJson` is *not* used here: it
78
+ * renders null / "" / absent identically, so it cannot carry the lossless
79
+ * guarantee this function advertises.)
80
+ */
81
+ export function compressJson(text, opts = {}) {
82
+ let parsed;
83
+ try {
84
+ parsed = JSON.parse(text);
85
+ }
86
+ catch {
87
+ return compressLines(text, EContentType.PlainText, opts);
88
+ }
89
+ const forced = opts.contentType === EContentType.Json || opts.contentType === EContentType.JsonArray
90
+ ? opts.contentType
91
+ : undefined;
92
+ const contentType = forced ?? (Array.isArray(parsed) ? EContentType.JsonArray : EContentType.Json);
93
+ if (hasRiskyNumber(text)) {
94
+ return passthroughResult(text, contentType, 'precision-preserving passthrough');
95
+ }
96
+ const columnar = Array.isArray(parsed) ? compactArrayToColumnar(parsed) : null;
97
+ // P2.3: an object KEYED by id with homogeneous values hoists to a columnar
98
+ // `_omap` envelope — the array columnar's analogue for the common map shape.
99
+ const objectMap = !Array.isArray(parsed) ? compactObjectMap(parsed) : null;
100
+ const lossless = columnar
101
+ ? JSON.stringify(columnar)
102
+ : objectMap
103
+ ? JSON.stringify({ _omap: objectMap })
104
+ : (JSON.stringify(parsed) ?? 'null');
105
+ // Lossy sampler is a LAST resort: only for a homogeneous array that, even
106
+ // losslessly compacted, still exceeds an explicit `maxTokens` budget.
107
+ const budget = opts.maxTokens;
108
+ if (Array.isArray(parsed) && budget && budget > 0 && estimateTokens(lossless, contentType) > budget) {
109
+ const sampled = sampleObjectArray(parsed, {
110
+ ...(opts.query !== undefined ? { query: opts.query } : {}),
111
+ ...(opts.maxItems !== undefined ? { maxItems: opts.maxItems } : {}),
112
+ });
113
+ if (sampled) {
114
+ return finalizeLossy({
115
+ original: text,
116
+ body: JSON.stringify(sampled),
117
+ contentType,
118
+ strategy: ECompressionStrategy.Sample,
119
+ opts,
120
+ note: `${sampled._table.sample.dropped} of ${sampled._table.n} rows sampled`,
121
+ });
122
+ }
123
+ }
124
+ const savings = measureSavings(text, lossless, contentType);
125
+ if (savings.after >= savings.before)
126
+ return passthroughResult(text, contentType);
127
+ return {
128
+ compressed: lossless,
129
+ contentType,
130
+ strategy: columnar || objectMap ? ECompressionStrategy.Table : ECompressionStrategy.MinifiedJson,
131
+ savings,
132
+ lossy: false,
133
+ note: columnar
134
+ ? 'lossless columnar table (valid JSON; schema hoisted, keys deduped)'
135
+ : objectMap
136
+ ? 'lossless columnar object-map (valid JSON; schema hoisted, keys deduped)'
137
+ : 'minified JSON (whitespace removed)',
138
+ };
139
+ }