@shrkcrft/compress 0.1.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +21 -0
  3. package/dist/cache/align-volatile-tokens.d.ts +13 -0
  4. package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
  5. package/dist/cache/align-volatile-tokens.js +51 -0
  6. package/dist/cache/alignment-map.d.ts +23 -0
  7. package/dist/cache/alignment-map.d.ts.map +1 -0
  8. package/dist/cache/alignment-map.js +1 -0
  9. package/dist/cache/alignment-result.d.ts +11 -0
  10. package/dist/cache/alignment-result.d.ts.map +1 -0
  11. package/dist/cache/alignment-result.js +1 -0
  12. package/dist/cache/detect-volatile-tokens.d.ts +10 -0
  13. package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
  14. package/dist/cache/detect-volatile-tokens.js +41 -0
  15. package/dist/cache/placeholder.d.ts +28 -0
  16. package/dist/cache/placeholder.d.ts.map +1 -0
  17. package/dist/cache/placeholder.js +0 -0
  18. package/dist/cache/restore-volatile-tokens.d.ts +10 -0
  19. package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
  20. package/dist/cache/restore-volatile-tokens.js +21 -0
  21. package/dist/cache/volatile-classify.d.ts +11 -0
  22. package/dist/cache/volatile-classify.d.ts.map +1 -0
  23. package/dist/cache/volatile-classify.js +35 -0
  24. package/dist/cache/volatile-kind.d.ts +13 -0
  25. package/dist/cache/volatile-kind.d.ts.map +1 -0
  26. package/dist/cache/volatile-kind.js +13 -0
  27. package/dist/cache/volatile-token.d.ts +14 -0
  28. package/dist/cache/volatile-token.d.ts.map +1 -0
  29. package/dist/cache/volatile-token.js +1 -0
  30. package/dist/ccr/ccr-entry.d.ts +13 -0
  31. package/dist/ccr/ccr-entry.d.ts.map +1 -0
  32. package/dist/ccr/ccr-entry.js +1 -0
  33. package/dist/ccr/ccr-key.d.ts +9 -0
  34. package/dist/ccr/ccr-key.d.ts.map +1 -0
  35. package/dist/ccr/ccr-key.js +19 -0
  36. package/dist/ccr/ccr-marker.d.ts +23 -0
  37. package/dist/ccr/ccr-marker.d.ts.map +1 -0
  38. package/dist/ccr/ccr-marker.js +30 -0
  39. package/dist/ccr/ccr-store.d.ts +18 -0
  40. package/dist/ccr/ccr-store.d.ts.map +1 -0
  41. package/dist/ccr/ccr-store.js +1 -0
  42. package/dist/ccr/file-ccr-store.d.ts +19 -0
  43. package/dist/ccr/file-ccr-store.d.ts.map +1 -0
  44. package/dist/ccr/file-ccr-store.js +53 -0
  45. package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
  46. package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
  47. package/dist/ccr/in-memory-ccr-store.js +45 -0
  48. package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
  49. package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
  50. package/dist/ccr/ttl-file-ccr-store.js +117 -0
  51. package/dist/code/compress-code.d.ts +4 -0
  52. package/dist/code/compress-code.d.ts.map +1 -0
  53. package/dist/code/compress-code.js +294 -0
  54. package/dist/compress-content.d.ts +11 -0
  55. package/dist/compress-content.d.ts.map +1 -0
  56. package/dist/compress-content.js +79 -0
  57. package/dist/content/content-type.d.ts +28 -0
  58. package/dist/content/content-type.d.ts.map +1 -0
  59. package/dist/content/content-type.js +28 -0
  60. package/dist/content/detect-content-type.d.ts +9 -0
  61. package/dist/content/detect-content-type.d.ts.map +1 -0
  62. package/dist/content/detect-content-type.js +184 -0
  63. package/dist/content/segment.d.ts +21 -0
  64. package/dist/content/segment.d.ts.map +1 -0
  65. package/dist/content/segment.js +117 -0
  66. package/dist/index.d.ts +61 -0
  67. package/dist/index.d.ts.map +1 -0
  68. package/dist/index.js +49 -0
  69. package/dist/json/compress-json.d.ts +18 -0
  70. package/dist/json/compress-json.d.ts.map +1 -0
  71. package/dist/json/compress-json.js +139 -0
  72. package/dist/json/render-compact-json.d.ts +10 -0
  73. package/dist/json/render-compact-json.d.ts.map +1 -0
  74. package/dist/json/render-compact-json.js +18 -0
  75. package/dist/relevance/bm25.d.ts +26 -0
  76. package/dist/relevance/bm25.d.ts.map +1 -0
  77. package/dist/relevance/bm25.js +115 -0
  78. package/dist/result/compress-options.d.ts +26 -0
  79. package/dist/result/compress-options.d.ts.map +1 -0
  80. package/dist/result/compress-options.js +1 -0
  81. package/dist/result/compression-result.d.ts +26 -0
  82. package/dist/result/compression-result.d.ts.map +1 -0
  83. package/dist/result/compression-result.js +1 -0
  84. package/dist/result/compression-strategy.d.ts +30 -0
  85. package/dist/result/compression-strategy.d.ts.map +1 -0
  86. package/dist/result/compression-strategy.js +30 -0
  87. package/dist/table/adaptive-size.d.ts +46 -0
  88. package/dist/table/adaptive-size.d.ts.map +1 -0
  89. package/dist/table/adaptive-size.js +170 -0
  90. package/dist/table/apply-value-dictionaries.d.ts +30 -0
  91. package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
  92. package/dist/table/apply-value-dictionaries.js +99 -0
  93. package/dist/table/column-presence.d.ts +20 -0
  94. package/dist/table/column-presence.d.ts.map +1 -0
  95. package/dist/table/column-presence.js +52 -0
  96. package/dist/table/columnar-json.d.ts +24 -0
  97. package/dist/table/columnar-json.d.ts.map +1 -0
  98. package/dist/table/columnar-json.js +83 -0
  99. package/dist/table/columnar-table.d.ts +24 -0
  100. package/dist/table/columnar-table.d.ts.map +1 -0
  101. package/dist/table/columnar-table.js +1 -0
  102. package/dist/table/compact-object-array.d.ts +12 -0
  103. package/dist/table/compact-object-array.d.ts.map +1 -0
  104. package/dist/table/compact-object-array.js +88 -0
  105. package/dist/table/field-spec.d.ts +13 -0
  106. package/dist/table/field-spec.d.ts.map +1 -0
  107. package/dist/table/field-spec.js +1 -0
  108. package/dist/table/object-map.d.ts +28 -0
  109. package/dist/table/object-map.d.ts.map +1 -0
  110. package/dist/table/object-map.js +119 -0
  111. package/dist/table/render-table.d.ts +11 -0
  112. package/dist/table/render-table.d.ts.map +1 -0
  113. package/dist/table/render-table.js +39 -0
  114. package/dist/table/sample-object-array.d.ts +11 -0
  115. package/dist/table/sample-object-array.d.ts.map +1 -0
  116. package/dist/table/sample-object-array.js +171 -0
  117. package/dist/table/sample-options.d.ts +29 -0
  118. package/dist/table/sample-options.d.ts.map +1 -0
  119. package/dist/table/sample-options.js +1 -0
  120. package/dist/table/sampled-table.d.ts +33 -0
  121. package/dist/table/sampled-table.d.ts.map +1 -0
  122. package/dist/table/sampled-table.js +8 -0
  123. package/dist/table/table-compaction.d.ts +19 -0
  124. package/dist/table/table-compaction.d.ts.map +1 -0
  125. package/dist/table/table-compaction.js +1 -0
  126. package/dist/table/table-formats.d.ts +23 -0
  127. package/dist/table/table-formats.d.ts.map +1 -0
  128. package/dist/table/table-formats.js +233 -0
  129. package/dist/text/compress-diff.d.ts +20 -0
  130. package/dist/text/compress-diff.d.ts.map +1 -0
  131. package/dist/text/compress-diff.js +344 -0
  132. package/dist/text/compress-lines.d.ts +12 -0
  133. package/dist/text/compress-lines.d.ts.map +1 -0
  134. package/dist/text/compress-lines.js +44 -0
  135. package/dist/text/compress-log.d.ts +12 -0
  136. package/dist/text/compress-log.d.ts.map +1 -0
  137. package/dist/text/compress-log.js +202 -0
  138. package/dist/text/compress-markdown.d.ts +15 -0
  139. package/dist/text/compress-markdown.d.ts.map +1 -0
  140. package/dist/text/compress-markdown.js +96 -0
  141. package/dist/text/compress-search.d.ts +11 -0
  142. package/dist/text/compress-search.d.ts.map +1 -0
  143. package/dist/text/compress-search.js +78 -0
  144. package/dist/text/finalize.d.ts +21 -0
  145. package/dist/text/finalize.d.ts.map +1 -0
  146. package/dist/text/finalize.js +54 -0
  147. package/dist/text/line-utils.d.ts +20 -0
  148. package/dist/text/line-utils.d.ts.map +1 -0
  149. package/dist/text/line-utils.js +65 -0
  150. package/dist/text/lockfile-names.d.ts +3 -0
  151. package/dist/text/lockfile-names.d.ts.map +1 -0
  152. package/dist/text/lockfile-names.js +33 -0
  153. package/dist/text/log-template.d.ts +31 -0
  154. package/dist/text/log-template.d.ts.map +1 -0
  155. package/dist/text/log-template.js +239 -0
  156. package/dist/tokens/estimate-tokens.d.ts +17 -0
  157. package/dist/tokens/estimate-tokens.d.ts.map +1 -0
  158. package/dist/tokens/estimate-tokens.js +53 -0
  159. package/dist/tokens/token-savings.d.ts +20 -0
  160. package/dist/tokens/token-savings.d.ts.map +1 -0
  161. package/dist/tokens/token-savings.js +1 -0
  162. package/package.json +52 -0
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Render a JSON value as the densest deterministic text. A homogeneous object
3
+ * array becomes a table block when that's shorter than minified JSON;
4
+ * everything else is minified JSON (whitespace stripped, structure intact).
5
+ * Lossless in information, though a table block is not itself re-parseable as
6
+ * JSON — use this only where the consumer reads text (the `compress` surface),
7
+ * not where a client calls `JSON.parse`.
8
+ */
9
+ export declare function renderCompactJson(value: unknown): string;
10
+ //# sourceMappingURL=render-compact-json.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"render-compact-json.d.ts","sourceRoot":"","sources":["../../src/json/render-compact-json.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,CAMxD"}
@@ -0,0 +1,18 @@
1
+ import { compactObjectArray } from "../table/compact-object-array.js";
2
+ import { renderTable } from "../table/render-table.js";
3
+ /**
4
+ * Render a JSON value as the densest deterministic text. A homogeneous object
5
+ * array becomes a table block when that's shorter than minified JSON;
6
+ * everything else is minified JSON (whitespace stripped, structure intact).
7
+ * Lossless in information, though a table block is not itself re-parseable as
8
+ * JSON — use this only where the consumer reads text (the `compress` surface),
9
+ * not where a client calls `JSON.parse`.
10
+ */
11
+ export function renderCompactJson(value) {
12
+ const minified = JSON.stringify(value) ?? 'null';
13
+ const table = compactObjectArray(value);
14
+ if (!table)
15
+ return minified;
16
+ const text = renderTable(table);
17
+ return text.length < minified.length ? text : minified;
18
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * A small, pure BM25 relevance scorer for query-biased compression. Plain
3
+ * token-overlap counting ({@link queryOverlap}) is weak: it can't tell a term
4
+ * that appears in every row from a rare, discriminating one, and it under-
5
+ * weights single-term and ID/UUID exact matches. BM25 weights each query term
6
+ * by inverse document frequency and normalizes for row/line length, so a
7
+ * uniquely-relevant row outranks a row that merely repeats a common word.
8
+ *
9
+ * Deterministic: a pure function of (query, documents). No clock, no RNG, no
10
+ * learned state. When the query is empty every score is 0, so callers fall
11
+ * straight back to their no-query behaviour.
12
+ */
13
+ export interface IBm25Options {
14
+ /** Term-frequency saturation. Default 1.2. */
15
+ k1?: number;
16
+ /** Length-normalization strength. Default 0.75. */
17
+ b?: number;
18
+ }
19
+ /** BM25 score per document for `query`. Empty query → all zeros. */
20
+ export declare function bm25Scores(query: string, documents: readonly string[], opts?: IBm25Options): number[];
21
+ /**
22
+ * Indices of the top-`k` documents by BM25 score (score > 0 only), highest
23
+ * first, ties broken by ascending index for determinism. Empty query → `[]`.
24
+ */
25
+ export declare function topByBm25(query: string, documents: readonly string[], k: number, opts?: IBm25Options): number[];
26
+ //# sourceMappingURL=bm25.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"bm25.d.ts","sourceRoot":"","sources":["../../src/relevance/bm25.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,MAAM,WAAW,YAAY;IAC3B,8CAA8C;IAC9C,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,mDAAmD;IACnD,CAAC,CAAC,EAAE,MAAM,CAAC;CACZ;AAuDD,oEAAoE;AACpE,wBAAgB,UAAU,CACxB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,SAAS,MAAM,EAAE,EAC5B,IAAI,GAAE,YAAiB,GACtB,MAAM,EAAE,CAsCV;AAED;;;GAGG;AACH,wBAAgB,SAAS,CACvB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,SAAS,MAAM,EAAE,EAC5B,CAAC,EAAE,MAAM,EACT,IAAI,GAAE,YAAiB,GACtB,MAAM,EAAE,CAQV"}
@@ -0,0 +1,115 @@
1
+ /**
2
+ * A small, pure BM25 relevance scorer for query-biased compression. Plain
3
+ * token-overlap counting ({@link queryOverlap}) is weak: it can't tell a term
4
+ * that appears in every row from a rare, discriminating one, and it under-
5
+ * weights single-term and ID/UUID exact matches. BM25 weights each query term
6
+ * by inverse document frequency and normalizes for row/line length, so a
7
+ * uniquely-relevant row outranks a row that merely repeats a common word.
8
+ *
9
+ * Deterministic: a pure function of (query, documents). No clock, no RNG, no
10
+ * learned state. When the query is empty every score is 0, so callers fall
11
+ * straight back to their no-query behaviour.
12
+ */
13
+ /** Extra idf weight for an exact ID-shaped term (UUID / long hex / email). */
14
+ const ID_BOOST = 2.5;
15
+ const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
16
+ const HEX_RE = /^(?:0x)?[0-9a-f]{8,}$/i;
17
+ const EMAIL_RE = /^[\w.+-]+@[\w-]+\.[\w.-]+$/;
18
+ function isIdShaped(term) {
19
+ return UUID_RE.test(term) || HEX_RE.test(term) || EMAIL_RE.test(term);
20
+ }
21
+ /** Parse a query into terms, preserving ID-shaped chunks (UUIDs, emails) whole. */
22
+ function parseQueryTerms(query) {
23
+ const seen = new Set();
24
+ const out = [];
25
+ for (const chunk of query.trim().split(/\s+/)) {
26
+ if (chunk.length === 0)
27
+ continue;
28
+ if (isIdShaped(chunk)) {
29
+ const t = chunk.toLowerCase();
30
+ if (!seen.has(t)) {
31
+ seen.add(t);
32
+ out.push({ term: t, id: true });
33
+ }
34
+ continue;
35
+ }
36
+ for (const sub of chunk.toLowerCase().split(/[^a-z0-9]+/)) {
37
+ if (sub.length < 1 || seen.has(sub))
38
+ continue;
39
+ seen.add(sub);
40
+ out.push({ term: sub, id: false });
41
+ }
42
+ }
43
+ return out;
44
+ }
45
+ function countOccurrences(haystack, needle) {
46
+ if (needle.length === 0)
47
+ return 0;
48
+ let count = 0;
49
+ let from = 0;
50
+ for (;;) {
51
+ const idx = haystack.indexOf(needle, from);
52
+ if (idx < 0)
53
+ break;
54
+ count += 1;
55
+ from = idx + needle.length;
56
+ }
57
+ return count;
58
+ }
59
+ /** BM25 score per document for `query`. Empty query → all zeros. */
60
+ export function bm25Scores(query, documents, opts = {}) {
61
+ const n = documents.length;
62
+ if (n === 0)
63
+ return [];
64
+ const terms = parseQueryTerms(query);
65
+ if (terms.length === 0)
66
+ return documents.map(() => 0);
67
+ const k1 = opts.k1 ?? 1.2;
68
+ const b = opts.b ?? 0.75;
69
+ // Lowercase each document ONCE, then derive tokens from the lowercased form
70
+ // (was lowercased twice: in the tokenizer and again for ID substring matching).
71
+ const docLower = documents.map((d) => d.toLowerCase());
72
+ const docTokens = docLower.map((l) => l.split(/[^a-z0-9]+/).filter((t) => t.length > 0));
73
+ const dl = docTokens.map((t) => t.length);
74
+ const avgdl = dl.reduce((s, x) => s + x, 0) / n || 1;
75
+ const scores = new Array(n).fill(0);
76
+ // One reusable frequency buffer for all terms — every slot is overwritten each
77
+ // term below, so no per-term allocation (or reset) is needed.
78
+ const f = new Array(n).fill(0);
79
+ for (const { term, id } of terms) {
80
+ let df = 0;
81
+ for (let d = 0; d < n; d += 1) {
82
+ const count = id
83
+ ? countOccurrences(docLower[d], term)
84
+ : docTokens[d].reduce((s, t) => s + (t === term ? 1 : 0), 0);
85
+ f[d] = count;
86
+ if (count > 0)
87
+ df += 1;
88
+ }
89
+ if (df === 0)
90
+ continue;
91
+ let idf = Math.log(1 + (n - df + 0.5) / (df + 0.5));
92
+ if (id)
93
+ idf *= ID_BOOST;
94
+ for (let d = 0; d < n; d += 1) {
95
+ if (f[d] === 0)
96
+ continue;
97
+ const denom = f[d] + k1 * (1 - b + (b * dl[d]) / avgdl);
98
+ scores[d] += (idf * (f[d] * (k1 + 1))) / denom;
99
+ }
100
+ }
101
+ return scores;
102
+ }
103
+ /**
104
+ * Indices of the top-`k` documents by BM25 score (score > 0 only), highest
105
+ * first, ties broken by ascending index for determinism. Empty query → `[]`.
106
+ */
107
+ export function topByBm25(query, documents, k, opts = {}) {
108
+ const scores = bm25Scores(query, documents, opts);
109
+ return scores
110
+ .map((score, index) => ({ score, index }))
111
+ .filter((x) => x.score > 0)
112
+ .sort((a, b) => b.score - a.score || a.index - b.index)
113
+ .slice(0, Math.max(0, k))
114
+ .map((x) => x.index);
115
+ }
@@ -0,0 +1,26 @@
1
+ import type { EContentType } from '../content/content-type.js';
2
+ import type { ICcrStore } from '../ccr/ccr-store.js';
3
+ /**
4
+ * Knobs for a compression pass. All optional — the defaults produce a safe,
5
+ * deterministic reduction. `store` is what makes a lossy pass reversible: when
6
+ * present, the original is cached and a `<<ccr:…>>` marker is appended.
7
+ */
8
+ export interface ICompressOptions {
9
+ /** Cache originals here so lossy output stays retrievable (CCR). */
10
+ store?: ICcrStore;
11
+ /** Task / query text that biases which lines or matches are kept. */
12
+ query?: string;
13
+ /** Force a content class instead of auto-detecting. */
14
+ contentType?: EContentType;
15
+ /** Soft cap on retained items/lines/matches/hunks (compressor-specific). */
16
+ maxItems?: number;
17
+ /** Below this many lines a lossy text pass returns the input untouched. */
18
+ minLines?: number;
19
+ /**
20
+ * Token budget for a JSON array. When set and the lossless columnar form
21
+ * still exceeds it, `compressJson` falls back to the lossy SmartCrusher
22
+ * row-sampler (kept rows + CCR original). Without it, JSON stays lossless.
23
+ */
24
+ maxTokens?: number;
25
+ }
26
+ //# sourceMappingURL=compress-options.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compress-options.d.ts","sourceRoot":"","sources":["../../src/result/compress-options.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAErD;;;;GAIG;AACH,MAAM,WAAW,gBAAgB;IAC/B,oEAAoE;IACpE,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,qEAAqE;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,uDAAuD;IACvD,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,4EAA4E;IAC5E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2EAA2E;IAC3E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,26 @@
1
+ import type { EContentType } from '../content/content-type.js';
2
+ import type { ECompressionStrategy } from './compression-strategy.js';
3
+ import type { ITokenSavings } from '../tokens/token-savings.js';
4
+ /**
5
+ * The outcome of compressing one blob. `compressed` is always safe to hand to
6
+ * a model as-is. When a lossy strategy dropped detail, `ccrKey` points at the
7
+ * cached original so the agent can call `retrieve_original` / `shrk expand`
8
+ * to get it back (Compress-Cache-Retrieve).
9
+ */
10
+ export interface ICompressionResult {
11
+ /** The compressed text, ready for the model. */
12
+ compressed: string;
13
+ /** Detected (or caller-forced) content class. */
14
+ contentType: EContentType;
15
+ /** Strategy that produced `compressed`. */
16
+ strategy: ECompressionStrategy;
17
+ /** Token accounting for the pass. */
18
+ savings: ITokenSavings;
19
+ /** True when detail was dropped (and an original was cached, if a store was given). */
20
+ lossy: boolean;
21
+ /** CCR key for the cached original, when a lossy pass stored one. */
22
+ ccrKey?: string;
23
+ /** A one-line, human/agent-readable note about what happened. */
24
+ note: string;
25
+ }
26
+ //# sourceMappingURL=compression-result.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compression-result.d.ts","sourceRoot":"","sources":["../../src/result/compression-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC;AACtE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAEhE;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IACjC,gDAAgD;IAChD,UAAU,EAAE,MAAM,CAAC;IACnB,iDAAiD;IACjD,WAAW,EAAE,YAAY,CAAC;IAC1B,2CAA2C;IAC3C,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,qCAAqC;IACrC,OAAO,EAAE,aAAa,CAAC;IACvB,uFAAuF;IACvF,KAAK,EAAE,OAAO,CAAC;IACf,qEAAqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,IAAI,EAAE,MAAM,CAAC;CACd"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Which deterministic strategy produced a compression result. Surfaced so
3
+ * callers (and tests) can assert on *how* a payload shrank, not just that it
4
+ * did.
5
+ */
6
+ export declare enum ECompressionStrategy {
7
+ /** No transform applied — output equals input (below threshold, or unknown shape). */
8
+ Passthrough = "passthrough",
9
+ /** Lossless columnar/table compaction of a homogeneous object array. */
10
+ Table = "table",
11
+ /** Log line-reduction: kept errors/warnings/summaries, dropped the rest. */
12
+ Log = "log",
13
+ /** Search-result reduction: kept the highest-signal `file:line` matches. */
14
+ Search = "search",
15
+ /** Diff reduction: capped files/hunks and trimmed surrounding context. */
16
+ Diff = "diff",
17
+ /** Generic line dedup for prose / plain text. */
18
+ Lines = "lines",
19
+ /** Code outline: kept imports / types / signatures, elided function bodies. */
20
+ Code = "code",
21
+ /** Markdown distilled to headers, section leads, list/table structure; bodies thinned. */
22
+ Markdown = "markdown",
23
+ /** Minified JSON (whitespace removed, structure preserved). */
24
+ MinifiedJson = "minified-json",
25
+ /** Lossy statistical row-sample of a huge homogeneous array (SmartCrusher). */
26
+ Sample = "sample",
27
+ /** Mixed blob segmented by type; each run compressed with its own strategy. */
28
+ Mixed = "mixed"
29
+ }
30
+ //# sourceMappingURL=compression-strategy.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compression-strategy.d.ts","sourceRoot":"","sources":["../../src/result/compression-strategy.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,oBAAY,oBAAoB;IAC9B,sFAAsF;IACtF,WAAW,gBAAgB;IAC3B,wEAAwE;IACxE,KAAK,UAAU;IACf,4EAA4E;IAC5E,GAAG,QAAQ;IACX,4EAA4E;IAC5E,MAAM,WAAW;IACjB,0EAA0E;IAC1E,IAAI,SAAS;IACb,iDAAiD;IACjD,KAAK,UAAU;IACf,+EAA+E;IAC/E,IAAI,SAAS;IACb,0FAA0F;IAC1F,QAAQ,aAAa;IACrB,+DAA+D;IAC/D,YAAY,kBAAkB;IAC9B,+EAA+E;IAC/E,MAAM,WAAW;IACjB,+EAA+E;IAC/E,KAAK,UAAU;CAChB"}
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Which deterministic strategy produced a compression result. Surfaced so
3
+ * callers (and tests) can assert on *how* a payload shrank, not just that it
4
+ * did.
5
+ */
6
+ export var ECompressionStrategy;
7
+ (function (ECompressionStrategy) {
8
+ /** No transform applied — output equals input (below threshold, or unknown shape). */
9
+ ECompressionStrategy["Passthrough"] = "passthrough";
10
+ /** Lossless columnar/table compaction of a homogeneous object array. */
11
+ ECompressionStrategy["Table"] = "table";
12
+ /** Log line-reduction: kept errors/warnings/summaries, dropped the rest. */
13
+ ECompressionStrategy["Log"] = "log";
14
+ /** Search-result reduction: kept the highest-signal `file:line` matches. */
15
+ ECompressionStrategy["Search"] = "search";
16
+ /** Diff reduction: capped files/hunks and trimmed surrounding context. */
17
+ ECompressionStrategy["Diff"] = "diff";
18
+ /** Generic line dedup for prose / plain text. */
19
+ ECompressionStrategy["Lines"] = "lines";
20
+ /** Code outline: kept imports / types / signatures, elided function bodies. */
21
+ ECompressionStrategy["Code"] = "code";
22
+ /** Markdown distilled to headers, section leads, list/table structure; bodies thinned. */
23
+ ECompressionStrategy["Markdown"] = "markdown";
24
+ /** Minified JSON (whitespace removed, structure preserved). */
25
+ ECompressionStrategy["MinifiedJson"] = "minified-json";
26
+ /** Lossy statistical row-sample of a huge homogeneous array (SmartCrusher). */
27
+ ECompressionStrategy["Sample"] = "sample";
28
+ /** Mixed blob segmented by type; each run compressed with its own strategy. */
29
+ ECompressionStrategy["Mixed"] = "mixed";
30
+ })(ECompressionStrategy || (ECompressionStrategy = {}));
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Adaptive sample sizing — pick how many items to keep from the *shape of the
3
+ * information*, not a fixed K. Fixed caps keep too much on redundant data and
4
+ * too little on diverse data. {@link computeOptimalK} finds the knee of the
5
+ * unique-bigram coverage curve (where adding more items stops adding
6
+ * information), cross-checked against simhash near-duplicate collapse and a
7
+ * zlib redundancy bound.
8
+ *
9
+ * Pure and deterministic: a function of the input bytes only (no clock, no RNG,
10
+ * no learned state). zlib `deflate` is a fixed, deterministic transform — a
11
+ * lookup-driven coder, not a model — used only as a redundancy *measure*.
12
+ */
13
+ export type AdaptiveBias = 'conservative' | 'moderate' | 'aggressive';
14
+ export interface IAdaptiveOptions {
15
+ /** Lower bound on the result (e.g. a representative-rows floor). Default 1. */
16
+ min?: number;
17
+ /** Upper bound on the result (e.g. an existing fixed cap). Default items.length. */
18
+ max?: number;
19
+ /** Shifts the knee: keep more (conservative) or fewer (aggressive). Default 'moderate'. */
20
+ bias?: AdaptiveBias;
21
+ }
22
+ /**
23
+ * A 32-bit simhash over an item's token bigrams. Near-identical items map to
24
+ * near-identical hashes (small Hamming distance), so redundant rows don't each
25
+ * count as "new information".
26
+ */
27
+ export declare function simhash(text: string): number;
28
+ export declare function hammingDistance(a: number, b: number): number;
29
+ /**
30
+ * Index of the knee of a monotone-nondecreasing curve via Kneedle: the point of
31
+ * maximum deflection from the chord between its endpoints. Returns `{ index,
32
+ * deflection }` where `deflection` is ~0 for a straight (no-knee) curve.
33
+ */
34
+ export declare function kneedle(curve: readonly number[]): {
35
+ index: number;
36
+ deflection: number;
37
+ };
38
+ /** The cumulative count of distinct bigrams as items are added in order. */
39
+ export declare function bigramCoverageCurve(items: readonly string[]): number[];
40
+ /**
41
+ * Choose how many of `items` to keep. Deterministic. The result is clamped to
42
+ * `[min, max]` (and never exceeds `items.length`), so an explicit cap passed as
43
+ * `max` is always honoured.
44
+ */
45
+ export declare function computeOptimalK(items: readonly string[], opts?: IAdaptiveOptions): number;
46
+ //# sourceMappingURL=adaptive-size.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adaptive-size.d.ts","sourceRoot":"","sources":["../../src/table/adaptive-size.ts"],"names":[],"mappings":"AAGA;;;;;;;;;;;GAWG;AAEH,MAAM,MAAM,YAAY,GAAG,cAAc,GAAG,UAAU,GAAG,YAAY,CAAC;AAEtE,MAAM,WAAW,gBAAgB;IAC/B,+EAA+E;IAC/E,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,oFAAoF;IACpF,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,2FAA2F;IAC3F,IAAI,CAAC,EAAE,YAAY,CAAC;CACrB;AA0BD;;;;GAIG;AACH,wBAAgB,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE5C;AAwBD,wBAAgB,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAQ5D;AAED;;;;GAIG;AACH,wBAAgB,OAAO,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,GAAG;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE,CAmBvF;AAED,4EAA4E;AAC5E,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,EAAE,CAEtE;AA2CD;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,EAAE,IAAI,GAAE,gBAAqB,GAAG,MAAM,CA+B7F"}
@@ -0,0 +1,170 @@
1
+ import { deflateSync } from 'node:zlib';
2
+ import { Buffer } from 'node:buffer';
3
+ const BIAS_FACTOR = {
4
+ conservative: 1.5,
5
+ moderate: 1.0,
6
+ aggressive: 0.6,
7
+ };
8
+ /** Hamming distance below which two simhashes are "the same information". */
9
+ const NEAR_DUP_HAMMING = 6;
10
+ /** A coverage curve flatter than this (max knee deflection) is "diverse" → keep max. */
11
+ const FLATNESS_EPSILON = 0.08;
12
+ function tokenize(text) {
13
+ return text.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 0);
14
+ }
15
+ /** The set of adjacent token bigrams in one item (unigrams when it has one token). */
16
+ function bigrams(text) {
17
+ const toks = tokenize(text);
18
+ if (toks.length <= 1)
19
+ return new Set(toks);
20
+ const out = new Set();
21
+ for (let i = 0; i + 1 < toks.length; i += 1)
22
+ out.add(`${toks[i]}${toks[i + 1]}`);
23
+ return out;
24
+ }
25
+ /**
26
+ * A 32-bit simhash over an item's token bigrams. Near-identical items map to
27
+ * near-identical hashes (small Hamming distance), so redundant rows don't each
28
+ * count as "new information".
29
+ */
30
+ export function simhash(text) {
31
+ return simhashOfGrams(bigrams(text));
32
+ }
33
+ /** Simhash over an already-computed bigram set (so callers can reuse the set). */
34
+ function simhashOfGrams(grams) {
35
+ const v = new Array(32).fill(0);
36
+ for (const g of grams) {
37
+ const h = hash32(g);
38
+ for (let b = 0; b < 32; b += 1)
39
+ v[b] += (h >>> b) & 1 ? 1 : -1;
40
+ }
41
+ let out = 0;
42
+ for (let b = 0; b < 32; b += 1)
43
+ if (v[b] > 0)
44
+ out |= 1 << b;
45
+ return out >>> 0;
46
+ }
47
+ function hash32(s) {
48
+ // FNV-1a — a fixed, deterministic non-cryptographic hash.
49
+ let h = 0x811c9dc5;
50
+ for (let i = 0; i < s.length; i += 1) {
51
+ h ^= s.charCodeAt(i);
52
+ h = Math.imul(h, 0x01000193);
53
+ }
54
+ return h >>> 0;
55
+ }
56
+ export function hammingDistance(a, b) {
57
+ let x = (a ^ b) >>> 0;
58
+ let count = 0;
59
+ while (x !== 0) {
60
+ x &= x - 1;
61
+ count += 1;
62
+ }
63
+ return count;
64
+ }
65
+ /**
66
+ * Index of the knee of a monotone-nondecreasing curve via Kneedle: the point of
67
+ * maximum deflection from the chord between its endpoints. Returns `{ index,
68
+ * deflection }` where `deflection` is ~0 for a straight (no-knee) curve.
69
+ */
70
+ export function kneedle(curve) {
71
+ const n = curve.length;
72
+ if (n <= 2)
73
+ return { index: n - 1, deflection: 0 };
74
+ const first = curve[0];
75
+ const last = curve[n - 1];
76
+ const span = last - first;
77
+ if (span <= 0)
78
+ return { index: 0, deflection: 0 };
79
+ let bestIdx = n - 1;
80
+ let bestDef = 0;
81
+ for (let i = 0; i < n; i += 1) {
82
+ const xNorm = i / (n - 1);
83
+ const yNorm = (curve[i] - first) / span;
84
+ const def = yNorm - xNorm; // concave (saturating) curves bulge above the chord
85
+ if (def > bestDef) {
86
+ bestDef = def;
87
+ bestIdx = i;
88
+ }
89
+ }
90
+ return { index: bestIdx, deflection: bestDef };
91
+ }
92
+ /** The cumulative count of distinct bigrams as items are added in order. */
93
+ export function bigramCoverageCurve(items) {
94
+ return coverageFromGrams(items.map(bigrams));
95
+ }
96
+ /** Cumulative distinct-bigram curve over already-computed gram sets. */
97
+ function coverageFromGrams(gramSets) {
98
+ const seen = new Set();
99
+ const curve = [];
100
+ for (const grams of gramSets) {
101
+ for (const g of grams)
102
+ seen.add(g);
103
+ curve.push(seen.size);
104
+ }
105
+ return curve;
106
+ }
107
+ /** Recent-hash window for the near-dup scan, keeping it O(n) on large inputs. */
108
+ const NEAR_DUP_WINDOW = 64;
109
+ /**
110
+ * Count of items that are NOT a simhash near-duplicate of a RECENT earlier item.
111
+ * Bounded to a sliding window so it stays linear; near-duplicates in sampler
112
+ * data cluster locally, and this is only an upper-bound cross-check anyway.
113
+ */
114
+ function uniqueFromGrams(gramSets) {
115
+ const window = [];
116
+ let unique = 0;
117
+ for (const grams of gramSets) {
118
+ const h = simhashOfGrams(grams);
119
+ if (!window.some((k) => hammingDistance(k, h) <= NEAR_DUP_HAMMING)) {
120
+ unique += 1;
121
+ window.push(h);
122
+ if (window.length > NEAR_DUP_WINDOW)
123
+ window.shift();
124
+ }
125
+ }
126
+ return unique;
127
+ }
128
+ /** zlib redundancy ratio of the joined corpus in (0, 1]; lower ⇒ more redundant. */
129
+ function zlibRedundancy(items) {
130
+ const raw = items.join('\n');
131
+ if (raw.length === 0)
132
+ return 1;
133
+ const deflated = deflateSync(Buffer.from(raw, 'utf8')).length;
134
+ return Math.min(1, deflated / Buffer.byteLength(raw, 'utf8'));
135
+ }
136
+ /**
137
+ * Choose how many of `items` to keep. Deterministic. The result is clamped to
138
+ * `[min, max]` (and never exceeds `items.length`), so an explicit cap passed as
139
+ * `max` is always honoured.
140
+ */
141
+ export function computeOptimalK(items, opts = {}) {
142
+ const n = items.length;
143
+ const min = Math.max(1, opts.min ?? 1);
144
+ const max = Math.min(opts.max ?? n, n);
145
+ if (n <= min)
146
+ return n;
147
+ if (max <= min)
148
+ return min;
149
+ // Tokenize + bigram each item ONCE, then reuse the sets for both the coverage
150
+ // curve and the simhash near-dup scan (was computed twice).
151
+ const gramSets = items.map(bigrams);
152
+ const curve = coverageFromGrams(gramSets);
153
+ const knee = kneedle(curve);
154
+ let k;
155
+ if (knee.deflection < FLATNESS_EPSILON) {
156
+ // Near-linear coverage ⇒ each item adds information ⇒ keep the most.
157
+ k = max;
158
+ }
159
+ else {
160
+ k = knee.index + 1;
161
+ }
162
+ // Bias shifts how far past the knee we keep.
163
+ k = Math.round(k * BIAS_FACTOR[opts.bias ?? 'moderate']);
164
+ // Never keep more than the number of non-near-duplicate items.
165
+ k = Math.min(k, uniqueFromGrams(gramSets));
166
+ // Very redundant corpora (low zlib ratio) pull K down toward the floor.
167
+ if (zlibRedundancy(items) < 0.25)
168
+ k = Math.min(k, Math.max(min, Math.ceil(max * 0.25)));
169
+ return Math.max(min, Math.min(max, k));
170
+ }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Per-column value-dictionary (enum) encoding for the columnar table — a
3
+ * lossless token reduction for low-cardinality columns. The columnar form
4
+ * already hoists the schema (column names written once), but a column like the
5
+ * knowledge graph's `kind`/`relation`/`source` still writes its value once per
6
+ * row. This pass replaces those repeats with a one-time dictionary plus a small
7
+ * integer index per row.
8
+ *
9
+ * Disambiguation: a cell is a dict INDEX iff its column name is a key of the
10
+ * returned `dict` (decided structurally, never by the cell's value/type), so a
11
+ * literal-number column and a dict-encoded numeric enum never collide. Absent
12
+ * cells get no index (they stay in `absent` and are skipped before any deref).
13
+ *
14
+ * Pure and deterministic (dict values in first-appearance order). Two guards
15
+ * ensure it NEVER inflates: a per-column byte check, and a table-level byte
16
+ * fallback that returns the dict-free rows when the dict didn't actually shrink.
17
+ */
18
+ export interface IValueDictResult {
19
+ /** Rows with dict-encoded columns rewritten to indices (or the input rows unchanged). */
20
+ rows: unknown[][];
21
+ /** Per-column value tables; present only when at least one column was encoded. */
22
+ dict?: Record<string, unknown[]>;
23
+ }
24
+ /**
25
+ * Dictionary-encode every low-cardinality column that strictly shrinks. Returns
26
+ * the (possibly unchanged) rows and an optional `dict`. The input `rows` is
27
+ * never mutated — it is cloned lazily only if a column is encoded.
28
+ */
29
+ export declare function applyValueDictionaries(cols: readonly string[], rows: readonly unknown[][], absent: ReadonlyArray<readonly [number, number]>): IValueDictResult;
30
+ //# sourceMappingURL=apply-value-dictionaries.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"apply-value-dictionaries.d.ts","sourceRoot":"","sources":["../../src/table/apply-value-dictionaries.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAOH,MAAM,WAAW,gBAAgB;IAC/B,yFAAyF;IACzF,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC;IAClB,kFAAkF;IAClF,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,CAAC;CAClC;AAOD;;;;GAIG;AACH,wBAAgB,sBAAsB,CACpC,IAAI,EAAE,SAAS,MAAM,EAAE,EACvB,IAAI,EAAE,SAAS,OAAO,EAAE,EAAE,EAC1B,MAAM,EAAE,aAAa,CAAC,SAAS,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAC/C,gBAAgB,CAkElB"}