@shrkcrft/compress 0.1.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +21 -0
- package/dist/cache/align-volatile-tokens.d.ts +13 -0
- package/dist/cache/align-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/align-volatile-tokens.js +51 -0
- package/dist/cache/alignment-map.d.ts +23 -0
- package/dist/cache/alignment-map.d.ts.map +1 -0
- package/dist/cache/alignment-map.js +1 -0
- package/dist/cache/alignment-result.d.ts +11 -0
- package/dist/cache/alignment-result.d.ts.map +1 -0
- package/dist/cache/alignment-result.js +1 -0
- package/dist/cache/detect-volatile-tokens.d.ts +10 -0
- package/dist/cache/detect-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/detect-volatile-tokens.js +41 -0
- package/dist/cache/placeholder.d.ts +28 -0
- package/dist/cache/placeholder.d.ts.map +1 -0
- package/dist/cache/placeholder.js +0 -0
- package/dist/cache/restore-volatile-tokens.d.ts +10 -0
- package/dist/cache/restore-volatile-tokens.d.ts.map +1 -0
- package/dist/cache/restore-volatile-tokens.js +21 -0
- package/dist/cache/volatile-classify.d.ts +11 -0
- package/dist/cache/volatile-classify.d.ts.map +1 -0
- package/dist/cache/volatile-classify.js +35 -0
- package/dist/cache/volatile-kind.d.ts +13 -0
- package/dist/cache/volatile-kind.d.ts.map +1 -0
- package/dist/cache/volatile-kind.js +13 -0
- package/dist/cache/volatile-token.d.ts +14 -0
- package/dist/cache/volatile-token.d.ts.map +1 -0
- package/dist/cache/volatile-token.js +1 -0
- package/dist/ccr/ccr-entry.d.ts +13 -0
- package/dist/ccr/ccr-entry.d.ts.map +1 -0
- package/dist/ccr/ccr-entry.js +1 -0
- package/dist/ccr/ccr-key.d.ts +9 -0
- package/dist/ccr/ccr-key.d.ts.map +1 -0
- package/dist/ccr/ccr-key.js +19 -0
- package/dist/ccr/ccr-marker.d.ts +23 -0
- package/dist/ccr/ccr-marker.d.ts.map +1 -0
- package/dist/ccr/ccr-marker.js +30 -0
- package/dist/ccr/ccr-store.d.ts +18 -0
- package/dist/ccr/ccr-store.d.ts.map +1 -0
- package/dist/ccr/ccr-store.js +1 -0
- package/dist/ccr/file-ccr-store.d.ts +19 -0
- package/dist/ccr/file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/file-ccr-store.js +53 -0
- package/dist/ccr/in-memory-ccr-store.d.ts +21 -0
- package/dist/ccr/in-memory-ccr-store.d.ts.map +1 -0
- package/dist/ccr/in-memory-ccr-store.js +45 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts +43 -0
- package/dist/ccr/ttl-file-ccr-store.d.ts.map +1 -0
- package/dist/ccr/ttl-file-ccr-store.js +117 -0
- package/dist/code/compress-code.d.ts +4 -0
- package/dist/code/compress-code.d.ts.map +1 -0
- package/dist/code/compress-code.js +294 -0
- package/dist/compress-content.d.ts +11 -0
- package/dist/compress-content.d.ts.map +1 -0
- package/dist/compress-content.js +79 -0
- package/dist/content/content-type.d.ts +28 -0
- package/dist/content/content-type.d.ts.map +1 -0
- package/dist/content/content-type.js +28 -0
- package/dist/content/detect-content-type.d.ts +9 -0
- package/dist/content/detect-content-type.d.ts.map +1 -0
- package/dist/content/detect-content-type.js +184 -0
- package/dist/content/segment.d.ts +21 -0
- package/dist/content/segment.d.ts.map +1 -0
- package/dist/content/segment.js +117 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +49 -0
- package/dist/json/compress-json.d.ts +18 -0
- package/dist/json/compress-json.d.ts.map +1 -0
- package/dist/json/compress-json.js +139 -0
- package/dist/json/render-compact-json.d.ts +10 -0
- package/dist/json/render-compact-json.d.ts.map +1 -0
- package/dist/json/render-compact-json.js +18 -0
- package/dist/relevance/bm25.d.ts +26 -0
- package/dist/relevance/bm25.d.ts.map +1 -0
- package/dist/relevance/bm25.js +115 -0
- package/dist/result/compress-options.d.ts +26 -0
- package/dist/result/compress-options.d.ts.map +1 -0
- package/dist/result/compress-options.js +1 -0
- package/dist/result/compression-result.d.ts +26 -0
- package/dist/result/compression-result.d.ts.map +1 -0
- package/dist/result/compression-result.js +1 -0
- package/dist/result/compression-strategy.d.ts +30 -0
- package/dist/result/compression-strategy.d.ts.map +1 -0
- package/dist/result/compression-strategy.js +30 -0
- package/dist/table/adaptive-size.d.ts +46 -0
- package/dist/table/adaptive-size.d.ts.map +1 -0
- package/dist/table/adaptive-size.js +170 -0
- package/dist/table/apply-value-dictionaries.d.ts +30 -0
- package/dist/table/apply-value-dictionaries.d.ts.map +1 -0
- package/dist/table/apply-value-dictionaries.js +99 -0
- package/dist/table/column-presence.d.ts +20 -0
- package/dist/table/column-presence.d.ts.map +1 -0
- package/dist/table/column-presence.js +52 -0
- package/dist/table/columnar-json.d.ts +24 -0
- package/dist/table/columnar-json.d.ts.map +1 -0
- package/dist/table/columnar-json.js +83 -0
- package/dist/table/columnar-table.d.ts +24 -0
- package/dist/table/columnar-table.d.ts.map +1 -0
- package/dist/table/columnar-table.js +1 -0
- package/dist/table/compact-object-array.d.ts +12 -0
- package/dist/table/compact-object-array.d.ts.map +1 -0
- package/dist/table/compact-object-array.js +88 -0
- package/dist/table/field-spec.d.ts +13 -0
- package/dist/table/field-spec.d.ts.map +1 -0
- package/dist/table/field-spec.js +1 -0
- package/dist/table/object-map.d.ts +28 -0
- package/dist/table/object-map.d.ts.map +1 -0
- package/dist/table/object-map.js +119 -0
- package/dist/table/render-table.d.ts +11 -0
- package/dist/table/render-table.d.ts.map +1 -0
- package/dist/table/render-table.js +39 -0
- package/dist/table/sample-object-array.d.ts +11 -0
- package/dist/table/sample-object-array.d.ts.map +1 -0
- package/dist/table/sample-object-array.js +171 -0
- package/dist/table/sample-options.d.ts +29 -0
- package/dist/table/sample-options.d.ts.map +1 -0
- package/dist/table/sample-options.js +1 -0
- package/dist/table/sampled-table.d.ts +33 -0
- package/dist/table/sampled-table.d.ts.map +1 -0
- package/dist/table/sampled-table.js +8 -0
- package/dist/table/table-compaction.d.ts +19 -0
- package/dist/table/table-compaction.d.ts.map +1 -0
- package/dist/table/table-compaction.js +1 -0
- package/dist/table/table-formats.d.ts +23 -0
- package/dist/table/table-formats.d.ts.map +1 -0
- package/dist/table/table-formats.js +233 -0
- package/dist/text/compress-diff.d.ts +20 -0
- package/dist/text/compress-diff.d.ts.map +1 -0
- package/dist/text/compress-diff.js +344 -0
- package/dist/text/compress-lines.d.ts +12 -0
- package/dist/text/compress-lines.d.ts.map +1 -0
- package/dist/text/compress-lines.js +44 -0
- package/dist/text/compress-log.d.ts +12 -0
- package/dist/text/compress-log.d.ts.map +1 -0
- package/dist/text/compress-log.js +202 -0
- package/dist/text/compress-markdown.d.ts +15 -0
- package/dist/text/compress-markdown.d.ts.map +1 -0
- package/dist/text/compress-markdown.js +96 -0
- package/dist/text/compress-search.d.ts +11 -0
- package/dist/text/compress-search.d.ts.map +1 -0
- package/dist/text/compress-search.js +78 -0
- package/dist/text/finalize.d.ts +21 -0
- package/dist/text/finalize.d.ts.map +1 -0
- package/dist/text/finalize.js +54 -0
- package/dist/text/line-utils.d.ts +20 -0
- package/dist/text/line-utils.d.ts.map +1 -0
- package/dist/text/line-utils.js +65 -0
- package/dist/text/lockfile-names.d.ts +3 -0
- package/dist/text/lockfile-names.d.ts.map +1 -0
- package/dist/text/lockfile-names.js +33 -0
- package/dist/text/log-template.d.ts +31 -0
- package/dist/text/log-template.d.ts.map +1 -0
- package/dist/text/log-template.js +239 -0
- package/dist/tokens/estimate-tokens.d.ts +17 -0
- package/dist/tokens/estimate-tokens.d.ts.map +1 -0
- package/dist/tokens/estimate-tokens.js +53 -0
- package/dist/tokens/token-savings.d.ts +20 -0
- package/dist/tokens/token-savings.d.ts.map +1 -0
- package/dist/tokens/token-savings.js +1 -0
- package/package.json +52 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ISampleOptions } from './sample-options.js';
|
|
2
|
+
import type { ISampledTable } from './sampled-table.js';
|
|
3
|
+
/**
|
|
4
|
+
* SmartCrusher-style lossy sampler for a homogeneous object array. Keeps
|
|
5
|
+
* representative rows — front/back anchors, query matches, numeric outliers,
|
|
6
|
+
* one per dedup class — and drops the rest, in ascending original order, with
|
|
7
|
+
* full provenance. Pure and deterministic (no RNG/clock; ties break by index).
|
|
8
|
+
* Returns `null` if the input isn't a homogeneous object array.
|
|
9
|
+
*/
|
|
10
|
+
export declare function sampleObjectArray(items: unknown, opts?: ISampleOptions): ISampledTable | null;
|
|
11
|
+
//# sourceMappingURL=sample-object-array.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sample-object-array.d.ts","sourceRoot":"","sources":["../../src/table/sample-object-array.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAqCxD;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,IAAI,GAAE,cAAmB,GAAG,aAAa,GAAG,IAAI,CA8HjG"}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import { compactObjectArray } from "./compact-object-array.js";
|
|
2
|
+
import { applyValueDictionaries } from "./apply-value-dictionaries.js";
|
|
3
|
+
import { computeOptimalK } from "./adaptive-size.js";
|
|
4
|
+
import { topByBm25 } from "../relevance/bm25.js";
|
|
5
|
+
import { queryTokens } from "../text/line-utils.js";
|
|
6
|
+
// Query matches rank ABOVE front/back anchors: when the caller asked about
|
|
7
|
+
// something specific, a relevant row matters more than a positional anchor
|
|
8
|
+
// under a tight cap. With no query there are no matches, so anchors lead as before.
|
|
9
|
+
const PRECEDENCE = { match: 0, head: 1, tail: 2, outlier: 3 };
|
|
10
|
+
/** Auto-pick the numeric column with the highest variance (ties → column name). */
|
|
11
|
+
function pickOutlierField(cols, rows) {
|
|
12
|
+
let best;
|
|
13
|
+
for (let c = 0; c < cols.length; c += 1) {
|
|
14
|
+
const spec = cols[c];
|
|
15
|
+
if (spec.type !== 'int' && spec.type !== 'float')
|
|
16
|
+
continue;
|
|
17
|
+
const nums = [];
|
|
18
|
+
for (const row of rows) {
|
|
19
|
+
const v = row[c];
|
|
20
|
+
if (typeof v === 'number' && Number.isFinite(v))
|
|
21
|
+
nums.push(v);
|
|
22
|
+
}
|
|
23
|
+
if (nums.length < 2)
|
|
24
|
+
continue;
|
|
25
|
+
const mean = nums.reduce((s, v) => s + v, 0) / nums.length;
|
|
26
|
+
const variance = nums.reduce((s, v) => s + (v - mean) * (v - mean), 0) / nums.length;
|
|
27
|
+
if (!best || variance > best.variance || (variance === best.variance && spec.name < best.name)) {
|
|
28
|
+
best = { name: spec.name, variance };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return best?.name;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* SmartCrusher-style lossy sampler for a homogeneous object array. Keeps
|
|
35
|
+
* representative rows — front/back anchors, query matches, numeric outliers,
|
|
36
|
+
* one per dedup class — and drops the rest, in ascending original order, with
|
|
37
|
+
* full provenance. Pure and deterministic (no RNG/clock; ties break by index).
|
|
38
|
+
* Returns `null` if the input isn't a homogeneous object array.
|
|
39
|
+
*/
|
|
40
|
+
export function sampleObjectArray(items, opts = {}) {
|
|
41
|
+
const table = compactObjectArray(items);
|
|
42
|
+
if (!table)
|
|
43
|
+
return null;
|
|
44
|
+
const { cols, rows, absent, originalCount } = table;
|
|
45
|
+
const anchors = opts.anchors ?? 8;
|
|
46
|
+
const outliersN = opts.outliers ?? 8;
|
|
47
|
+
const matchesN = opts.matches ?? 16;
|
|
48
|
+
// P3.1: with no explicit cap, size the keep-set from the data's information
|
|
49
|
+
// curve instead of a flat 200. Floored at the representative budget so the
|
|
50
|
+
// adaptive cap never trims the bucket reps (anchors/matches/outliers) — it
|
|
51
|
+
// only declines to over-keep on large redundant arrays. An explicit
|
|
52
|
+
// `maxItems` always wins.
|
|
53
|
+
const repBudget = anchors * 2 + matchesN + outliersN * 2;
|
|
54
|
+
const maxItems = opts.maxItems && opts.maxItems > 0
|
|
55
|
+
? opts.maxItems
|
|
56
|
+
: computeOptimalK(rows.map((r) => JSON.stringify(r)), {
|
|
57
|
+
min: Math.min(repBudget, rows.length),
|
|
58
|
+
max: 200,
|
|
59
|
+
...(opts.bias ? { bias: opts.bias } : {}),
|
|
60
|
+
});
|
|
61
|
+
const tokens = queryTokens(opts.query);
|
|
62
|
+
// 1. Dedup → representatives (smallest index per byte-identical row).
|
|
63
|
+
let repIdx;
|
|
64
|
+
let deduped = 0;
|
|
65
|
+
if (opts.dedup !== false) {
|
|
66
|
+
const seen = new Map();
|
|
67
|
+
for (let i = 0; i < rows.length; i += 1) {
|
|
68
|
+
const key = JSON.stringify(rows[i]);
|
|
69
|
+
if (!seen.has(key))
|
|
70
|
+
seen.set(key, i);
|
|
71
|
+
else
|
|
72
|
+
deduped += 1;
|
|
73
|
+
}
|
|
74
|
+
repIdx = [...seen.values()].sort((a, b) => a - b);
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
repIdx = rows.map((_, i) => i);
|
|
78
|
+
}
|
|
79
|
+
// 2. Bucket selection (record first bucket that claimed each index).
|
|
80
|
+
const bucketOf = new Map();
|
|
81
|
+
const claim = (i, b) => {
|
|
82
|
+
if (!bucketOf.has(i))
|
|
83
|
+
bucketOf.set(i, b);
|
|
84
|
+
};
|
|
85
|
+
for (const i of repIdx.slice(0, anchors))
|
|
86
|
+
claim(i, 'head');
|
|
87
|
+
for (const i of repIdx.slice(Math.max(0, repIdx.length - anchors)))
|
|
88
|
+
claim(i, 'tail');
|
|
89
|
+
if (tokens.length > 0 && opts.query) {
|
|
90
|
+
// P3.2: rank query matches by BM25 (idf-weighted, length-normalized, with an
|
|
91
|
+
// exact-match boost for ID-shaped terms) instead of bare token overlap, so a
|
|
92
|
+
// uniquely-relevant row outranks one that merely repeats a common word.
|
|
93
|
+
const docs = repIdx.map((i) => JSON.stringify(rows[i]));
|
|
94
|
+
for (const localIdx of topByBm25(opts.query, docs, matchesN)) {
|
|
95
|
+
claim(repIdx[localIdx], 'match');
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
const sortField = opts.outlierField ?? pickOutlierField(cols, rows);
|
|
99
|
+
if (sortField) {
|
|
100
|
+
const col = cols.findIndex((c) => c.name === sortField);
|
|
101
|
+
if (col >= 0) {
|
|
102
|
+
const vals = repIdx
|
|
103
|
+
.map((i) => ({ i, v: rows[i]?.[col] }))
|
|
104
|
+
.filter((x) => typeof x.v === 'number' && Number.isFinite(x.v))
|
|
105
|
+
.sort((a, b) => a.v - b.v || a.i - b.i);
|
|
106
|
+
for (const { i } of vals.slice(0, outliersN))
|
|
107
|
+
claim(i, 'outlier');
|
|
108
|
+
for (const { i } of vals.slice(Math.max(0, vals.length - outliersN)))
|
|
109
|
+
claim(i, 'outlier');
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// 3. Hard cap by precedence, always keeping the first & last representative.
|
|
113
|
+
let chosen = [...bucketOf.keys()];
|
|
114
|
+
if (chosen.length > maxItems) {
|
|
115
|
+
const firstRep = repIdx[0];
|
|
116
|
+
const lastRep = repIdx[repIdx.length - 1];
|
|
117
|
+
const forced = new Set();
|
|
118
|
+
if (firstRep !== undefined)
|
|
119
|
+
forced.add(firstRep);
|
|
120
|
+
// Only force the last endpoint if the cap has room — otherwise maxItems=1
|
|
121
|
+
// would over-keep 2 rows.
|
|
122
|
+
if (lastRep !== undefined && forced.size < maxItems)
|
|
123
|
+
forced.add(lastRep);
|
|
124
|
+
const ranked = chosen
|
|
125
|
+
.filter((i) => !forced.has(i))
|
|
126
|
+
.sort((a, b) => PRECEDENCE[bucketOf.get(a)] - PRECEDENCE[bucketOf.get(b)] || a - b);
|
|
127
|
+
const out = new Set(forced);
|
|
128
|
+
for (const i of ranked) {
|
|
129
|
+
if (out.size >= maxItems)
|
|
130
|
+
break;
|
|
131
|
+
out.add(i);
|
|
132
|
+
}
|
|
133
|
+
chosen = [...out];
|
|
134
|
+
}
|
|
135
|
+
const keptSorted = chosen.sort((a, b) => a - b);
|
|
136
|
+
// 4. Build the sampled table (ascending original order; remap absent).
|
|
137
|
+
const newIndexOf = new Map();
|
|
138
|
+
keptSorted.forEach((orig, idx) => newIndexOf.set(orig, idx));
|
|
139
|
+
const keptRows = keptSorted.map((i) => rows[i] ?? []);
|
|
140
|
+
const keptAbsent = [];
|
|
141
|
+
for (const [r, c] of absent) {
|
|
142
|
+
const nr = newIndexOf.get(r);
|
|
143
|
+
if (nr !== undefined)
|
|
144
|
+
keptAbsent.push([nr, c]);
|
|
145
|
+
}
|
|
146
|
+
const count = (b) => keptSorted.filter((i) => bucketOf.get(i) === b).length;
|
|
147
|
+
// Value-dictionary encode the kept rows (over kept indices, so dict + indices
|
|
148
|
+
// align). Decoded by the same expandColumnar deref the lossless path uses.
|
|
149
|
+
const colNames = cols.map((c) => c.name);
|
|
150
|
+
const { rows: dictRows, dict } = applyValueDictionaries(colNames, keptRows, keptAbsent);
|
|
151
|
+
return {
|
|
152
|
+
_table: {
|
|
153
|
+
cols: colNames,
|
|
154
|
+
rows: dictRows,
|
|
155
|
+
absent: keptAbsent,
|
|
156
|
+
...(dict ? { dict } : {}),
|
|
157
|
+
n: originalCount,
|
|
158
|
+
sample: {
|
|
159
|
+
kept: keptSorted.length,
|
|
160
|
+
dropped: originalCount - keptSorted.length,
|
|
161
|
+
anchorsHead: count('head'),
|
|
162
|
+
anchorsTail: count('tail'),
|
|
163
|
+
outliers: count('outlier'),
|
|
164
|
+
matches: count('match'),
|
|
165
|
+
deduped,
|
|
166
|
+
srcRows: keptSorted,
|
|
167
|
+
...(sortField ? { sortField } : {}),
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Knobs for the SmartCrusher row-sampler. All optional; defaults are
|
|
3
|
+
* deterministic. The sampler keeps representative rows (anchors / outliers /
|
|
4
|
+
* query matches / one-per-dedup-class) and drops the rest, recording why.
|
|
5
|
+
*/
|
|
6
|
+
export interface ISampleOptions {
|
|
7
|
+
/** Hard cap on KEPT rows. Default 200. */
|
|
8
|
+
maxItems?: number;
|
|
9
|
+
/** Rows kept from each of the front and back. Default 8. */
|
|
10
|
+
anchors?: number;
|
|
11
|
+
/** Numeric column to rank outliers on; auto-picked (highest variance) if omitted. */
|
|
12
|
+
outlierField?: string;
|
|
13
|
+
/** Extreme rows kept per tail (min side + max side). Default 8. */
|
|
14
|
+
outliers?: number;
|
|
15
|
+
/** Query whose tokens force-keep matching rows (up to `matches`). */
|
|
16
|
+
query?: string;
|
|
17
|
+
/** Max query-matched rows to force-keep. Default 16. */
|
|
18
|
+
matches?: number;
|
|
19
|
+
/** Collapse byte-identical rows to one representative (earliest). Default true. */
|
|
20
|
+
dedup?: boolean;
|
|
21
|
+
/**
|
|
22
|
+
* When no explicit `maxItems` is given, the cap is chosen adaptively from the
|
|
23
|
+
* data's information curve (P3.1). `bias` shifts that knee: keep more
|
|
24
|
+
* (`conservative`) or fewer (`aggressive`). Default `moderate`. Ignored when
|
|
25
|
+
* `maxItems` is set.
|
|
26
|
+
*/
|
|
27
|
+
bias?: 'conservative' | 'moderate' | 'aggressive';
|
|
28
|
+
}
|
|
29
|
+
//# sourceMappingURL=sample-options.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sample-options.d.ts","sourceRoot":"","sources":["../../src/table/sample-options.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC7B,0CAA0C;IAC1C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,qFAAqF;IACrF,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mEAAmE;IACnE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,wDAAwD;IACxD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,mFAAmF;IACnF,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB;;;;;OAKG;IACH,IAAI,CAAC,EAAE,cAAc,GAAG,UAAU,GAAG,YAAY,CAAC;CACnD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A lossily-sampled columnar table. Extends the `_table` envelope (so
|
|
3
|
+
* `expandColumnar` still yields the KEPT rows verbatim) with a `sample` block
|
|
4
|
+
* recording how many rows were dropped and why. The FULL array is recovered
|
|
5
|
+
* from the CCR original, not from this structure.
|
|
6
|
+
*/
|
|
7
|
+
export interface ISampledTable {
|
|
8
|
+
_table: {
|
|
9
|
+
cols: string[];
|
|
10
|
+
rows: unknown[][];
|
|
11
|
+
absent: Array<[number, number]>;
|
|
12
|
+
/** Per-column value dictionaries (see {@link IColumnarTable}); decode via the same deref. */
|
|
13
|
+
dict?: Record<string, unknown[]>;
|
|
14
|
+
/** Original row count (kept + dropped) — surfaced in the sampling note. */
|
|
15
|
+
n: number;
|
|
16
|
+
sample: {
|
|
17
|
+
kept: number;
|
|
18
|
+
dropped: number;
|
|
19
|
+
anchorsHead: number;
|
|
20
|
+
anchorsTail: number;
|
|
21
|
+
outliers: number;
|
|
22
|
+
matches: number;
|
|
23
|
+
deduped: number;
|
|
24
|
+
/** Original index of each kept row, strictly ascending. */
|
|
25
|
+
srcRows: number[];
|
|
26
|
+
/** Numeric column used for outlier selection, if any. */
|
|
27
|
+
sortField?: string;
|
|
28
|
+
};
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
/** Type guard: a columnar table carrying sampling provenance. */
|
|
32
|
+
export declare function isSampledTable(value: unknown): value is ISampledTable;
|
|
33
|
+
//# sourceMappingURL=sampled-table.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sampled-table.d.ts","sourceRoot":"","sources":["../../src/table/sampled-table.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE;QACN,IAAI,EAAE,MAAM,EAAE,CAAC;QACf,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC;QAClB,MAAM,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;QAChC,6FAA6F;QAC7F,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,CAAC;QACjC,2EAA2E;QAC3E,CAAC,EAAE,MAAM,CAAC;QACV,MAAM,EAAE;YACN,IAAI,EAAE,MAAM,CAAC;YACb,OAAO,EAAE,MAAM,CAAC;YAChB,WAAW,EAAE,MAAM,CAAC;YACpB,WAAW,EAAE,MAAM,CAAC;YACpB,QAAQ,EAAE,MAAM,CAAC;YACjB,OAAO,EAAE,MAAM,CAAC;YAChB,OAAO,EAAE,MAAM,CAAC;YAChB,2DAA2D;YAC3D,OAAO,EAAE,MAAM,EAAE,CAAC;YAClB,yDAAyD;YACzD,SAAS,CAAC,EAAE,MAAM,CAAC;SACpB,CAAC;KACH,CAAC;CACH;AAED,iEAAiE;AACjE,wBAAgB,cAAc,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,aAAa,CAIrE"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { isColumnarTable } from "./columnar-json.js";
|
|
2
|
+
/** Type guard: a columnar table carrying sampling provenance. */
|
|
3
|
+
export function isSampledTable(value) {
|
|
4
|
+
if (!isColumnarTable(value))
|
|
5
|
+
return false;
|
|
6
|
+
const t = value._table;
|
|
7
|
+
return !!t && typeof t.sample === 'object' && t.sample !== null;
|
|
8
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { IFieldSpec } from './field-spec.js';
|
|
2
|
+
/**
|
|
3
|
+
* Lossless columnar form of a homogeneous object array. The shared schema is
|
|
4
|
+
* hoisted once into {@link cols}; each row carries only values, positionally.
|
|
5
|
+
* `absent` records `[row, col]` positions whose key was missing on the source
|
|
6
|
+
* object (distinct from a present null) so the original array reconstructs
|
|
7
|
+
* exactly.
|
|
8
|
+
*/
|
|
9
|
+
export interface ITableCompaction {
|
|
10
|
+
/** Hoisted column schema, in a deterministic order. */
|
|
11
|
+
cols: IFieldSpec[];
|
|
12
|
+
/** Row-major values; `rows[r][c]` aligns to `cols[c]`. */
|
|
13
|
+
rows: unknown[][];
|
|
14
|
+
/** `[row, col]` positions where the source object had no such key. */
|
|
15
|
+
absent: Array<[number, number]>;
|
|
16
|
+
/** Number of source objects (equals `rows.length`). */
|
|
17
|
+
originalCount: number;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=table-compaction.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"table-compaction.d.ts","sourceRoot":"","sources":["../../src/table/table-compaction.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAElD;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B,uDAAuD;IACvD,IAAI,EAAE,UAAU,EAAE,CAAC;IACnB,0DAA0D;IAC1D,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC;IAClB,sEAAsE;IACtE,MAAM,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IAChC,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;CACvB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { IColumnarTable } from './columnar-table.js';
|
|
2
|
+
/**
|
|
3
|
+
* Alternative, model-read-friendly encodings of a columnar table (P4.2). The
|
|
4
|
+
* default wire shape is the columnar JSON envelope; these offer the same data
|
|
5
|
+
* as CSV or Markdown key/value blocks for shapes where a flat layout reads more
|
|
6
|
+
* accurately. Both are reversible: their inverse rebuilds the original object
|
|
7
|
+
* array exactly (cell values are JSON-encoded, so strings, numbers, booleans,
|
|
8
|
+
* null, and nested values all round-trip; an absent key stays absent).
|
|
9
|
+
*
|
|
10
|
+
* Whether any of these becomes a default awaits the P4.1 comprehension eval —
|
|
11
|
+
* until then they are opt-in. Pure and deterministic.
|
|
12
|
+
*/
|
|
13
|
+
type Row = Record<string, unknown>;
|
|
14
|
+
/** Encode a columnar table as CSV: a header row of column names, then one row per record. */
|
|
15
|
+
export declare function columnarToCsv(table: IColumnarTable): string;
|
|
16
|
+
/** Inverse of {@link columnarToCsv}: rebuild the original object array. */
|
|
17
|
+
export declare function csvToObjects(csv: string): Row[];
|
|
18
|
+
/** Encode a columnar table as Markdown key/value blocks, one block per record. */
|
|
19
|
+
export declare function columnarToMarkdownKv(table: IColumnarTable): string;
|
|
20
|
+
/** Inverse of {@link columnarToMarkdownKv}: rebuild the original object array. */
|
|
21
|
+
export declare function markdownKvToObjects(md: string): Row[];
|
|
22
|
+
export {};
|
|
23
|
+
//# sourceMappingURL=table-formats.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"table-formats.d.ts","sourceRoot":"","sources":["../../src/table/table-formats.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAE1D;;;;;;;;;;GAUG;AAEH,KAAK,GAAG,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AA4EnC,6FAA6F;AAC7F,wBAAgB,aAAa,CAAC,KAAK,EAAE,cAAc,GAAG,MAAM,CAc3D;AAED,2EAA2E;AAC3E,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,EAAE,CAyB/C;AAaD,kFAAkF;AAClF,wBAAgB,oBAAoB,CAAC,KAAK,EAAE,cAAc,GAAG,MAAM,CAkBlE;AAED,kFAAkF;AAClF,wBAAgB,mBAAmB,CAAC,EAAE,EAAE,MAAM,GAAG,GAAG,EAAE,CAiDrD"}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
function csvEscapeField(s) {
|
|
2
|
+
return /[",\n\r]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
|
|
3
|
+
}
|
|
4
|
+
/**
|
|
5
|
+
* Parse a whole CSV document into records of fields, honouring `"`-quoted
|
|
6
|
+
* fields with `""` escapes — including fields that contain commas, `\r`/`\n`,
|
|
7
|
+
* or the record separator itself. Splitting the document into lines BEFORE
|
|
8
|
+
* honouring quotes (the old approach) shattered any quoted field that carried
|
|
9
|
+
* a newline (e.g. a column name containing `\n`), so the inverse threw or
|
|
10
|
+
* dropped data. Record separators are unquoted `\n`, `\r\n`, or `\r`.
|
|
11
|
+
*/
|
|
12
|
+
function parseCsvRecords(csv) {
|
|
13
|
+
const records = [];
|
|
14
|
+
let row = [];
|
|
15
|
+
let field = '';
|
|
16
|
+
let inQuotes = false;
|
|
17
|
+
let pending = false; // the current record has started (content or a separator seen)
|
|
18
|
+
const endField = () => {
|
|
19
|
+
row.push(field);
|
|
20
|
+
field = '';
|
|
21
|
+
pending = true;
|
|
22
|
+
};
|
|
23
|
+
const endRecord = () => {
|
|
24
|
+
endField();
|
|
25
|
+
records.push(row);
|
|
26
|
+
row = [];
|
|
27
|
+
pending = false;
|
|
28
|
+
};
|
|
29
|
+
for (let i = 0; i < csv.length; i += 1) {
|
|
30
|
+
const ch = csv[i];
|
|
31
|
+
if (inQuotes) {
|
|
32
|
+
if (ch === '"') {
|
|
33
|
+
if (csv[i + 1] === '"') {
|
|
34
|
+
field += '"';
|
|
35
|
+
i += 1;
|
|
36
|
+
}
|
|
37
|
+
else
|
|
38
|
+
inQuotes = false;
|
|
39
|
+
}
|
|
40
|
+
else
|
|
41
|
+
field += ch;
|
|
42
|
+
pending = true;
|
|
43
|
+
}
|
|
44
|
+
else if (ch === '"') {
|
|
45
|
+
inQuotes = true;
|
|
46
|
+
pending = true;
|
|
47
|
+
}
|
|
48
|
+
else if (ch === ',') {
|
|
49
|
+
endField();
|
|
50
|
+
}
|
|
51
|
+
else if (ch === '\n') {
|
|
52
|
+
endRecord();
|
|
53
|
+
}
|
|
54
|
+
else if (ch === '\r') {
|
|
55
|
+
endRecord();
|
|
56
|
+
if (csv[i + 1] === '\n')
|
|
57
|
+
i += 1;
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
field += ch;
|
|
61
|
+
pending = true;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
// Flush a trailing record unless the document ended exactly on a record
|
|
65
|
+
// separator (nothing pending), matching `out.join('\n')` having no trailer.
|
|
66
|
+
if (pending || field.length > 0 || row.length > 0)
|
|
67
|
+
endRecord();
|
|
68
|
+
return records;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Resolve a cell to its REAL value, dereferencing a dict index when the column
|
|
72
|
+
* is value-dictionary encoded. CSV/MD-KV are read-accuracy text formats, so they
|
|
73
|
+
* must show the value, not the index.
|
|
74
|
+
*/
|
|
75
|
+
function cellValue(table, r, c) {
|
|
76
|
+
const { cols, rows, dict } = table._table;
|
|
77
|
+
const raw = rows[r]?.[c];
|
|
78
|
+
const name = cols[c];
|
|
79
|
+
return dict && name !== undefined && Object.prototype.hasOwnProperty.call(dict, name)
|
|
80
|
+
? dict[name][raw]
|
|
81
|
+
: raw;
|
|
82
|
+
}
|
|
83
|
+
/** Encode a columnar table as CSV: a header row of column names, then one row per record. */
|
|
84
|
+
export function columnarToCsv(table) {
|
|
85
|
+
const { cols, rows, absent } = table._table;
|
|
86
|
+
const w = cols.length;
|
|
87
|
+
const absentSet = new Set(absent.map(([r, c]) => r * w + c));
|
|
88
|
+
const out = [cols.map(csvEscapeField).join(',')];
|
|
89
|
+
for (let r = 0; r < rows.length; r += 1) {
|
|
90
|
+
const cells = cols.map((_, c) =>
|
|
91
|
+
// Absent → a truly empty field; present → JSON-encoded value (always
|
|
92
|
+
// quoted by the escaper when it would otherwise be ambiguous).
|
|
93
|
+
absentSet.has(r * w + c) ? '' : csvEscapeField(JSON.stringify(cellValue(table, r, c))));
|
|
94
|
+
out.push(cells.join(','));
|
|
95
|
+
}
|
|
96
|
+
return out.join('\n');
|
|
97
|
+
}
|
|
98
|
+
/** Inverse of {@link columnarToCsv}: rebuild the original object array. */
|
|
99
|
+
export function csvToObjects(csv) {
|
|
100
|
+
const records = parseCsvRecords(csv);
|
|
101
|
+
if (records.length === 0)
|
|
102
|
+
return [];
|
|
103
|
+
const header = records[0] ?? [];
|
|
104
|
+
const out = [];
|
|
105
|
+
for (let r = 1; r < records.length; r += 1) {
|
|
106
|
+
const fields = records[r] ?? [];
|
|
107
|
+
// A present value is always JSON-encoded (non-empty: even "" becomes `""`),
|
|
108
|
+
// so a truly empty parsed field unambiguously means the key was absent.
|
|
109
|
+
const obj = {};
|
|
110
|
+
for (let c = 0; c < header.length; c += 1) {
|
|
111
|
+
const name = header[c];
|
|
112
|
+
if (name === undefined)
|
|
113
|
+
continue;
|
|
114
|
+
const field = fields[c];
|
|
115
|
+
if (field === undefined || field === '')
|
|
116
|
+
continue; // absent key
|
|
117
|
+
Object.defineProperty(obj, name, {
|
|
118
|
+
value: JSON.parse(field),
|
|
119
|
+
writable: true,
|
|
120
|
+
enumerable: true,
|
|
121
|
+
configurable: true,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
out.push(obj);
|
|
125
|
+
}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* A column name is safe to write bare in a `key: value` line only when it can't
|
|
130
|
+
* collide with the `: ` separator or the line/record structure. Otherwise we
|
|
131
|
+
* JSON-encode it (a leading `"` is the decoder's signal) so the key survives the
|
|
132
|
+
* round trip — `markdownKvToObjects` splits on the FIRST `: `, which a key
|
|
133
|
+
* containing `: ` or a newline would otherwise break.
|
|
134
|
+
*/
|
|
135
|
+
function markdownKeyNeedsQuoting(key) {
|
|
136
|
+
return key.includes(': ') || key.includes('\n') || key.includes('\r') || key.startsWith('"');
|
|
137
|
+
}
|
|
138
|
+
/** Encode a columnar table as Markdown key/value blocks, one block per record. */
|
|
139
|
+
export function columnarToMarkdownKv(table) {
|
|
140
|
+
const { cols, rows, absent } = table._table;
|
|
141
|
+
const w = cols.length;
|
|
142
|
+
const absentSet = new Set(absent.map(([r, c]) => r * w + c));
|
|
143
|
+
const blocks = [];
|
|
144
|
+
for (let r = 0; r < rows.length; r += 1) {
|
|
145
|
+
const present = [];
|
|
146
|
+
for (let c = 0; c < w; c += 1) {
|
|
147
|
+
if (absentSet.has(r * w + c))
|
|
148
|
+
continue;
|
|
149
|
+
const rawKey = String(cols[c]);
|
|
150
|
+
const keyText = markdownKeyNeedsQuoting(rawKey) ? JSON.stringify(rawKey) : rawKey;
|
|
151
|
+
present.push(`${keyText}: ${JSON.stringify(cellValue(table, r, c))}`);
|
|
152
|
+
}
|
|
153
|
+
// `- ` opens a record; remaining keys are indented two spaces.
|
|
154
|
+
const lines = present.map((line, idx) => (idx === 0 ? `- ${line}` : ` ${line}`));
|
|
155
|
+
blocks.push(lines.length > 0 ? lines.join('\n') : '-');
|
|
156
|
+
}
|
|
157
|
+
return blocks.join('\n');
|
|
158
|
+
}
|
|
159
|
+
/** Inverse of {@link columnarToMarkdownKv}: rebuild the original object array. */
|
|
160
|
+
export function markdownKvToObjects(md) {
|
|
161
|
+
const out = [];
|
|
162
|
+
let cur = null;
|
|
163
|
+
const commit = () => {
|
|
164
|
+
if (cur)
|
|
165
|
+
out.push(cur);
|
|
166
|
+
};
|
|
167
|
+
for (const rawLine of md.split(/\r?\n/)) {
|
|
168
|
+
const opensRecord = rawLine.startsWith('- ');
|
|
169
|
+
const line = opensRecord ? rawLine.slice(2) : rawLine.replace(/^\s+/, '');
|
|
170
|
+
if (opensRecord) {
|
|
171
|
+
commit();
|
|
172
|
+
cur = {};
|
|
173
|
+
}
|
|
174
|
+
if (rawLine === '-') {
|
|
175
|
+
commit();
|
|
176
|
+
cur = {};
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (!cur)
|
|
180
|
+
continue;
|
|
181
|
+
let key;
|
|
182
|
+
let value;
|
|
183
|
+
if (line.startsWith('"')) {
|
|
184
|
+
// JSON-encoded key (emitted by markdownKeyNeedsQuoting): read the JSON
|
|
185
|
+
// string token, then the `: ` separator, then the JSON value.
|
|
186
|
+
const end = jsonStringEnd(line);
|
|
187
|
+
if (end < 0)
|
|
188
|
+
continue;
|
|
189
|
+
const rest = line.slice(end);
|
|
190
|
+
if (!rest.startsWith(': '))
|
|
191
|
+
continue;
|
|
192
|
+
try {
|
|
193
|
+
key = JSON.parse(line.slice(0, end));
|
|
194
|
+
}
|
|
195
|
+
catch {
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
value = rest.slice(2);
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
const sep = line.indexOf(': ');
|
|
202
|
+
if (sep < 0)
|
|
203
|
+
continue;
|
|
204
|
+
key = line.slice(0, sep);
|
|
205
|
+
value = line.slice(sep + 2);
|
|
206
|
+
}
|
|
207
|
+
Object.defineProperty(cur, key, {
|
|
208
|
+
value: JSON.parse(value),
|
|
209
|
+
writable: true,
|
|
210
|
+
enumerable: true,
|
|
211
|
+
configurable: true,
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
commit();
|
|
215
|
+
return out;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Given a string whose char 0 is `"`, return the index just past the matching
|
|
219
|
+
* closing quote of that JSON string token (honouring `\"` escapes), or -1 if
|
|
220
|
+
* unterminated.
|
|
221
|
+
*/
|
|
222
|
+
function jsonStringEnd(s) {
|
|
223
|
+
for (let i = 1; i < s.length; i += 1) {
|
|
224
|
+
const ch = s[i];
|
|
225
|
+
if (ch === '\\') {
|
|
226
|
+
i += 1;
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
if (ch === '"')
|
|
230
|
+
return i + 1;
|
|
231
|
+
}
|
|
232
|
+
return -1;
|
|
233
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ICompressionResult } from '../result/compression-result.js';
|
|
2
|
+
import type { ICompressOptions } from '../result/compress-options.js';
|
|
3
|
+
/**
|
|
4
|
+
* Reduce a unified diff to its highest-signal lines. Two passes compose:
|
|
5
|
+
*
|
|
6
|
+
* 1. **Diff-noise offload** (this wrapper): lockfile sections
|
|
7
|
+
* (`package-lock.json` & friends) collapse to a one-line marker, and
|
|
8
|
+
* whitespace-only hunks (pure reindentation) collapse to a marker — the
|
|
9
|
+
* single largest sources of useless diff tokens. Both are CCR-recoverable.
|
|
10
|
+
* 2. **Core hunk compression** ({@link compressDiffCore}): the remaining real
|
|
11
|
+
* changes keep their changed lines plus a tight context window, capped per
|
|
12
|
+
* file.
|
|
13
|
+
*
|
|
14
|
+
* The offload pass only engages when a `diff --git` section is actually a
|
|
15
|
+
* lockfile or contains a whitespace-only hunk; every other diff routes straight
|
|
16
|
+
* to the core compressor, byte-identical to before. Recoverable via CCR; output
|
|
17
|
+
* favours LLM readability over `git apply` fidelity.
|
|
18
|
+
*/
|
|
19
|
+
export declare function compressDiff(text: string, opts?: ICompressOptions): ICompressionResult;
|
|
20
|
+
//# sourceMappingURL=compress-diff.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compress-diff.d.ts","sourceRoot":"","sources":["../../src/text/compress-diff.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AA0BtE;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,kBAAkB,CAiE1F"}
|