scxq2-cc 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +340 -0
- package/dist/base64.js +83 -0
- package/dist/canon.js +60 -0
- package/dist/cli.mjs +192 -0
- package/dist/engine.js +753 -0
- package/dist/index.d.ts +426 -0
- package/dist/index.js +48 -0
- package/dist/sha.js +71 -0
- package/dist/verify.js +480 -0
- package/dist/wasm-decoder.js +232 -0
- package/package.json +64 -0
- package/src/base64.js +83 -0
- package/src/canon.js +60 -0
- package/src/engine.js +753 -0
- package/src/index.js +48 -0
- package/src/sha.js +71 -0
package/src/engine.js
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SCXQ2 Compression Calculus Engine (CC-v1)
|
|
3
|
+
*
|
|
4
|
+
* A deterministic, proof-generating compression engine that produces
|
|
5
|
+
* content-addressable language packs. Implements the frozen SCXQ2 specification.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Single-lane and multi-lane compression
|
|
9
|
+
* - CC operators: NORM, DICT, FIELD, LANE, EDGE
|
|
10
|
+
* - Cryptographic proof of reversibility
|
|
11
|
+
* - Universal runtime (Node.js, Browser, Worker)
|
|
12
|
+
*
|
|
13
|
+
* @module @asx/scxq2-cc/engine
|
|
14
|
+
* @version 1.0.0
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { canon, strip } from "./canon.js";
|
|
18
|
+
import { sha256HexUtf8, sha256HexUtf8Sync, getNodeCrypto } from "./sha.js";
|
|
19
|
+
import { bytesToBase64, base64ToBytes } from "./base64.js";
|
|
20
|
+
|
|
21
|
+
/* =============================================================================
|
|
22
|
+
Engine Identity (FROZEN)
|
|
23
|
+
============================================================================= */
|
|
24
|
+
|
|
25
|
+
export const CC_ENGINE = Object.freeze({
|
|
26
|
+
"@id": "asx://cc/engine/scxq2.v1",
|
|
27
|
+
"@type": "cc.engine",
|
|
28
|
+
"@version": "1.0.0",
|
|
29
|
+
"@status": "frozen",
|
|
30
|
+
"$schema": "xjson://schema/core/v1"
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
export const SCXQ2_ENCODING = Object.freeze({
|
|
34
|
+
mode: "SCXQ2-DICT16-B64",
|
|
35
|
+
encoding: "SCXQ2-1"
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
export const CC_OPS = Object.freeze({
|
|
39
|
+
NORM: "cc.norm.v1",
|
|
40
|
+
DICT: "cc.dict.v1",
|
|
41
|
+
FIELD: "cc.field.v1",
|
|
42
|
+
LANE: "cc.lane.v1",
|
|
43
|
+
EDGE: "cc.edge.v1"
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
/* =============================================================================
|
|
47
|
+
Public API - Single Lane Compression
|
|
48
|
+
============================================================================= */
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Compresses input text into an SCXQ2 language pack.
|
|
52
|
+
* Async version uses WebCrypto for universal compatibility.
|
|
53
|
+
*
|
|
54
|
+
* @param {string|Uint8Array} input - Source text to compress
|
|
55
|
+
* @param {Object} [opts] - Compression options
|
|
56
|
+
* @param {number} [opts.maxDict=1024] - Maximum dictionary entries (1-65535)
|
|
57
|
+
* @param {number} [opts.minLen=3] - Minimum token length (2-128)
|
|
58
|
+
* @param {boolean} [opts.noStrings] - Skip string literal tokens
|
|
59
|
+
* @param {boolean} [opts.noWS] - Skip whitespace tokens
|
|
60
|
+
* @param {boolean} [opts.noPunct] - Skip punctuation tokens
|
|
61
|
+
* @param {boolean} [opts.enableFieldOps] - Enable JSON key extraction
|
|
62
|
+
* @param {boolean} [opts.enableEdgeOps] - Enable edge witnesses
|
|
63
|
+
* @param {string} [opts.created_utc] - ISO timestamp (auto-generated if omitted)
|
|
64
|
+
* @param {string} [opts.source_file] - Source file identifier
|
|
65
|
+
* @returns {Promise<CCResult>} Compression result with dict, block, proof, audit
|
|
66
|
+
*/
|
|
67
|
+
export async function ccCompress(input, opts = {}) {
|
|
68
|
+
const o = normalizeOpts(opts);
|
|
69
|
+
const src = canonicalizeInput(input, o);
|
|
70
|
+
|
|
71
|
+
const srcSha = await sha256HexUtf8(src);
|
|
72
|
+
const tokenStats = collectTokens(src, o);
|
|
73
|
+
const dict = buildDict(tokenStats, o);
|
|
74
|
+
|
|
75
|
+
const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
|
|
76
|
+
const b64 = enc.b64;
|
|
77
|
+
|
|
78
|
+
const dictJson = makeDictJson(dict, srcSha, o);
|
|
79
|
+
dictJson.dict_sha256_canon = await sha256HexUtf8(
|
|
80
|
+
canon(strip(dictJson, ["dict_sha256_canon"]))
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
const blockJson = makeBlockJson(
|
|
84
|
+
b64,
|
|
85
|
+
srcSha,
|
|
86
|
+
dictJson.dict_sha256_canon,
|
|
87
|
+
src,
|
|
88
|
+
o,
|
|
89
|
+
enc.edges
|
|
90
|
+
);
|
|
91
|
+
blockJson.block_sha256_canon = await sha256HexUtf8(
|
|
92
|
+
canon(strip(blockJson, ["block_sha256_canon"]))
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
const roundtrip = ccDecompress(dictJson, blockJson);
|
|
96
|
+
const rtSha = await sha256HexUtf8(roundtrip);
|
|
97
|
+
|
|
98
|
+
const proof = makeProof(srcSha, rtSha, dictJson, blockJson, o);
|
|
99
|
+
const audit = makeAudit(src, tokenStats, dictJson, blockJson, o);
|
|
100
|
+
|
|
101
|
+
return { dict: dictJson, block: blockJson, proof, audit };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Synchronous compression (Node.js only).
|
|
106
|
+
* Uses Node.js crypto module for hashing.
|
|
107
|
+
*
|
|
108
|
+
* @param {string|Uint8Array} input - Source text to compress
|
|
109
|
+
* @param {Object} [opts] - Compression options (same as ccCompress)
|
|
110
|
+
* @returns {CCResult} Compression result
|
|
111
|
+
*/
|
|
112
|
+
export function ccCompressSync(input, opts = {}) {
|
|
113
|
+
const node = getNodeCrypto();
|
|
114
|
+
if (!node) {
|
|
115
|
+
throw new Error("SCXQ2: ccCompressSync requires Node.js crypto module");
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const o = normalizeOpts(opts);
|
|
119
|
+
const src = canonicalizeInput(input, o);
|
|
120
|
+
|
|
121
|
+
const srcSha = sha256HexUtf8Sync(src, node);
|
|
122
|
+
const tokenStats = collectTokens(src, o);
|
|
123
|
+
const dict = buildDict(tokenStats, o);
|
|
124
|
+
|
|
125
|
+
const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
|
|
126
|
+
const b64 = enc.b64;
|
|
127
|
+
|
|
128
|
+
const dictJson = makeDictJson(dict, srcSha, o);
|
|
129
|
+
dictJson.dict_sha256_canon = sha256HexUtf8Sync(
|
|
130
|
+
canon(strip(dictJson, ["dict_sha256_canon"])),
|
|
131
|
+
node
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
const blockJson = makeBlockJson(
|
|
135
|
+
b64,
|
|
136
|
+
srcSha,
|
|
137
|
+
dictJson.dict_sha256_canon,
|
|
138
|
+
src,
|
|
139
|
+
o,
|
|
140
|
+
enc.edges
|
|
141
|
+
);
|
|
142
|
+
blockJson.block_sha256_canon = sha256HexUtf8Sync(
|
|
143
|
+
canon(strip(blockJson, ["block_sha256_canon"])),
|
|
144
|
+
node
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
const roundtrip = ccDecompress(dictJson, blockJson);
|
|
148
|
+
const rtSha = sha256HexUtf8Sync(roundtrip, node);
|
|
149
|
+
|
|
150
|
+
const proof = makeProof(srcSha, rtSha, dictJson, blockJson, o);
|
|
151
|
+
const audit = makeAudit(src, tokenStats, dictJson, blockJson, o);
|
|
152
|
+
|
|
153
|
+
return { dict: dictJson, block: blockJson, proof, audit };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/* =============================================================================
|
|
157
|
+
Public API - Multi-Lane Compression
|
|
158
|
+
============================================================================= */
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Compresses multiple lanes sharing a single dictionary.
|
|
162
|
+
*
|
|
163
|
+
* @param {Object} laneInput - Lane input object
|
|
164
|
+
* @param {Array<{lane_id: string, text: string|Uint8Array}>} laneInput.lanes
|
|
165
|
+
* @param {Object} [opts] - Compression options
|
|
166
|
+
* @returns {Promise<CCLanesResult>} Multi-lane compression result
|
|
167
|
+
*/
|
|
168
|
+
export async function ccCompressLanes(laneInput, opts = {}) {
|
|
169
|
+
const o = normalizeOpts(opts);
|
|
170
|
+
const lanes = normalizeLanes(laneInput);
|
|
171
|
+
|
|
172
|
+
// Build shared dictionary from all lanes
|
|
173
|
+
const joined = lanes
|
|
174
|
+
.map((l) => canonicalizeInput(l.text, o))
|
|
175
|
+
.join("\n\n/*__LANE_BREAK__*/\n\n");
|
|
176
|
+
const joinedSha = await sha256HexUtf8(joined);
|
|
177
|
+
|
|
178
|
+
const tokenStats = collectTokens(joined, o);
|
|
179
|
+
const dict = buildDict(tokenStats, o);
|
|
180
|
+
|
|
181
|
+
const dictJson = makeDictJson(dict, joinedSha, {
|
|
182
|
+
...o,
|
|
183
|
+
source_file: o.source_file ?? "lanes"
|
|
184
|
+
});
|
|
185
|
+
dictJson.dict_sha256_canon = await sha256HexUtf8(
|
|
186
|
+
canon(strip(dictJson, ["dict_sha256_canon"]))
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
// Encode each lane
|
|
190
|
+
const laneBlocks = [];
|
|
191
|
+
for (const lane of lanes) {
|
|
192
|
+
const src = canonicalizeInput(lane.text, o);
|
|
193
|
+
const srcSha = await sha256HexUtf8(src);
|
|
194
|
+
|
|
195
|
+
const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
|
|
196
|
+
const blockJson = makeLaneBlockJson(
|
|
197
|
+
lane.lane_id,
|
|
198
|
+
enc.b64,
|
|
199
|
+
srcSha,
|
|
200
|
+
dictJson.dict_sha256_canon,
|
|
201
|
+
src,
|
|
202
|
+
o,
|
|
203
|
+
enc.edges
|
|
204
|
+
);
|
|
205
|
+
blockJson.block_sha256_canon = await sha256HexUtf8(
|
|
206
|
+
canon(strip(blockJson, ["block_sha256_canon"]))
|
|
207
|
+
);
|
|
208
|
+
|
|
209
|
+
// Verify roundtrip
|
|
210
|
+
const rt = ccDecompress(dictJson, blockJson);
|
|
211
|
+
const rtSha = await sha256HexUtf8(rt);
|
|
212
|
+
if (rtSha !== srcSha) {
|
|
213
|
+
throw new Error(`SCXQ2: lane roundtrip mismatch: ${lane.lane_id}`);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
laneBlocks.push(blockJson);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const proof = {
|
|
220
|
+
"@type": "cc.lanes.proof",
|
|
221
|
+
"@version": "1.0.0",
|
|
222
|
+
engine: CC_ENGINE["@id"],
|
|
223
|
+
created_utc: o.created_utc,
|
|
224
|
+
dict_sha256_canon: dictJson.dict_sha256_canon,
|
|
225
|
+
lanes: laneBlocks.map((b) => ({
|
|
226
|
+
lane_id: b.lane_id,
|
|
227
|
+
source_sha256_utf8: b.source_sha256_utf8,
|
|
228
|
+
block_sha256_canon: b.block_sha256_canon
|
|
229
|
+
})),
|
|
230
|
+
ok: true,
|
|
231
|
+
steps: [
|
|
232
|
+
{ op: CC_OPS.LANE, lanes: laneBlocks.length },
|
|
233
|
+
{ op: CC_OPS.DICT, dict_entries: dictJson.dict.length }
|
|
234
|
+
]
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
const audit = {
|
|
238
|
+
"@type": "cc.lanes.audit",
|
|
239
|
+
"@version": "1.0.0",
|
|
240
|
+
engine: CC_ENGINE["@id"],
|
|
241
|
+
created_utc: o.created_utc,
|
|
242
|
+
dict_entries: dictJson.dict.length,
|
|
243
|
+
lane_count: laneBlocks.length
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
return { dict: dictJson, lanes: laneBlocks, proof, audit };
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Synchronous multi-lane compression (Node.js only).
|
|
251
|
+
*
|
|
252
|
+
* @param {Object} laneInput - Lane input object
|
|
253
|
+
* @param {Object} [opts] - Compression options
|
|
254
|
+
* @returns {CCLanesResult} Multi-lane compression result
|
|
255
|
+
*/
|
|
256
|
+
export function ccCompressLanesSync(laneInput, opts = {}) {
|
|
257
|
+
const node = getNodeCrypto();
|
|
258
|
+
if (!node) {
|
|
259
|
+
throw new Error("SCXQ2: ccCompressLanesSync requires Node.js crypto");
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const o = normalizeOpts(opts);
|
|
263
|
+
const lanes = normalizeLanes(laneInput);
|
|
264
|
+
|
|
265
|
+
const joined = lanes
|
|
266
|
+
.map((l) => canonicalizeInput(l.text, o))
|
|
267
|
+
.join("\n\n/*__LANE_BREAK__*/\n\n");
|
|
268
|
+
const joinedSha = sha256HexUtf8Sync(joined, node);
|
|
269
|
+
|
|
270
|
+
const tokenStats = collectTokens(joined, o);
|
|
271
|
+
const dict = buildDict(tokenStats, o);
|
|
272
|
+
|
|
273
|
+
const dictJson = makeDictJson(dict, joinedSha, {
|
|
274
|
+
...o,
|
|
275
|
+
source_file: o.source_file ?? "lanes"
|
|
276
|
+
});
|
|
277
|
+
dictJson.dict_sha256_canon = sha256HexUtf8Sync(
|
|
278
|
+
canon(strip(dictJson, ["dict_sha256_canon"])),
|
|
279
|
+
node
|
|
280
|
+
);
|
|
281
|
+
|
|
282
|
+
const laneBlocks = [];
|
|
283
|
+
for (const lane of lanes) {
|
|
284
|
+
const src = canonicalizeInput(lane.text, o);
|
|
285
|
+
const srcSha = sha256HexUtf8Sync(src, node);
|
|
286
|
+
|
|
287
|
+
const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
|
|
288
|
+
const blockJson = makeLaneBlockJson(
|
|
289
|
+
lane.lane_id,
|
|
290
|
+
enc.b64,
|
|
291
|
+
srcSha,
|
|
292
|
+
dictJson.dict_sha256_canon,
|
|
293
|
+
src,
|
|
294
|
+
o,
|
|
295
|
+
enc.edges
|
|
296
|
+
);
|
|
297
|
+
blockJson.block_sha256_canon = sha256HexUtf8Sync(
|
|
298
|
+
canon(strip(blockJson, ["block_sha256_canon"])),
|
|
299
|
+
node
|
|
300
|
+
);
|
|
301
|
+
|
|
302
|
+
const rt = ccDecompress(dictJson, blockJson);
|
|
303
|
+
const rtSha = sha256HexUtf8Sync(rt, node);
|
|
304
|
+
if (rtSha !== srcSha) {
|
|
305
|
+
throw new Error(`SCXQ2: lane roundtrip mismatch: ${lane.lane_id}`);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
laneBlocks.push(blockJson);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
const proof = {
|
|
312
|
+
"@type": "cc.lanes.proof",
|
|
313
|
+
"@version": "1.0.0",
|
|
314
|
+
engine: CC_ENGINE["@id"],
|
|
315
|
+
created_utc: o.created_utc,
|
|
316
|
+
dict_sha256_canon: dictJson.dict_sha256_canon,
|
|
317
|
+
lanes: laneBlocks.map((b) => ({
|
|
318
|
+
lane_id: b.lane_id,
|
|
319
|
+
source_sha256_utf8: b.source_sha256_utf8,
|
|
320
|
+
block_sha256_canon: b.block_sha256_canon
|
|
321
|
+
})),
|
|
322
|
+
ok: true
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
const audit = {
|
|
326
|
+
"@type": "cc.lanes.audit",
|
|
327
|
+
"@version": "1.0.0",
|
|
328
|
+
engine: CC_ENGINE["@id"],
|
|
329
|
+
created_utc: o.created_utc,
|
|
330
|
+
dict_entries: dictJson.dict.length,
|
|
331
|
+
lane_count: laneBlocks.length
|
|
332
|
+
};
|
|
333
|
+
|
|
334
|
+
return { dict: dictJson, lanes: laneBlocks, proof, audit };
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
/* =============================================================================
|
|
338
|
+
Public API - Decompression
|
|
339
|
+
============================================================================= */
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Decompresses an SCXQ2 block using its dictionary.
|
|
343
|
+
*
|
|
344
|
+
* @param {SCXQ2Dict} dictJson - SCXQ2 dictionary object
|
|
345
|
+
* @param {SCXQ2Block} blockJson - SCXQ2 block object
|
|
346
|
+
* @returns {string} Decompressed text
|
|
347
|
+
*/
|
|
348
|
+
export function ccDecompress(dictJson, blockJson) {
|
|
349
|
+
verifyPack(dictJson, blockJson);
|
|
350
|
+
|
|
351
|
+
const dict = dictJson.dict;
|
|
352
|
+
const bytes = base64ToBytes(blockJson.b64);
|
|
353
|
+
|
|
354
|
+
let out = "";
|
|
355
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
356
|
+
const b = bytes[i];
|
|
357
|
+
|
|
358
|
+
// Dictionary reference: 0x80 [hi] [lo]
|
|
359
|
+
if (b === 0x80) {
|
|
360
|
+
const idx = (bytes[++i] << 8) | bytes[++i];
|
|
361
|
+
const tok = dict[idx];
|
|
362
|
+
if (typeof tok !== "string") {
|
|
363
|
+
throw new Error(`SCXQ2: invalid dict reference at index ${idx}`);
|
|
364
|
+
}
|
|
365
|
+
out += tok;
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// UTF-16 literal: 0x81 [hi] [lo]
|
|
370
|
+
if (b === 0x81) {
|
|
371
|
+
out += String.fromCharCode((bytes[++i] << 8) | bytes[++i]);
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// ASCII literal: byte < 128
|
|
376
|
+
out += String.fromCharCode(b);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
return out;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/* =============================================================================
|
|
383
|
+
Public API - Verification
|
|
384
|
+
============================================================================= */
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Verifies structural validity of an SCXQ2 pack.
|
|
388
|
+
*
|
|
389
|
+
* @param {SCXQ2Dict} dictJson - Dictionary object
|
|
390
|
+
* @param {SCXQ2Block} blockJson - Block object
|
|
391
|
+
* @returns {{ok: true}} Success indicator
|
|
392
|
+
* @throws {Error} On verification failure
|
|
393
|
+
*/
|
|
394
|
+
export function verifyPack(dictJson, blockJson) {
|
|
395
|
+
if (!dictJson || typeof dictJson !== "object") {
|
|
396
|
+
throw new Error("SCXQ2: missing dict");
|
|
397
|
+
}
|
|
398
|
+
if (!blockJson || typeof blockJson !== "object") {
|
|
399
|
+
throw new Error("SCXQ2: missing block");
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if (dictJson["@type"] !== "scxq2.dict") {
|
|
403
|
+
throw new Error("SCXQ2: invalid dict @type");
|
|
404
|
+
}
|
|
405
|
+
if (blockJson["@type"] !== "scxq2.block") {
|
|
406
|
+
throw new Error("SCXQ2: invalid block @type");
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (!Array.isArray(dictJson.dict)) {
|
|
410
|
+
throw new Error("SCXQ2: dict must be array");
|
|
411
|
+
}
|
|
412
|
+
if (typeof blockJson.b64 !== "string") {
|
|
413
|
+
throw new Error("SCXQ2: block b64 must be string");
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if (dictJson.mode !== SCXQ2_ENCODING.mode) {
|
|
417
|
+
throw new Error("SCXQ2: invalid dict mode");
|
|
418
|
+
}
|
|
419
|
+
if (blockJson.mode !== SCXQ2_ENCODING.mode) {
|
|
420
|
+
throw new Error("SCXQ2: invalid block mode");
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
if (dictJson.encoding !== SCXQ2_ENCODING.encoding) {
|
|
424
|
+
throw new Error("SCXQ2: invalid dict encoding");
|
|
425
|
+
}
|
|
426
|
+
if (blockJson.encoding !== SCXQ2_ENCODING.encoding) {
|
|
427
|
+
throw new Error("SCXQ2: invalid block encoding");
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// Verify dictionary linkage
|
|
431
|
+
if (
|
|
432
|
+
blockJson.dict_sha256_canon &&
|
|
433
|
+
dictJson.dict_sha256_canon &&
|
|
434
|
+
blockJson.dict_sha256_canon !== dictJson.dict_sha256_canon
|
|
435
|
+
) {
|
|
436
|
+
throw new Error("SCXQ2: dict linkage mismatch");
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
return { ok: true };
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/* =============================================================================
|
|
443
|
+
Internal - Options Normalization
|
|
444
|
+
============================================================================= */
|
|
445
|
+
|
|
446
|
+
function normalizeOpts(opts) {
|
|
447
|
+
return {
|
|
448
|
+
maxDict: clamp(opts.maxDict ?? 1024, 1, 65535),
|
|
449
|
+
minLen: clamp(opts.minLen ?? 3, 2, 128),
|
|
450
|
+
created_utc: opts.created_utc ?? isoUtc(),
|
|
451
|
+
source_file: opts.source_file ?? null,
|
|
452
|
+
enableFieldOps: !!opts.enableFieldOps,
|
|
453
|
+
enableEdgeOps: !!opts.enableEdgeOps,
|
|
454
|
+
flags: {
|
|
455
|
+
noStrings: !!opts.noStrings,
|
|
456
|
+
noWS: !!opts.noWS,
|
|
457
|
+
noPunct: !!opts.noPunct
|
|
458
|
+
}
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
function normalizeLanes(laneInput) {
|
|
463
|
+
if (!laneInput || typeof laneInput !== "object") {
|
|
464
|
+
throw new Error("SCXQ2: lanes input invalid");
|
|
465
|
+
}
|
|
466
|
+
const lanes = laneInput.lanes;
|
|
467
|
+
if (!Array.isArray(lanes) || lanes.length === 0) {
|
|
468
|
+
throw new Error("SCXQ2: lanes missing");
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
const out = [];
|
|
472
|
+
for (const l of lanes) {
|
|
473
|
+
const lane_id = String(l?.lane_id ?? "").trim();
|
|
474
|
+
if (!lane_id) {
|
|
475
|
+
throw new Error("SCXQ2: lane_id missing");
|
|
476
|
+
}
|
|
477
|
+
const text = l?.text;
|
|
478
|
+
if (typeof text !== "string" && !(text instanceof Uint8Array)) {
|
|
479
|
+
throw new Error("SCXQ2: lane text invalid");
|
|
480
|
+
}
|
|
481
|
+
out.push({ lane_id, text });
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Deterministic lane order
|
|
485
|
+
out.sort((a, b) =>
|
|
486
|
+
a.lane_id < b.lane_id ? -1 : a.lane_id > b.lane_id ? 1 : 0
|
|
487
|
+
);
|
|
488
|
+
return out;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/* =============================================================================
|
|
492
|
+
Internal - Input Canonicalization (CC.NORM)
|
|
493
|
+
============================================================================= */
|
|
494
|
+
|
|
495
|
+
function canonicalizeInput(input, o) {
|
|
496
|
+
let s;
|
|
497
|
+
if (typeof input === "string") {
|
|
498
|
+
s = input;
|
|
499
|
+
} else if (input instanceof Uint8Array) {
|
|
500
|
+
s = new TextDecoder("utf-8", { fatal: false }).decode(input);
|
|
501
|
+
} else {
|
|
502
|
+
throw new Error("SCXQ2: input must be string or Uint8Array");
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Normalize newlines deterministically
|
|
506
|
+
s = s.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
|
507
|
+
return s;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
/* =============================================================================
|
|
511
|
+
Internal - Token Collection (CC.DICT + CC.FIELD)
|
|
512
|
+
============================================================================= */
|
|
513
|
+
|
|
514
|
+
function collectTokens(text, o) {
|
|
515
|
+
const freq = new Map();
|
|
516
|
+
|
|
517
|
+
const add = (tok) => {
|
|
518
|
+
if (!tok || tok.length < o.minLen) return;
|
|
519
|
+
if (tok.indexOf("\u0000") >= 0) return;
|
|
520
|
+
freq.set(tok, (freq.get(tok) || 0) + 1);
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
// Identifiers / words
|
|
524
|
+
for (const m of text.matchAll(/[A-Za-z_$][A-Za-z0-9_$]{2,}/g)) {
|
|
525
|
+
add(m[0]);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Whitespace runs
|
|
529
|
+
if (!o.flags.noWS) {
|
|
530
|
+
for (const m of text.matchAll(/[ \t]{2,}/g)) {
|
|
531
|
+
add(m[0]);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// Punctuation clusters
|
|
536
|
+
if (!o.flags.noPunct) {
|
|
537
|
+
for (const m of text.matchAll(/[{}()[\];,.=:+\-*/<>!&|%^]{2,}/g)) {
|
|
538
|
+
add(m[0]);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// String literal contents
|
|
543
|
+
if (!o.flags.noStrings) {
|
|
544
|
+
for (const m of text.matchAll(/"([^"\n]{3,64})"|'([^'\n]{3,64})'/g)) {
|
|
545
|
+
const candidate = (m[1] || m[2] || "").trim();
|
|
546
|
+
if (candidate) add(candidate);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// FIELD operator: JSON keys
|
|
551
|
+
if (o.enableFieldOps) {
|
|
552
|
+
for (const m of text.matchAll(/"([^"\\\n]{1,64})"\s*:/g)) {
|
|
553
|
+
const k = m[1];
|
|
554
|
+
if (k) {
|
|
555
|
+
add(k);
|
|
556
|
+
add(`"${k}"`);
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Score by estimated savings
|
|
562
|
+
const scored = [];
|
|
563
|
+
for (const [tok, count] of freq.entries()) {
|
|
564
|
+
if (count < 2) continue;
|
|
565
|
+
const tokenBytes = estimateBytes(tok);
|
|
566
|
+
const savings = (tokenBytes - 3) * count;
|
|
567
|
+
if (savings > 0) {
|
|
568
|
+
scored.push({ tok, count, totalSavings: savings });
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
scored.sort(
|
|
573
|
+
(a, b) =>
|
|
574
|
+
b.totalSavings - a.totalSavings ||
|
|
575
|
+
b.tok.length - a.tok.length ||
|
|
576
|
+
(a.tok < b.tok ? -1 : 1)
|
|
577
|
+
);
|
|
578
|
+
|
|
579
|
+
return scored;
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
/* =============================================================================
|
|
583
|
+
Internal - Dictionary Build
|
|
584
|
+
============================================================================= */
|
|
585
|
+
|
|
586
|
+
function buildDict(scored, o) {
|
|
587
|
+
const dict = [];
|
|
588
|
+
for (const t of scored) {
|
|
589
|
+
if (dict.length >= o.maxDict) break;
|
|
590
|
+
dict.push(t.tok);
|
|
591
|
+
}
|
|
592
|
+
// Longest-first for greedy matching
|
|
593
|
+
dict.sort((a, b) => b.length - a.length || (a < b ? -1 : 1));
|
|
594
|
+
return dict;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
/* =============================================================================
|
|
598
|
+
Internal - SCXQ2 Encoding
|
|
599
|
+
============================================================================= */
|
|
600
|
+
|
|
601
|
+
function encodeSCXQ2(text, dict, opts = {}) {
|
|
602
|
+
const map = new Map(dict.map((t, i) => [t, i]));
|
|
603
|
+
const bytes = [];
|
|
604
|
+
const edges = opts.enableEdgeOps ? [] : null;
|
|
605
|
+
let lastDictIdx = -1;
|
|
606
|
+
|
|
607
|
+
for (let i = 0; i < text.length; ) {
|
|
608
|
+
let matched = false;
|
|
609
|
+
|
|
610
|
+
for (const tok of dict) {
|
|
611
|
+
if (text.startsWith(tok, i)) {
|
|
612
|
+
const idx = map.get(tok);
|
|
613
|
+
bytes.push(0x80, idx >> 8, idx & 255);
|
|
614
|
+
i += tok.length;
|
|
615
|
+
matched = true;
|
|
616
|
+
|
|
617
|
+
// Edge witness
|
|
618
|
+
if (edges && lastDictIdx >= 0) {
|
|
619
|
+
edges.push([lastDictIdx, idx]);
|
|
620
|
+
}
|
|
621
|
+
lastDictIdx = idx;
|
|
622
|
+
break;
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (!matched) {
|
|
627
|
+
const c = text.charCodeAt(i++);
|
|
628
|
+
if (c < 128) {
|
|
629
|
+
bytes.push(c);
|
|
630
|
+
} else {
|
|
631
|
+
bytes.push(0x81, c >> 8, c & 255);
|
|
632
|
+
}
|
|
633
|
+
lastDictIdx = -1;
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
return { bytes, b64: bytesToBase64(bytes), edges };
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
/* =============================================================================
|
|
641
|
+
Internal - Utilities
|
|
642
|
+
============================================================================= */
|
|
643
|
+
|
|
644
|
+
function estimateBytes(s) {
|
|
645
|
+
let bytes = 0;
|
|
646
|
+
for (let i = 0; i < s.length; i++) {
|
|
647
|
+
bytes += s.charCodeAt(i) < 128 ? 1 : 3;
|
|
648
|
+
}
|
|
649
|
+
return bytes;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
function utf8Bytes(s) {
|
|
653
|
+
return new TextEncoder().encode(s).length;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
function clamp(v, lo, hi) {
|
|
657
|
+
return Math.min(hi, Math.max(lo, v));
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
function isoUtc() {
|
|
661
|
+
return new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
/* =============================================================================
|
|
665
|
+
Internal - JSON Emitters
|
|
666
|
+
============================================================================= */
|
|
667
|
+
|
|
668
|
+
function makeDictJson(dict, srcSha, o) {
|
|
669
|
+
return {
|
|
670
|
+
"@type": "scxq2.dict",
|
|
671
|
+
"@version": "1.0.0",
|
|
672
|
+
mode: SCXQ2_ENCODING.mode,
|
|
673
|
+
encoding: SCXQ2_ENCODING.encoding,
|
|
674
|
+
created_utc: o.created_utc,
|
|
675
|
+
source_sha256_utf8: srcSha,
|
|
676
|
+
max_dict: o.maxDict,
|
|
677
|
+
min_len: o.minLen,
|
|
678
|
+
flags: o.flags,
|
|
679
|
+
dict
|
|
680
|
+
};
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
function makeBlockJson(b64, srcSha, dictSha, src, o, edges) {
|
|
684
|
+
const block = {
|
|
685
|
+
"@type": "scxq2.block",
|
|
686
|
+
"@version": "1.0.0",
|
|
687
|
+
mode: SCXQ2_ENCODING.mode,
|
|
688
|
+
encoding: SCXQ2_ENCODING.encoding,
|
|
689
|
+
created_utc: o.created_utc,
|
|
690
|
+
source_sha256_utf8: srcSha,
|
|
691
|
+
dict_sha256_canon: dictSha,
|
|
692
|
+
original_bytes_utf8: utf8Bytes(src),
|
|
693
|
+
b64
|
|
694
|
+
};
|
|
695
|
+
if (edges && edges.length > 0) {
|
|
696
|
+
block.edges = edges.slice(0, 1000); // Limit edge witnesses
|
|
697
|
+
}
|
|
698
|
+
return block;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
function makeLaneBlockJson(laneId, b64, srcSha, dictSha, src, o, edges) {
|
|
702
|
+
const block = makeBlockJson(b64, srcSha, dictSha, src, o, edges);
|
|
703
|
+
block.lane_id = laneId;
|
|
704
|
+
return block;
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
function makeProof(srcSha, rtSha, dictJson, blockJson, o) {
|
|
708
|
+
return {
|
|
709
|
+
"@type": "cc.proof",
|
|
710
|
+
"@version": "1.0.0",
|
|
711
|
+
engine: CC_ENGINE["@id"],
|
|
712
|
+
created_utc: o.created_utc,
|
|
713
|
+
source_sha256_utf8: srcSha,
|
|
714
|
+
dict_sha256_canon: dictJson.dict_sha256_canon,
|
|
715
|
+
block_sha256_canon: blockJson.block_sha256_canon,
|
|
716
|
+
roundtrip_sha256_utf8: rtSha,
|
|
717
|
+
ok: srcSha === rtSha,
|
|
718
|
+
steps: [
|
|
719
|
+
{ op: CC_OPS.NORM, sha: srcSha },
|
|
720
|
+
{ op: CC_OPS.DICT, dict_entries: dictJson.dict.length },
|
|
721
|
+
{ op: "scxq2.encode.v1", block_sha: blockJson.block_sha256_canon },
|
|
722
|
+
{ op: "scxq2.decode.v1", roundtrip_sha: rtSha }
|
|
723
|
+
]
|
|
724
|
+
};
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
function makeAudit(src, tokenStats, dictJson, blockJson, o) {
|
|
728
|
+
const srcBytes = utf8Bytes(src);
|
|
729
|
+
const b64Bytes = utf8Bytes(blockJson.b64);
|
|
730
|
+
|
|
731
|
+
return {
|
|
732
|
+
"@type": "cc.audit",
|
|
733
|
+
"@version": "1.0.0",
|
|
734
|
+
engine: CC_ENGINE["@id"],
|
|
735
|
+
created_utc: o.created_utc,
|
|
736
|
+
sizes: {
|
|
737
|
+
original_bytes_utf8: srcBytes,
|
|
738
|
+
encoded_b64_bytes_utf8: b64Bytes,
|
|
739
|
+
ratio: srcBytes ? Number((b64Bytes / srcBytes).toFixed(6)) : null
|
|
740
|
+
},
|
|
741
|
+
dict: {
|
|
742
|
+
entries: dictJson.dict.length,
|
|
743
|
+
max_dict: dictJson.max_dict,
|
|
744
|
+
min_len: dictJson.min_len,
|
|
745
|
+
flags: dictJson.flags
|
|
746
|
+
},
|
|
747
|
+
top_tokens: tokenStats.slice(0, 25).map((t) => ({
|
|
748
|
+
tok: t.tok,
|
|
749
|
+
count: t.count,
|
|
750
|
+
totalSavings: t.totalSavings
|
|
751
|
+
}))
|
|
752
|
+
};
|
|
753
|
+
}
|