scxq2-cc 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/engine.js ADDED
@@ -0,0 +1,753 @@
1
+ /**
2
+ * SCXQ2 Compression Calculus Engine (CC-v1)
3
+ *
4
+ * A deterministic, proof-generating compression engine that produces
5
+ * content-addressable language packs. Implements the frozen SCXQ2 specification.
6
+ *
7
+ * Features:
8
+ * - Single-lane and multi-lane compression
9
+ * - CC operators: NORM, DICT, FIELD, LANE, EDGE
10
+ * - Cryptographic proof of reversibility
11
+ * - Universal runtime (Node.js, Browser, Worker)
12
+ *
13
+ * @module @asx/scxq2-cc/engine
14
+ * @version 1.0.0
15
+ */
16
+
17
+ import { canon, strip } from "./canon.js";
18
+ import { sha256HexUtf8, sha256HexUtf8Sync, getNodeCrypto } from "./sha.js";
19
+ import { bytesToBase64, base64ToBytes } from "./base64.js";
20
+
21
+ /* =============================================================================
22
+ Engine Identity (FROZEN)
23
+ ============================================================================= */
24
+
25
+ export const CC_ENGINE = Object.freeze({
26
+ "@id": "asx://cc/engine/scxq2.v1",
27
+ "@type": "cc.engine",
28
+ "@version": "1.0.0",
29
+ "@status": "frozen",
30
+ "$schema": "xjson://schema/core/v1"
31
+ });
32
+
33
+ export const SCXQ2_ENCODING = Object.freeze({
34
+ mode: "SCXQ2-DICT16-B64",
35
+ encoding: "SCXQ2-1"
36
+ });
37
+
38
+ export const CC_OPS = Object.freeze({
39
+ NORM: "cc.norm.v1",
40
+ DICT: "cc.dict.v1",
41
+ FIELD: "cc.field.v1",
42
+ LANE: "cc.lane.v1",
43
+ EDGE: "cc.edge.v1"
44
+ });
45
+
46
+ /* =============================================================================
47
+ Public API - Single Lane Compression
48
+ ============================================================================= */
49
+
50
+ /**
51
+ * Compresses input text into an SCXQ2 language pack.
52
+ * Async version uses WebCrypto for universal compatibility.
53
+ *
54
+ * @param {string|Uint8Array} input - Source text to compress
55
+ * @param {Object} [opts] - Compression options
56
+ * @param {number} [opts.maxDict=1024] - Maximum dictionary entries (1-65535)
57
+ * @param {number} [opts.minLen=3] - Minimum token length (2-128)
58
+ * @param {boolean} [opts.noStrings] - Skip string literal tokens
59
+ * @param {boolean} [opts.noWS] - Skip whitespace tokens
60
+ * @param {boolean} [opts.noPunct] - Skip punctuation tokens
61
+ * @param {boolean} [opts.enableFieldOps] - Enable JSON key extraction
62
+ * @param {boolean} [opts.enableEdgeOps] - Enable edge witnesses
63
+ * @param {string} [opts.created_utc] - ISO timestamp (auto-generated if omitted)
64
+ * @param {string} [opts.source_file] - Source file identifier
65
+ * @returns {Promise<CCResult>} Compression result with dict, block, proof, audit
66
+ */
67
+ export async function ccCompress(input, opts = {}) {
68
+ const o = normalizeOpts(opts);
69
+ const src = canonicalizeInput(input, o);
70
+
71
+ const srcSha = await sha256HexUtf8(src);
72
+ const tokenStats = collectTokens(src, o);
73
+ const dict = buildDict(tokenStats, o);
74
+
75
+ const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
76
+ const b64 = enc.b64;
77
+
78
+ const dictJson = makeDictJson(dict, srcSha, o);
79
+ dictJson.dict_sha256_canon = await sha256HexUtf8(
80
+ canon(strip(dictJson, ["dict_sha256_canon"]))
81
+ );
82
+
83
+ const blockJson = makeBlockJson(
84
+ b64,
85
+ srcSha,
86
+ dictJson.dict_sha256_canon,
87
+ src,
88
+ o,
89
+ enc.edges
90
+ );
91
+ blockJson.block_sha256_canon = await sha256HexUtf8(
92
+ canon(strip(blockJson, ["block_sha256_canon"]))
93
+ );
94
+
95
+ const roundtrip = ccDecompress(dictJson, blockJson);
96
+ const rtSha = await sha256HexUtf8(roundtrip);
97
+
98
+ const proof = makeProof(srcSha, rtSha, dictJson, blockJson, o);
99
+ const audit = makeAudit(src, tokenStats, dictJson, blockJson, o);
100
+
101
+ return { dict: dictJson, block: blockJson, proof, audit };
102
+ }
103
+
104
+ /**
105
+ * Synchronous compression (Node.js only).
106
+ * Uses Node.js crypto module for hashing.
107
+ *
108
+ * @param {string|Uint8Array} input - Source text to compress
109
+ * @param {Object} [opts] - Compression options (same as ccCompress)
110
+ * @returns {CCResult} Compression result
111
+ */
112
+ export function ccCompressSync(input, opts = {}) {
113
+ const node = getNodeCrypto();
114
+ if (!node) {
115
+ throw new Error("SCXQ2: ccCompressSync requires Node.js crypto module");
116
+ }
117
+
118
+ const o = normalizeOpts(opts);
119
+ const src = canonicalizeInput(input, o);
120
+
121
+ const srcSha = sha256HexUtf8Sync(src, node);
122
+ const tokenStats = collectTokens(src, o);
123
+ const dict = buildDict(tokenStats, o);
124
+
125
+ const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
126
+ const b64 = enc.b64;
127
+
128
+ const dictJson = makeDictJson(dict, srcSha, o);
129
+ dictJson.dict_sha256_canon = sha256HexUtf8Sync(
130
+ canon(strip(dictJson, ["dict_sha256_canon"])),
131
+ node
132
+ );
133
+
134
+ const blockJson = makeBlockJson(
135
+ b64,
136
+ srcSha,
137
+ dictJson.dict_sha256_canon,
138
+ src,
139
+ o,
140
+ enc.edges
141
+ );
142
+ blockJson.block_sha256_canon = sha256HexUtf8Sync(
143
+ canon(strip(blockJson, ["block_sha256_canon"])),
144
+ node
145
+ );
146
+
147
+ const roundtrip = ccDecompress(dictJson, blockJson);
148
+ const rtSha = sha256HexUtf8Sync(roundtrip, node);
149
+
150
+ const proof = makeProof(srcSha, rtSha, dictJson, blockJson, o);
151
+ const audit = makeAudit(src, tokenStats, dictJson, blockJson, o);
152
+
153
+ return { dict: dictJson, block: blockJson, proof, audit };
154
+ }
155
+
156
+ /* =============================================================================
157
+ Public API - Multi-Lane Compression
158
+ ============================================================================= */
159
+
160
+ /**
161
+ * Compresses multiple lanes sharing a single dictionary.
162
+ *
163
+ * @param {Object} laneInput - Lane input object
164
+ * @param {Array<{lane_id: string, text: string|Uint8Array}>} laneInput.lanes
165
+ * @param {Object} [opts] - Compression options
166
+ * @returns {Promise<CCLanesResult>} Multi-lane compression result
167
+ */
168
+ export async function ccCompressLanes(laneInput, opts = {}) {
169
+ const o = normalizeOpts(opts);
170
+ const lanes = normalizeLanes(laneInput);
171
+
172
+ // Build shared dictionary from all lanes
173
+ const joined = lanes
174
+ .map((l) => canonicalizeInput(l.text, o))
175
+ .join("\n\n/*__LANE_BREAK__*/\n\n");
176
+ const joinedSha = await sha256HexUtf8(joined);
177
+
178
+ const tokenStats = collectTokens(joined, o);
179
+ const dict = buildDict(tokenStats, o);
180
+
181
+ const dictJson = makeDictJson(dict, joinedSha, {
182
+ ...o,
183
+ source_file: o.source_file ?? "lanes"
184
+ });
185
+ dictJson.dict_sha256_canon = await sha256HexUtf8(
186
+ canon(strip(dictJson, ["dict_sha256_canon"]))
187
+ );
188
+
189
+ // Encode each lane
190
+ const laneBlocks = [];
191
+ for (const lane of lanes) {
192
+ const src = canonicalizeInput(lane.text, o);
193
+ const srcSha = await sha256HexUtf8(src);
194
+
195
+ const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
196
+ const blockJson = makeLaneBlockJson(
197
+ lane.lane_id,
198
+ enc.b64,
199
+ srcSha,
200
+ dictJson.dict_sha256_canon,
201
+ src,
202
+ o,
203
+ enc.edges
204
+ );
205
+ blockJson.block_sha256_canon = await sha256HexUtf8(
206
+ canon(strip(blockJson, ["block_sha256_canon"]))
207
+ );
208
+
209
+ // Verify roundtrip
210
+ const rt = ccDecompress(dictJson, blockJson);
211
+ const rtSha = await sha256HexUtf8(rt);
212
+ if (rtSha !== srcSha) {
213
+ throw new Error(`SCXQ2: lane roundtrip mismatch: ${lane.lane_id}`);
214
+ }
215
+
216
+ laneBlocks.push(blockJson);
217
+ }
218
+
219
+ const proof = {
220
+ "@type": "cc.lanes.proof",
221
+ "@version": "1.0.0",
222
+ engine: CC_ENGINE["@id"],
223
+ created_utc: o.created_utc,
224
+ dict_sha256_canon: dictJson.dict_sha256_canon,
225
+ lanes: laneBlocks.map((b) => ({
226
+ lane_id: b.lane_id,
227
+ source_sha256_utf8: b.source_sha256_utf8,
228
+ block_sha256_canon: b.block_sha256_canon
229
+ })),
230
+ ok: true,
231
+ steps: [
232
+ { op: CC_OPS.LANE, lanes: laneBlocks.length },
233
+ { op: CC_OPS.DICT, dict_entries: dictJson.dict.length }
234
+ ]
235
+ };
236
+
237
+ const audit = {
238
+ "@type": "cc.lanes.audit",
239
+ "@version": "1.0.0",
240
+ engine: CC_ENGINE["@id"],
241
+ created_utc: o.created_utc,
242
+ dict_entries: dictJson.dict.length,
243
+ lane_count: laneBlocks.length
244
+ };
245
+
246
+ return { dict: dictJson, lanes: laneBlocks, proof, audit };
247
+ }
248
+
249
+ /**
250
+ * Synchronous multi-lane compression (Node.js only).
251
+ *
252
+ * @param {Object} laneInput - Lane input object
253
+ * @param {Object} [opts] - Compression options
254
+ * @returns {CCLanesResult} Multi-lane compression result
255
+ */
256
+ export function ccCompressLanesSync(laneInput, opts = {}) {
257
+ const node = getNodeCrypto();
258
+ if (!node) {
259
+ throw new Error("SCXQ2: ccCompressLanesSync requires Node.js crypto");
260
+ }
261
+
262
+ const o = normalizeOpts(opts);
263
+ const lanes = normalizeLanes(laneInput);
264
+
265
+ const joined = lanes
266
+ .map((l) => canonicalizeInput(l.text, o))
267
+ .join("\n\n/*__LANE_BREAK__*/\n\n");
268
+ const joinedSha = sha256HexUtf8Sync(joined, node);
269
+
270
+ const tokenStats = collectTokens(joined, o);
271
+ const dict = buildDict(tokenStats, o);
272
+
273
+ const dictJson = makeDictJson(dict, joinedSha, {
274
+ ...o,
275
+ source_file: o.source_file ?? "lanes"
276
+ });
277
+ dictJson.dict_sha256_canon = sha256HexUtf8Sync(
278
+ canon(strip(dictJson, ["dict_sha256_canon"])),
279
+ node
280
+ );
281
+
282
+ const laneBlocks = [];
283
+ for (const lane of lanes) {
284
+ const src = canonicalizeInput(lane.text, o);
285
+ const srcSha = sha256HexUtf8Sync(src, node);
286
+
287
+ const enc = encodeSCXQ2(src, dict, { enableEdgeOps: o.enableEdgeOps });
288
+ const blockJson = makeLaneBlockJson(
289
+ lane.lane_id,
290
+ enc.b64,
291
+ srcSha,
292
+ dictJson.dict_sha256_canon,
293
+ src,
294
+ o,
295
+ enc.edges
296
+ );
297
+ blockJson.block_sha256_canon = sha256HexUtf8Sync(
298
+ canon(strip(blockJson, ["block_sha256_canon"])),
299
+ node
300
+ );
301
+
302
+ const rt = ccDecompress(dictJson, blockJson);
303
+ const rtSha = sha256HexUtf8Sync(rt, node);
304
+ if (rtSha !== srcSha) {
305
+ throw new Error(`SCXQ2: lane roundtrip mismatch: ${lane.lane_id}`);
306
+ }
307
+
308
+ laneBlocks.push(blockJson);
309
+ }
310
+
311
+ const proof = {
312
+ "@type": "cc.lanes.proof",
313
+ "@version": "1.0.0",
314
+ engine: CC_ENGINE["@id"],
315
+ created_utc: o.created_utc,
316
+ dict_sha256_canon: dictJson.dict_sha256_canon,
317
+ lanes: laneBlocks.map((b) => ({
318
+ lane_id: b.lane_id,
319
+ source_sha256_utf8: b.source_sha256_utf8,
320
+ block_sha256_canon: b.block_sha256_canon
321
+ })),
322
+ ok: true
323
+ };
324
+
325
+ const audit = {
326
+ "@type": "cc.lanes.audit",
327
+ "@version": "1.0.0",
328
+ engine: CC_ENGINE["@id"],
329
+ created_utc: o.created_utc,
330
+ dict_entries: dictJson.dict.length,
331
+ lane_count: laneBlocks.length
332
+ };
333
+
334
+ return { dict: dictJson, lanes: laneBlocks, proof, audit };
335
+ }
336
+
337
+ /* =============================================================================
338
+ Public API - Decompression
339
+ ============================================================================= */
340
+
341
+ /**
342
+ * Decompresses an SCXQ2 block using its dictionary.
343
+ *
344
+ * @param {SCXQ2Dict} dictJson - SCXQ2 dictionary object
345
+ * @param {SCXQ2Block} blockJson - SCXQ2 block object
346
+ * @returns {string} Decompressed text
347
+ */
348
+ export function ccDecompress(dictJson, blockJson) {
349
+ verifyPack(dictJson, blockJson);
350
+
351
+ const dict = dictJson.dict;
352
+ const bytes = base64ToBytes(blockJson.b64);
353
+
354
+ let out = "";
355
+ for (let i = 0; i < bytes.length; i++) {
356
+ const b = bytes[i];
357
+
358
+ // Dictionary reference: 0x80 [hi] [lo]
359
+ if (b === 0x80) {
360
+ const idx = (bytes[++i] << 8) | bytes[++i];
361
+ const tok = dict[idx];
362
+ if (typeof tok !== "string") {
363
+ throw new Error(`SCXQ2: invalid dict reference at index ${idx}`);
364
+ }
365
+ out += tok;
366
+ continue;
367
+ }
368
+
369
+ // UTF-16 literal: 0x81 [hi] [lo]
370
+ if (b === 0x81) {
371
+ out += String.fromCharCode((bytes[++i] << 8) | bytes[++i]);
372
+ continue;
373
+ }
374
+
375
+ // ASCII literal: byte < 128
376
+ out += String.fromCharCode(b);
377
+ }
378
+
379
+ return out;
380
+ }
381
+
382
+ /* =============================================================================
383
+ Public API - Verification
384
+ ============================================================================= */
385
+
386
+ /**
387
+ * Verifies structural validity of an SCXQ2 pack.
388
+ *
389
+ * @param {SCXQ2Dict} dictJson - Dictionary object
390
+ * @param {SCXQ2Block} blockJson - Block object
391
+ * @returns {{ok: true}} Success indicator
392
+ * @throws {Error} On verification failure
393
+ */
394
+ export function verifyPack(dictJson, blockJson) {
395
+ if (!dictJson || typeof dictJson !== "object") {
396
+ throw new Error("SCXQ2: missing dict");
397
+ }
398
+ if (!blockJson || typeof blockJson !== "object") {
399
+ throw new Error("SCXQ2: missing block");
400
+ }
401
+
402
+ if (dictJson["@type"] !== "scxq2.dict") {
403
+ throw new Error("SCXQ2: invalid dict @type");
404
+ }
405
+ if (blockJson["@type"] !== "scxq2.block") {
406
+ throw new Error("SCXQ2: invalid block @type");
407
+ }
408
+
409
+ if (!Array.isArray(dictJson.dict)) {
410
+ throw new Error("SCXQ2: dict must be array");
411
+ }
412
+ if (typeof blockJson.b64 !== "string") {
413
+ throw new Error("SCXQ2: block b64 must be string");
414
+ }
415
+
416
+ if (dictJson.mode !== SCXQ2_ENCODING.mode) {
417
+ throw new Error("SCXQ2: invalid dict mode");
418
+ }
419
+ if (blockJson.mode !== SCXQ2_ENCODING.mode) {
420
+ throw new Error("SCXQ2: invalid block mode");
421
+ }
422
+
423
+ if (dictJson.encoding !== SCXQ2_ENCODING.encoding) {
424
+ throw new Error("SCXQ2: invalid dict encoding");
425
+ }
426
+ if (blockJson.encoding !== SCXQ2_ENCODING.encoding) {
427
+ throw new Error("SCXQ2: invalid block encoding");
428
+ }
429
+
430
+ // Verify dictionary linkage
431
+ if (
432
+ blockJson.dict_sha256_canon &&
433
+ dictJson.dict_sha256_canon &&
434
+ blockJson.dict_sha256_canon !== dictJson.dict_sha256_canon
435
+ ) {
436
+ throw new Error("SCXQ2: dict linkage mismatch");
437
+ }
438
+
439
+ return { ok: true };
440
+ }
441
+
442
+ /* =============================================================================
443
+ Internal - Options Normalization
444
+ ============================================================================= */
445
+
446
+ function normalizeOpts(opts) {
447
+ return {
448
+ maxDict: clamp(opts.maxDict ?? 1024, 1, 65535),
449
+ minLen: clamp(opts.minLen ?? 3, 2, 128),
450
+ created_utc: opts.created_utc ?? isoUtc(),
451
+ source_file: opts.source_file ?? null,
452
+ enableFieldOps: !!opts.enableFieldOps,
453
+ enableEdgeOps: !!opts.enableEdgeOps,
454
+ flags: {
455
+ noStrings: !!opts.noStrings,
456
+ noWS: !!opts.noWS,
457
+ noPunct: !!opts.noPunct
458
+ }
459
+ };
460
+ }
461
+
462
+ function normalizeLanes(laneInput) {
463
+ if (!laneInput || typeof laneInput !== "object") {
464
+ throw new Error("SCXQ2: lanes input invalid");
465
+ }
466
+ const lanes = laneInput.lanes;
467
+ if (!Array.isArray(lanes) || lanes.length === 0) {
468
+ throw new Error("SCXQ2: lanes missing");
469
+ }
470
+
471
+ const out = [];
472
+ for (const l of lanes) {
473
+ const lane_id = String(l?.lane_id ?? "").trim();
474
+ if (!lane_id) {
475
+ throw new Error("SCXQ2: lane_id missing");
476
+ }
477
+ const text = l?.text;
478
+ if (typeof text !== "string" && !(text instanceof Uint8Array)) {
479
+ throw new Error("SCXQ2: lane text invalid");
480
+ }
481
+ out.push({ lane_id, text });
482
+ }
483
+
484
+ // Deterministic lane order
485
+ out.sort((a, b) =>
486
+ a.lane_id < b.lane_id ? -1 : a.lane_id > b.lane_id ? 1 : 0
487
+ );
488
+ return out;
489
+ }
490
+
491
+ /* =============================================================================
492
+ Internal - Input Canonicalization (CC.NORM)
493
+ ============================================================================= */
494
+
495
+ function canonicalizeInput(input, o) {
496
+ let s;
497
+ if (typeof input === "string") {
498
+ s = input;
499
+ } else if (input instanceof Uint8Array) {
500
+ s = new TextDecoder("utf-8", { fatal: false }).decode(input);
501
+ } else {
502
+ throw new Error("SCXQ2: input must be string or Uint8Array");
503
+ }
504
+
505
+ // Normalize newlines deterministically
506
+ s = s.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
507
+ return s;
508
+ }
509
+
510
+ /* =============================================================================
511
+ Internal - Token Collection (CC.DICT + CC.FIELD)
512
+ ============================================================================= */
513
+
514
+ function collectTokens(text, o) {
515
+ const freq = new Map();
516
+
517
+ const add = (tok) => {
518
+ if (!tok || tok.length < o.minLen) return;
519
+ if (tok.indexOf("\u0000") >= 0) return;
520
+ freq.set(tok, (freq.get(tok) || 0) + 1);
521
+ };
522
+
523
+ // Identifiers / words
524
+ for (const m of text.matchAll(/[A-Za-z_$][A-Za-z0-9_$]{2,}/g)) {
525
+ add(m[0]);
526
+ }
527
+
528
+ // Whitespace runs
529
+ if (!o.flags.noWS) {
530
+ for (const m of text.matchAll(/[ \t]{2,}/g)) {
531
+ add(m[0]);
532
+ }
533
+ }
534
+
535
+ // Punctuation clusters
536
+ if (!o.flags.noPunct) {
537
+ for (const m of text.matchAll(/[{}()[\];,.=:+\-*/<>!&|%^]{2,}/g)) {
538
+ add(m[0]);
539
+ }
540
+ }
541
+
542
+ // String literal contents
543
+ if (!o.flags.noStrings) {
544
+ for (const m of text.matchAll(/"([^"\n]{3,64})"|'([^'\n]{3,64})'/g)) {
545
+ const candidate = (m[1] || m[2] || "").trim();
546
+ if (candidate) add(candidate);
547
+ }
548
+ }
549
+
550
+ // FIELD operator: JSON keys
551
+ if (o.enableFieldOps) {
552
+ for (const m of text.matchAll(/"([^"\\\n]{1,64})"\s*:/g)) {
553
+ const k = m[1];
554
+ if (k) {
555
+ add(k);
556
+ add(`"${k}"`);
557
+ }
558
+ }
559
+ }
560
+
561
+ // Score by estimated savings
562
+ const scored = [];
563
+ for (const [tok, count] of freq.entries()) {
564
+ if (count < 2) continue;
565
+ const tokenBytes = estimateBytes(tok);
566
+ const savings = (tokenBytes - 3) * count;
567
+ if (savings > 0) {
568
+ scored.push({ tok, count, totalSavings: savings });
569
+ }
570
+ }
571
+
572
+ scored.sort(
573
+ (a, b) =>
574
+ b.totalSavings - a.totalSavings ||
575
+ b.tok.length - a.tok.length ||
576
+ (a.tok < b.tok ? -1 : 1)
577
+ );
578
+
579
+ return scored;
580
+ }
581
+
582
+ /* =============================================================================
583
+ Internal - Dictionary Build
584
+ ============================================================================= */
585
+
586
+ function buildDict(scored, o) {
587
+ const dict = [];
588
+ for (const t of scored) {
589
+ if (dict.length >= o.maxDict) break;
590
+ dict.push(t.tok);
591
+ }
592
+ // Longest-first for greedy matching
593
+ dict.sort((a, b) => b.length - a.length || (a < b ? -1 : 1));
594
+ return dict;
595
+ }
596
+
597
+ /* =============================================================================
598
+ Internal - SCXQ2 Encoding
599
+ ============================================================================= */
600
+
601
+ function encodeSCXQ2(text, dict, opts = {}) {
602
+ const map = new Map(dict.map((t, i) => [t, i]));
603
+ const bytes = [];
604
+ const edges = opts.enableEdgeOps ? [] : null;
605
+ let lastDictIdx = -1;
606
+
607
+ for (let i = 0; i < text.length; ) {
608
+ let matched = false;
609
+
610
+ for (const tok of dict) {
611
+ if (text.startsWith(tok, i)) {
612
+ const idx = map.get(tok);
613
+ bytes.push(0x80, idx >> 8, idx & 255);
614
+ i += tok.length;
615
+ matched = true;
616
+
617
+ // Edge witness
618
+ if (edges && lastDictIdx >= 0) {
619
+ edges.push([lastDictIdx, idx]);
620
+ }
621
+ lastDictIdx = idx;
622
+ break;
623
+ }
624
+ }
625
+
626
+ if (!matched) {
627
+ const c = text.charCodeAt(i++);
628
+ if (c < 128) {
629
+ bytes.push(c);
630
+ } else {
631
+ bytes.push(0x81, c >> 8, c & 255);
632
+ }
633
+ lastDictIdx = -1;
634
+ }
635
+ }
636
+
637
+ return { bytes, b64: bytesToBase64(bytes), edges };
638
+ }
639
+
640
+ /* =============================================================================
641
+ Internal - Utilities
642
+ ============================================================================= */
643
+
644
+ function estimateBytes(s) {
645
+ let bytes = 0;
646
+ for (let i = 0; i < s.length; i++) {
647
+ bytes += s.charCodeAt(i) < 128 ? 1 : 3;
648
+ }
649
+ return bytes;
650
+ }
651
+
652
+ function utf8Bytes(s) {
653
+ return new TextEncoder().encode(s).length;
654
+ }
655
+
656
+ function clamp(v, lo, hi) {
657
+ return Math.min(hi, Math.max(lo, v));
658
+ }
659
+
660
+ function isoUtc() {
661
+ return new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
662
+ }
663
+
664
+ /* =============================================================================
665
+ Internal - JSON Emitters
666
+ ============================================================================= */
667
+
668
+ function makeDictJson(dict, srcSha, o) {
669
+ return {
670
+ "@type": "scxq2.dict",
671
+ "@version": "1.0.0",
672
+ mode: SCXQ2_ENCODING.mode,
673
+ encoding: SCXQ2_ENCODING.encoding,
674
+ created_utc: o.created_utc,
675
+ source_sha256_utf8: srcSha,
676
+ max_dict: o.maxDict,
677
+ min_len: o.minLen,
678
+ flags: o.flags,
679
+ dict
680
+ };
681
+ }
682
+
683
+ function makeBlockJson(b64, srcSha, dictSha, src, o, edges) {
684
+ const block = {
685
+ "@type": "scxq2.block",
686
+ "@version": "1.0.0",
687
+ mode: SCXQ2_ENCODING.mode,
688
+ encoding: SCXQ2_ENCODING.encoding,
689
+ created_utc: o.created_utc,
690
+ source_sha256_utf8: srcSha,
691
+ dict_sha256_canon: dictSha,
692
+ original_bytes_utf8: utf8Bytes(src),
693
+ b64
694
+ };
695
+ if (edges && edges.length > 0) {
696
+ block.edges = edges.slice(0, 1000); // Limit edge witnesses
697
+ }
698
+ return block;
699
+ }
700
+
701
+ function makeLaneBlockJson(laneId, b64, srcSha, dictSha, src, o, edges) {
702
+ const block = makeBlockJson(b64, srcSha, dictSha, src, o, edges);
703
+ block.lane_id = laneId;
704
+ return block;
705
+ }
706
+
707
+ function makeProof(srcSha, rtSha, dictJson, blockJson, o) {
708
+ return {
709
+ "@type": "cc.proof",
710
+ "@version": "1.0.0",
711
+ engine: CC_ENGINE["@id"],
712
+ created_utc: o.created_utc,
713
+ source_sha256_utf8: srcSha,
714
+ dict_sha256_canon: dictJson.dict_sha256_canon,
715
+ block_sha256_canon: blockJson.block_sha256_canon,
716
+ roundtrip_sha256_utf8: rtSha,
717
+ ok: srcSha === rtSha,
718
+ steps: [
719
+ { op: CC_OPS.NORM, sha: srcSha },
720
+ { op: CC_OPS.DICT, dict_entries: dictJson.dict.length },
721
+ { op: "scxq2.encode.v1", block_sha: blockJson.block_sha256_canon },
722
+ { op: "scxq2.decode.v1", roundtrip_sha: rtSha }
723
+ ]
724
+ };
725
+ }
726
+
727
+ function makeAudit(src, tokenStats, dictJson, blockJson, o) {
728
+ const srcBytes = utf8Bytes(src);
729
+ const b64Bytes = utf8Bytes(blockJson.b64);
730
+
731
+ return {
732
+ "@type": "cc.audit",
733
+ "@version": "1.0.0",
734
+ engine: CC_ENGINE["@id"],
735
+ created_utc: o.created_utc,
736
+ sizes: {
737
+ original_bytes_utf8: srcBytes,
738
+ encoded_b64_bytes_utf8: b64Bytes,
739
+ ratio: srcBytes ? Number((b64Bytes / srcBytes).toFixed(6)) : null
740
+ },
741
+ dict: {
742
+ entries: dictJson.dict.length,
743
+ max_dict: dictJson.max_dict,
744
+ min_len: dictJson.min_len,
745
+ flags: dictJson.flags
746
+ },
747
+ top_tokens: tokenStats.slice(0, 25).map((t) => ({
748
+ tok: t.tok,
749
+ count: t.count,
750
+ totalSavings: t.totalSavings
751
+ }))
752
+ };
753
+ }