querysub 0.355.0 → 0.357.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/.cursorrules +8 -0
  2. package/bin/movelogs.js +4 -0
  3. package/package.json +12 -6
  4. package/scripts/postinstall.js +23 -0
  5. package/src/-a-archives/archiveCache.ts +10 -12
  6. package/src/-a-archives/archives.ts +29 -0
  7. package/src/-a-archives/archivesBackBlaze.ts +60 -12
  8. package/src/-a-archives/archivesDisk.ts +27 -8
  9. package/src/-a-archives/archivesLimitedCache.ts +21 -0
  10. package/src/-a-archives/archivesMemoryCache.ts +350 -0
  11. package/src/-a-archives/archivesPrivateFileSystem.ts +22 -0
  12. package/src/-g-core-values/NodeCapabilities.ts +3 -0
  13. package/src/0-path-value-core/auditLogs.ts +5 -1
  14. package/src/0-path-value-core/pathValueCore.ts +7 -7
  15. package/src/4-dom/qreact.tsx +1 -0
  16. package/src/4-querysub/Querysub.ts +1 -5
  17. package/src/config.ts +5 -0
  18. package/src/diagnostics/MachineThreadInfo.tsx +235 -0
  19. package/src/diagnostics/NodeViewer.tsx +3 -2
  20. package/src/diagnostics/logs/FastArchiveAppendable.ts +79 -42
  21. package/src/diagnostics/logs/FastArchiveController.ts +102 -63
  22. package/src/diagnostics/logs/FastArchiveViewer.tsx +36 -8
  23. package/src/diagnostics/logs/IndexedLogs/BufferIndex.ts +461 -0
  24. package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.cpp +327 -0
  25. package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.d.ts +18 -0
  26. package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.js +1 -0
  27. package/src/diagnostics/logs/IndexedLogs/BufferIndexHelpers.ts +140 -0
  28. package/src/diagnostics/logs/IndexedLogs/BufferIndexLogsOptimizationConstants.ts +22 -0
  29. package/src/diagnostics/logs/IndexedLogs/BufferIndexWAT.wat +1145 -0
  30. package/src/diagnostics/logs/IndexedLogs/BufferIndexWAT.wat.d.ts +178 -0
  31. package/src/diagnostics/logs/IndexedLogs/BufferListStreamer.ts +206 -0
  32. package/src/diagnostics/logs/IndexedLogs/BufferUnitIndex.ts +719 -0
  33. package/src/diagnostics/logs/IndexedLogs/BufferUnitSet.ts +146 -0
  34. package/src/diagnostics/logs/IndexedLogs/FilePathSelector.tsx +408 -0
  35. package/src/diagnostics/logs/IndexedLogs/FindProgressTracker.ts +45 -0
  36. package/src/diagnostics/logs/IndexedLogs/IndexedLogs.ts +598 -0
  37. package/src/diagnostics/logs/IndexedLogs/LogStreamer.ts +47 -0
  38. package/src/diagnostics/logs/IndexedLogs/LogViewer3.tsx +702 -0
  39. package/src/diagnostics/logs/IndexedLogs/TimeFileTree.ts +236 -0
  40. package/src/diagnostics/logs/IndexedLogs/binding.gyp +23 -0
  41. package/src/diagnostics/logs/IndexedLogs/moveIndexLogsToPublic.ts +221 -0
  42. package/src/diagnostics/logs/IndexedLogs/moveLogsEntry.ts +10 -0
  43. package/src/diagnostics/logs/LogViewer2.tsx +120 -55
  44. package/src/diagnostics/logs/TimeRangeSelector.tsx +5 -2
  45. package/src/diagnostics/logs/diskLogger.ts +32 -48
  46. package/src/diagnostics/logs/errorNotifications/ErrorNotificationController.ts +3 -2
  47. package/src/diagnostics/logs/errorNotifications/errorDigests.tsx +1 -0
  48. package/src/diagnostics/logs/lifeCycleAnalysis/LifeCyclePages.tsx +150 -0
  49. package/src/diagnostics/logs/lifeCycleAnalysis/lifeCycles.tsx +133 -0
  50. package/src/diagnostics/logs/lifeCycleAnalysis/test.ts +180 -0
  51. package/src/diagnostics/logs/lifeCycleAnalysis/test.wat +106 -0
  52. package/src/diagnostics/logs/lifeCycleAnalysis/test.wat.d.ts +2 -0
  53. package/src/diagnostics/logs/lifeCycleAnalysis/testHoist.ts +5 -0
  54. package/src/diagnostics/logs/logViewerExtractField.ts +2 -3
  55. package/src/diagnostics/managementPages.tsx +11 -1
  56. package/src/diagnostics/trackResources.ts +1 -1
  57. package/src/misc/lz4_wasm_nodejs.d.ts +34 -0
  58. package/src/misc/lz4_wasm_nodejs.js +178 -0
  59. package/src/misc/lz4_wasm_nodejs_bg.js +94 -0
  60. package/src/misc/lz4_wasm_nodejs_bg.wasm +0 -0
  61. package/src/misc/lz4_wasm_nodejs_bg.wasm.d.ts +15 -0
  62. package/src/storage/CompressedStream.ts +13 -0
  63. package/src/storage/LZ4.ts +32 -0
  64. package/src/storage/ZSTD.ts +10 -0
  65. package/src/wat/watCompiler.ts +1716 -0
  66. package/src/wat/watGrammar.pegjs +93 -0
  67. package/src/wat/watHandler.ts +179 -0
  68. package/src/wat/watInstructions.txt +707 -0
  69. package/src/zip.ts +3 -89
  70. package/src/diagnostics/logs/lifeCycleAnalysis/spec.md +0 -125
@@ -0,0 +1,719 @@
1
+
2
+
3
+ import { LZ4 } from "../../../storage/LZ4";
4
+ import { measureBlock, measureFnc } from "socket-function/src/profiling/measure";
5
+ import { Zip } from "../../../zip";
6
+ import { BufferReader, Reader, WILD_CARD_BYTE, createMatchesPattern, SearchParams } from "./BufferIndexHelpers";
7
+ import { formatNumber, formatPercent } from "socket-function/src/formatting/format";
8
+ import { lazy } from "socket-function/src/caching";
9
+ import { list, sort } from "socket-function/src/misc";
10
+ import { testDisableCache } from "../../../-a-archives/archivesMemoryCache";
11
+ import { devDebugbreak } from "../../../config";
12
+ import { BufferUnitIndexParallelSearchCount, DEFAULT_BLOCK_SIZE, DEFAULT_TARGET_UNITS_PER_BUCKET } from "./BufferIndexLogsOptimizationConstants";
13
+ import { runInParallel } from "socket-function/src/batching";
14
+
15
+ const USE_COMPRESSION = true;
16
+
17
+
18
+ // Lazy-load the C++ addon (only loaded when actually needed, not in client-side code)
19
+ let getCppAddon = lazy(() => {
20
+ try {
21
+ // Check if we're in Node.js environment
22
+ if (typeof process === "undefined" || !process.versions || !process.versions.node) {
23
+ return undefined;
24
+ }
25
+ // Try to require the C++ addon
26
+ const addon = require("./BufferIndexCPP") as typeof import("./BufferIndexCPP");
27
+ return addon;
28
+ } catch (e) {
29
+ console.warn("C++ addon not available, falling back to TypeScript implementation", e);
30
+ return undefined;
31
+ }
32
+ });
33
+
34
+ function compressBuffer(buffer: Buffer): Buffer {
35
+ if (!USE_COMPRESSION) {
36
+ return buffer;
37
+ }
38
+ return LZ4.compress(buffer);
39
+ }
40
+
41
+ function decompressBuffer(buffer: Buffer): Buffer {
42
+ if (!USE_COMPRESSION) {
43
+ return buffer;
44
+ }
45
+ return LZ4.decompress(buffer);
46
+ }
47
+
48
+ function getMaskedHash(unit: number, mask: number): number {
49
+ // Optimized: Simple multiply and shift
50
+ let hash = unit;
51
+ hash = Math.imul(hash, 0x27d4eb2d);
52
+ hash = hash ^ (hash >>> 15);
53
+
54
+ let maskedHash = hash & mask;
55
+ // Avoid collision with empty marker (0)
56
+ if (maskedHash === 0) {
57
+ maskedHash = 1;
58
+ }
59
+ return maskedHash;
60
+ }
61
+
62
+
63
+ /*
64
+ - If you store every single position, the index is going to really struggle to be significantly smaller than your input data. Because you'll be storing a unit value per byte, which is just a lot of data.
65
+ - SO, we hash each unit, only taking some bits. By only storing some of the units, we massively decrease the storage size, making this significantly smaller than the input data, or even the compressed data.
66
+ - This doesn't affect most searches, because most searches have multiple unique units, and the intersection of them likely only contains the actual blocks.
67
+ - We also store the list of blocks associated with each hash unit outside the hash table in a list, and each one of those lists are compressed.
68
+ - As a result, our compression ratio is usually amazing, although we do have some false positives.
69
+ - The index is often easily 100X smaller than the input data. And then this can increase multiple times if you compress the index (you can't compress it and still use it, however, this will reduce your network usage and disk usage).
70
+
71
+ PERFORMANCE OPTIMIZATION:
72
+ - Uses Uint32Arrays for hash table and linked lists for fast, cache-friendly memory access
73
+ - Linked lists handle collisions more efficiently than linear probing, with better cache locality
74
+ - Dynamic memory allocation (starting at totalUnits/100, growing 4x when needed) keeps memory usage optimal
75
+
76
+
77
+ BlocksBuffer {
78
+ byte type
79
+ byte[3] padding
80
+ uint32 blockCount
81
+ uint32[blockCount + 1] blockOffsets
82
+ compress(Block)[]
83
+ }
84
+
85
+ Block {
86
+ uint32 bufferCount
87
+ uint32[bufferCount + 1] bufferOffsets (offsets from the start of the uncompressed block)
88
+ byte[][] bufferData
89
+ }
90
+
91
+ Index {
92
+ byte type
93
+ byte[3] padding
94
+ uint32 mask
95
+ uint32 dataOffset
96
+ uint32[] hashTable (data offsets for each bucket, 0 = empty)
97
+ DataEntry[] dataSection
98
+ }
99
+
100
+ DataEntry {
101
+ uint32 compressedSize
102
+ compress(uint16[] deltaEncodedBlockIndices)
103
+ }
104
+ */
105
+ export class BufferUnitIndex {
106
+
107
+ // IMPORTANT! The input data values should be sorted from newest to oldest.
108
+ @measureFnc
109
+ public static encode(allData: Buffer[], options: {
110
+ type: number;
111
+ blockSize?: number;
112
+ targetUnitsPerBucket?: number;
113
+ }): {
114
+ blocks: Buffer;
115
+ index: Buffer;
116
+ } {
117
+ const type = options.type;
118
+ const BLOCK_SIZE = options.blockSize ?? DEFAULT_BLOCK_SIZE;
119
+ const TARGET_UNITS_PER_BUCKET = options.targetUnitsPerBucket ?? DEFAULT_TARGET_UNITS_PER_BUCKET;
120
+
121
+ // Step 1: Group buffers into blocks and collect unique units
122
+ const blocks: Buffer[] = [];
123
+ const blocksUncompressed: Buffer[] = [];
124
+ measureBlock(() => {
125
+ function createBlockWithMetadata(buffers: Buffer[]): { block: Buffer; uncompressed: Buffer } {
126
+ const bufferCount = buffers.length;
127
+ const headerBuffer = Buffer.from(new Uint32Array([
128
+ bufferCount,
129
+ ...list(bufferCount + 1).map(i => 0),
130
+ ]).buffer);
131
+ let currentOffset = headerBuffer.length;
132
+ let headerPos = 4;
133
+ for (const buffer of buffers) {
134
+ headerBuffer.writeUInt32LE(currentOffset, headerPos);
135
+ headerPos += 4;
136
+ currentOffset += buffer.length;
137
+ }
138
+ headerBuffer.writeUInt32LE(currentOffset, headerPos);
139
+
140
+ const concatenatedData = Buffer.concat(buffers);
141
+
142
+ const block = Buffer.concat([headerBuffer, concatenatedData]);
143
+
144
+ return {
145
+ block: block,
146
+ uncompressed: concatenatedData
147
+ };
148
+ }
149
+
150
+ let currentBlockBuffers: Buffer[] = [];
151
+ let currentBlockSize = 0;
152
+
153
+ for (const buffer of allData) {
154
+ let nextSize = currentBlockSize + buffer.length;
155
+ if (currentBlockBuffers.length > 0 && nextSize > BLOCK_SIZE) {
156
+ const block = createBlockWithMetadata(currentBlockBuffers);
157
+ blocks.push(block.block);
158
+ blocksUncompressed.push(block.uncompressed);
159
+ currentBlockBuffers = [];
160
+ currentBlockSize = 0;
161
+ }
162
+
163
+ currentBlockBuffers.push(buffer);
164
+ currentBlockSize += buffer.length;
165
+ }
166
+
167
+ if (currentBlockBuffers.length > 0) {
168
+ const block = createBlockWithMetadata(currentBlockBuffers);
169
+ blocks.push(block.block);
170
+ blocksUncompressed.push(block.uncompressed);
171
+ }
172
+ }, `createBlocks`);
173
+
174
+ // Step 2: Create final blocks buffer with compressed blocks
175
+ const finalBlocksBuffer = measureBlock(() => {
176
+ // Compress each entire block (metadata + data)
177
+ const compressedBlocks: Buffer[] = [];
178
+ for (const block of blocks) {
179
+ const compressed = compressBuffer(block);
180
+ compressedBlocks.push(compressed);
181
+ }
182
+
183
+ const blocksHeaderBuffer = Buffer.allocUnsafe(4 + 4 + ((compressedBlocks.length + 1) * 4));
184
+
185
+ blocksHeaderBuffer.writeUInt8(type, 0);
186
+ blocksHeaderBuffer.writeUInt8(0, 1);
187
+ blocksHeaderBuffer.writeUInt8(0, 2);
188
+ blocksHeaderBuffer.writeUInt8(0, 3);
189
+ blocksHeaderBuffer.writeUInt32LE(compressedBlocks.length, 4);
190
+
191
+ let currentBlockOffset = blocksHeaderBuffer.length;
192
+ for (let i = 0; i < compressedBlocks.length; i++) {
193
+ blocksHeaderBuffer.writeUInt32LE(currentBlockOffset, 8 + (i * 4));
194
+ currentBlockOffset += compressedBlocks[i].length;
195
+ }
196
+ blocksHeaderBuffer.writeUInt32LE(currentBlockOffset, 8 + (compressedBlocks.length * 4));
197
+
198
+ return Buffer.concat([blocksHeaderBuffer, ...compressedBlocks]);
199
+ }, `createFinalBlocksBuffer`);
200
+
201
+ const estimatedUniqueUnits = measureBlock(() => {
202
+ const cppAddon = getCppAddon();
203
+ if (cppAddon) {
204
+ // Use C++ addon for performance
205
+ return cppAddon.estimateUniqueUnits(allData);
206
+ } else {
207
+ // TypeScript fallback implementation
208
+ // Use a 64KB bitmap to track which 16-bit masked hashes we've seen
209
+ const seenMaskedHashes = Buffer.allocUnsafe(65536);
210
+ seenMaskedHashes.fill(0);
211
+ let uniqueCount = 0;
212
+
213
+ for (const buffer of allData) {
214
+ for (let i = 0; i <= buffer.length - 4; i++) {
215
+ const unit = buffer.readUint32LE(i);
216
+ const maskedHash = getMaskedHash(unit, 0xFFFF);
217
+
218
+ if (seenMaskedHashes[maskedHash] === 0) {
219
+ seenMaskedHashes[maskedHash] = 1;
220
+ uniqueCount++;
221
+ }
222
+ }
223
+ }
224
+
225
+ // Estimate actual unique units by multiplying by 2
226
+ // This accounts for collisions in the 16-bit space
227
+ // Only inaccurate if unique units approaches or exceeds 65k
228
+ return uniqueCount * 2;
229
+ }
230
+ }, `collectUniqueUnits`);
231
+
232
+ // Step 3: Calculate hash table size and mask
233
+ const { hashTableCapacity, mask } = measureBlock(() => {
234
+ const uniqueUnitCount = estimatedUniqueUnits;
235
+ const targetBuckets = Math.max(1, Math.floor(uniqueUnitCount / TARGET_UNITS_PER_BUCKET));
236
+ let hashTableCapacity = Math.pow(2, Math.floor(Math.log2(targetBuckets * 2)) + 1);
237
+ let mask = hashTableCapacity - 1;
238
+
239
+ // Cap mask at 16 bits (2 bytes)
240
+ if (mask > 0xFFFF) {
241
+ mask = 0xFFFF;
242
+ hashTableCapacity = (mask + 1);
243
+ }
244
+
245
+ console.log(`UnitBlockLookup: ${blocks.length} blocks, ${uniqueUnitCount} unique units, ${hashTableCapacity} hash table capacity (~${(uniqueUnitCount / hashTableCapacity).toFixed(1)} units/bucket), mask: ${mask}`);
246
+
247
+ return { hashTableCapacity, mask };
248
+ }, `calculateHashTableSize`);
249
+
250
+
251
+
252
+ // Step 4: Build hash table with linked lists using Uint32Arrays for performance
253
+ const { hashTable, linkedListData, nodeCount, filledSlots } = measureBlock(() => {
254
+ const cppAddon = getCppAddon();
255
+ if (cppAddon) {
256
+ // Use C++ addon for performance
257
+ return cppAddon.buildHashTable(blocksUncompressed, hashTableCapacity, mask);
258
+ } else {
259
+ // TypeScript fallback implementation
260
+ // Calculate initial linked list capacity based on total units
261
+ let totalUnits = 0;
262
+ for (const buffer of blocksUncompressed) {
263
+ totalUnits += Math.max(0, buffer.length - 3); // number of 4-byte units
264
+ }
265
+ let linkedListCapacity = Math.max(1000, Math.floor(totalUnits / 100));
266
+
267
+ // Hash table: stores head node index for each bucket (0 means empty)
268
+ const hashTable = new Uint32Array(hashTableCapacity);
269
+
270
+ // Linked list storage: each node is 3 uint32s: [maskedHash, blockIndex, nextNodeIndex]
271
+ let linkedListData = new Uint32Array(linkedListCapacity * 3);
272
+ let linkedListSize = 0;
273
+ let filledSlots = 0;
274
+
275
+ function allocateNode(maskedHash: number, blockIndex: number, nextNodeIndex: number): number {
276
+ if (linkedListSize >= linkedListCapacity) {
277
+ console.log(`Growing linked list buffer`);
278
+ // Reallocate with 4x capacity
279
+ const newCapacity = linkedListCapacity * 4;
280
+ const newData = new Uint32Array(newCapacity * 3);
281
+ newData.set(linkedListData);
282
+ linkedListData = newData;
283
+ linkedListCapacity = newCapacity;
284
+ }
285
+ const nodeIndex = linkedListSize + 1; // +1 because 0 means empty
286
+ const nodeOffset = linkedListSize * 3;
287
+ linkedListData[nodeOffset] = maskedHash;
288
+ linkedListData[nodeOffset + 1] = blockIndex;
289
+ linkedListData[nodeOffset + 2] = nextNodeIndex;
290
+ linkedListSize++;
291
+ return nodeIndex;
292
+ }
293
+
294
+ // Build the hash table with linked lists (iterate backwards so linked lists are in forward order)
295
+ for (let blockIndex = blocksUncompressed.length - 1; blockIndex >= 0; blockIndex--) {
296
+ const blockUncompressed = blocksUncompressed[blockIndex];
297
+
298
+ // Iterate through all units and add them directly
299
+ for (let i = 0; i <= blockUncompressed.length - 4; i++) {
300
+ const unit = blockUncompressed.readUint32LE(i);
301
+ const maskedHash = getMaskedHash(unit, mask);
302
+
303
+ const currentHead = hashTable[maskedHash];
304
+
305
+ // Skip if we already added this masked hash for the current block
306
+ // (check if head node has the same blockIndex)
307
+ if (currentHead !== 0) {
308
+ const headNodeOffset = (currentHead - 1) * 3;
309
+ const headBlockIndex = linkedListData[headNodeOffset + 1];
310
+ if (headBlockIndex === blockIndex) {
311
+ continue;
312
+ }
313
+ }
314
+
315
+ // Track when we're filling a previously empty slot
316
+ if (currentHead === 0) {
317
+ filledSlots++;
318
+ }
319
+
320
+ // Create new node and prepend to the chain
321
+ const newNodeIndex = allocateNode(maskedHash, blockIndex, currentHead);
322
+ hashTable[maskedHash] = newNodeIndex;
323
+ }
324
+ }
325
+
326
+ return {
327
+ hashTable,
328
+ linkedListData: linkedListData.slice(0, linkedListSize * 3),
329
+ nodeCount: linkedListSize,
330
+ filledSlots
331
+ };
332
+ }
333
+ }, `buildHashTableWithLinkedLists`);
334
+
335
+ console.log(`Hash table filled slots: ${filledSlots} / ${hashTableCapacity} (${formatPercent(filledSlots / hashTableCapacity)} utilization)`);
336
+
337
+ // Step 5: Flatten linked lists and encode to binary
338
+ // Build data section and hash table by traversing linked lists
339
+ const { dataSection, hashTableBuffer } = measureBlock(() => {
340
+ const dataBuffers: Buffer[] = [];
341
+ let currentDataOffset = 0;
342
+
343
+ // Hash table stores data offset for each bucket (0 means empty)
344
+ const hashTableBuffer = Buffer.allocUnsafe(hashTableCapacity * 4);
345
+ hashTableBuffer.fill(0);
346
+
347
+ let totalBlocks = 0;
348
+ let nonEmptyBuckets = 0;
349
+
350
+ // Traverse each bucket and flatten its linked list
351
+ for (let bucketIndex = 0; bucketIndex < hashTableCapacity; bucketIndex++) {
352
+ let nodeIndex = hashTable[bucketIndex];
353
+
354
+ if (nodeIndex === 0) {
355
+ // Empty bucket
356
+ continue;
357
+ }
358
+
359
+ nonEmptyBuckets++;
360
+
361
+ // First pass: count blocks in this chain
362
+ let blockCount = 0;
363
+ let tempNodeIndex = nodeIndex;
364
+ while (tempNodeIndex !== 0) {
365
+ blockCount++;
366
+ const nodeOffset = (tempNodeIndex - 1) * 3;
367
+ tempNodeIndex = linkedListData[nodeOffset + 2];
368
+ }
369
+
370
+ totalBlocks += blockCount;
371
+
372
+ // Allocate buffer for delta-encoded blocks
373
+ const deltaEncoded = new Uint16Array(blockCount);
374
+
375
+ // Second pass: write delta-encoded values directly
376
+ let writeIndex = 0;
377
+ let prevBlock = 0;
378
+ nodeIndex = hashTable[bucketIndex];
379
+ while (nodeIndex !== 0) {
380
+ const nodeOffset = (nodeIndex - 1) * 3;
381
+ const blockIndex = linkedListData[nodeOffset + 1];
382
+
383
+ if (writeIndex === 0) {
384
+ deltaEncoded[writeIndex] = blockIndex;
385
+ prevBlock = blockIndex;
386
+ } else {
387
+ deltaEncoded[writeIndex] = blockIndex - prevBlock;
388
+ prevBlock = blockIndex;
389
+ }
390
+
391
+ writeIndex++;
392
+ nodeIndex = linkedListData[nodeOffset + 2];
393
+ }
394
+
395
+ // Compress the delta-encoded blocks
396
+ const uncompressedBuffer = Buffer.from(deltaEncoded.buffer, deltaEncoded.byteOffset, deltaEncoded.byteLength);
397
+ const dataBuffer = compressBuffer(uncompressedBuffer);
398
+
399
+ // Store: [dataSize, data]
400
+ const sizeBuffer = Buffer.allocUnsafe(4);
401
+ sizeBuffer.writeUInt32LE(dataBuffer.length, 0);
402
+
403
+ const entryBuffer = Buffer.concat([sizeBuffer, dataBuffer]);
404
+ dataBuffers.push(entryBuffer);
405
+
406
+ // Store data offset in hash table
407
+ hashTableBuffer.writeUInt32LE(currentDataOffset, bucketIndex * 4);
408
+ currentDataOffset += entryBuffer.length;
409
+ }
410
+
411
+ const dataSection = Buffer.concat(dataBuffers);
412
+
413
+ const avgBlocks = totalBlocks / nonEmptyBuckets;
414
+ console.log(`Block data buffers: ${dataBuffers.length}, avg blocks per hash: ${formatNumber(avgBlocks)}`);
415
+
416
+ return { dataSection, hashTableBuffer };
417
+ }, `flattenLinkedListsAndBuildDataSection`);
418
+
419
+ // Build header: [type (byte), padding (byte[3]), mask (uint32), dataOffset (uint32)]
420
+ const index = measureBlock(() => {
421
+ const header = Buffer.allocUnsafe(4 + 8);
422
+ header.writeUInt8(type, 0);
423
+ header.writeUInt8(0, 1); // padding
424
+ header.writeUInt8(0, 2); // padding
425
+ header.writeUInt8(0, 3); // padding
426
+ header.writeUInt32LE(mask, 4);
427
+ header.writeUInt32LE(12 + hashTableBuffer.length, 8); // dataOffset: after type+padding+mask+dataOffset+hashTable
428
+
429
+ return Buffer.concat([header, hashTableBuffer, dataSection]);
430
+ }, `buildFinalIndex`);
431
+
432
+ return { blocks: finalBlocksBuffer, index };
433
+ }
434
+
435
+ @measureFnc
436
+ public static async decodeAll(blocks: Buffer): Promise<Buffer[]> {
437
+ let reader = new BufferReader(blocks);
438
+ const blockCount = await this.getBlockCount(reader);
439
+ const allBuffers: Buffer[] = [];
440
+
441
+ for (let blockIndex = 0; blockIndex < blockCount; blockIndex++) {
442
+ let obj = await this.getBlock(reader, blockIndex);
443
+ let blockReader = new BufferReader(obj.block);
444
+ let bufferCount = await this.getBufferCountFromBlock(blockReader);
445
+ for (let i = 0; i < bufferCount; i++) {
446
+ let buffer = await this.getBufferFromBlock(blockReader, i);
447
+ allBuffers.push(buffer);
448
+ }
449
+ }
450
+
451
+ return allBuffers;
452
+ }
453
+
454
+ @measureFnc
455
+ public static async find(config: {
456
+ params: SearchParams;
457
+ keepIterating: () => boolean;
458
+ onResult: (match: Buffer) => void;
459
+ index: Buffer;
460
+ reader: Reader;
461
+ }): Promise<{
462
+ blocksChecked: number;
463
+ blocksCheckedCompressedSize: number;
464
+ blocksCheckedDecompressedSize: number;
465
+ totalBlockCount: number;
466
+ blockSearchTime: number;
467
+ blocksWithErrors: string[];
468
+ }> {
469
+ const { params, index, reader, keepIterating } = config;
470
+
471
+ // Split on wildcards if present
472
+ function splitOnWildcard(buffer: Buffer): Buffer[] {
473
+ const segments: Buffer[] = [];
474
+ let start = 0;
475
+ for (let i = 0; i <= buffer.length; i++) {
476
+ if (i === buffer.length || buffer[i] === WILD_CARD_BYTE) {
477
+ segments.push(buffer.slice(start, i));
478
+ start = i + 1;
479
+ }
480
+ }
481
+ return segments;
482
+ }
483
+
484
+ const segments = params.disableWildCards && [params.findBuffer] || splitOnWildcard(params.findBuffer).filter(s => s.length > 0);
485
+
486
+ // Find blocks for each segment >= 4 bytes
487
+ const candidateBlocks = measureBlock(() => {
488
+ const candidateBlocksPerSegment: number[][] = [];
489
+ for (const segment of segments) {
490
+ if (segment.length < 4) continue;
491
+ const blockIndices = this.findBlocks({ findBuffer: segment, index });
492
+ candidateBlocksPerSegment.push(blockIndices);
493
+ }
494
+
495
+ if (candidateBlocksPerSegment.length === 0) {
496
+ throw new Error("Search pattern too short: all segments are fewer than 4 bytes, cannot use index");
497
+ }
498
+
499
+ let intersectionSet = new Set<number>(candidateBlocksPerSegment[0]);
500
+ for (let i = 1; i < candidateBlocksPerSegment.length; i++) {
501
+ const currentSet = new Set(candidateBlocksPerSegment[i]);
502
+ intersectionSet = new Set([...intersectionSet].filter(x => currentSet.has(x)));
503
+ }
504
+
505
+ // Search first first, as moveLogsToPublic should have made it so this is the newest.
506
+ return Array.from(intersectionSet).sort((a, b) => a - b);
507
+ }, `findCandidateBlocks`);
508
+
509
+
510
+ const matchesPattern = createMatchesPattern(params.findBuffer, !!params.disableWildCards);
511
+
512
+ const headerBuffer = await reader.read(0, 8);
513
+ const blockCount = headerBuffer.readUInt32LE(4);
514
+ let blocksCheckedCompressedSize = 0;
515
+ let blocksCheckedDecompressedSize = 0;
516
+
517
+ // Read blocks and search for matches
518
+ let blocksChecked = 0;
519
+ let blocksWithErrors: string[] = [];
520
+ let blockSearchTimeStart = Date.now();
521
+ await measureBlock(async () => {
522
+ let matchCount = 0;
523
+ let matchCounts = list(blockCount).fill(0);
524
+
525
+ const searchBlock = async (blockIndex: number) => {
526
+ // This is kind of a weird thing. Basically, because we search in parallel, we might search out of order. So we can only look at the counts before or at us, as if we match a whole bunch after us, but we should still keep going as our matches are going to take precedence.
527
+ let stopIterating = () => {
528
+ let countBefore = 0;
529
+ for (let i = 0; i <= blockIndex; i++) {
530
+ countBefore += matchCounts[i];
531
+ }
532
+ return countBefore >= params.limit || !keepIterating();
533
+ };
534
+ if (stopIterating()) return;
535
+
536
+ let debugOffsets = {
537
+ startOffset: 0,
538
+ endOffset: 0,
539
+ };
540
+ try {
541
+ if (blockIndex >= blockCount) {
542
+ throw new Error(`Block index ${blockIndex} out of range, blockCount is ${blockCount}`);
543
+ }
544
+ let obj = await this.getBlock(reader, blockIndex, debugOffsets);
545
+
546
+ blocksCheckedDecompressedSize += obj.block.length;
547
+ blocksCheckedCompressedSize += obj.compressedSize;
548
+ blocksChecked++;
549
+
550
+ let blockReader = new BufferReader(obj.block);
551
+ let bufferCount = await this.getBufferCountFromBlock(blockReader);
552
+
553
+ // Check each buffer for a match
554
+ for (let i = 0; i < bufferCount; i++) {
555
+ if (stopIterating()) break;
556
+
557
+ const buffer = await this.getBufferFromBlock(blockReader, i);
558
+ if (matchesPattern(buffer)) {
559
+ config.onResult(buffer);
560
+ matchCount++;
561
+ matchCounts[blockIndex]++;
562
+ }
563
+ }
564
+ } catch (e: any) {
565
+ console.warn(`Error decompressing/searching block: ${e}`);
566
+ blocksWithErrors.push(`(for block ${blockIndex + 1} / ${blockCount}, ${debugOffsets.startOffset}-${debugOffsets.endOffset}) ${String(e?.stack || e)}`);
567
+ }
568
+ };
569
+
570
+ const runSearchBlock = runInParallel(
571
+ { parallelCount: BufferUnitIndexParallelSearchCount },
572
+ searchBlock
573
+ );
574
+ await Promise.all(list(blockCount).map(runSearchBlock));
575
+
576
+ }, `searchBlocks`);
577
+ let blockSearchTime = Date.now() - blockSearchTimeStart;
578
+
579
+ return { blocksChecked, blocksCheckedCompressedSize, blocksCheckedDecompressedSize, totalBlockCount: blockCount, blockSearchTime, blocksWithErrors };
580
+ }
581
+
582
+ private static findBlocks(config: {
583
+ findBuffer: Buffer;
584
+ index: Buffer;
585
+ }): number[] {
586
+ const { findBuffer, index } = config;
587
+
588
+ // Extract all unique units from findBuffer
589
+ const units = measureBlock(() => {
590
+ const units: number[] = [];
591
+ for (let i = 0; i <= findBuffer.length - 4; i++) {
592
+ const unit = findBuffer.readUint32LE(i);
593
+ if (unit !== 0 && !units.includes(unit)) {
594
+ units.push(unit);
595
+ }
596
+ }
597
+ return units;
598
+ }, `extractUnits`);
599
+
600
+ if (units.length === 0) {
601
+ return [];
602
+ }
603
+
604
+ // Get blocks for each unit and intersect
605
+ return measureBlock(() => {
606
+ const candidateBlocksPerUnit: number[][] = [];
607
+ for (const unit of units) {
608
+ const blockIndices = this.getBlocksForUnit(index, unit);
609
+ candidateBlocksPerUnit.push(blockIndices);
610
+ }
611
+
612
+ // Intersect all block sets
613
+ let intersectionSet = new Set<number>(candidateBlocksPerUnit[0]);
614
+ for (let i = 1; i < candidateBlocksPerUnit.length; i++) {
615
+ const currentSet = new Set(candidateBlocksPerUnit[i]);
616
+ intersectionSet = new Set([...intersectionSet].filter(x => currentSet.has(x)));
617
+ }
618
+
619
+ let allCounts = candidateBlocksPerUnit.map(b => b.length);
620
+ sort(allCounts, x => x);
621
+
622
+ //console.log(`Candidate blocks ${intersectionSet.size}, minimum: ${Math.min(...candidateBlocksPerUnit.map(b => b.length))}, best 4 counts: ${allCounts.slice(0, 4).join(", ")}`);
623
+
624
+ return Array.from(intersectionSet);
625
+ }, `intersectBlocks`);
626
+ }
627
+
628
+ private static getBlocksForUnit(index: Buffer, unit: number): number[] {
629
+ // Read header (skip type + padding at offset 0-3)
630
+ const mask = index.readUInt32LE(4);
631
+ const dataOffset = index.readUInt32LE(8);
632
+
633
+ const maskedHash = getMaskedHash(unit, mask);
634
+
635
+ // Hash table starts after header (type + padding + mask + dataOffset = 12 bytes)
636
+ const hashTableOffset = 12 + maskedHash * 4;
637
+ const bucketDataOffset = index.readUInt32LE(hashTableOffset);
638
+
639
+ if (bucketDataOffset === 0) {
640
+ // Empty bucket
641
+ return [];
642
+ }
643
+
644
+ // Read from data section
645
+ const dataSection = index.slice(dataOffset);
646
+ const dataSize = dataSection.readUInt32LE(bucketDataOffset);
647
+
648
+ if (dataSize === 0) {
649
+ return [];
650
+ }
651
+
652
+ const data = dataSection.slice(bucketDataOffset + 4, bucketDataOffset + 4 + dataSize);
653
+ const decompressedData = decompressBuffer(data);
654
+
655
+ const blocks: number[] = [];
656
+ const uint16View = new Uint16Array(decompressedData.buffer, decompressedData.byteOffset, decompressedData.length / 2);
657
+
658
+ if (uint16View.length === 0) {
659
+ return [];
660
+ }
661
+
662
+ // Decode delta encoding
663
+ let currentBlock = uint16View[0];
664
+ blocks.push(currentBlock);
665
+
666
+ for (let i = 1; i < uint16View.length; i++) {
667
+ currentBlock += uint16View[i];
668
+ blocks.push(currentBlock);
669
+ }
670
+
671
+ return blocks;
672
+ }
673
+
674
+
675
+ private static async getBlockCount(reader: Reader): Promise<number> {
676
+ const headerBuffer = await reader.read(4, 8);
677
+ return headerBuffer.readUInt32LE(0);
678
+ }
679
+
680
+ private static async getBlock(reader: Reader, blockIndex: number, debugOffsets?: {
681
+ startOffset: number;
682
+ endOffset: number;
683
+ }): Promise<{
684
+ compressedSize: number;
685
+ block: Buffer;
686
+ }> {
687
+ const startOffset = (await reader.read(8 + blockIndex * 4, 4)).readUInt32LE(0);
688
+ const endOffset = (await reader.read(8 + (blockIndex + 1) * 4, 4)).readUInt32LE(0);
689
+ if (debugOffsets) {
690
+ debugOffsets.startOffset = startOffset;
691
+ debugOffsets.endOffset = endOffset;
692
+ }
693
+ let size = endOffset - startOffset;
694
+ let compressedBlock = await reader.read(startOffset, size);
695
+ if (compressedBlock.length !== size) {
696
+ devDebugbreak();
697
+ let test = await reader.read(startOffset, size);
698
+ throw new Error(`Reading block data at index ${blockIndex} we were given an incorrect amount of data, expected ${size}, got ${compressedBlock.length}. ${size > compressedBlock.length ? "It is likely the file is missing trailing data?" : "It's likely that the reader function is incorrect."}`);
699
+ }
700
+ let block = decompressBuffer(compressedBlock);
701
+ return {
702
+ compressedSize: endOffset - startOffset,
703
+ block: block,
704
+ };
705
+ }
706
+
707
+ public static async getBufferCountFromBlock(reader: Reader): Promise<number> {
708
+ return (await reader.read(0, 4)).readUInt32LE(0);
709
+ }
710
+
711
+ private static async getBufferFromBlock(reader: Reader, bufferIndex: number): Promise<Buffer> {
712
+ let startOffset = (await reader.read(4 + bufferIndex * 4, 4)).readUInt32LE(0);
713
+ let endOffset = (await reader.read(4 + (bufferIndex + 1) * 4, 4)).readUInt32LE(0);
714
+ let buffer = await reader.read(startOffset, endOffset - startOffset);
715
+ return buffer;
716
+ }
717
+
718
+ }
719
+