querysub 0.355.0 → 0.357.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursorrules +8 -0
- package/bin/movelogs.js +4 -0
- package/package.json +12 -6
- package/scripts/postinstall.js +23 -0
- package/src/-a-archives/archiveCache.ts +10 -12
- package/src/-a-archives/archives.ts +29 -0
- package/src/-a-archives/archivesBackBlaze.ts +60 -12
- package/src/-a-archives/archivesDisk.ts +27 -8
- package/src/-a-archives/archivesLimitedCache.ts +21 -0
- package/src/-a-archives/archivesMemoryCache.ts +350 -0
- package/src/-a-archives/archivesPrivateFileSystem.ts +22 -0
- package/src/-g-core-values/NodeCapabilities.ts +3 -0
- package/src/0-path-value-core/auditLogs.ts +5 -1
- package/src/0-path-value-core/pathValueCore.ts +7 -7
- package/src/4-dom/qreact.tsx +1 -0
- package/src/4-querysub/Querysub.ts +1 -5
- package/src/config.ts +5 -0
- package/src/diagnostics/MachineThreadInfo.tsx +235 -0
- package/src/diagnostics/NodeViewer.tsx +3 -2
- package/src/diagnostics/logs/FastArchiveAppendable.ts +79 -42
- package/src/diagnostics/logs/FastArchiveController.ts +102 -63
- package/src/diagnostics/logs/FastArchiveViewer.tsx +36 -8
- package/src/diagnostics/logs/IndexedLogs/BufferIndex.ts +461 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.cpp +327 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.d.ts +18 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexCPP.js +1 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexHelpers.ts +140 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexLogsOptimizationConstants.ts +22 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexWAT.wat +1145 -0
- package/src/diagnostics/logs/IndexedLogs/BufferIndexWAT.wat.d.ts +178 -0
- package/src/diagnostics/logs/IndexedLogs/BufferListStreamer.ts +206 -0
- package/src/diagnostics/logs/IndexedLogs/BufferUnitIndex.ts +719 -0
- package/src/diagnostics/logs/IndexedLogs/BufferUnitSet.ts +146 -0
- package/src/diagnostics/logs/IndexedLogs/FilePathSelector.tsx +408 -0
- package/src/diagnostics/logs/IndexedLogs/FindProgressTracker.ts +45 -0
- package/src/diagnostics/logs/IndexedLogs/IndexedLogs.ts +598 -0
- package/src/diagnostics/logs/IndexedLogs/LogStreamer.ts +47 -0
- package/src/diagnostics/logs/IndexedLogs/LogViewer3.tsx +702 -0
- package/src/diagnostics/logs/IndexedLogs/TimeFileTree.ts +236 -0
- package/src/diagnostics/logs/IndexedLogs/binding.gyp +23 -0
- package/src/diagnostics/logs/IndexedLogs/moveIndexLogsToPublic.ts +221 -0
- package/src/diagnostics/logs/IndexedLogs/moveLogsEntry.ts +10 -0
- package/src/diagnostics/logs/LogViewer2.tsx +120 -55
- package/src/diagnostics/logs/TimeRangeSelector.tsx +5 -2
- package/src/diagnostics/logs/diskLogger.ts +32 -48
- package/src/diagnostics/logs/errorNotifications/ErrorNotificationController.ts +3 -2
- package/src/diagnostics/logs/errorNotifications/errorDigests.tsx +1 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/LifeCyclePages.tsx +150 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/lifeCycles.tsx +133 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/test.ts +180 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/test.wat +106 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/test.wat.d.ts +2 -0
- package/src/diagnostics/logs/lifeCycleAnalysis/testHoist.ts +5 -0
- package/src/diagnostics/logs/logViewerExtractField.ts +2 -3
- package/src/diagnostics/managementPages.tsx +11 -1
- package/src/diagnostics/trackResources.ts +1 -1
- package/src/misc/lz4_wasm_nodejs.d.ts +34 -0
- package/src/misc/lz4_wasm_nodejs.js +178 -0
- package/src/misc/lz4_wasm_nodejs_bg.js +94 -0
- package/src/misc/lz4_wasm_nodejs_bg.wasm +0 -0
- package/src/misc/lz4_wasm_nodejs_bg.wasm.d.ts +15 -0
- package/src/storage/CompressedStream.ts +13 -0
- package/src/storage/LZ4.ts +32 -0
- package/src/storage/ZSTD.ts +10 -0
- package/src/wat/watCompiler.ts +1716 -0
- package/src/wat/watGrammar.pegjs +93 -0
- package/src/wat/watHandler.ts +179 -0
- package/src/wat/watInstructions.txt +707 -0
- package/src/zip.ts +3 -89
- package/src/diagnostics/logs/lifeCycleAnalysis/spec.md +0 -125
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import { LZ4 } from "../../../storage/LZ4";
|
|
4
|
+
import { measureBlock, measureFnc } from "socket-function/src/profiling/measure";
|
|
5
|
+
import { Zip } from "../../../zip";
|
|
6
|
+
import { BufferReader, Reader, WILD_CARD_BYTE, createMatchesPattern, SearchParams } from "./BufferIndexHelpers";
|
|
7
|
+
import { formatNumber, formatPercent } from "socket-function/src/formatting/format";
|
|
8
|
+
import { lazy } from "socket-function/src/caching";
|
|
9
|
+
import { list, sort } from "socket-function/src/misc";
|
|
10
|
+
import { testDisableCache } from "../../../-a-archives/archivesMemoryCache";
|
|
11
|
+
import { devDebugbreak } from "../../../config";
|
|
12
|
+
import { BufferUnitIndexParallelSearchCount, DEFAULT_BLOCK_SIZE, DEFAULT_TARGET_UNITS_PER_BUCKET } from "./BufferIndexLogsOptimizationConstants";
|
|
13
|
+
import { runInParallel } from "socket-function/src/batching";
|
|
14
|
+
|
|
15
|
+
const USE_COMPRESSION = true;
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
// Lazy-load the C++ addon (only loaded when actually needed, not in client-side code)
|
|
19
|
+
let getCppAddon = lazy(() => {
|
|
20
|
+
try {
|
|
21
|
+
// Check if we're in Node.js environment
|
|
22
|
+
if (typeof process === "undefined" || !process.versions || !process.versions.node) {
|
|
23
|
+
return undefined;
|
|
24
|
+
}
|
|
25
|
+
// Try to require the C++ addon
|
|
26
|
+
const addon = require("./BufferIndexCPP") as typeof import("./BufferIndexCPP");
|
|
27
|
+
return addon;
|
|
28
|
+
} catch (e) {
|
|
29
|
+
console.warn("C++ addon not available, falling back to TypeScript implementation", e);
|
|
30
|
+
return undefined;
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
function compressBuffer(buffer: Buffer): Buffer {
|
|
35
|
+
if (!USE_COMPRESSION) {
|
|
36
|
+
return buffer;
|
|
37
|
+
}
|
|
38
|
+
return LZ4.compress(buffer);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function decompressBuffer(buffer: Buffer): Buffer {
|
|
42
|
+
if (!USE_COMPRESSION) {
|
|
43
|
+
return buffer;
|
|
44
|
+
}
|
|
45
|
+
return LZ4.decompress(buffer);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function getMaskedHash(unit: number, mask: number): number {
|
|
49
|
+
// Optimized: Simple multiply and shift
|
|
50
|
+
let hash = unit;
|
|
51
|
+
hash = Math.imul(hash, 0x27d4eb2d);
|
|
52
|
+
hash = hash ^ (hash >>> 15);
|
|
53
|
+
|
|
54
|
+
let maskedHash = hash & mask;
|
|
55
|
+
// Avoid collision with empty marker (0)
|
|
56
|
+
if (maskedHash === 0) {
|
|
57
|
+
maskedHash = 1;
|
|
58
|
+
}
|
|
59
|
+
return maskedHash;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
/*
|
|
64
|
+
- If you store every single position, the index is going to really struggle to be significantly smaller than your input data. Because you'll be storing a unit value per byte, which is just a lot of data.
|
|
65
|
+
- SO, we hash each unit, only taking some bits. By only storing some of the units, we massively decrease the storage size, making this significantly smaller than the input data, or even the compressed data.
|
|
66
|
+
- This doesn't affect most searches, because most searches have multiple unique units, and the intersection of them likely only contains the actual blocks.
|
|
67
|
+
- We also store the list of blocks associated with each hash unit outside the hash table in a list, and each one of those lists are compressed.
|
|
68
|
+
- As a result, our compression ratio is usually amazing, although we do have some false positives.
|
|
69
|
+
- The index is often easily 100X smaller than the input data. And then this can increase multiple times if you compress the index (you can't compress it and still use it, however, this will reduce your network usage and disk usage).
|
|
70
|
+
|
|
71
|
+
PERFORMANCE OPTIMIZATION:
|
|
72
|
+
- Uses Uint32Arrays for hash table and linked lists for fast, cache-friendly memory access
|
|
73
|
+
- Linked lists handle collisions more efficiently than linear probing, with better cache locality
|
|
74
|
+
- Dynamic memory allocation (starting at totalUnits/100, growing 4x when needed) keeps memory usage optimal
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
BlocksBuffer {
|
|
78
|
+
byte type
|
|
79
|
+
byte[3] padding
|
|
80
|
+
uint32 blockCount
|
|
81
|
+
uint32[blockCount + 1] blockOffsets
|
|
82
|
+
compress(Block)[]
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
Block {
|
|
86
|
+
uint32 bufferCount
|
|
87
|
+
uint32[bufferCount + 1] bufferOffsets (offsets from the start of the uncompressed block)
|
|
88
|
+
byte[][] bufferData
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
Index {
|
|
92
|
+
byte type
|
|
93
|
+
byte[3] padding
|
|
94
|
+
uint32 mask
|
|
95
|
+
uint32 dataOffset
|
|
96
|
+
uint32[] hashTable (data offsets for each bucket, 0 = empty)
|
|
97
|
+
DataEntry[] dataSection
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
DataEntry {
|
|
101
|
+
uint32 compressedSize
|
|
102
|
+
compress(uint16[] deltaEncodedBlockIndices)
|
|
103
|
+
}
|
|
104
|
+
*/
|
|
105
|
+
export class BufferUnitIndex {
|
|
106
|
+
|
|
107
|
+
// IMPORTANT! The input data values should be sorted from newest to oldest.
|
|
108
|
+
@measureFnc
|
|
109
|
+
public static encode(allData: Buffer[], options: {
|
|
110
|
+
type: number;
|
|
111
|
+
blockSize?: number;
|
|
112
|
+
targetUnitsPerBucket?: number;
|
|
113
|
+
}): {
|
|
114
|
+
blocks: Buffer;
|
|
115
|
+
index: Buffer;
|
|
116
|
+
} {
|
|
117
|
+
const type = options.type;
|
|
118
|
+
const BLOCK_SIZE = options.blockSize ?? DEFAULT_BLOCK_SIZE;
|
|
119
|
+
const TARGET_UNITS_PER_BUCKET = options.targetUnitsPerBucket ?? DEFAULT_TARGET_UNITS_PER_BUCKET;
|
|
120
|
+
|
|
121
|
+
// Step 1: Group buffers into blocks and collect unique units
|
|
122
|
+
const blocks: Buffer[] = [];
|
|
123
|
+
const blocksUncompressed: Buffer[] = [];
|
|
124
|
+
measureBlock(() => {
|
|
125
|
+
function createBlockWithMetadata(buffers: Buffer[]): { block: Buffer; uncompressed: Buffer } {
|
|
126
|
+
const bufferCount = buffers.length;
|
|
127
|
+
const headerBuffer = Buffer.from(new Uint32Array([
|
|
128
|
+
bufferCount,
|
|
129
|
+
...list(bufferCount + 1).map(i => 0),
|
|
130
|
+
]).buffer);
|
|
131
|
+
let currentOffset = headerBuffer.length;
|
|
132
|
+
let headerPos = 4;
|
|
133
|
+
for (const buffer of buffers) {
|
|
134
|
+
headerBuffer.writeUInt32LE(currentOffset, headerPos);
|
|
135
|
+
headerPos += 4;
|
|
136
|
+
currentOffset += buffer.length;
|
|
137
|
+
}
|
|
138
|
+
headerBuffer.writeUInt32LE(currentOffset, headerPos);
|
|
139
|
+
|
|
140
|
+
const concatenatedData = Buffer.concat(buffers);
|
|
141
|
+
|
|
142
|
+
const block = Buffer.concat([headerBuffer, concatenatedData]);
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
block: block,
|
|
146
|
+
uncompressed: concatenatedData
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
let currentBlockBuffers: Buffer[] = [];
|
|
151
|
+
let currentBlockSize = 0;
|
|
152
|
+
|
|
153
|
+
for (const buffer of allData) {
|
|
154
|
+
let nextSize = currentBlockSize + buffer.length;
|
|
155
|
+
if (currentBlockBuffers.length > 0 && nextSize > BLOCK_SIZE) {
|
|
156
|
+
const block = createBlockWithMetadata(currentBlockBuffers);
|
|
157
|
+
blocks.push(block.block);
|
|
158
|
+
blocksUncompressed.push(block.uncompressed);
|
|
159
|
+
currentBlockBuffers = [];
|
|
160
|
+
currentBlockSize = 0;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
currentBlockBuffers.push(buffer);
|
|
164
|
+
currentBlockSize += buffer.length;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (currentBlockBuffers.length > 0) {
|
|
168
|
+
const block = createBlockWithMetadata(currentBlockBuffers);
|
|
169
|
+
blocks.push(block.block);
|
|
170
|
+
blocksUncompressed.push(block.uncompressed);
|
|
171
|
+
}
|
|
172
|
+
}, `createBlocks`);
|
|
173
|
+
|
|
174
|
+
// Step 2: Create final blocks buffer with compressed blocks
|
|
175
|
+
const finalBlocksBuffer = measureBlock(() => {
|
|
176
|
+
// Compress each entire block (metadata + data)
|
|
177
|
+
const compressedBlocks: Buffer[] = [];
|
|
178
|
+
for (const block of blocks) {
|
|
179
|
+
const compressed = compressBuffer(block);
|
|
180
|
+
compressedBlocks.push(compressed);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const blocksHeaderBuffer = Buffer.allocUnsafe(4 + 4 + ((compressedBlocks.length + 1) * 4));
|
|
184
|
+
|
|
185
|
+
blocksHeaderBuffer.writeUInt8(type, 0);
|
|
186
|
+
blocksHeaderBuffer.writeUInt8(0, 1);
|
|
187
|
+
blocksHeaderBuffer.writeUInt8(0, 2);
|
|
188
|
+
blocksHeaderBuffer.writeUInt8(0, 3);
|
|
189
|
+
blocksHeaderBuffer.writeUInt32LE(compressedBlocks.length, 4);
|
|
190
|
+
|
|
191
|
+
let currentBlockOffset = blocksHeaderBuffer.length;
|
|
192
|
+
for (let i = 0; i < compressedBlocks.length; i++) {
|
|
193
|
+
blocksHeaderBuffer.writeUInt32LE(currentBlockOffset, 8 + (i * 4));
|
|
194
|
+
currentBlockOffset += compressedBlocks[i].length;
|
|
195
|
+
}
|
|
196
|
+
blocksHeaderBuffer.writeUInt32LE(currentBlockOffset, 8 + (compressedBlocks.length * 4));
|
|
197
|
+
|
|
198
|
+
return Buffer.concat([blocksHeaderBuffer, ...compressedBlocks]);
|
|
199
|
+
}, `createFinalBlocksBuffer`);
|
|
200
|
+
|
|
201
|
+
const estimatedUniqueUnits = measureBlock(() => {
|
|
202
|
+
const cppAddon = getCppAddon();
|
|
203
|
+
if (cppAddon) {
|
|
204
|
+
// Use C++ addon for performance
|
|
205
|
+
return cppAddon.estimateUniqueUnits(allData);
|
|
206
|
+
} else {
|
|
207
|
+
// TypeScript fallback implementation
|
|
208
|
+
// Use a 64KB bitmap to track which 16-bit masked hashes we've seen
|
|
209
|
+
const seenMaskedHashes = Buffer.allocUnsafe(65536);
|
|
210
|
+
seenMaskedHashes.fill(0);
|
|
211
|
+
let uniqueCount = 0;
|
|
212
|
+
|
|
213
|
+
for (const buffer of allData) {
|
|
214
|
+
for (let i = 0; i <= buffer.length - 4; i++) {
|
|
215
|
+
const unit = buffer.readUint32LE(i);
|
|
216
|
+
const maskedHash = getMaskedHash(unit, 0xFFFF);
|
|
217
|
+
|
|
218
|
+
if (seenMaskedHashes[maskedHash] === 0) {
|
|
219
|
+
seenMaskedHashes[maskedHash] = 1;
|
|
220
|
+
uniqueCount++;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Estimate actual unique units by multiplying by 2
|
|
226
|
+
// This accounts for collisions in the 16-bit space
|
|
227
|
+
// Only inaccurate if unique units approaches or exceeds 65k
|
|
228
|
+
return uniqueCount * 2;
|
|
229
|
+
}
|
|
230
|
+
}, `collectUniqueUnits`);
|
|
231
|
+
|
|
232
|
+
// Step 3: Calculate hash table size and mask
|
|
233
|
+
const { hashTableCapacity, mask } = measureBlock(() => {
|
|
234
|
+
const uniqueUnitCount = estimatedUniqueUnits;
|
|
235
|
+
const targetBuckets = Math.max(1, Math.floor(uniqueUnitCount / TARGET_UNITS_PER_BUCKET));
|
|
236
|
+
let hashTableCapacity = Math.pow(2, Math.floor(Math.log2(targetBuckets * 2)) + 1);
|
|
237
|
+
let mask = hashTableCapacity - 1;
|
|
238
|
+
|
|
239
|
+
// Cap mask at 16 bits (2 bytes)
|
|
240
|
+
if (mask > 0xFFFF) {
|
|
241
|
+
mask = 0xFFFF;
|
|
242
|
+
hashTableCapacity = (mask + 1);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
console.log(`UnitBlockLookup: ${blocks.length} blocks, ${uniqueUnitCount} unique units, ${hashTableCapacity} hash table capacity (~${(uniqueUnitCount / hashTableCapacity).toFixed(1)} units/bucket), mask: ${mask}`);
|
|
246
|
+
|
|
247
|
+
return { hashTableCapacity, mask };
|
|
248
|
+
}, `calculateHashTableSize`);
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
// Step 4: Build hash table with linked lists using Uint32Arrays for performance
|
|
253
|
+
const { hashTable, linkedListData, nodeCount, filledSlots } = measureBlock(() => {
|
|
254
|
+
const cppAddon = getCppAddon();
|
|
255
|
+
if (cppAddon) {
|
|
256
|
+
// Use C++ addon for performance
|
|
257
|
+
return cppAddon.buildHashTable(blocksUncompressed, hashTableCapacity, mask);
|
|
258
|
+
} else {
|
|
259
|
+
// TypeScript fallback implementation
|
|
260
|
+
// Calculate initial linked list capacity based on total units
|
|
261
|
+
let totalUnits = 0;
|
|
262
|
+
for (const buffer of blocksUncompressed) {
|
|
263
|
+
totalUnits += Math.max(0, buffer.length - 3); // number of 4-byte units
|
|
264
|
+
}
|
|
265
|
+
let linkedListCapacity = Math.max(1000, Math.floor(totalUnits / 100));
|
|
266
|
+
|
|
267
|
+
// Hash table: stores head node index for each bucket (0 means empty)
|
|
268
|
+
const hashTable = new Uint32Array(hashTableCapacity);
|
|
269
|
+
|
|
270
|
+
// Linked list storage: each node is 3 uint32s: [maskedHash, blockIndex, nextNodeIndex]
|
|
271
|
+
let linkedListData = new Uint32Array(linkedListCapacity * 3);
|
|
272
|
+
let linkedListSize = 0;
|
|
273
|
+
let filledSlots = 0;
|
|
274
|
+
|
|
275
|
+
function allocateNode(maskedHash: number, blockIndex: number, nextNodeIndex: number): number {
|
|
276
|
+
if (linkedListSize >= linkedListCapacity) {
|
|
277
|
+
console.log(`Growing linked list buffer`);
|
|
278
|
+
// Reallocate with 4x capacity
|
|
279
|
+
const newCapacity = linkedListCapacity * 4;
|
|
280
|
+
const newData = new Uint32Array(newCapacity * 3);
|
|
281
|
+
newData.set(linkedListData);
|
|
282
|
+
linkedListData = newData;
|
|
283
|
+
linkedListCapacity = newCapacity;
|
|
284
|
+
}
|
|
285
|
+
const nodeIndex = linkedListSize + 1; // +1 because 0 means empty
|
|
286
|
+
const nodeOffset = linkedListSize * 3;
|
|
287
|
+
linkedListData[nodeOffset] = maskedHash;
|
|
288
|
+
linkedListData[nodeOffset + 1] = blockIndex;
|
|
289
|
+
linkedListData[nodeOffset + 2] = nextNodeIndex;
|
|
290
|
+
linkedListSize++;
|
|
291
|
+
return nodeIndex;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Build the hash table with linked lists (iterate backwards so linked lists are in forward order)
|
|
295
|
+
for (let blockIndex = blocksUncompressed.length - 1; blockIndex >= 0; blockIndex--) {
|
|
296
|
+
const blockUncompressed = blocksUncompressed[blockIndex];
|
|
297
|
+
|
|
298
|
+
// Iterate through all units and add them directly
|
|
299
|
+
for (let i = 0; i <= blockUncompressed.length - 4; i++) {
|
|
300
|
+
const unit = blockUncompressed.readUint32LE(i);
|
|
301
|
+
const maskedHash = getMaskedHash(unit, mask);
|
|
302
|
+
|
|
303
|
+
const currentHead = hashTable[maskedHash];
|
|
304
|
+
|
|
305
|
+
// Skip if we already added this masked hash for the current block
|
|
306
|
+
// (check if head node has the same blockIndex)
|
|
307
|
+
if (currentHead !== 0) {
|
|
308
|
+
const headNodeOffset = (currentHead - 1) * 3;
|
|
309
|
+
const headBlockIndex = linkedListData[headNodeOffset + 1];
|
|
310
|
+
if (headBlockIndex === blockIndex) {
|
|
311
|
+
continue;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Track when we're filling a previously empty slot
|
|
316
|
+
if (currentHead === 0) {
|
|
317
|
+
filledSlots++;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Create new node and prepend to the chain
|
|
321
|
+
const newNodeIndex = allocateNode(maskedHash, blockIndex, currentHead);
|
|
322
|
+
hashTable[maskedHash] = newNodeIndex;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return {
|
|
327
|
+
hashTable,
|
|
328
|
+
linkedListData: linkedListData.slice(0, linkedListSize * 3),
|
|
329
|
+
nodeCount: linkedListSize,
|
|
330
|
+
filledSlots
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
}, `buildHashTableWithLinkedLists`);
|
|
334
|
+
|
|
335
|
+
console.log(`Hash table filled slots: ${filledSlots} / ${hashTableCapacity} (${formatPercent(filledSlots / hashTableCapacity)} utilization)`);
|
|
336
|
+
|
|
337
|
+
// Step 5: Flatten linked lists and encode to binary
|
|
338
|
+
// Build data section and hash table by traversing linked lists
|
|
339
|
+
const { dataSection, hashTableBuffer } = measureBlock(() => {
|
|
340
|
+
const dataBuffers: Buffer[] = [];
|
|
341
|
+
let currentDataOffset = 0;
|
|
342
|
+
|
|
343
|
+
// Hash table stores data offset for each bucket (0 means empty)
|
|
344
|
+
const hashTableBuffer = Buffer.allocUnsafe(hashTableCapacity * 4);
|
|
345
|
+
hashTableBuffer.fill(0);
|
|
346
|
+
|
|
347
|
+
let totalBlocks = 0;
|
|
348
|
+
let nonEmptyBuckets = 0;
|
|
349
|
+
|
|
350
|
+
// Traverse each bucket and flatten its linked list
|
|
351
|
+
for (let bucketIndex = 0; bucketIndex < hashTableCapacity; bucketIndex++) {
|
|
352
|
+
let nodeIndex = hashTable[bucketIndex];
|
|
353
|
+
|
|
354
|
+
if (nodeIndex === 0) {
|
|
355
|
+
// Empty bucket
|
|
356
|
+
continue;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
nonEmptyBuckets++;
|
|
360
|
+
|
|
361
|
+
// First pass: count blocks in this chain
|
|
362
|
+
let blockCount = 0;
|
|
363
|
+
let tempNodeIndex = nodeIndex;
|
|
364
|
+
while (tempNodeIndex !== 0) {
|
|
365
|
+
blockCount++;
|
|
366
|
+
const nodeOffset = (tempNodeIndex - 1) * 3;
|
|
367
|
+
tempNodeIndex = linkedListData[nodeOffset + 2];
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
totalBlocks += blockCount;
|
|
371
|
+
|
|
372
|
+
// Allocate buffer for delta-encoded blocks
|
|
373
|
+
const deltaEncoded = new Uint16Array(blockCount);
|
|
374
|
+
|
|
375
|
+
// Second pass: write delta-encoded values directly
|
|
376
|
+
let writeIndex = 0;
|
|
377
|
+
let prevBlock = 0;
|
|
378
|
+
nodeIndex = hashTable[bucketIndex];
|
|
379
|
+
while (nodeIndex !== 0) {
|
|
380
|
+
const nodeOffset = (nodeIndex - 1) * 3;
|
|
381
|
+
const blockIndex = linkedListData[nodeOffset + 1];
|
|
382
|
+
|
|
383
|
+
if (writeIndex === 0) {
|
|
384
|
+
deltaEncoded[writeIndex] = blockIndex;
|
|
385
|
+
prevBlock = blockIndex;
|
|
386
|
+
} else {
|
|
387
|
+
deltaEncoded[writeIndex] = blockIndex - prevBlock;
|
|
388
|
+
prevBlock = blockIndex;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
writeIndex++;
|
|
392
|
+
nodeIndex = linkedListData[nodeOffset + 2];
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Compress the delta-encoded blocks
|
|
396
|
+
const uncompressedBuffer = Buffer.from(deltaEncoded.buffer, deltaEncoded.byteOffset, deltaEncoded.byteLength);
|
|
397
|
+
const dataBuffer = compressBuffer(uncompressedBuffer);
|
|
398
|
+
|
|
399
|
+
// Store: [dataSize, data]
|
|
400
|
+
const sizeBuffer = Buffer.allocUnsafe(4);
|
|
401
|
+
sizeBuffer.writeUInt32LE(dataBuffer.length, 0);
|
|
402
|
+
|
|
403
|
+
const entryBuffer = Buffer.concat([sizeBuffer, dataBuffer]);
|
|
404
|
+
dataBuffers.push(entryBuffer);
|
|
405
|
+
|
|
406
|
+
// Store data offset in hash table
|
|
407
|
+
hashTableBuffer.writeUInt32LE(currentDataOffset, bucketIndex * 4);
|
|
408
|
+
currentDataOffset += entryBuffer.length;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const dataSection = Buffer.concat(dataBuffers);
|
|
412
|
+
|
|
413
|
+
const avgBlocks = totalBlocks / nonEmptyBuckets;
|
|
414
|
+
console.log(`Block data buffers: ${dataBuffers.length}, avg blocks per hash: ${formatNumber(avgBlocks)}`);
|
|
415
|
+
|
|
416
|
+
return { dataSection, hashTableBuffer };
|
|
417
|
+
}, `flattenLinkedListsAndBuildDataSection`);
|
|
418
|
+
|
|
419
|
+
// Build header: [type (byte), padding (byte[3]), mask (uint32), dataOffset (uint32)]
|
|
420
|
+
const index = measureBlock(() => {
|
|
421
|
+
const header = Buffer.allocUnsafe(4 + 8);
|
|
422
|
+
header.writeUInt8(type, 0);
|
|
423
|
+
header.writeUInt8(0, 1); // padding
|
|
424
|
+
header.writeUInt8(0, 2); // padding
|
|
425
|
+
header.writeUInt8(0, 3); // padding
|
|
426
|
+
header.writeUInt32LE(mask, 4);
|
|
427
|
+
header.writeUInt32LE(12 + hashTableBuffer.length, 8); // dataOffset: after type+padding+mask+dataOffset+hashTable
|
|
428
|
+
|
|
429
|
+
return Buffer.concat([header, hashTableBuffer, dataSection]);
|
|
430
|
+
}, `buildFinalIndex`);
|
|
431
|
+
|
|
432
|
+
return { blocks: finalBlocksBuffer, index };
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
@measureFnc
|
|
436
|
+
public static async decodeAll(blocks: Buffer): Promise<Buffer[]> {
|
|
437
|
+
let reader = new BufferReader(blocks);
|
|
438
|
+
const blockCount = await this.getBlockCount(reader);
|
|
439
|
+
const allBuffers: Buffer[] = [];
|
|
440
|
+
|
|
441
|
+
for (let blockIndex = 0; blockIndex < blockCount; blockIndex++) {
|
|
442
|
+
let obj = await this.getBlock(reader, blockIndex);
|
|
443
|
+
let blockReader = new BufferReader(obj.block);
|
|
444
|
+
let bufferCount = await this.getBufferCountFromBlock(blockReader);
|
|
445
|
+
for (let i = 0; i < bufferCount; i++) {
|
|
446
|
+
let buffer = await this.getBufferFromBlock(blockReader, i);
|
|
447
|
+
allBuffers.push(buffer);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return allBuffers;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
@measureFnc
|
|
455
|
+
public static async find(config: {
|
|
456
|
+
params: SearchParams;
|
|
457
|
+
keepIterating: () => boolean;
|
|
458
|
+
onResult: (match: Buffer) => void;
|
|
459
|
+
index: Buffer;
|
|
460
|
+
reader: Reader;
|
|
461
|
+
}): Promise<{
|
|
462
|
+
blocksChecked: number;
|
|
463
|
+
blocksCheckedCompressedSize: number;
|
|
464
|
+
blocksCheckedDecompressedSize: number;
|
|
465
|
+
totalBlockCount: number;
|
|
466
|
+
blockSearchTime: number;
|
|
467
|
+
blocksWithErrors: string[];
|
|
468
|
+
}> {
|
|
469
|
+
const { params, index, reader, keepIterating } = config;
|
|
470
|
+
|
|
471
|
+
// Split on wildcards if present
|
|
472
|
+
function splitOnWildcard(buffer: Buffer): Buffer[] {
|
|
473
|
+
const segments: Buffer[] = [];
|
|
474
|
+
let start = 0;
|
|
475
|
+
for (let i = 0; i <= buffer.length; i++) {
|
|
476
|
+
if (i === buffer.length || buffer[i] === WILD_CARD_BYTE) {
|
|
477
|
+
segments.push(buffer.slice(start, i));
|
|
478
|
+
start = i + 1;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
return segments;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
const segments = params.disableWildCards && [params.findBuffer] || splitOnWildcard(params.findBuffer).filter(s => s.length > 0);
|
|
485
|
+
|
|
486
|
+
// Find blocks for each segment >= 4 bytes
|
|
487
|
+
const candidateBlocks = measureBlock(() => {
|
|
488
|
+
const candidateBlocksPerSegment: number[][] = [];
|
|
489
|
+
for (const segment of segments) {
|
|
490
|
+
if (segment.length < 4) continue;
|
|
491
|
+
const blockIndices = this.findBlocks({ findBuffer: segment, index });
|
|
492
|
+
candidateBlocksPerSegment.push(blockIndices);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
if (candidateBlocksPerSegment.length === 0) {
|
|
496
|
+
throw new Error("Search pattern too short: all segments are fewer than 4 bytes, cannot use index");
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
let intersectionSet = new Set<number>(candidateBlocksPerSegment[0]);
|
|
500
|
+
for (let i = 1; i < candidateBlocksPerSegment.length; i++) {
|
|
501
|
+
const currentSet = new Set(candidateBlocksPerSegment[i]);
|
|
502
|
+
intersectionSet = new Set([...intersectionSet].filter(x => currentSet.has(x)));
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Search first first, as moveLogsToPublic should have made it so this is the newest.
|
|
506
|
+
return Array.from(intersectionSet).sort((a, b) => a - b);
|
|
507
|
+
}, `findCandidateBlocks`);
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
const matchesPattern = createMatchesPattern(params.findBuffer, !!params.disableWildCards);
|
|
511
|
+
|
|
512
|
+
const headerBuffer = await reader.read(0, 8);
|
|
513
|
+
const blockCount = headerBuffer.readUInt32LE(4);
|
|
514
|
+
let blocksCheckedCompressedSize = 0;
|
|
515
|
+
let blocksCheckedDecompressedSize = 0;
|
|
516
|
+
|
|
517
|
+
// Read blocks and search for matches
|
|
518
|
+
let blocksChecked = 0;
|
|
519
|
+
let blocksWithErrors: string[] = [];
|
|
520
|
+
let blockSearchTimeStart = Date.now();
|
|
521
|
+
await measureBlock(async () => {
|
|
522
|
+
let matchCount = 0;
|
|
523
|
+
let matchCounts = list(blockCount).fill(0);
|
|
524
|
+
|
|
525
|
+
const searchBlock = async (blockIndex: number) => {
|
|
526
|
+
// This is kind of a weird thing. Basically, because we search in parallel, we might search out of order. So we can only look at the counts before or at us, as if we match a whole bunch after us, but we should still keep going as our matches are going to take precedence.
|
|
527
|
+
let stopIterating = () => {
|
|
528
|
+
let countBefore = 0;
|
|
529
|
+
for (let i = 0; i <= blockIndex; i++) {
|
|
530
|
+
countBefore += matchCounts[i];
|
|
531
|
+
}
|
|
532
|
+
return countBefore >= params.limit || !keepIterating();
|
|
533
|
+
};
|
|
534
|
+
if (stopIterating()) return;
|
|
535
|
+
|
|
536
|
+
let debugOffsets = {
|
|
537
|
+
startOffset: 0,
|
|
538
|
+
endOffset: 0,
|
|
539
|
+
};
|
|
540
|
+
try {
|
|
541
|
+
if (blockIndex >= blockCount) {
|
|
542
|
+
throw new Error(`Block index ${blockIndex} out of range, blockCount is ${blockCount}`);
|
|
543
|
+
}
|
|
544
|
+
let obj = await this.getBlock(reader, blockIndex, debugOffsets);
|
|
545
|
+
|
|
546
|
+
blocksCheckedDecompressedSize += obj.block.length;
|
|
547
|
+
blocksCheckedCompressedSize += obj.compressedSize;
|
|
548
|
+
blocksChecked++;
|
|
549
|
+
|
|
550
|
+
let blockReader = new BufferReader(obj.block);
|
|
551
|
+
let bufferCount = await this.getBufferCountFromBlock(blockReader);
|
|
552
|
+
|
|
553
|
+
// Check each buffer for a match
|
|
554
|
+
for (let i = 0; i < bufferCount; i++) {
|
|
555
|
+
if (stopIterating()) break;
|
|
556
|
+
|
|
557
|
+
const buffer = await this.getBufferFromBlock(blockReader, i);
|
|
558
|
+
if (matchesPattern(buffer)) {
|
|
559
|
+
config.onResult(buffer);
|
|
560
|
+
matchCount++;
|
|
561
|
+
matchCounts[blockIndex]++;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
} catch (e: any) {
|
|
565
|
+
console.warn(`Error decompressing/searching block: ${e}`);
|
|
566
|
+
blocksWithErrors.push(`(for block ${blockIndex + 1} / ${blockCount}, ${debugOffsets.startOffset}-${debugOffsets.endOffset}) ${String(e?.stack || e)}`);
|
|
567
|
+
}
|
|
568
|
+
};
|
|
569
|
+
|
|
570
|
+
const runSearchBlock = runInParallel(
|
|
571
|
+
{ parallelCount: BufferUnitIndexParallelSearchCount },
|
|
572
|
+
searchBlock
|
|
573
|
+
);
|
|
574
|
+
await Promise.all(list(blockCount).map(runSearchBlock));
|
|
575
|
+
|
|
576
|
+
}, `searchBlocks`);
|
|
577
|
+
let blockSearchTime = Date.now() - blockSearchTimeStart;
|
|
578
|
+
|
|
579
|
+
return { blocksChecked, blocksCheckedCompressedSize, blocksCheckedDecompressedSize, totalBlockCount: blockCount, blockSearchTime, blocksWithErrors };
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
private static findBlocks(config: {
|
|
583
|
+
findBuffer: Buffer;
|
|
584
|
+
index: Buffer;
|
|
585
|
+
}): number[] {
|
|
586
|
+
const { findBuffer, index } = config;
|
|
587
|
+
|
|
588
|
+
// Extract all unique units from findBuffer
|
|
589
|
+
const units = measureBlock(() => {
|
|
590
|
+
const units: number[] = [];
|
|
591
|
+
for (let i = 0; i <= findBuffer.length - 4; i++) {
|
|
592
|
+
const unit = findBuffer.readUint32LE(i);
|
|
593
|
+
if (unit !== 0 && !units.includes(unit)) {
|
|
594
|
+
units.push(unit);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
return units;
|
|
598
|
+
}, `extractUnits`);
|
|
599
|
+
|
|
600
|
+
if (units.length === 0) {
|
|
601
|
+
return [];
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// Get blocks for each unit and intersect
|
|
605
|
+
return measureBlock(() => {
|
|
606
|
+
const candidateBlocksPerUnit: number[][] = [];
|
|
607
|
+
for (const unit of units) {
|
|
608
|
+
const blockIndices = this.getBlocksForUnit(index, unit);
|
|
609
|
+
candidateBlocksPerUnit.push(blockIndices);
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// Intersect all block sets
|
|
613
|
+
let intersectionSet = new Set<number>(candidateBlocksPerUnit[0]);
|
|
614
|
+
for (let i = 1; i < candidateBlocksPerUnit.length; i++) {
|
|
615
|
+
const currentSet = new Set(candidateBlocksPerUnit[i]);
|
|
616
|
+
intersectionSet = new Set([...intersectionSet].filter(x => currentSet.has(x)));
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
let allCounts = candidateBlocksPerUnit.map(b => b.length);
|
|
620
|
+
sort(allCounts, x => x);
|
|
621
|
+
|
|
622
|
+
//console.log(`Candidate blocks ${intersectionSet.size}, minimum: ${Math.min(...candidateBlocksPerUnit.map(b => b.length))}, best 4 counts: ${allCounts.slice(0, 4).join(", ")}`);
|
|
623
|
+
|
|
624
|
+
return Array.from(intersectionSet);
|
|
625
|
+
}, `intersectBlocks`);
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
private static getBlocksForUnit(index: Buffer, unit: number): number[] {
|
|
629
|
+
// Read header (skip type + padding at offset 0-3)
|
|
630
|
+
const mask = index.readUInt32LE(4);
|
|
631
|
+
const dataOffset = index.readUInt32LE(8);
|
|
632
|
+
|
|
633
|
+
const maskedHash = getMaskedHash(unit, mask);
|
|
634
|
+
|
|
635
|
+
// Hash table starts after header (type + padding + mask + dataOffset = 12 bytes)
|
|
636
|
+
const hashTableOffset = 12 + maskedHash * 4;
|
|
637
|
+
const bucketDataOffset = index.readUInt32LE(hashTableOffset);
|
|
638
|
+
|
|
639
|
+
if (bucketDataOffset === 0) {
|
|
640
|
+
// Empty bucket
|
|
641
|
+
return [];
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
// Read from data section
|
|
645
|
+
const dataSection = index.slice(dataOffset);
|
|
646
|
+
const dataSize = dataSection.readUInt32LE(bucketDataOffset);
|
|
647
|
+
|
|
648
|
+
if (dataSize === 0) {
|
|
649
|
+
return [];
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
const data = dataSection.slice(bucketDataOffset + 4, bucketDataOffset + 4 + dataSize);
|
|
653
|
+
const decompressedData = decompressBuffer(data);
|
|
654
|
+
|
|
655
|
+
const blocks: number[] = [];
|
|
656
|
+
const uint16View = new Uint16Array(decompressedData.buffer, decompressedData.byteOffset, decompressedData.length / 2);
|
|
657
|
+
|
|
658
|
+
if (uint16View.length === 0) {
|
|
659
|
+
return [];
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
// Decode delta encoding
|
|
663
|
+
let currentBlock = uint16View[0];
|
|
664
|
+
blocks.push(currentBlock);
|
|
665
|
+
|
|
666
|
+
for (let i = 1; i < uint16View.length; i++) {
|
|
667
|
+
currentBlock += uint16View[i];
|
|
668
|
+
blocks.push(currentBlock);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
return blocks;
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
private static async getBlockCount(reader: Reader): Promise<number> {
|
|
676
|
+
const headerBuffer = await reader.read(4, 8);
|
|
677
|
+
return headerBuffer.readUInt32LE(0);
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
private static async getBlock(reader: Reader, blockIndex: number, debugOffsets?: {
|
|
681
|
+
startOffset: number;
|
|
682
|
+
endOffset: number;
|
|
683
|
+
}): Promise<{
|
|
684
|
+
compressedSize: number;
|
|
685
|
+
block: Buffer;
|
|
686
|
+
}> {
|
|
687
|
+
const startOffset = (await reader.read(8 + blockIndex * 4, 4)).readUInt32LE(0);
|
|
688
|
+
const endOffset = (await reader.read(8 + (blockIndex + 1) * 4, 4)).readUInt32LE(0);
|
|
689
|
+
if (debugOffsets) {
|
|
690
|
+
debugOffsets.startOffset = startOffset;
|
|
691
|
+
debugOffsets.endOffset = endOffset;
|
|
692
|
+
}
|
|
693
|
+
let size = endOffset - startOffset;
|
|
694
|
+
let compressedBlock = await reader.read(startOffset, size);
|
|
695
|
+
if (compressedBlock.length !== size) {
|
|
696
|
+
devDebugbreak();
|
|
697
|
+
let test = await reader.read(startOffset, size);
|
|
698
|
+
throw new Error(`Reading block data at index ${blockIndex} we were given an incorrect amount of data, expected ${size}, got ${compressedBlock.length}. ${size > compressedBlock.length ? "It is likely the file is missing trailing data?" : "It's likely that the reader function is incorrect."}`);
|
|
699
|
+
}
|
|
700
|
+
let block = decompressBuffer(compressedBlock);
|
|
701
|
+
return {
|
|
702
|
+
compressedSize: endOffset - startOffset,
|
|
703
|
+
block: block,
|
|
704
|
+
};
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
public static async getBufferCountFromBlock(reader: Reader): Promise<number> {
|
|
708
|
+
return (await reader.read(0, 4)).readUInt32LE(0);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
private static async getBufferFromBlock(reader: Reader, bufferIndex: number): Promise<Buffer> {
|
|
712
|
+
let startOffset = (await reader.read(4 + bufferIndex * 4, 4)).readUInt32LE(0);
|
|
713
|
+
let endOffset = (await reader.read(4 + (bufferIndex + 1) * 4, 4)).readUInt32LE(0);
|
|
714
|
+
let buffer = await reader.read(startOffset, endOffset - startOffset);
|
|
715
|
+
return buffer;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
}
|
|
719
|
+
|