@blamejs/core 0.11.24 → 0.11.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/index.js +5 -0
- package/lib/auth/bot-challenge.js +573 -0
- package/lib/framework-error.js +6 -0
- package/lib/fsm.js +469 -0
- package/lib/guard-mail-query.js +14 -0
- package/lib/mail-agent.js +24 -10
- package/lib/mail-server-mx.js +12 -7
- package/lib/mail-server-submission.js +199 -11
- package/lib/mail-store-fts.js +394 -0
- package/lib/mail-store.js +142 -4
- package/lib/money.js +699 -0
- package/lib/webhook.js +229 -0
- package/package.json +1 -1
- package/sbom.cdx.json +6 -6
|
@@ -428,9 +428,27 @@ function create(opts) {
|
|
|
428
428
|
authPending: null,
|
|
429
429
|
};
|
|
430
430
|
|
|
431
|
-
|
|
431
|
+
// RAW byte buffer — NOT a string. The BDAT-CHUNKING path (RFC 3030)
|
|
432
|
+
// requires lossless byte preservation when the BDAT command line +
|
|
433
|
+
// payload arrive in the same TCP segment, and DATA-body 8BITMIME
|
|
434
|
+
// payloads can contain bytes that are invalid UTF-8. Decoding the
|
|
435
|
+
// socket-bytes through a string layer replaces invalid sequences
|
|
436
|
+
// with U+FFFD and corrupts the body. Keep the raw bytes; decode to
|
|
437
|
+
// string only for the per-command parse.
|
|
438
|
+
var lineBuffer = Buffer.alloc(0);
|
|
432
439
|
var bodyCollector = null;
|
|
433
440
|
var inDataBody = false;
|
|
441
|
+
// RFC 3030 CHUNKING — state for the BDAT command. `bdatCollector`
|
|
442
|
+
// accumulates the message body across multiple BDAT chunks; it lives
|
|
443
|
+
// for the lifetime of the SMTP transaction (i.e., between MAIL FROM
|
|
444
|
+
// and the BDAT ... LAST that finalises). `bdatRemaining` counts down
|
|
445
|
+
// bytes still owed by the current BDAT chunk; `bdatIsLast` flags
|
|
446
|
+
// whether the current chunk is the terminator.
|
|
447
|
+
var inBdatChunk = false;
|
|
448
|
+
var bdatRemaining = 0;
|
|
449
|
+
var bdatIsLast = false;
|
|
450
|
+
var bdatCollector = null;
|
|
451
|
+
var bdatTotalBytes = 0;
|
|
434
452
|
|
|
435
453
|
socket.setTimeout(idleTimeoutMs);
|
|
436
454
|
socket.on("timeout", function () {
|
|
@@ -465,6 +483,57 @@ function create(opts) {
|
|
|
465
483
|
});
|
|
466
484
|
|
|
467
485
|
function _ingestBytes(state, socket, chunk) {
|
|
486
|
+
// RFC 3030 — when a BDAT chunk is in progress we consume exactly
|
|
487
|
+
// `bdatRemaining` bytes off the wire, no dot-stuffing, no end-of-
|
|
488
|
+
// data marker. Any excess bytes in the chunk after the BDAT
|
|
489
|
+
// payload completes get fed back through the command line buffer
|
|
490
|
+
// (typical when a pipelined `BDAT N LAST\r\n<payload>\r\nNOOP\r\n`
|
|
491
|
+
// arrives in a single TCP segment).
|
|
492
|
+
if (inBdatChunk) {
|
|
493
|
+
var consumeN = Math.min(chunk.length, bdatRemaining);
|
|
494
|
+
var consumed = chunk.subarray(0, consumeN);
|
|
495
|
+
try { bdatCollector.push(consumed); }
|
|
496
|
+
catch (_e) {
|
|
497
|
+
_emit("mail.server.submission.bdat_refused",
|
|
498
|
+
{ connectionId: state.id, reason: "body-too-large", maxBytes: maxMessageBytes },
|
|
499
|
+
"denied");
|
|
500
|
+
_writeReply(socket, REPLY_552_SIZE_EXCEEDED,
|
|
501
|
+
"5.3.4 BDAT body exceeds maxMessageBytes (" + maxMessageBytes + " bytes)");
|
|
502
|
+
_resetTransaction(state);
|
|
503
|
+
inBdatChunk = false; bdatCollector = null; bdatRemaining = 0; bdatTotalBytes = 0;
|
|
504
|
+
return;
|
|
505
|
+
}
|
|
506
|
+
bdatRemaining -= consumeN;
|
|
507
|
+
bdatTotalBytes += consumeN;
|
|
508
|
+
if (bdatRemaining === 0) {
|
|
509
|
+
var wasLast = bdatIsLast;
|
|
510
|
+
inBdatChunk = false;
|
|
511
|
+
if (wasLast) {
|
|
512
|
+
// RFC 3030 §2.2 — ONE reply per BDAT command. When LAST,
|
|
513
|
+
// the single reply is the "message queued" finalize reply
|
|
514
|
+
// (emitted from _finalizeAcceptedBody), not the per-chunk
|
|
515
|
+
// "<N> octets received" reply. Emitting both would
|
|
516
|
+
// desynchronise the client (the second 250 would be
|
|
517
|
+
// consumed as the response to the next command).
|
|
518
|
+
// No dot-unstuff for BDAT — RFC 3030 §3 explicitly defines
|
|
519
|
+
// BDAT payloads as opaque byte streams.
|
|
520
|
+
var bdatBody = bdatCollector.result();
|
|
521
|
+
bdatCollector = null;
|
|
522
|
+
bdatTotalBytes = 0;
|
|
523
|
+
_finalizeAcceptedBody(state, socket, bdatBody, "BDAT");
|
|
524
|
+
} else {
|
|
525
|
+
// Non-final chunk — per-chunk acknowledgement only.
|
|
526
|
+
_writeReply(socket, REPLY_250_OK,
|
|
527
|
+
"2.0.0 " + bdatTotalBytes + " octets received");
|
|
528
|
+
}
|
|
529
|
+
// Any tail bytes after this BDAT chunk get re-fed as commands.
|
|
530
|
+
if (consumeN < chunk.length) {
|
|
531
|
+
var tail = chunk.subarray(consumeN);
|
|
532
|
+
_ingestBytes(state, socket, tail);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
return;
|
|
536
|
+
}
|
|
468
537
|
if (inDataBody) {
|
|
469
538
|
try { bodyCollector.push(chunk); }
|
|
470
539
|
catch (_e) {
|
|
@@ -491,13 +560,15 @@ function create(opts) {
|
|
|
491
560
|
var endIdx = safeSmtp.findDotTerminator(collected);
|
|
492
561
|
if (endIdx !== -1) {
|
|
493
562
|
var body = collected.subarray(0, endIdx);
|
|
494
|
-
|
|
563
|
+
// DATA path dot-unstuffs here; BDAT path skips this step.
|
|
564
|
+
var dedotted = safeSmtp.dotUnstuff(body);
|
|
565
|
+
_finalizeAcceptedBody(state, socket, dedotted, "DATA");
|
|
495
566
|
inDataBody = false; bodyCollector = null;
|
|
496
567
|
}
|
|
497
568
|
return;
|
|
498
569
|
}
|
|
499
570
|
|
|
500
|
-
lineBuffer
|
|
571
|
+
lineBuffer = lineBuffer.length === 0 ? chunk : Buffer.concat([lineBuffer, chunk]);
|
|
501
572
|
if (lineBuffer.length > maxLineBytes * 4) {
|
|
502
573
|
_writeReply(socket, REPLY_500_SYNTAX,
|
|
503
574
|
"5.5.6 Line too long (>" + maxLineBytes + " bytes)");
|
|
@@ -505,11 +576,29 @@ function create(opts) {
|
|
|
505
576
|
return;
|
|
506
577
|
}
|
|
507
578
|
var crlf;
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
579
|
+
var crlfNeedle = Buffer.from("\r\n", "ascii");
|
|
580
|
+
while ((crlf = lineBuffer.indexOf(crlfNeedle)) !== -1) {
|
|
581
|
+
// Decode just the per-command line to a string — keeps the
|
|
582
|
+
// wire-protocol parser working in UTF-8 while leaving the
|
|
583
|
+
// RAW lineBuffer intact for any binary payload that follows.
|
|
584
|
+
var line = lineBuffer.subarray(0, crlf).toString("utf8");
|
|
585
|
+
lineBuffer = lineBuffer.subarray(crlf + 2);
|
|
511
586
|
_handleCommand(state, socket, line);
|
|
512
587
|
if (inDataBody) return;
|
|
588
|
+
if (inBdatChunk) {
|
|
589
|
+
// RFC 3030 — `BDAT <N> [LAST]\r\n` is immediately followed by
|
|
590
|
+
// exactly <N> raw bytes (no dot-stuffing, no terminator). When
|
|
591
|
+
// those bytes arrived in the SAME TCP segment as the BDAT
|
|
592
|
+
// command, drain them straight from the raw byte buffer
|
|
593
|
+
// (NOT through a UTF-8 string round-trip — would corrupt
|
|
594
|
+
// 8-bit / binary payloads).
|
|
595
|
+
if (lineBuffer.length > 0) {
|
|
596
|
+
var pendingBytes = lineBuffer;
|
|
597
|
+
lineBuffer = Buffer.alloc(0);
|
|
598
|
+
_ingestBytes(state, socket, pendingBytes);
|
|
599
|
+
}
|
|
600
|
+
return;
|
|
601
|
+
}
|
|
513
602
|
}
|
|
514
603
|
}
|
|
515
604
|
|
|
@@ -555,6 +644,8 @@ function create(opts) {
|
|
|
555
644
|
return _handleRcptTo(state, socket, line);
|
|
556
645
|
case "DATA":
|
|
557
646
|
return _handleData(state, socket);
|
|
647
|
+
case "BDAT":
|
|
648
|
+
return _handleBdat(state, socket, line);
|
|
558
649
|
case "NOOP":
|
|
559
650
|
return _writeReply(socket, REPLY_250_OK, "2.0.0 OK");
|
|
560
651
|
case "RSET":
|
|
@@ -592,7 +683,7 @@ function create(opts) {
|
|
|
592
683
|
state.helo = helo;
|
|
593
684
|
state.stage = "ehlo";
|
|
594
685
|
if (verb === "EHLO") {
|
|
595
|
-
var caps = ["PIPELINING", "SIZE " + maxMessageBytes, "8BITMIME", "ENHANCEDSTATUSCODES"];
|
|
686
|
+
var caps = ["PIPELINING", "SIZE " + maxMessageBytes, "8BITMIME", "ENHANCEDSTATUSCODES", "CHUNKING"];
|
|
596
687
|
// STARTTLS advertised only on explicit-STARTTLS port (587),
|
|
597
688
|
// not on implicit-TLS (465 already wrapped). RFC 8314 §3.3.
|
|
598
689
|
if (!state.tls && !implicitTls) caps.unshift("STARTTLS");
|
|
@@ -627,7 +718,12 @@ function create(opts) {
|
|
|
627
718
|
// body collector AND strip the plain-socket "data" listener
|
|
628
719
|
// before wrapping in TLSSocket so bytes the peer pipelined
|
|
629
720
|
// pre-handshake cannot reach the post-TLS state machine.
|
|
630
|
-
lineBuffer =
|
|
721
|
+
lineBuffer = Buffer.alloc(0); bodyCollector = null; inDataBody = false;
|
|
722
|
+
// BDAT-side state cleared on STARTTLS upgrade too — same threat
|
|
723
|
+
// model as CVE-2021-38371 (Exim) / CVE-2021-33515 (Dovecot):
|
|
724
|
+
// pre-handshake bytes the peer pipelined MUST NOT reach the
|
|
725
|
+
// post-TLS state machine via the BDAT collector either.
|
|
726
|
+
inBdatChunk = false; bdatRemaining = 0; bdatCollector = null; bdatTotalBytes = 0;
|
|
631
727
|
mailServerTls.upgradeSocket({
|
|
632
728
|
plainSocket: socket,
|
|
633
729
|
secureContext: opts.tlsContext,
|
|
@@ -1033,8 +1129,7 @@ function create(opts) {
|
|
|
1033
1129
|
});
|
|
1034
1130
|
}
|
|
1035
1131
|
|
|
1036
|
-
function
|
|
1037
|
-
var dedotted = safeSmtp.dotUnstuff(body);
|
|
1132
|
+
function _finalizeAcceptedBody(state, socket, dedotted, source) {
|
|
1038
1133
|
|
|
1039
1134
|
// Outbound DKIM-required gate. Scan the header block for a
|
|
1040
1135
|
// `DKIM-Signature:` line; under `self` mode also require at
|
|
@@ -1108,16 +1203,109 @@ function create(opts) {
|
|
|
1108
1203
|
}
|
|
1109
1204
|
_emit("mail.server.submission.data_accepted",
|
|
1110
1205
|
{ connectionId: state.id, mailFrom: state.mailFrom,
|
|
1111
|
-
rcptCount: state.rcpts.length, sizeBytes: dedotted.length });
|
|
1206
|
+
rcptCount: state.rcpts.length, sizeBytes: dedotted.length, source: source || "DATA" });
|
|
1112
1207
|
_writeReply(socket, REPLY_250_OK, "2.6.0 Message queued (audit-only)");
|
|
1113
1208
|
_resetTransaction(state);
|
|
1114
1209
|
}
|
|
1115
1210
|
|
|
1211
|
+
// RFC 3030 §2 — BDAT <chunk-size> [LAST]. Reads exactly chunk-size
|
|
1212
|
+
// bytes off the wire (no dot-stuffing, no end-of-data marker). The
|
|
1213
|
+
// size is a non-negative integer; LAST keyword (case-insensitive)
|
|
1214
|
+
// terminates the message body. Mixing DATA + BDAT within the same
|
|
1215
|
+
// transaction is forbidden — the server returns 503 once the first
|
|
1216
|
+
// BDAT lands and forces the client to RSET.
|
|
1217
|
+
function _handleBdat(state, socket, line) {
|
|
1218
|
+
if (state.stage !== "rcpt" && state.stage !== "bdat") {
|
|
1219
|
+
_writeReply(socket, REPLY_503_BAD_SEQUENCE, "5.5.1 BDAT requires MAIL FROM + RCPT TO");
|
|
1220
|
+
return;
|
|
1221
|
+
}
|
|
1222
|
+
if (state.rcpts.length === 0) {
|
|
1223
|
+
_writeReply(socket, REPLY_503_BAD_SEQUENCE, "5.5.1 No valid recipients");
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
// Pipelining race — same gate as DATA.
|
|
1227
|
+
if ((state.rcptsPending || 0) > 0) {
|
|
1228
|
+
_emit("mail.server.submission.pipelining_bdat_race", {
|
|
1229
|
+
connectionId: state.id, rcptsPending: state.rcptsPending,
|
|
1230
|
+
rcptsCommitted: state.rcpts.length,
|
|
1231
|
+
}, "denied");
|
|
1232
|
+
_writeReply(socket, REPLY_451_LOCAL_ERROR,
|
|
1233
|
+
"4.5.0 RCPT TO verdicts pending; reissue BDAT after recipient replies");
|
|
1234
|
+
return;
|
|
1235
|
+
}
|
|
1236
|
+
// Parse `BDAT <size>[ LAST]`.
|
|
1237
|
+
var parts = line.split(/\s+/);
|
|
1238
|
+
if (parts.length < 2 || parts.length > 3) {
|
|
1239
|
+
_writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT requires <chunk-size> [LAST]");
|
|
1240
|
+
return;
|
|
1241
|
+
}
|
|
1242
|
+
var sizeStr = parts[1];
|
|
1243
|
+
var sizeN = parseInt(sizeStr, 10);
|
|
1244
|
+
if (!/^\d+$/.test(sizeStr) || !isFinite(sizeN) || sizeN < 0) {
|
|
1245
|
+
_writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT chunk-size must be a non-negative integer");
|
|
1246
|
+
return;
|
|
1247
|
+
}
|
|
1248
|
+
var isLast = parts.length === 3 && parts[2].toUpperCase() === "LAST";
|
|
1249
|
+
if (parts.length === 3 && !isLast) {
|
|
1250
|
+
_writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT third arg must be 'LAST' (RFC 3030 §2)");
|
|
1251
|
+
return;
|
|
1252
|
+
}
|
|
1253
|
+
// Cumulative-size cap. The collector is bounded too, but checking
|
|
1254
|
+
// up-front lets us refuse the chunk before reading bytes off the
|
|
1255
|
+
// socket — important when sizeN >> maxMessageBytes.
|
|
1256
|
+
if (bdatTotalBytes + sizeN > maxMessageBytes) {
|
|
1257
|
+
_emit("mail.server.submission.bdat_refused",
|
|
1258
|
+
{ connectionId: state.id, reason: "body-too-large",
|
|
1259
|
+
requestedTotal: bdatTotalBytes + sizeN, maxBytes: maxMessageBytes }, "denied");
|
|
1260
|
+
_writeReply(socket, REPLY_552_SIZE_EXCEEDED,
|
|
1261
|
+
"5.3.4 BDAT cumulative size " + (bdatTotalBytes + sizeN) +
|
|
1262
|
+
" exceeds maxMessageBytes (" + maxMessageBytes + ")");
|
|
1263
|
+
_resetTransaction(state);
|
|
1264
|
+
bdatCollector = null; bdatTotalBytes = 0;
|
|
1265
|
+
return;
|
|
1266
|
+
}
|
|
1267
|
+
if (!bdatCollector) {
|
|
1268
|
+
bdatCollector = safeBuffer.boundedChunkCollector({
|
|
1269
|
+
maxBytes: maxMessageBytes,
|
|
1270
|
+
errorClass: MailServerSubmissionError,
|
|
1271
|
+
sizeCode: "mail-server-submission/body-too-large",
|
|
1272
|
+
sizeMessage: "BDAT body exceeded maxMessageBytes (" + maxMessageBytes + ")",
|
|
1273
|
+
});
|
|
1274
|
+
}
|
|
1275
|
+
state.stage = "bdat";
|
|
1276
|
+
bdatRemaining = sizeN;
|
|
1277
|
+
bdatIsLast = isLast;
|
|
1278
|
+
// size=0 + LAST is a valid sequence — finalises the message
|
|
1279
|
+
// body (the LAST chunk may carry zero bytes when the prior chunk
|
|
1280
|
+
// was the final payload). RFC 3030 §2.2 — ONE reply per command:
|
|
1281
|
+
// emit the "0 octets" ack for size=0 NOT-LAST, but defer to
|
|
1282
|
+
// _finalizeAcceptedBody for size=0 LAST.
|
|
1283
|
+
if (sizeN === 0) {
|
|
1284
|
+
if (isLast) {
|
|
1285
|
+
var emptyBody = bdatCollector ? bdatCollector.result() : Buffer.alloc(0);
|
|
1286
|
+
bdatCollector = null; bdatTotalBytes = 0;
|
|
1287
|
+
_finalizeAcceptedBody(state, socket, emptyBody, "BDAT");
|
|
1288
|
+
} else {
|
|
1289
|
+
_writeReply(socket, REPLY_250_OK, "2.0.0 0 octets received");
|
|
1290
|
+
}
|
|
1291
|
+
return;
|
|
1292
|
+
}
|
|
1293
|
+
inBdatChunk = true;
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1116
1296
|
function _resetTransaction(state) {
|
|
1117
1297
|
state.mailFrom = null;
|
|
1118
1298
|
state.rcpts = [];
|
|
1119
1299
|
state.rcptsPending = 0;
|
|
1120
1300
|
state.stage = "ehlo";
|
|
1301
|
+
// BDAT-side state lives at the connection level, not on `state`.
|
|
1302
|
+
// Reset it here so a RSET / failed BDAT can't leak collected
|
|
1303
|
+
// bytes into the next transaction.
|
|
1304
|
+
inBdatChunk = false;
|
|
1305
|
+
bdatRemaining = 0;
|
|
1306
|
+
bdatIsLast = false;
|
|
1307
|
+
bdatCollector = null;
|
|
1308
|
+
bdatTotalBytes = 0;
|
|
1121
1309
|
}
|
|
1122
1310
|
}
|
|
1123
1311
|
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @module b.mailStore.fts
|
|
4
|
+
* @nav Mail
|
|
5
|
+
* @title Mail-store FTS (sealed-token full-text index)
|
|
6
|
+
* @order 250
|
|
7
|
+
* @slug mail-store-fts
|
|
8
|
+
*
|
|
9
|
+
* @intro
|
|
10
|
+
* Sealed-token full-text search index for `b.mailStore`. At
|
|
11
|
+
* `appendMessage` time the row's plaintext subject + addresses + body
|
|
12
|
+
* are tokenized, each token is hashed with the per-deployment vault
|
|
13
|
+
* salt (the same scheme `b.cryptoField` uses for derived-hash mirrors
|
|
14
|
+
* on sealed columns), and the resulting space-separated token-hash
|
|
15
|
+
* string is inserted into a SQLite FTS5 virtual table. Search runs
|
|
16
|
+
* the same tokenize → hash transform on the operator's query terms
|
|
17
|
+
* and issues `MATCH` against the FTS5 table — never against
|
|
18
|
+
* plaintext.
|
|
19
|
+
*
|
|
20
|
+
* The index is unrecoverable without the vault salt. A database
|
|
21
|
+
* dump leaks zero readable text — the FTS5 rows are byte-for-byte
|
|
22
|
+
* indistinguishable from random hashes. Per-tenant separation rides
|
|
23
|
+
* on the cryptoField namespace prefix (`bj-<table>-<field>:`), so
|
|
24
|
+
* tokens from one tenant's row can never collide with another's
|
|
25
|
+
* under the same vault key.
|
|
26
|
+
*
|
|
27
|
+
* Limitations of sealed-token FTS — operator-facing constraints:
|
|
28
|
+
*
|
|
29
|
+
* - Exact-token match only. No SQLite FTS5 stemmer, no porter,
|
|
30
|
+
* no Unicode-fold-then-stem, no NEAR with offsets. The token
|
|
31
|
+
* boundary IS the search granularity. Operators that need
|
|
32
|
+
* linguistic search at the cost of plaintext-at-rest opt in to
|
|
33
|
+
* a separate plaintext-FTS layer on top — not part of this
|
|
34
|
+
* primitive.
|
|
35
|
+
* - No prefix wildcard (`MATCH 'kub*'`). Token hashes don't
|
|
36
|
+
* preserve substring relationships. The cost of partial-match
|
|
37
|
+
* search is sealed-at-rest; operators get either-or.
|
|
38
|
+
* - Stopword filter is conservative (a / the / of / to / in /
|
|
39
|
+
* for / on / and / or / is / are / be / by). Stopwords land
|
|
40
|
+
* in the unsealed plaintext but never reach the FTS row.
|
|
41
|
+
* - Token length capped at 2..64 unicode codepoints after
|
|
42
|
+
* NFC normalisation. Tokens outside the band are dropped (too
|
|
43
|
+
* short = high-collision noise; too long = file-bomb shape).
|
|
44
|
+
*
|
|
45
|
+
* Posture cascade. The primitive is on by default for every
|
|
46
|
+
* posture (`hipaa` / `pci-dss` / `gdpr` / `soc2`) — the token
|
|
47
|
+
* index uses the same vault key already protecting sealed-row
|
|
48
|
+
* storage, so adding the FTS index doesn't widen the cryptographic
|
|
49
|
+
* trust boundary. A future opt-in plaintext-FTS overlay would be
|
|
50
|
+
* gated by a relaxed posture; this module ships sealed-only.
|
|
51
|
+
*
|
|
52
|
+
* @card
|
|
53
|
+
* Tokenize → vault-salted hash → FTS5 MATCH. The DB dump leaks
|
|
54
|
+
* nothing readable; search works against ciphertext.
|
|
55
|
+
*/
|
|
56
|
+
|
|
57
|
+
var bCrypto = require("./crypto");
|
|
58
|
+
var vault = require("./vault");
|
|
59
|
+
var C = require("./constants");
|
|
60
|
+
|
|
61
|
+
// Stopwords are dropped before hashing — they'd dominate every row's
|
|
62
|
+
// token set without adding query selectivity. Kept conservative to
|
|
63
|
+
// stay locale-neutral for v1.
|
|
64
|
+
var STOPWORDS = Object.create(null);
|
|
65
|
+
(
|
|
66
|
+
"a an the of to in for on and or but is are was were be been by with " +
|
|
67
|
+
"as at from this that it its their our your his her him us we i you " +
|
|
68
|
+
"do does did not no yes if so up down out over under than then them"
|
|
69
|
+
).split(" ").forEach(function (w) { STOPWORDS[w] = true; });
|
|
70
|
+
|
|
71
|
+
// Per-token bounds. NFC-normalised codepoint count, not byte length —
|
|
72
|
+
// tokens carrying multi-byte UTF-8 are not penalised relative to ASCII.
|
|
73
|
+
var MIN_TOKEN_LEN = 2;
|
|
74
|
+
var MAX_TOKEN_LEN = 64;
|
|
75
|
+
|
|
76
|
+
// Per-row token-set cap. A single 50 MiB message can produce
|
|
77
|
+
// millions of tokens; FTS5 row insert + index update must stay
|
|
78
|
+
// bounded. The cap is applied AFTER stopword + length filter so the
|
|
79
|
+
// surviving tokens are the highest-signal subset.
|
|
80
|
+
var MAX_TOKENS_PER_FIELD = 8192; // allow:raw-byte-literal — token-count cap, not bytes
|
|
81
|
+
|
|
82
|
+
// Per-field FTS column names. Kept symmetric with the messages table
|
|
83
|
+
// columns so callers can reason about which FTS column corresponds
|
|
84
|
+
// to which plaintext source field.
|
|
85
|
+
var FTS_FIELDS = {
|
|
86
|
+
subject: "subject_toks",
|
|
87
|
+
from: "addr_toks",
|
|
88
|
+
to: "addr_toks",
|
|
89
|
+
body: "body_toks",
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
// Token splitter — Unicode-aware. Splits on every non-letter,
|
|
93
|
+
// non-digit code-point (Unicode category L* or N*). Apostrophes inside
|
|
94
|
+
// a word survive (`don't` → `don't`); leading/trailing punctuation is
|
|
95
|
+
// stripped. Email addresses are split on `@` + `.` so both the local
|
|
96
|
+
// part and each domain label produce independent tokens — operators
|
|
97
|
+
// searching for `example.com` find rows whose from/to header carries
|
|
98
|
+
// `alice@example.com` AND rows that mention `example` or `com` in
|
|
99
|
+
// body prose. Stopwords prune the noisy ones.
|
|
100
|
+
//
|
|
101
|
+
// Refuses input larger than MAX_INPUT_BYTES to bound tokenizer work
|
|
102
|
+
// — protects against DoS-shaped messages whose body is 50 MiB of a
|
|
103
|
+
// single token boundary.
|
|
104
|
+
var MAX_INPUT_BYTES = C.BYTES.mib(8); // 8 MiB
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* @primitive b.mailStore.fts.tokenize
|
|
108
|
+
* @signature b.mailStore.fts.tokenize(text)
|
|
109
|
+
* @since 0.11.25
|
|
110
|
+
* @status stable
|
|
111
|
+
*
|
|
112
|
+
* Split `text` into a deduplicated, lowercased, NFC-normalised token
|
|
113
|
+
* array. Drops stopwords + tokens outside the 2..64-codepoint band.
|
|
114
|
+
* Splits on every non-letter / non-digit codepoint, including the
|
|
115
|
+
* `@` + `.` boundaries of email addresses so local-part + domain
|
|
116
|
+
* labels become independent tokens.
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* b.mailStore.fts.tokenize("Hello world from alice@example.com");
|
|
120
|
+
* // → ["hello", "world", "alice", "example", "com"]
|
|
121
|
+
*/
|
|
122
|
+
function tokenize(text) {
|
|
123
|
+
if (typeof text !== "string") return [];
|
|
124
|
+
if (text.length === 0) return [];
|
|
125
|
+
if (Buffer.byteLength(text, "utf8") > MAX_INPUT_BYTES) {
|
|
126
|
+
// Truncate at MAX_INPUT_BYTES. Tokenization on the prefix is
|
|
127
|
+
// already representative for the body's content fingerprint;
|
|
128
|
+
// refusing outright would weaken indexing on legitimately large
|
|
129
|
+
// messages.
|
|
130
|
+
text = text.slice(0, MAX_INPUT_BYTES);
|
|
131
|
+
}
|
|
132
|
+
// NFC normalise so visually-identical tokens hash to the same value
|
|
133
|
+
// regardless of the source's encoding form.
|
|
134
|
+
var nfc = text.normalize("NFC").toLowerCase();
|
|
135
|
+
// Split on any run of characters that is NOT a letter, digit, or
|
|
136
|
+
// intra-word apostrophe. `\p{L}` + `\p{N}` need the `u` flag.
|
|
137
|
+
var rawTokens = nfc.split(/[^\p{L}\p{N}']+/u);
|
|
138
|
+
var seen = Object.create(null);
|
|
139
|
+
var out = [];
|
|
140
|
+
for (var i = 0; i < rawTokens.length && out.length < MAX_TOKENS_PER_FIELD; i++) {
|
|
141
|
+
var t = rawTokens[i];
|
|
142
|
+
if (!t) continue;
|
|
143
|
+
// Drop leading/trailing apostrophes that survived the split.
|
|
144
|
+
t = t.replace(/^[']+/, "").replace(/[']+$/, "");
|
|
145
|
+
if (!t) continue;
|
|
146
|
+
// Count CODEPOINTS, not UTF-16 units.
|
|
147
|
+
var len = Array.from(t).length;
|
|
148
|
+
if (len < MIN_TOKEN_LEN || len > MAX_TOKEN_LEN) continue;
|
|
149
|
+
if (STOPWORDS[t]) continue;
|
|
150
|
+
if (seen[t]) continue;
|
|
151
|
+
seen[t] = true;
|
|
152
|
+
out.push(t);
|
|
153
|
+
}
|
|
154
|
+
return out;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Hash one token using the same scheme cryptoField uses for derived-
|
|
158
|
+
// hash mirrors: `sha3Hash(vaultSalt + namespace + token)`. The
|
|
159
|
+
// namespace is per-table, per-field, per-purpose ("fts") so that
|
|
160
|
+
// rotating an operator's vault salt invalidates every FTS row in
|
|
161
|
+
// the same step as every sealed column. Returns a 16-char hex prefix
|
|
162
|
+
// — full 64-char SHA3 is overkill for FTS hash space, and shorter
|
|
163
|
+
// tokens compress the FTS5 row 4x without observable collision risk
|
|
164
|
+
// at corpus sizes the framework targets (≤ 10^9 unique tokens, where
|
|
165
|
+
// 64-bit collision space leaves the birthday bound > 10^9).
|
|
166
|
+
/**
|
|
167
|
+
* @primitive b.mailStore.fts.hashToken
|
|
168
|
+
* @signature b.mailStore.fts.hashToken(table, field, token)
|
|
169
|
+
* @since 0.11.25
|
|
170
|
+
* @status stable
|
|
171
|
+
*
|
|
172
|
+
* Vault-salted hash of one token under the (table, field) namespace.
|
|
173
|
+
* The same scheme `b.cryptoField.computeDerived` uses for derived-
|
|
174
|
+
* hash mirrors on sealed columns — rotating the vault salt
|
|
175
|
+
* invalidates every FTS hash in step with every sealed-column hash.
|
|
176
|
+
* Returns a 16-char hex prefix.
|
|
177
|
+
*
|
|
178
|
+
* @example
|
|
179
|
+
* var h = b.mailStore.fts.hashToken("mail_messages", "body", "kubernetes");
|
|
180
|
+
* /^[0-9a-f]{16}$/.test(h); // → true
|
|
181
|
+
*/
|
|
182
|
+
function hashToken(table, field, token) {
|
|
183
|
+
if (typeof token !== "string" || token.length === 0) return "";
|
|
184
|
+
// Mirrors cryptoField's internal `namespaceFor()` scheme — the FTS
|
|
185
|
+
// fields are pseudo-fields (no sealed-column registration), so the
|
|
186
|
+
// canonical fallback path is always the right answer here.
|
|
187
|
+
var ns = "bj-" + table + "-" + field + ":fts:";
|
|
188
|
+
var salt = vault.getDerivedHashSalt();
|
|
189
|
+
var saltHex = (salt && typeof salt.toString === "function") ? salt.toString("hex") : "";
|
|
190
|
+
return bCrypto.sha3Hash(saltHex + ns + token).slice(0, 16); // allow:raw-byte-literal — 16-char hex prefix length, not bytes
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Hash a token array → space-separated string suitable for FTS5
|
|
194
|
+
// row insertion. The output is what gets MATCH'd at query time.
|
|
195
|
+
/**
|
|
196
|
+
* @primitive b.mailStore.fts.hashTokens
|
|
197
|
+
* @signature b.mailStore.fts.hashTokens(table, field, tokens)
|
|
198
|
+
* @since 0.11.25
|
|
199
|
+
* @status stable
|
|
200
|
+
*
|
|
201
|
+
* Hash an array of tokens → space-separated hash string suitable for
|
|
202
|
+
* direct insertion into an FTS5 column. Empty + duplicate token-
|
|
203
|
+
* hashes drop on the way out.
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* b.mailStore.fts.hashTokens("t", "subject", ["hello", "world"]);
|
|
207
|
+
* // → "<16hex> <16hex>"
|
|
208
|
+
*/
|
|
209
|
+
function hashTokens(table, field, tokens) {
|
|
210
|
+
if (!Array.isArray(tokens) || tokens.length === 0) return "";
|
|
211
|
+
var seen = Object.create(null);
|
|
212
|
+
var out = [];
|
|
213
|
+
for (var i = 0; i < tokens.length; i++) {
|
|
214
|
+
var h = hashToken(table, field, tokens[i]);
|
|
215
|
+
if (!h || seen[h]) continue;
|
|
216
|
+
seen[h] = true;
|
|
217
|
+
out.push(h);
|
|
218
|
+
}
|
|
219
|
+
return out.join(" ");
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Tokenize + hash + join in one step (the common path for both
|
|
223
|
+
// append-side index updates and search-side query rewriting).
|
|
224
|
+
/**
|
|
225
|
+
* @primitive b.mailStore.fts.hashText
|
|
226
|
+
* @signature b.mailStore.fts.hashText(table, field, text)
|
|
227
|
+
* @since 0.11.25
|
|
228
|
+
* @status stable
|
|
229
|
+
*
|
|
230
|
+
* Tokenize + hash + join in one step. Convenience wrapper —
|
|
231
|
+
* equivalent to `hashTokens(table, field, tokenize(text))`.
|
|
232
|
+
*
|
|
233
|
+
* @example
|
|
234
|
+
* b.mailStore.fts.hashText("mail_messages", "body", "kubernetes deploy");
|
|
235
|
+
* // → "<16hex> <16hex>"
|
|
236
|
+
*/
|
|
237
|
+
function hashText(table, field, text) {
|
|
238
|
+
return hashTokens(table, field, tokenize(text));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Build the FTS row body for one message. Subject + body tokens get
|
|
242
|
+
// their own FTS columns; from + to addresses share `addr_toks` so a
|
|
243
|
+
// search for an address hits regardless of which side it's on. The
|
|
244
|
+
// `addr_toks` namespace is a single pseudo-field "addr" so the index
|
|
245
|
+
// + query sides hash identically regardless of which header carried
|
|
246
|
+
// the token — `{from: "alice@x"}` and `{to: "alice@x"}` BOTH hit a
|
|
247
|
+
// row that mentions alice@x in EITHER header.
|
|
248
|
+
/**
|
|
249
|
+
* @primitive b.mailStore.fts.rowFromMessage
|
|
250
|
+
* @signature b.mailStore.fts.rowFromMessage(table, msg)
|
|
251
|
+
* @since 0.11.25
|
|
252
|
+
* @status stable
|
|
253
|
+
*
|
|
254
|
+
* Build the FTS5 row payload `{ objectid, subject_toks, addr_toks,
|
|
255
|
+
* body_toks }` from a `{ objectid, subject, from, to, body }`
|
|
256
|
+
* plaintext message. `from` + `to` share `addr_toks`.
|
|
257
|
+
*
|
|
258
|
+
* @example
|
|
259
|
+
* b.mailStore.fts.rowFromMessage("t", { objectid:"o1", subject:"Hi", from:"a@x", to:"b@x", body:"hello" });
|
|
260
|
+
* // → { objectid:"o1", subject_toks:"<hash>", addr_toks:"<hash> <hash>", body_toks:"<hash>" }
|
|
261
|
+
*/
|
|
262
|
+
function rowFromMessage(table, msg) {
|
|
263
|
+
var addrTokens = tokenize(msg.from || "").concat(tokenize(msg.to || ""));
|
|
264
|
+
return {
|
|
265
|
+
objectid: msg.objectid,
|
|
266
|
+
subject_toks: hashText(table, "subject", msg.subject || ""),
|
|
267
|
+
addr_toks: hashTokens(table, "addr", addrTokens),
|
|
268
|
+
body_toks: hashText(table, "body", msg.body || ""),
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Map a query-side filter key onto the (FTS5 column, namespace pseudo-
|
|
273
|
+
// field) pair the indexer used. Keeps the index + query in lock-step
|
|
274
|
+
// so future column additions only touch this table.
|
|
275
|
+
//
|
|
276
|
+
// filter key → FTS5 column + namespace field
|
|
277
|
+
// subject → subject_toks + "subject"
|
|
278
|
+
// body → body_toks + "body"
|
|
279
|
+
// from / to → addr_toks + "addr"
|
|
280
|
+
//
|
|
281
|
+
// For a broad cross-column `text` query the caller iterates this
|
|
282
|
+
// mapping and OR's the per-column MATCH clauses.
|
|
283
|
+
var QUERY_KEY_MAP = {
|
|
284
|
+
subject: { column: "subject_toks", field: "subject" },
|
|
285
|
+
body: { column: "body_toks", field: "body" },
|
|
286
|
+
from: { column: "addr_toks", field: "addr" },
|
|
287
|
+
to: { column: "addr_toks", field: "addr" },
|
|
288
|
+
};
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* @primitive b.mailStore.fts.columnAndFieldFor
|
|
292
|
+
* @signature b.mailStore.fts.columnAndFieldFor(filterKey)
|
|
293
|
+
* @since 0.11.25
|
|
294
|
+
* @status stable
|
|
295
|
+
*
|
|
296
|
+
* Map a search filter key (`subject` / `body` / `from` / `to`) to
|
|
297
|
+
* the FTS5 column it indexes into PLUS the namespace pseudo-field
|
|
298
|
+
* the indexer uses when hashing tokens. Used by the search path so
|
|
299
|
+
* the query-side hash transform matches the index-side one byte-
|
|
300
|
+
* for-byte.
|
|
301
|
+
*
|
|
302
|
+
* @example
|
|
303
|
+
* b.mailStore.fts.columnAndFieldFor("from");
|
|
304
|
+
* // → { column: "addr_toks", field: "addr" }
|
|
305
|
+
*/
|
|
306
|
+
function columnAndFieldFor(key) {
|
|
307
|
+
return QUERY_KEY_MAP[key] || null;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Rewrite an operator query term into a FTS5 MATCH expression. The
|
|
311
|
+
// term is tokenized + hashed exactly like an index value, then the
|
|
312
|
+
// hashes are AND'd together so multi-word queries require every
|
|
313
|
+
// token to appear in the row. Returns null when no tokens survive
|
|
314
|
+
// the filter (caller should skip the FTS join in that case).
|
|
315
|
+
/**
|
|
316
|
+
* @primitive b.mailStore.fts.buildMatchExpression
|
|
317
|
+
* @signature b.mailStore.fts.buildMatchExpression(table, field, term)
|
|
318
|
+
* @since 0.11.25
|
|
319
|
+
* @status stable
|
|
320
|
+
*
|
|
321
|
+
* Tokenize + hash an operator's query `term` and produce the FTS5
|
|
322
|
+
* MATCH expression that selects rows containing every surviving
|
|
323
|
+
* token. Returns `null` when no tokens survive the tokenize +
|
|
324
|
+
* stopword filter (caller skips the FTS join in that case).
|
|
325
|
+
*
|
|
326
|
+
* @example
|
|
327
|
+
* var expr = b.mailStore.fts.buildMatchExpression("t", "body", "kubernetes deploy");
|
|
328
|
+
* // → "<16hex> AND <16hex>"
|
|
329
|
+
*/
|
|
330
|
+
function buildMatchExpression(table, field, term) {
|
|
331
|
+
var tokens = tokenize(term);
|
|
332
|
+
if (tokens.length === 0) return null;
|
|
333
|
+
var hashes = [];
|
|
334
|
+
var seen = Object.create(null);
|
|
335
|
+
for (var i = 0; i < tokens.length; i++) {
|
|
336
|
+
var h = hashToken(table, field, tokens[i]);
|
|
337
|
+
if (!h || seen[h]) continue;
|
|
338
|
+
seen[h] = true;
|
|
339
|
+
hashes.push(h);
|
|
340
|
+
}
|
|
341
|
+
if (hashes.length === 0) return null;
|
|
342
|
+
// FTS5 default operator is AND; explicit for readability.
|
|
343
|
+
return hashes.join(" AND ");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// SQL builder — creates the FTS5 virtual table. Caller supplies the
|
|
347
|
+
// quoted parent table identifier; this module owns the FTS table
|
|
348
|
+
// name and column layout.
|
|
349
|
+
/**
|
|
350
|
+
* @primitive b.mailStore.fts.createSql
|
|
351
|
+
* @signature b.mailStore.fts.createSql(qFtsTable)
|
|
352
|
+
* @since 0.11.25
|
|
353
|
+
* @status stable
|
|
354
|
+
*
|
|
355
|
+
* Returns the `CREATE VIRTUAL TABLE IF NOT EXISTS` SQL for the
|
|
356
|
+
* sealed-token FTS5 table. The caller passes the quoted table
|
|
357
|
+
* identifier (e.g. `"blamejs_mail_messages_fts"`).
|
|
358
|
+
*
|
|
359
|
+
* @example
|
|
360
|
+
* db.prepare(b.mailStore.fts.createSql('"mail_fts"')).run();
|
|
361
|
+
*/
|
|
362
|
+
function createSql(qFtsTable) {
|
|
363
|
+
return "CREATE VIRTUAL TABLE IF NOT EXISTS " + qFtsTable + " USING fts5(" +
|
|
364
|
+
"objectid UNINDEXED, " +
|
|
365
|
+
"subject_toks, " +
|
|
366
|
+
"addr_toks, " +
|
|
367
|
+
"body_toks, " +
|
|
368
|
+
"tokenize = 'unicode61 remove_diacritics 2'" +
|
|
369
|
+
")";
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
module.exports = {
|
|
373
|
+
// SQL primitives
|
|
374
|
+
createSql: createSql,
|
|
375
|
+
|
|
376
|
+
// Index-side
|
|
377
|
+
tokenize: tokenize,
|
|
378
|
+
hashToken: hashToken,
|
|
379
|
+
hashTokens: hashTokens,
|
|
380
|
+
hashText: hashText,
|
|
381
|
+
rowFromMessage: rowFromMessage,
|
|
382
|
+
|
|
383
|
+
// Query-side
|
|
384
|
+
buildMatchExpression: buildMatchExpression,
|
|
385
|
+
columnAndFieldFor: columnAndFieldFor,
|
|
386
|
+
QUERY_KEY_MAP: QUERY_KEY_MAP,
|
|
387
|
+
|
|
388
|
+
// Constants surfaced for tests + adjacent modules.
|
|
389
|
+
STOPWORDS: STOPWORDS,
|
|
390
|
+
MIN_TOKEN_LEN: MIN_TOKEN_LEN,
|
|
391
|
+
MAX_TOKEN_LEN: MAX_TOKEN_LEN,
|
|
392
|
+
MAX_TOKENS_PER_FIELD: MAX_TOKENS_PER_FIELD,
|
|
393
|
+
FTS_FIELDS: FTS_FIELDS,
|
|
394
|
+
};
|