@blamejs/core 0.11.24 → 0.11.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -428,9 +428,27 @@ function create(opts) {
428
428
  authPending: null,
429
429
  };
430
430
 
431
- var lineBuffer = "";
431
+ // RAW byte buffer — NOT a string. The BDAT-CHUNKING path (RFC 3030)
432
+ // requires lossless byte preservation when the BDAT command line +
433
+ // payload arrive in the same TCP segment, and DATA-body 8BITMIME
434
+ // payloads can contain bytes that are invalid UTF-8. Decoding the
435
+ // socket-bytes through a string layer replaces invalid sequences
436
+ // with U+FFFD and corrupts the body. Keep the raw bytes; decode to
437
+ // string only for the per-command parse.
438
+ var lineBuffer = Buffer.alloc(0);
432
439
  var bodyCollector = null;
433
440
  var inDataBody = false;
441
+ // RFC 3030 CHUNKING — state for the BDAT command. `bdatCollector`
442
+ // accumulates the message body across multiple BDAT chunks; it lives
443
+ // for the lifetime of the SMTP transaction (i.e., between MAIL FROM
444
+ // and the BDAT ... LAST that finalises). `bdatRemaining` counts down
445
+ // bytes still owed by the current BDAT chunk; `bdatIsLast` flags
446
+ // whether the current chunk is the terminator.
447
+ var inBdatChunk = false;
448
+ var bdatRemaining = 0;
449
+ var bdatIsLast = false;
450
+ var bdatCollector = null;
451
+ var bdatTotalBytes = 0;
434
452
 
435
453
  socket.setTimeout(idleTimeoutMs);
436
454
  socket.on("timeout", function () {
@@ -465,6 +483,57 @@ function create(opts) {
465
483
  });
466
484
 
467
485
  function _ingestBytes(state, socket, chunk) {
486
+ // RFC 3030 — when a BDAT chunk is in progress we consume exactly
487
+ // `bdatRemaining` bytes off the wire, no dot-stuffing, no end-of-
488
+ // data marker. Any excess bytes in the chunk after the BDAT
489
+ // payload completes get fed back through the command line buffer
490
+ // (typical when a pipelined `BDAT N LAST\r\n<payload>\r\nNOOP\r\n`
491
+ // arrives in a single TCP segment).
492
+ if (inBdatChunk) {
493
+ var consumeN = Math.min(chunk.length, bdatRemaining);
494
+ var consumed = chunk.subarray(0, consumeN);
495
+ try { bdatCollector.push(consumed); }
496
+ catch (_e) {
497
+ _emit("mail.server.submission.bdat_refused",
498
+ { connectionId: state.id, reason: "body-too-large", maxBytes: maxMessageBytes },
499
+ "denied");
500
+ _writeReply(socket, REPLY_552_SIZE_EXCEEDED,
501
+ "5.3.4 BDAT body exceeds maxMessageBytes (" + maxMessageBytes + " bytes)");
502
+ _resetTransaction(state);
503
+ inBdatChunk = false; bdatCollector = null; bdatRemaining = 0; bdatTotalBytes = 0;
504
+ return;
505
+ }
506
+ bdatRemaining -= consumeN;
507
+ bdatTotalBytes += consumeN;
508
+ if (bdatRemaining === 0) {
509
+ var wasLast = bdatIsLast;
510
+ inBdatChunk = false;
511
+ if (wasLast) {
512
+ // RFC 3030 §2.2 — ONE reply per BDAT command. When LAST,
513
+ // the single reply is the "message queued" finalize reply
514
+ // (emitted from _finalizeAcceptedBody), not the per-chunk
515
+ // "<N> octets received" reply. Emitting both would
516
+ // desynchronise the client (the second 250 would be
517
+ // consumed as the response to the next command).
518
+ // No dot-unstuff for BDAT — RFC 3030 §3 explicitly defines
519
+ // BDAT payloads as opaque byte streams.
520
+ var bdatBody = bdatCollector.result();
521
+ bdatCollector = null;
522
+ bdatTotalBytes = 0;
523
+ _finalizeAcceptedBody(state, socket, bdatBody, "BDAT");
524
+ } else {
525
+ // Non-final chunk — per-chunk acknowledgement only.
526
+ _writeReply(socket, REPLY_250_OK,
527
+ "2.0.0 " + bdatTotalBytes + " octets received");
528
+ }
529
+ // Any tail bytes after this BDAT chunk get re-fed as commands.
530
+ if (consumeN < chunk.length) {
531
+ var tail = chunk.subarray(consumeN);
532
+ _ingestBytes(state, socket, tail);
533
+ }
534
+ }
535
+ return;
536
+ }
468
537
  if (inDataBody) {
469
538
  try { bodyCollector.push(chunk); }
470
539
  catch (_e) {
@@ -491,13 +560,15 @@ function create(opts) {
491
560
  var endIdx = safeSmtp.findDotTerminator(collected);
492
561
  if (endIdx !== -1) {
493
562
  var body = collected.subarray(0, endIdx);
494
- _finalizeDataBody(state, socket, body);
563
+ // DATA path dot-unstuffs here; BDAT path skips this step.
564
+ var dedotted = safeSmtp.dotUnstuff(body);
565
+ _finalizeAcceptedBody(state, socket, dedotted, "DATA");
495
566
  inDataBody = false; bodyCollector = null;
496
567
  }
497
568
  return;
498
569
  }
499
570
 
500
- lineBuffer += chunk.toString("utf8");
571
+ lineBuffer = lineBuffer.length === 0 ? chunk : Buffer.concat([lineBuffer, chunk]);
501
572
  if (lineBuffer.length > maxLineBytes * 4) {
502
573
  _writeReply(socket, REPLY_500_SYNTAX,
503
574
  "5.5.6 Line too long (>" + maxLineBytes + " bytes)");
@@ -505,11 +576,29 @@ function create(opts) {
505
576
  return;
506
577
  }
507
578
  var crlf;
508
- while ((crlf = lineBuffer.indexOf("\r\n")) !== -1) {
509
- var line = lineBuffer.slice(0, crlf);
510
- lineBuffer = lineBuffer.slice(crlf + 2);
579
+ var crlfNeedle = Buffer.from("\r\n", "ascii");
580
+ while ((crlf = lineBuffer.indexOf(crlfNeedle)) !== -1) {
581
+ // Decode just the per-command line to a string — keeps the
582
+ // wire-protocol parser working in UTF-8 while leaving the
583
+ // RAW lineBuffer intact for any binary payload that follows.
584
+ var line = lineBuffer.subarray(0, crlf).toString("utf8");
585
+ lineBuffer = lineBuffer.subarray(crlf + 2);
511
586
  _handleCommand(state, socket, line);
512
587
  if (inDataBody) return;
588
+ if (inBdatChunk) {
589
+ // RFC 3030 — `BDAT <N> [LAST]\r\n` is immediately followed by
590
+ // exactly <N> raw bytes (no dot-stuffing, no terminator). When
591
+ // those bytes arrived in the SAME TCP segment as the BDAT
592
+ // command, drain them straight from the raw byte buffer
593
+ // (NOT through a UTF-8 string round-trip — would corrupt
594
+ // 8-bit / binary payloads).
595
+ if (lineBuffer.length > 0) {
596
+ var pendingBytes = lineBuffer;
597
+ lineBuffer = Buffer.alloc(0);
598
+ _ingestBytes(state, socket, pendingBytes);
599
+ }
600
+ return;
601
+ }
513
602
  }
514
603
  }
515
604
 
@@ -555,6 +644,8 @@ function create(opts) {
555
644
  return _handleRcptTo(state, socket, line);
556
645
  case "DATA":
557
646
  return _handleData(state, socket);
647
+ case "BDAT":
648
+ return _handleBdat(state, socket, line);
558
649
  case "NOOP":
559
650
  return _writeReply(socket, REPLY_250_OK, "2.0.0 OK");
560
651
  case "RSET":
@@ -592,7 +683,7 @@ function create(opts) {
592
683
  state.helo = helo;
593
684
  state.stage = "ehlo";
594
685
  if (verb === "EHLO") {
595
- var caps = ["PIPELINING", "SIZE " + maxMessageBytes, "8BITMIME", "ENHANCEDSTATUSCODES"];
686
+ var caps = ["PIPELINING", "SIZE " + maxMessageBytes, "8BITMIME", "ENHANCEDSTATUSCODES", "CHUNKING"];
596
687
  // STARTTLS advertised only on explicit-STARTTLS port (587),
597
688
  // not on implicit-TLS (465 already wrapped). RFC 8314 §3.3.
598
689
  if (!state.tls && !implicitTls) caps.unshift("STARTTLS");
@@ -627,7 +718,12 @@ function create(opts) {
627
718
  // body collector AND strip the plain-socket "data" listener
628
719
  // before wrapping in TLSSocket so bytes the peer pipelined
629
720
  // pre-handshake cannot reach the post-TLS state machine.
630
- lineBuffer = ""; bodyCollector = null; inDataBody = false;
721
+ lineBuffer = Buffer.alloc(0); bodyCollector = null; inDataBody = false;
722
+ // BDAT-side state cleared on STARTTLS upgrade too — same threat
723
+ // model as CVE-2021-38371 (Exim) / CVE-2021-33515 (Dovecot):
724
+ // pre-handshake bytes the peer pipelined MUST NOT reach the
725
+ // post-TLS state machine via the BDAT collector either.
726
+ inBdatChunk = false; bdatRemaining = 0; bdatCollector = null; bdatTotalBytes = 0;
631
727
  mailServerTls.upgradeSocket({
632
728
  plainSocket: socket,
633
729
  secureContext: opts.tlsContext,
@@ -1033,8 +1129,7 @@ function create(opts) {
1033
1129
  });
1034
1130
  }
1035
1131
 
1036
- function _finalizeDataBody(state, socket, body) {
1037
- var dedotted = safeSmtp.dotUnstuff(body);
1132
+ function _finalizeAcceptedBody(state, socket, dedotted, source) {
1038
1133
 
1039
1134
  // Outbound DKIM-required gate. Scan the header block for a
1040
1135
  // `DKIM-Signature:` line; under `self` mode also require at
@@ -1108,16 +1203,109 @@ function create(opts) {
1108
1203
  }
1109
1204
  _emit("mail.server.submission.data_accepted",
1110
1205
  { connectionId: state.id, mailFrom: state.mailFrom,
1111
- rcptCount: state.rcpts.length, sizeBytes: dedotted.length });
1206
+ rcptCount: state.rcpts.length, sizeBytes: dedotted.length, source: source || "DATA" });
1112
1207
  _writeReply(socket, REPLY_250_OK, "2.6.0 Message queued (audit-only)");
1113
1208
  _resetTransaction(state);
1114
1209
  }
1115
1210
 
1211
+ // RFC 3030 §2 — BDAT <chunk-size> [LAST]. Reads exactly chunk-size
1212
+ // bytes off the wire (no dot-stuffing, no end-of-data marker). The
1213
+ // size is a non-negative integer; LAST keyword (case-insensitive)
1214
+ // terminates the message body. Mixing DATA + BDAT within the same
1215
+ // transaction is forbidden — the server returns 503 once the first
1216
+ // BDAT lands and forces the client to RSET.
1217
+ function _handleBdat(state, socket, line) {
1218
+ if (state.stage !== "rcpt" && state.stage !== "bdat") {
1219
+ _writeReply(socket, REPLY_503_BAD_SEQUENCE, "5.5.1 BDAT requires MAIL FROM + RCPT TO");
1220
+ return;
1221
+ }
1222
+ if (state.rcpts.length === 0) {
1223
+ _writeReply(socket, REPLY_503_BAD_SEQUENCE, "5.5.1 No valid recipients");
1224
+ return;
1225
+ }
1226
+ // Pipelining race — same gate as DATA.
1227
+ if ((state.rcptsPending || 0) > 0) {
1228
+ _emit("mail.server.submission.pipelining_bdat_race", {
1229
+ connectionId: state.id, rcptsPending: state.rcptsPending,
1230
+ rcptsCommitted: state.rcpts.length,
1231
+ }, "denied");
1232
+ _writeReply(socket, REPLY_451_LOCAL_ERROR,
1233
+ "4.5.0 RCPT TO verdicts pending; reissue BDAT after recipient replies");
1234
+ return;
1235
+ }
1236
+ // Parse `BDAT <size>[ LAST]`.
1237
+ var parts = line.split(/\s+/);
1238
+ if (parts.length < 2 || parts.length > 3) {
1239
+ _writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT requires <chunk-size> [LAST]");
1240
+ return;
1241
+ }
1242
+ var sizeStr = parts[1];
1243
+ var sizeN = parseInt(sizeStr, 10);
1244
+ if (!/^\d+$/.test(sizeStr) || !isFinite(sizeN) || sizeN < 0) {
1245
+ _writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT chunk-size must be a non-negative integer");
1246
+ return;
1247
+ }
1248
+ var isLast = parts.length === 3 && parts[2].toUpperCase() === "LAST";
1249
+ if (parts.length === 3 && !isLast) {
1250
+ _writeReply(socket, REPLY_501_BAD_ARGS, "5.5.4 BDAT third arg must be 'LAST' (RFC 3030 §2)");
1251
+ return;
1252
+ }
1253
+ // Cumulative-size cap. The collector is bounded too, but checking
1254
+ // up-front lets us refuse the chunk before reading bytes off the
1255
+ // socket — important when sizeN >> maxMessageBytes.
1256
+ if (bdatTotalBytes + sizeN > maxMessageBytes) {
1257
+ _emit("mail.server.submission.bdat_refused",
1258
+ { connectionId: state.id, reason: "body-too-large",
1259
+ requestedTotal: bdatTotalBytes + sizeN, maxBytes: maxMessageBytes }, "denied");
1260
+ _writeReply(socket, REPLY_552_SIZE_EXCEEDED,
1261
+ "5.3.4 BDAT cumulative size " + (bdatTotalBytes + sizeN) +
1262
+ " exceeds maxMessageBytes (" + maxMessageBytes + ")");
1263
+ _resetTransaction(state);
1264
+ bdatCollector = null; bdatTotalBytes = 0;
1265
+ return;
1266
+ }
1267
+ if (!bdatCollector) {
1268
+ bdatCollector = safeBuffer.boundedChunkCollector({
1269
+ maxBytes: maxMessageBytes,
1270
+ errorClass: MailServerSubmissionError,
1271
+ sizeCode: "mail-server-submission/body-too-large",
1272
+ sizeMessage: "BDAT body exceeded maxMessageBytes (" + maxMessageBytes + ")",
1273
+ });
1274
+ }
1275
+ state.stage = "bdat";
1276
+ bdatRemaining = sizeN;
1277
+ bdatIsLast = isLast;
1278
+ // size=0 + LAST is a valid sequence — finalises the message
1279
+ // body (the LAST chunk may carry zero bytes when the prior chunk
1280
+ // was the final payload). RFC 3030 §2.2 — ONE reply per command:
1281
+ // emit the "0 octets" ack for size=0 NOT-LAST, but defer to
1282
+ // _finalizeAcceptedBody for size=0 LAST.
1283
+ if (sizeN === 0) {
1284
+ if (isLast) {
1285
+ var emptyBody = bdatCollector ? bdatCollector.result() : Buffer.alloc(0);
1286
+ bdatCollector = null; bdatTotalBytes = 0;
1287
+ _finalizeAcceptedBody(state, socket, emptyBody, "BDAT");
1288
+ } else {
1289
+ _writeReply(socket, REPLY_250_OK, "2.0.0 0 octets received");
1290
+ }
1291
+ return;
1292
+ }
1293
+ inBdatChunk = true;
1294
+ }
1295
+
1116
1296
  function _resetTransaction(state) {
1117
1297
  state.mailFrom = null;
1118
1298
  state.rcpts = [];
1119
1299
  state.rcptsPending = 0;
1120
1300
  state.stage = "ehlo";
1301
+ // BDAT-side state lives at the connection level, not on `state`.
1302
+ // Reset it here so a RSET / failed BDAT can't leak collected
1303
+ // bytes into the next transaction.
1304
+ inBdatChunk = false;
1305
+ bdatRemaining = 0;
1306
+ bdatIsLast = false;
1307
+ bdatCollector = null;
1308
+ bdatTotalBytes = 0;
1121
1309
  }
1122
1310
  }
1123
1311
 
@@ -0,0 +1,394 @@
1
+ "use strict";
2
+ /**
3
+ * @module b.mailStore.fts
4
+ * @nav Mail
5
+ * @title Mail-store FTS (sealed-token full-text index)
6
+ * @order 250
7
+ * @slug mail-store-fts
8
+ *
9
+ * @intro
10
+ * Sealed-token full-text search index for `b.mailStore`. At
11
+ * `appendMessage` time the row's plaintext subject + addresses + body
12
+ * are tokenized, each token is hashed with the per-deployment vault
13
+ * salt (the same scheme `b.cryptoField` uses for derived-hash mirrors
14
+ * on sealed columns), and the resulting space-separated token-hash
15
+ * string is inserted into a SQLite FTS5 virtual table. Search runs
16
+ * the same tokenize → hash transform on the operator's query terms
17
+ * and issues `MATCH` against the FTS5 table — never against
18
+ * plaintext.
19
+ *
20
+ * The index is unrecoverable without the vault salt. A database
21
+ * dump leaks zero readable text — the FTS5 rows are byte-for-byte
22
+ * indistinguishable from random hashes. Per-tenant separation rides
23
+ * on the cryptoField namespace prefix (`bj-<table>-<field>:`), so
24
+ * tokens from one tenant's row can never collide with another's
25
+ * under the same vault key.
26
+ *
27
+ * Limitations of sealed-token FTS — operator-facing constraints:
28
+ *
29
+ * - Exact-token match only. No SQLite FTS5 stemmer, no porter,
30
+ * no Unicode-fold-then-stem, no NEAR with offsets. The token
31
+ * boundary IS the search granularity. Operators that need
32
+ * linguistic search at the cost of plaintext-at-rest opt in to
33
+ * a separate plaintext-FTS layer on top — not part of this
34
+ * primitive.
35
+ * - No prefix wildcard (`MATCH 'kub*'`). Token hashes don't
36
+ * preserve substring relationships. The cost of partial-match
37
+ * search is sealed-at-rest; operators get either-or.
38
+ * - Stopword filter is conservative (a / the / of / to / in /
39
+ * for / on / and / or / is / are / be / by). Stopwords land
40
+ * in the unsealed plaintext but never reach the FTS row.
41
+ * - Token length capped at 2..64 unicode codepoints after
42
+ * NFC normalisation. Tokens outside the band are dropped (too
43
+ * short = high-collision noise; too long = file-bomb shape).
44
+ *
45
+ * Posture cascade. The primitive is on by default for every
46
+ * posture (`hipaa` / `pci-dss` / `gdpr` / `soc2`) — the token
47
+ * index uses the same vault key already protecting sealed-row
48
+ * storage, so adding the FTS index doesn't widen the cryptographic
49
+ * trust boundary. A future opt-in plaintext-FTS overlay would be
50
+ * gated by a relaxed posture; this module ships sealed-only.
51
+ *
52
+ * @card
53
+ * Tokenize → vault-salted hash → FTS5 MATCH. The DB dump leaks
54
+ * nothing readable; search works against ciphertext.
55
+ */
56
+
57
+ var bCrypto = require("./crypto");
58
+ var vault = require("./vault");
59
+ var C = require("./constants");
60
+
61
+ // Stopwords are dropped before hashing — they'd dominate every row's
62
+ // token set without adding query selectivity. Kept conservative to
63
+ // stay locale-neutral for v1.
64
+ var STOPWORDS = Object.create(null);
65
+ (
66
+ "a an the of to in for on and or but is are was were be been by with " +
67
+ "as at from this that it its their our your his her him us we i you " +
68
+ "do does did not no yes if so up down out over under than then them"
69
+ ).split(" ").forEach(function (w) { STOPWORDS[w] = true; });
70
+
71
+ // Per-token bounds. NFC-normalised codepoint count, not byte length —
72
+ // tokens carrying multi-byte UTF-8 are not penalised relative to ASCII.
73
+ var MIN_TOKEN_LEN = 2;
74
+ var MAX_TOKEN_LEN = 64;
75
+
76
+ // Per-row token-set cap. A single 50 MiB message can produce
77
+ // millions of tokens; FTS5 row insert + index update must stay
78
+ // bounded. The cap is applied AFTER stopword + length filter so the
79
+ // surviving tokens are the highest-signal subset.
80
+ var MAX_TOKENS_PER_FIELD = 8192; // allow:raw-byte-literal — token-count cap, not bytes
81
+
82
+ // Per-field FTS column names. Kept symmetric with the messages table
83
+ // columns so callers can reason about which FTS column corresponds
84
+ // to which plaintext source field.
85
+ var FTS_FIELDS = {
86
+ subject: "subject_toks",
87
+ from: "addr_toks",
88
+ to: "addr_toks",
89
+ body: "body_toks",
90
+ };
91
+
92
+ // Token splitter — Unicode-aware. Splits on every non-letter,
93
+ // non-digit code-point (Unicode category L* or N*). Apostrophes inside
94
+ // a word survive (`don't` → `don't`); leading/trailing punctuation is
95
+ // stripped. Email addresses are split on `@` + `.` so both the local
96
+ // part and each domain label produce independent tokens — operators
97
+ // searching for `example.com` find rows whose from/to header carries
98
+ // `alice@example.com` AND rows that mention `example` or `com` in
99
+ // body prose. Stopwords prune the noisy ones.
100
+ //
101
+ // Refuses input larger than MAX_INPUT_BYTES to bound tokenizer work
102
+ // — protects against DoS-shaped messages whose body is 50 MiB of a
103
+ // single token boundary.
104
+ var MAX_INPUT_BYTES = C.BYTES.mib(8); // 8 MiB
105
+
106
+ /**
107
+ * @primitive b.mailStore.fts.tokenize
108
+ * @signature b.mailStore.fts.tokenize(text)
109
+ * @since 0.11.25
110
+ * @status stable
111
+ *
112
+ * Split `text` into a deduplicated, lowercased, NFC-normalised token
113
+ * array. Drops stopwords + tokens outside the 2..64-codepoint band.
114
+ * Splits on every non-letter / non-digit codepoint, including the
115
+ * `@` + `.` boundaries of email addresses so local-part + domain
116
+ * labels become independent tokens.
117
+ *
118
+ * @example
119
+ * b.mailStore.fts.tokenize("Hello world from alice@example.com");
120
+ * // → ["hello", "world", "alice", "example", "com"]
121
+ */
122
+ function tokenize(text) {
123
+ if (typeof text !== "string") return [];
124
+ if (text.length === 0) return [];
125
+ if (Buffer.byteLength(text, "utf8") > MAX_INPUT_BYTES) {
126
+ // Truncate at MAX_INPUT_BYTES. Tokenization on the prefix is
127
+ // already representative for the body's content fingerprint;
128
+ // refusing outright would weaken indexing on legitimately large
129
+ // messages.
130
+ text = text.slice(0, MAX_INPUT_BYTES);
131
+ }
132
+ // NFC normalise so visually-identical tokens hash to the same value
133
+ // regardless of the source's encoding form.
134
+ var nfc = text.normalize("NFC").toLowerCase();
135
+ // Split on any run of characters that is NOT a letter, digit, or
136
+ // intra-word apostrophe. `\p{L}` + `\p{N}` need the `u` flag.
137
+ var rawTokens = nfc.split(/[^\p{L}\p{N}']+/u);
138
+ var seen = Object.create(null);
139
+ var out = [];
140
+ for (var i = 0; i < rawTokens.length && out.length < MAX_TOKENS_PER_FIELD; i++) {
141
+ var t = rawTokens[i];
142
+ if (!t) continue;
143
+ // Drop leading/trailing apostrophes that survived the split.
144
+ t = t.replace(/^[']+/, "").replace(/[']+$/, "");
145
+ if (!t) continue;
146
+ // Count CODEPOINTS, not UTF-16 units.
147
+ var len = Array.from(t).length;
148
+ if (len < MIN_TOKEN_LEN || len > MAX_TOKEN_LEN) continue;
149
+ if (STOPWORDS[t]) continue;
150
+ if (seen[t]) continue;
151
+ seen[t] = true;
152
+ out.push(t);
153
+ }
154
+ return out;
155
+ }
156
+
157
+ // Hash one token using the same scheme cryptoField uses for derived-
158
+ // hash mirrors: `sha3Hash(vaultSalt + namespace + token)`. The
159
+ // namespace is per-table, per-field, per-purpose ("fts") so that
160
+ // rotating an operator's vault salt invalidates every FTS row in
161
+ // the same step as every sealed column. Returns a 16-char hex prefix
162
+ // — full 64-char SHA3 is overkill for FTS hash space, and shorter
163
+ // tokens compress the FTS5 row 4x without observable collision risk
164
+ // at corpus sizes the framework targets (≤ 10^9 unique tokens, where
165
+ // 64-bit collision space leaves the birthday bound > 10^9).
166
+ /**
167
+ * @primitive b.mailStore.fts.hashToken
168
+ * @signature b.mailStore.fts.hashToken(table, field, token)
169
+ * @since 0.11.25
170
+ * @status stable
171
+ *
172
+ * Vault-salted hash of one token under the (table, field) namespace.
173
+ * The same scheme `b.cryptoField.computeDerived` uses for derived-
174
+ * hash mirrors on sealed columns — rotating the vault salt
175
+ * invalidates every FTS hash in step with every sealed-column hash.
176
+ * Returns a 16-char hex prefix.
177
+ *
178
+ * @example
179
+ * var h = b.mailStore.fts.hashToken("mail_messages", "body", "kubernetes");
180
+ * /^[0-9a-f]{16}$/.test(h); // → true
181
+ */
182
+ function hashToken(table, field, token) {
183
+ if (typeof token !== "string" || token.length === 0) return "";
184
+ // Mirrors cryptoField's internal `namespaceFor()` scheme — the FTS
185
+ // fields are pseudo-fields (no sealed-column registration), so the
186
+ // canonical fallback path is always the right answer here.
187
+ var ns = "bj-" + table + "-" + field + ":fts:";
188
+ var salt = vault.getDerivedHashSalt();
189
+ var saltHex = (salt && typeof salt.toString === "function") ? salt.toString("hex") : "";
190
+ return bCrypto.sha3Hash(saltHex + ns + token).slice(0, 16); // allow:raw-byte-literal — 16-char hex prefix length, not bytes
191
+ }
192
+
193
+ // Hash a token array → space-separated string suitable for FTS5
194
+ // row insertion. The output is what gets MATCH'd at query time.
195
+ /**
196
+ * @primitive b.mailStore.fts.hashTokens
197
+ * @signature b.mailStore.fts.hashTokens(table, field, tokens)
198
+ * @since 0.11.25
199
+ * @status stable
200
+ *
201
+ * Hash an array of tokens → space-separated hash string suitable for
202
+ * direct insertion into an FTS5 column. Empty + duplicate token-
203
+ * hashes drop on the way out.
204
+ *
205
+ * @example
206
+ * b.mailStore.fts.hashTokens("t", "subject", ["hello", "world"]);
207
+ * // → "<16hex> <16hex>"
208
+ */
209
+ function hashTokens(table, field, tokens) {
210
+ if (!Array.isArray(tokens) || tokens.length === 0) return "";
211
+ var seen = Object.create(null);
212
+ var out = [];
213
+ for (var i = 0; i < tokens.length; i++) {
214
+ var h = hashToken(table, field, tokens[i]);
215
+ if (!h || seen[h]) continue;
216
+ seen[h] = true;
217
+ out.push(h);
218
+ }
219
+ return out.join(" ");
220
+ }
221
+
222
+ // Tokenize + hash + join in one step (the common path for both
223
+ // append-side index updates and search-side query rewriting).
224
+ /**
225
+ * @primitive b.mailStore.fts.hashText
226
+ * @signature b.mailStore.fts.hashText(table, field, text)
227
+ * @since 0.11.25
228
+ * @status stable
229
+ *
230
+ * Tokenize + hash + join in one step. Convenience wrapper —
231
+ * equivalent to `hashTokens(table, field, tokenize(text))`.
232
+ *
233
+ * @example
234
+ * b.mailStore.fts.hashText("mail_messages", "body", "kubernetes deploy");
235
+ * // → "<16hex> <16hex>"
236
+ */
237
+ function hashText(table, field, text) {
238
+ return hashTokens(table, field, tokenize(text));
239
+ }
240
+
241
+ // Build the FTS row body for one message. Subject + body tokens get
242
+ // their own FTS columns; from + to addresses share `addr_toks` so a
243
+ // search for an address hits regardless of which side it's on. The
244
+ // `addr_toks` namespace is a single pseudo-field "addr" so the index
245
+ // + query sides hash identically regardless of which header carried
246
+ // the token — `{from: "alice@x"}` and `{to: "alice@x"}` BOTH hit a
247
+ // row that mentions alice@x in EITHER header.
248
+ /**
249
+ * @primitive b.mailStore.fts.rowFromMessage
250
+ * @signature b.mailStore.fts.rowFromMessage(table, msg)
251
+ * @since 0.11.25
252
+ * @status stable
253
+ *
254
+ * Build the FTS5 row payload `{ objectid, subject_toks, addr_toks,
255
+ * body_toks }` from a `{ objectid, subject, from, to, body }`
256
+ * plaintext message. `from` + `to` share `addr_toks`.
257
+ *
258
+ * @example
259
+ * b.mailStore.fts.rowFromMessage("t", { objectid:"o1", subject:"Hi", from:"a@x", to:"b@x", body:"hello" });
260
+ * // → { objectid:"o1", subject_toks:"<hash>", addr_toks:"<hash> <hash>", body_toks:"<hash>" }
261
+ */
262
+ function rowFromMessage(table, msg) {
263
+ var addrTokens = tokenize(msg.from || "").concat(tokenize(msg.to || ""));
264
+ return {
265
+ objectid: msg.objectid,
266
+ subject_toks: hashText(table, "subject", msg.subject || ""),
267
+ addr_toks: hashTokens(table, "addr", addrTokens),
268
+ body_toks: hashText(table, "body", msg.body || ""),
269
+ };
270
+ }
271
+
272
+ // Map a query-side filter key onto the (FTS5 column, namespace pseudo-
273
+ // field) pair the indexer used. Keeps the index + query in lock-step
274
+ // so future column additions only touch this table.
275
+ //
276
+ // filter key → FTS5 column + namespace field
277
+ // subject → subject_toks + "subject"
278
+ // body → body_toks + "body"
279
+ // from / to → addr_toks + "addr"
280
+ //
281
+ // For a broad cross-column `text` query the caller iterates this
282
+ // mapping and OR's the per-column MATCH clauses.
283
+ var QUERY_KEY_MAP = {
284
+ subject: { column: "subject_toks", field: "subject" },
285
+ body: { column: "body_toks", field: "body" },
286
+ from: { column: "addr_toks", field: "addr" },
287
+ to: { column: "addr_toks", field: "addr" },
288
+ };
289
+
290
+ /**
291
+ * @primitive b.mailStore.fts.columnAndFieldFor
292
+ * @signature b.mailStore.fts.columnAndFieldFor(filterKey)
293
+ * @since 0.11.25
294
+ * @status stable
295
+ *
296
+ * Map a search filter key (`subject` / `body` / `from` / `to`) to
297
+ * the FTS5 column it indexes into PLUS the namespace pseudo-field
298
+ * the indexer uses when hashing tokens. Used by the search path so
299
+ * the query-side hash transform matches the index-side one byte-
300
+ * for-byte.
301
+ *
302
+ * @example
303
+ * b.mailStore.fts.columnAndFieldFor("from");
304
+ * // → { column: "addr_toks", field: "addr" }
305
+ */
306
+ function columnAndFieldFor(key) {
307
+ return QUERY_KEY_MAP[key] || null;
308
+ }
309
+
310
+ // Rewrite an operator query term into a FTS5 MATCH expression. The
311
+ // term is tokenized + hashed exactly like an index value, then the
312
+ // hashes are AND'd together so multi-word queries require every
313
+ // token to appear in the row. Returns null when no tokens survive
314
+ // the filter (caller should skip the FTS join in that case).
315
+ /**
316
+ * @primitive b.mailStore.fts.buildMatchExpression
317
+ * @signature b.mailStore.fts.buildMatchExpression(table, field, term)
318
+ * @since 0.11.25
319
+ * @status stable
320
+ *
321
+ * Tokenize + hash an operator's query `term` and produce the FTS5
322
+ * MATCH expression that selects rows containing every surviving
323
+ * token. Returns `null` when no tokens survive the tokenize +
324
+ * stopword filter (caller skips the FTS join in that case).
325
+ *
326
+ * @example
327
+ * var expr = b.mailStore.fts.buildMatchExpression("t", "body", "kubernetes deploy");
328
+ * // → "<16hex> AND <16hex>"
329
+ */
330
+ function buildMatchExpression(table, field, term) {
331
+ var tokens = tokenize(term);
332
+ if (tokens.length === 0) return null;
333
+ var hashes = [];
334
+ var seen = Object.create(null);
335
+ for (var i = 0; i < tokens.length; i++) {
336
+ var h = hashToken(table, field, tokens[i]);
337
+ if (!h || seen[h]) continue;
338
+ seen[h] = true;
339
+ hashes.push(h);
340
+ }
341
+ if (hashes.length === 0) return null;
342
+ // FTS5 default operator is AND; explicit for readability.
343
+ return hashes.join(" AND ");
344
+ }
345
+
346
+ // SQL builder — creates the FTS5 virtual table. Caller supplies the
347
+ // quoted parent table identifier; this module owns the FTS table
348
+ // name and column layout.
349
+ /**
350
+ * @primitive b.mailStore.fts.createSql
351
+ * @signature b.mailStore.fts.createSql(qFtsTable)
352
+ * @since 0.11.25
353
+ * @status stable
354
+ *
355
+ * Returns the `CREATE VIRTUAL TABLE IF NOT EXISTS` SQL for the
356
+ * sealed-token FTS5 table. The caller passes the quoted table
357
+ * identifier (e.g. `"blamejs_mail_messages_fts"`).
358
+ *
359
+ * @example
360
+ * db.prepare(b.mailStore.fts.createSql('"mail_fts"')).run();
361
+ */
362
+ function createSql(qFtsTable) {
363
+ return "CREATE VIRTUAL TABLE IF NOT EXISTS " + qFtsTable + " USING fts5(" +
364
+ "objectid UNINDEXED, " +
365
+ "subject_toks, " +
366
+ "addr_toks, " +
367
+ "body_toks, " +
368
+ "tokenize = 'unicode61 remove_diacritics 2'" +
369
+ ")";
370
+ }
371
+
372
+ module.exports = {
373
+ // SQL primitives
374
+ createSql: createSql,
375
+
376
+ // Index-side
377
+ tokenize: tokenize,
378
+ hashToken: hashToken,
379
+ hashTokens: hashTokens,
380
+ hashText: hashText,
381
+ rowFromMessage: rowFromMessage,
382
+
383
+ // Query-side
384
+ buildMatchExpression: buildMatchExpression,
385
+ columnAndFieldFor: columnAndFieldFor,
386
+ QUERY_KEY_MAP: QUERY_KEY_MAP,
387
+
388
+ // Constants surfaced for tests + adjacent modules.
389
+ STOPWORDS: STOPWORDS,
390
+ MIN_TOKEN_LEN: MIN_TOKEN_LEN,
391
+ MAX_TOKEN_LEN: MAX_TOKEN_LEN,
392
+ MAX_TOKENS_PER_FIELD: MAX_TOKENS_PER_FIELD,
393
+ FTS_FIELDS: FTS_FIELDS,
394
+ };