@blamejs/core 0.11.23 → 0.11.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ "use strict";
2
+ /**
3
+ * @module b.mailStore.fts
4
+ * @nav Mail
5
+ * @title Mail-store FTS (sealed-token full-text index)
6
+ * @order 250
7
+ * @slug mail-store-fts
8
+ *
9
+ * @intro
10
+ * Sealed-token full-text search index for `b.mailStore`. At
11
+ * `appendMessage` time the row's plaintext subject + addresses + body
12
+ * are tokenized, each token is hashed with the per-deployment vault
13
+ * salt (the same scheme `b.cryptoField` uses for derived-hash mirrors
14
+ * on sealed columns), and the resulting space-separated token-hash
15
+ * string is inserted into a SQLite FTS5 virtual table. Search runs
16
+ * the same tokenize → hash transform on the operator's query terms
17
+ * and issues `MATCH` against the FTS5 table — never against
18
+ * plaintext.
19
+ *
20
+ * The index is unrecoverable without the vault salt. A database
21
+ * dump leaks zero readable text — the FTS5 rows are byte-for-byte
22
+ * indistinguishable from random hashes. Per-tenant separation rides
23
+ * on the cryptoField namespace prefix (`bj-<table>-<field>:`), so
24
+ * tokens from one tenant's row can never collide with another's
25
+ * under the same vault key.
26
+ *
27
+ * Limitations of sealed-token FTS — operator-facing constraints:
28
+ *
29
+ * - Exact-token match only. No SQLite FTS5 stemmer, no porter,
30
+ * no Unicode-fold-then-stem, no NEAR with offsets. The token
31
+ * boundary IS the search granularity. Operators that need
32
+ * linguistic search at the cost of plaintext-at-rest opt in to
33
+ * a separate plaintext-FTS layer on top — not part of this
34
+ * primitive.
35
+ * - No prefix wildcard (`MATCH 'kub*'`). Token hashes don't
36
+ * preserve substring relationships. The cost of partial-match
37
+ * search is sealed-at-rest; operators get either-or.
38
+ * - Stopword filter is conservative (a / the / of / to / in /
39
+ * for / on / and / or / is / are / be / by). Stopwords land
40
+ * in the unsealed plaintext but never reach the FTS row.
41
+ * - Token length capped at 2..64 unicode codepoints after
42
+ * NFC normalisation. Tokens outside the band are dropped (too
43
+ * short = high-collision noise; too long = file-bomb shape).
44
+ *
45
+ * Posture cascade. The primitive is on by default for every
46
+ * posture (`hipaa` / `pci-dss` / `gdpr` / `soc2`) — the token
47
+ * index uses the same vault key already protecting sealed-row
48
+ * storage, so adding the FTS index doesn't widen the cryptographic
49
+ * trust boundary. A future opt-in plaintext-FTS overlay would be
50
+ * gated by a relaxed posture; this module ships sealed-only.
51
+ *
52
+ * @card
53
+ * Tokenize → vault-salted hash → FTS5 MATCH. The DB dump leaks
54
+ * nothing readable; search works against ciphertext.
55
+ */
56
+
57
+ var bCrypto = require("./crypto");
58
+ var vault = require("./vault");
59
+ var C = require("./constants");
60
+
61
+ // Stopwords are dropped before hashing — they'd dominate every row's
62
+ // token set without adding query selectivity. Kept conservative to
63
+ // stay locale-neutral for v1.
64
+ var STOPWORDS = Object.create(null);
65
+ (
66
+ "a an the of to in for on and or but is are was were be been by with " +
67
+ "as at from this that it its their our your his her him us we i you " +
68
+ "do does did not no yes if so up down out over under than then them"
69
+ ).split(" ").forEach(function (w) { STOPWORDS[w] = true; });
70
+
71
+ // Per-token bounds. NFC-normalised codepoint count, not byte length —
72
+ // tokens carrying multi-byte UTF-8 are not penalised relative to ASCII.
73
+ var MIN_TOKEN_LEN = 2;
74
+ var MAX_TOKEN_LEN = 64;
75
+
76
+ // Per-row token-set cap. A single 50 MiB message can produce
77
+ // millions of tokens; FTS5 row insert + index update must stay
78
+ // bounded. The cap is applied AFTER stopword + length filter so the
79
+ // surviving tokens are the highest-signal subset.
80
+ var MAX_TOKENS_PER_FIELD = 8192; // allow:raw-byte-literal — token-count cap, not bytes
81
+
82
+ // Per-field FTS column names. Kept symmetric with the messages table
83
+ // columns so callers can reason about which FTS column corresponds
84
+ // to which plaintext source field.
85
+ var FTS_FIELDS = {
86
+ subject: "subject_toks",
87
+ from: "addr_toks",
88
+ to: "addr_toks",
89
+ body: "body_toks",
90
+ };
91
+
92
+ // Token splitter — Unicode-aware. Splits on every non-letter,
93
+ // non-digit code-point (Unicode category L* or N*). Apostrophes inside
94
+ // a word survive (`don't` → `don't`); leading/trailing punctuation is
95
+ // stripped. Email addresses are split on `@` + `.` so both the local
96
+ // part and each domain label produce independent tokens — operators
97
+ // searching for `example.com` find rows whose from/to header carries
98
+ // `alice@example.com` AND rows that mention `example` or `com` in
99
+ // body prose. Stopwords prune the noisy ones.
100
+ //
101
+ // Refuses input larger than MAX_INPUT_BYTES to bound tokenizer work
102
+ // — protects against DoS-shaped messages whose body is 50 MiB of a
103
+ // single token boundary.
104
+ var MAX_INPUT_BYTES = C.BYTES.mib(8); // 8 MiB
105
+
106
+ /**
107
+ * @primitive b.mailStore.fts.tokenize
108
+ * @signature b.mailStore.fts.tokenize(text)
109
+ * @since 0.11.25
110
+ * @status stable
111
+ *
112
+ * Split `text` into a deduplicated, lowercased, NFC-normalised token
113
+ * array. Drops stopwords + tokens outside the 2..64-codepoint band.
114
+ * Splits on every non-letter / non-digit codepoint, including the
115
+ * `@` + `.` boundaries of email addresses so local-part + domain
116
+ * labels become independent tokens.
117
+ *
118
+ * @example
119
+ * b.mailStore.fts.tokenize("Hello world from alice@example.com");
120
+ * // → ["hello", "world", "alice", "example", "com"]
121
+ */
122
+ function tokenize(text) {
123
+ if (typeof text !== "string") return [];
124
+ if (text.length === 0) return [];
125
+ if (Buffer.byteLength(text, "utf8") > MAX_INPUT_BYTES) {
126
+ // Truncate at MAX_INPUT_BYTES. Tokenization on the prefix is
127
+ // already representative for the body's content fingerprint;
128
+ // refusing outright would weaken indexing on legitimately large
129
+ // messages.
130
+ text = text.slice(0, MAX_INPUT_BYTES);
131
+ }
132
+ // NFC normalise so visually-identical tokens hash to the same value
133
+ // regardless of the source's encoding form.
134
+ var nfc = text.normalize("NFC").toLowerCase();
135
+ // Split on any run of characters that is NOT a letter, digit, or
136
+ // intra-word apostrophe. `\p{L}` + `\p{N}` need the `u` flag.
137
+ var rawTokens = nfc.split(/[^\p{L}\p{N}']+/u);
138
+ var seen = Object.create(null);
139
+ var out = [];
140
+ for (var i = 0; i < rawTokens.length && out.length < MAX_TOKENS_PER_FIELD; i++) {
141
+ var t = rawTokens[i];
142
+ if (!t) continue;
143
+ // Drop leading/trailing apostrophes that survived the split.
144
+ t = t.replace(/^[']+/, "").replace(/[']+$/, "");
145
+ if (!t) continue;
146
+ // Count CODEPOINTS, not UTF-16 units.
147
+ var len = Array.from(t).length;
148
+ if (len < MIN_TOKEN_LEN || len > MAX_TOKEN_LEN) continue;
149
+ if (STOPWORDS[t]) continue;
150
+ if (seen[t]) continue;
151
+ seen[t] = true;
152
+ out.push(t);
153
+ }
154
+ return out;
155
+ }
156
+
157
+ // Hash one token using the same scheme cryptoField uses for derived-
158
+ // hash mirrors: `sha3Hash(vaultSalt + namespace + token)`. The
159
+ // namespace is per-table, per-field, per-purpose ("fts") so that
160
+ // rotating an operator's vault salt invalidates every FTS row in
161
+ // the same step as every sealed column. Returns a 16-char hex prefix
162
+ // — full 64-char SHA3 is overkill for FTS hash space, and shorter
163
+ // tokens compress the FTS5 row 4x without observable collision risk
164
+ // at corpus sizes the framework targets (≤ 10^9 unique tokens, where
165
+ // 64-bit collision space leaves the birthday bound > 10^9).
166
+ /**
167
+ * @primitive b.mailStore.fts.hashToken
168
+ * @signature b.mailStore.fts.hashToken(table, field, token)
169
+ * @since 0.11.25
170
+ * @status stable
171
+ *
172
+ * Vault-salted hash of one token under the (table, field) namespace.
173
+ * The same scheme `b.cryptoField.computeDerived` uses for derived-
174
+ * hash mirrors on sealed columns — rotating the vault salt
175
+ * invalidates every FTS hash in step with every sealed-column hash.
176
+ * Returns a 16-char hex prefix.
177
+ *
178
+ * @example
179
+ * var h = b.mailStore.fts.hashToken("mail_messages", "body", "kubernetes");
180
+ * /^[0-9a-f]{16}$/.test(h); // → true
181
+ */
182
+ function hashToken(table, field, token) {
183
+ if (typeof token !== "string" || token.length === 0) return "";
184
+ // Mirrors cryptoField's internal `namespaceFor()` scheme — the FTS
185
+ // fields are pseudo-fields (no sealed-column registration), so the
186
+ // canonical fallback path is always the right answer here.
187
+ var ns = "bj-" + table + "-" + field + ":fts:";
188
+ var salt = vault.getDerivedHashSalt();
189
+ var saltHex = (salt && typeof salt.toString === "function") ? salt.toString("hex") : "";
190
+ return bCrypto.sha3Hash(saltHex + ns + token).slice(0, 16); // allow:raw-byte-literal — 16-char hex prefix length, not bytes
191
+ }
192
+
193
+ // Hash a token array → space-separated string suitable for FTS5
194
+ // row insertion. The output is what gets MATCH'd at query time.
195
+ /**
196
+ * @primitive b.mailStore.fts.hashTokens
197
+ * @signature b.mailStore.fts.hashTokens(table, field, tokens)
198
+ * @since 0.11.25
199
+ * @status stable
200
+ *
201
+ * Hash an array of tokens → space-separated hash string suitable for
202
+ * direct insertion into an FTS5 column. Empty + duplicate token-
203
+ * hashes drop on the way out.
204
+ *
205
+ * @example
206
+ * b.mailStore.fts.hashTokens("t", "subject", ["hello", "world"]);
207
+ * // → "<16hex> <16hex>"
208
+ */
209
+ function hashTokens(table, field, tokens) {
210
+ if (!Array.isArray(tokens) || tokens.length === 0) return "";
211
+ var seen = Object.create(null);
212
+ var out = [];
213
+ for (var i = 0; i < tokens.length; i++) {
214
+ var h = hashToken(table, field, tokens[i]);
215
+ if (!h || seen[h]) continue;
216
+ seen[h] = true;
217
+ out.push(h);
218
+ }
219
+ return out.join(" ");
220
+ }
221
+
222
+ // Tokenize + hash + join in one step (the common path for both
223
+ // append-side index updates and search-side query rewriting).
224
+ /**
225
+ * @primitive b.mailStore.fts.hashText
226
+ * @signature b.mailStore.fts.hashText(table, field, text)
227
+ * @since 0.11.25
228
+ * @status stable
229
+ *
230
+ * Tokenize + hash + join in one step. Convenience wrapper —
231
+ * equivalent to `hashTokens(table, field, tokenize(text))`.
232
+ *
233
+ * @example
234
+ * b.mailStore.fts.hashText("mail_messages", "body", "kubernetes deploy");
235
+ * // → "<16hex> <16hex>"
236
+ */
237
+ function hashText(table, field, text) {
238
+ return hashTokens(table, field, tokenize(text));
239
+ }
240
+
241
+ // Build the FTS row body for one message. Subject + body tokens get
242
+ // their own FTS columns; from + to addresses share `addr_toks` so a
243
+ // search for an address hits regardless of which side it's on. The
244
+ // `addr_toks` namespace is a single pseudo-field "addr" so the index
245
+ // + query sides hash identically regardless of which header carried
246
+ // the token — `{from: "alice@x"}` and `{to: "alice@x"}` BOTH hit a
247
+ // row that mentions alice@x in EITHER header.
248
+ /**
249
+ * @primitive b.mailStore.fts.rowFromMessage
250
+ * @signature b.mailStore.fts.rowFromMessage(table, msg)
251
+ * @since 0.11.25
252
+ * @status stable
253
+ *
254
+ * Build the FTS5 row payload `{ objectid, subject_toks, addr_toks,
255
+ * body_toks }` from a `{ objectid, subject, from, to, body }`
256
+ * plaintext message. `from` + `to` share `addr_toks`.
257
+ *
258
+ * @example
259
+ * b.mailStore.fts.rowFromMessage("t", { objectid:"o1", subject:"Hi", from:"a@x", to:"b@x", body:"hello" });
260
+ * // → { objectid:"o1", subject_toks:"<hash>", addr_toks:"<hash> <hash>", body_toks:"<hash>" }
261
+ */
262
+ function rowFromMessage(table, msg) {
263
+ var addrTokens = tokenize(msg.from || "").concat(tokenize(msg.to || ""));
264
+ return {
265
+ objectid: msg.objectid,
266
+ subject_toks: hashText(table, "subject", msg.subject || ""),
267
+ addr_toks: hashTokens(table, "addr", addrTokens),
268
+ body_toks: hashText(table, "body", msg.body || ""),
269
+ };
270
+ }
271
+
272
+ // Map a query-side filter key onto the (FTS5 column, namespace pseudo-
273
+ // field) pair the indexer used. Keeps the index + query in lock-step
274
+ // so future column additions only touch this table.
275
+ //
276
+ // filter key → FTS5 column + namespace field
277
+ // subject → subject_toks + "subject"
278
+ // body → body_toks + "body"
279
+ // from / to → addr_toks + "addr"
280
+ //
281
+ // For a broad cross-column `text` query the caller iterates this
282
+ // mapping and OR's the per-column MATCH clauses.
283
+ var QUERY_KEY_MAP = {
284
+ subject: { column: "subject_toks", field: "subject" },
285
+ body: { column: "body_toks", field: "body" },
286
+ from: { column: "addr_toks", field: "addr" },
287
+ to: { column: "addr_toks", field: "addr" },
288
+ };
289
+
290
+ /**
291
+ * @primitive b.mailStore.fts.columnAndFieldFor
292
+ * @signature b.mailStore.fts.columnAndFieldFor(filterKey)
293
+ * @since 0.11.25
294
+ * @status stable
295
+ *
296
+ * Map a search filter key (`subject` / `body` / `from` / `to`) to
297
+ * the FTS5 column it indexes into PLUS the namespace pseudo-field
298
+ * the indexer uses when hashing tokens. Used by the search path so
299
+ * the query-side hash transform matches the index-side one byte-
300
+ * for-byte.
301
+ *
302
+ * @example
303
+ * b.mailStore.fts.columnAndFieldFor("from");
304
+ * // → { column: "addr_toks", field: "addr" }
305
+ */
306
+ function columnAndFieldFor(key) {
307
+ return QUERY_KEY_MAP[key] || null;
308
+ }
309
+
310
+ // Rewrite an operator query term into a FTS5 MATCH expression. The
311
+ // term is tokenized + hashed exactly like an index value, then the
312
+ // hashes are AND'd together so multi-word queries require every
313
+ // token to appear in the row. Returns null when no tokens survive
314
+ // the filter (caller should skip the FTS join in that case).
315
+ /**
316
+ * @primitive b.mailStore.fts.buildMatchExpression
317
+ * @signature b.mailStore.fts.buildMatchExpression(table, field, term)
318
+ * @since 0.11.25
319
+ * @status stable
320
+ *
321
+ * Tokenize + hash an operator's query `term` and produce the FTS5
322
+ * MATCH expression that selects rows containing every surviving
323
+ * token. Returns `null` when no tokens survive the tokenize +
324
+ * stopword filter (caller skips the FTS join in that case).
325
+ *
326
+ * @example
327
+ * var expr = b.mailStore.fts.buildMatchExpression("t", "body", "kubernetes deploy");
328
+ * // → "<16hex> AND <16hex>"
329
+ */
330
+ function buildMatchExpression(table, field, term) {
331
+ var tokens = tokenize(term);
332
+ if (tokens.length === 0) return null;
333
+ var hashes = [];
334
+ var seen = Object.create(null);
335
+ for (var i = 0; i < tokens.length; i++) {
336
+ var h = hashToken(table, field, tokens[i]);
337
+ if (!h || seen[h]) continue;
338
+ seen[h] = true;
339
+ hashes.push(h);
340
+ }
341
+ if (hashes.length === 0) return null;
342
+ // FTS5 default operator is AND; explicit for readability.
343
+ return hashes.join(" AND ");
344
+ }
345
+
346
+ // SQL builder — creates the FTS5 virtual table. Caller supplies the
347
+ // quoted parent table identifier; this module owns the FTS table
348
+ // name and column layout.
349
+ /**
350
+ * @primitive b.mailStore.fts.createSql
351
+ * @signature b.mailStore.fts.createSql(qFtsTable)
352
+ * @since 0.11.25
353
+ * @status stable
354
+ *
355
+ * Returns the `CREATE VIRTUAL TABLE IF NOT EXISTS` SQL for the
356
+ * sealed-token FTS5 table. The caller passes the quoted table
357
+ * identifier (e.g. `"blamejs_mail_messages_fts"`).
358
+ *
359
+ * @example
360
+ * db.prepare(b.mailStore.fts.createSql('"mail_fts"')).run();
361
+ */
362
+ function createSql(qFtsTable) {
363
+ return "CREATE VIRTUAL TABLE IF NOT EXISTS " + qFtsTable + " USING fts5(" +
364
+ "objectid UNINDEXED, " +
365
+ "subject_toks, " +
366
+ "addr_toks, " +
367
+ "body_toks, " +
368
+ "tokenize = 'unicode61 remove_diacritics 2'" +
369
+ ")";
370
+ }
371
+
372
+ module.exports = {
373
+ // SQL primitives
374
+ createSql: createSql,
375
+
376
+ // Index-side
377
+ tokenize: tokenize,
378
+ hashToken: hashToken,
379
+ hashTokens: hashTokens,
380
+ hashText: hashText,
381
+ rowFromMessage: rowFromMessage,
382
+
383
+ // Query-side
384
+ buildMatchExpression: buildMatchExpression,
385
+ columnAndFieldFor: columnAndFieldFor,
386
+ QUERY_KEY_MAP: QUERY_KEY_MAP,
387
+
388
+ // Constants surfaced for tests + adjacent modules.
389
+ STOPWORDS: STOPWORDS,
390
+ MIN_TOKEN_LEN: MIN_TOKEN_LEN,
391
+ MAX_TOKEN_LEN: MAX_TOKEN_LEN,
392
+ MAX_TOKENS_PER_FIELD: MAX_TOKENS_PER_FIELD,
393
+ FTS_FIELDS: FTS_FIELDS,
394
+ };
package/lib/mail-store.js CHANGED
@@ -63,6 +63,7 @@ var cryptoField = require("./crypto-field");
63
63
  var safeMime = require("./safe-mime");
64
64
  var safeSql = require("./safe-sql");
65
65
  var guardMessageId = require("./guard-message-id");
66
+ var mailStoreFts = require("./mail-store-fts");
66
67
  var { defineClass } = require("./framework-error");
67
68
 
68
69
  var MailStoreError = defineClass("MailStoreError", { alwaysPermanent: true });
@@ -126,6 +127,7 @@ function create(opts) {
126
127
  var qFolders = safeSql.quoteIdentifier(prefix + "_folders", "sqlite");
127
128
  var qFlags = safeSql.quoteIdentifier(prefix + "_flags", "sqlite");
128
129
  var qQuota = safeSql.quoteIdentifier(prefix + "_quota", "sqlite");
130
+ var qFts = safeSql.quoteIdentifier(prefix + "_messages_fts", "sqlite");
129
131
  var messagesTable = prefix + "_messages";
130
132
 
131
133
  var maxMessageBytes = opts.maxMessageBytes !== undefined ? opts.maxMessageBytes : DEFAULT_MAX_MESSAGE_BYTES;
@@ -147,7 +149,7 @@ function create(opts) {
147
149
  });
148
150
 
149
151
  if (doInit) {
150
- _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota);
152
+ _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota, qFts);
151
153
  _ensureDefaultFolders(db, qFolders);
152
154
  }
153
155
 
@@ -195,12 +197,27 @@ function create(opts) {
195
197
  " WHERE folder_id = ? AND objectid IN (SELECT value FROM json_each(?))");
196
198
  var stmtDeleteMsg = db.prepare("DELETE FROM " + qMsgs + " WHERE objectid = ?");
197
199
  var stmtDeleteFlags = db.prepare("DELETE FROM " + qFlags + " WHERE objectid = ?");
200
+ // Sealed-token FTS5 prepared statements — index sync runs in the
201
+ // same transaction window as the canonical row mutation so a crash
202
+ // between the two cannot leave the FTS index out of step with the
203
+ // messages table. See lib/mail-store-fts.js for the tokenize +
204
+ // vault-salted-hash transform applied here.
205
+ var stmtInsertFts = db.prepare(
206
+ "INSERT INTO " + qFts + " (objectid, subject_toks, addr_toks, body_toks) VALUES (?, ?, ?, ?)");
207
+ var stmtDeleteFts = db.prepare("DELETE FROM " + qFts + " WHERE objectid = ?");
198
208
 
199
209
  return {
200
210
  appendMessage: function (folderName, rawBytes, appendOpts) {
201
- return _appendMessage({
211
+ // Wrap canonical row insert + FTS row insert in a single backend
212
+ // transaction so a crash / FTS-row failure CANNOT leave a message
213
+ // persisted but unsearchable (state drift). better-sqlite3-style
214
+ // backends expose `.transaction(fn)()`; backends without
215
+ // transactions fall back to per-statement (the FTS insert is the
216
+ // last write, so partial state == still consistent to the reader).
217
+ var args = {
202
218
  db: db, qMsgs: qMsgs, qFlags: qFlags, messagesTable: messagesTable,
203
219
  stmtInsertMsg: stmtInsertMsg,
220
+ stmtInsertFts: stmtInsertFts,
204
221
  stmtBumpFolderModseq: stmtBumpFolderModseq,
205
222
  stmtGetFolderByName: stmtGetFolderByName,
206
223
  stmtFindThreadByMsgId: stmtFindThreadByMsgId,
@@ -209,7 +226,13 @@ function create(opts) {
209
226
  safeMimeOpts: safeMimeOpts,
210
227
  maxMessageBytes: maxMessageBytes,
211
228
  maxBodyBytes: maxBodyBytes,
212
- });
229
+ };
230
+ if (typeof db.transaction === "function") {
231
+ var result;
232
+ db.transaction(function () { result = _appendMessage(args); })();
233
+ return result;
234
+ }
235
+ return _appendMessage(args);
213
236
  },
214
237
  fetchByObjectId: function (folderName, objectid) {
215
238
  return _fetchByObjectId({
@@ -220,6 +243,97 @@ function create(opts) {
220
243
  folderName: folderName, objectid: objectid,
221
244
  });
222
245
  },
246
+ /**
247
+ * search — sealed-token full-text search inside a single folder.
248
+ *
249
+ * Composes the FTS5 virtual table populated by `appendMessage`.
250
+ * Each filter term is tokenized + vault-salted-hashed exactly like
251
+ * the index side, then issued as an FTS5 `MATCH` expression
252
+ * intersected with the modseq + flag window. Result rows carry the
253
+ * SAME shape as `queryByModseq` so operators iterate either path
254
+ * symmetrically.
255
+ *
256
+ * `filter` accepts (any subset; all present terms AND-combine):
257
+ * - text: match across subject + addr + body
258
+ * - subject: match against `subject_toks` column only
259
+ * - body: match against `body_toks` column only
260
+ * - from / to: match against `addr_toks`
261
+ * - sinceModseq: integer floor
262
+ * - limit: result cap (default 100, hard cap 1000)
263
+ *
264
+ * When NO text-side filter is present, falls through to the
265
+ * `queryByModseq` path — search is purely additive on the existing
266
+ * modseq cursor.
267
+ */
268
+ search: function (folderName, filter) {
269
+ var folder = stmtGetFolderByName.get(folderName);
270
+ if (!folder) {
271
+ throw new MailStoreError("mail-store/no-folder",
272
+ "search: folder '" + folderName + "' not found");
273
+ }
274
+ var f = filter || {};
275
+ var sinceModseq = f.sinceModseq || 0;
276
+ var limit = f.limit || 100;
277
+ if (limit > 1000) limit = 1000; // allow:raw-byte-literal — query row cap, not bytes
278
+
279
+ var matchClauses = [];
280
+ function addMatch(filterKey, term) {
281
+ if (!term) return;
282
+ var m = mailStoreFts.columnAndFieldFor(filterKey);
283
+ if (!m) return;
284
+ var expr = mailStoreFts.buildMatchExpression(messagesTable, m.field, term);
285
+ if (expr) matchClauses.push(m.column + ":(" + expr + ")");
286
+ }
287
+ if (f.subject) addMatch("subject", f.subject);
288
+ if (f.body) addMatch("body", f.body);
289
+ if (f.from) addMatch("from", f.from);
290
+ if (f.to) addMatch("to", f.to);
291
+ if (f.text) {
292
+ var perCol = ["subject", "body", "from"].map(function (key) {
293
+ var m = mailStoreFts.columnAndFieldFor(key);
294
+ var perColExpr = mailStoreFts.buildMatchExpression(messagesTable, m.field, f.text);
295
+ return perColExpr ? "(" + m.column + ":(" + perColExpr + "))" : null;
296
+ }).filter(Boolean);
297
+ if (perCol.length > 0) {
298
+ matchClauses.push("(" + perCol.join(" OR ") + ")");
299
+ }
300
+ }
301
+
302
+ if (matchClauses.length === 0) {
303
+ var fallback = stmtQueryByModseq.all(folder.id, sinceModseq, limit);
304
+ return {
305
+ rows: fallback.map(function (r) {
306
+ return {
307
+ objectid: r.objectid, modseq: r.modseq, sizeBytes: r.size_bytes,
308
+ internalDate: r.internal_date, legalHold: r.legal_hold === 1,
309
+ };
310
+ }),
311
+ nextModseq: fallback.length > 0 ? fallback[fallback.length - 1].modseq : sinceModseq,
312
+ };
313
+ }
314
+
315
+ var matchExpr = matchClauses.join(" AND ");
316
+ // FTS5 MATCH binds to the virtual-table name — aliases / joined-
317
+ // table refs are parsed as ordinary column refs and fail. The
318
+ // IN-subquery shape sidesteps that.
319
+ var sql =
320
+ "SELECT objectid, modseq, size_bytes, internal_date, legal_hold " +
321
+ "FROM " + qMsgs + " " +
322
+ "WHERE folder_id = ? AND modseq > ? " +
323
+ "AND objectid IN (SELECT objectid FROM " + qFts + " WHERE " + qFts + " MATCH ?) " +
324
+ "ORDER BY modseq ASC LIMIT ?";
325
+ var rows = db.prepare(sql).all(folder.id, sinceModseq, matchExpr, limit);
326
+ return {
327
+ rows: rows.map(function (r) {
328
+ return {
329
+ objectid: r.objectid, modseq: r.modseq, sizeBytes: r.size_bytes,
330
+ internalDate: r.internal_date, legalHold: r.legal_hold === 1,
331
+ };
332
+ }),
333
+ nextModseq: rows.length > 0 ? rows[rows.length - 1].modseq : sinceModseq,
334
+ matchExpr: matchExpr,
335
+ };
336
+ },
223
337
  queryByModseq: function (folderName, queryOpts) {
224
338
  var folder = stmtGetFolderByName.get(folderName);
225
339
  if (!folder) {
@@ -368,6 +482,7 @@ function create(opts) {
368
482
  function _runTxn() {
369
483
  for (var di = 0; di < toDelete.length; di += 1) {
370
484
  stmtDeleteFlags.run(toDelete[di].objectid);
485
+ stmtDeleteFts.run(toDelete[di].objectid);
371
486
  stmtDeleteMsg.run(toDelete[di].objectid);
372
487
  totalBytes += toDelete[di].size_bytes || 0;
373
488
  }
@@ -512,6 +627,18 @@ function _appendMessage(args) {
512
627
  args.stmtBumpFolderModseq.run(modseq, args.folderName);
513
628
  args.stmtBumpQuota.run(folder.id, buf.length, 1);
514
629
 
630
+ // FTS index update — tokenize the PRE-seal plaintext, hash each
631
+ // token with the per-deployment vault salt, insert into the FTS5
632
+ // virtual table.
633
+ var ftsRow = mailStoreFts.rowFromMessage(args.messagesTable, {
634
+ objectid: objectid,
635
+ subject: subject,
636
+ from: fromAddr,
637
+ to: toAddrs,
638
+ body: bodyText,
639
+ });
640
+ args.stmtInsertFts.run(ftsRow.objectid, ftsRow.subject_toks, ftsRow.addr_toks, ftsRow.body_toks);
641
+
515
642
  return { objectid: objectid, modseq: modseq, sizeBytes: buf.length, threadRootId: threadRootId };
516
643
  }
517
644
 
@@ -706,7 +833,7 @@ function _normalizeMsgId(s) {
706
833
 
707
834
  // ---- Schema bootstrap ----------------------------------------------------
708
835
 
709
- function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota) {
836
+ function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota, qFts) {
710
837
  // Folders table — created first since messages reference folder_id.
711
838
  db.prepare(
712
839
  "CREATE TABLE IF NOT EXISTS " + qFolders + " (" +
@@ -775,6 +902,13 @@ function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota) {
775
902
  "cap_count INTEGER, " +
776
903
  "FOREIGN KEY(folder_id) REFERENCES " + qFolders + "(id))"
777
904
  ).run();
905
+
906
+ // Sealed-token FTS5 virtual table. The token-hash transform lives in
907
+ // `lib/mail-store-fts.js`; this is the storage layer. Tokenizer is
908
+ // `unicode61 remove_diacritics 2` so FTS5's segmenter splits hash-
909
+ // tokens on whitespace exactly — hashes are ASCII-hex-only, so no
910
+ // Unicode case-fold runs at MATCH time.
911
+ db.prepare(mailStoreFts.createSql(qFts)).run();
778
912
  }
779
913
 
780
914
  function _ensureDefaultFolders(db, qFolders) {
@@ -788,4 +922,8 @@ module.exports = {
788
922
  create: create,
789
923
  DEFAULT_FOLDERS: DEFAULT_FOLDERS,
790
924
  MailStoreError: MailStoreError,
925
+ // Sealed-token FTS substrate. Exposed for adjacent primitives (e.g.
926
+ // wire-protocol adapters translating IMAP SEARCH TEXT into the
927
+ // store's FTS5 column expression).
928
+ fts: mailStoreFts,
791
929
  };