@blamejs/core 0.11.24 → 0.11.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +2 -0
- package/index.js +5 -0
- package/lib/auth/bot-challenge.js +573 -0
- package/lib/framework-error.js +6 -0
- package/lib/fsm.js +469 -0
- package/lib/guard-mail-query.js +14 -0
- package/lib/mail-agent.js +24 -10
- package/lib/mail-store-fts.js +394 -0
- package/lib/mail-store.js +142 -4
- package/lib/money.js +699 -0
- package/lib/webhook.js +229 -0
- package/package.json +1 -1
- package/sbom.cdx.json +6 -6
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @module b.mailStore.fts
|
|
4
|
+
* @nav Mail
|
|
5
|
+
* @title Mail-store FTS (sealed-token full-text index)
|
|
6
|
+
* @order 250
|
|
7
|
+
* @slug mail-store-fts
|
|
8
|
+
*
|
|
9
|
+
* @intro
|
|
10
|
+
* Sealed-token full-text search index for `b.mailStore`. At
|
|
11
|
+
* `appendMessage` time the row's plaintext subject + addresses + body
|
|
12
|
+
* are tokenized, each token is hashed with the per-deployment vault
|
|
13
|
+
* salt (the same scheme `b.cryptoField` uses for derived-hash mirrors
|
|
14
|
+
* on sealed columns), and the resulting space-separated token-hash
|
|
15
|
+
* string is inserted into a SQLite FTS5 virtual table. Search runs
|
|
16
|
+
* the same tokenize → hash transform on the operator's query terms
|
|
17
|
+
* and issues `MATCH` against the FTS5 table — never against
|
|
18
|
+
* plaintext.
|
|
19
|
+
*
|
|
20
|
+
* The index is unrecoverable without the vault salt. A database
|
|
21
|
+
* dump leaks zero readable text — the FTS5 rows are byte-for-byte
|
|
22
|
+
* indistinguishable from random hashes. Per-tenant separation rides
|
|
23
|
+
* on the cryptoField namespace prefix (`bj-<table>-<field>:`), so
|
|
24
|
+
* tokens from one tenant's row can never collide with another's
|
|
25
|
+
* under the same vault key.
|
|
26
|
+
*
|
|
27
|
+
* Limitations of sealed-token FTS — operator-facing constraints:
|
|
28
|
+
*
|
|
29
|
+
* - Exact-token match only. No SQLite FTS5 stemmer, no porter,
|
|
30
|
+
* no Unicode-fold-then-stem, no NEAR with offsets. The token
|
|
31
|
+
* boundary IS the search granularity. Operators that need
|
|
32
|
+
* linguistic search at the cost of plaintext-at-rest opt in to
|
|
33
|
+
* a separate plaintext-FTS layer on top — not part of this
|
|
34
|
+
* primitive.
|
|
35
|
+
* - No prefix wildcard (`MATCH 'kub*'`). Token hashes don't
|
|
36
|
+
* preserve substring relationships. The cost of partial-match
|
|
37
|
+
* search is sealed-at-rest; operators get either-or.
|
|
38
|
+
* - Stopword filter is conservative (a / the / of / to / in /
|
|
39
|
+
* for / on / and / or / is / are / be / by). Stopwords land
|
|
40
|
+
* in the unsealed plaintext but never reach the FTS row.
|
|
41
|
+
* - Token length capped at 2..64 unicode codepoints after
|
|
42
|
+
* NFC normalisation. Tokens outside the band are dropped (too
|
|
43
|
+
* short = high-collision noise; too long = file-bomb shape).
|
|
44
|
+
*
|
|
45
|
+
* Posture cascade. The primitive is on by default for every
|
|
46
|
+
* posture (`hipaa` / `pci-dss` / `gdpr` / `soc2`) — the token
|
|
47
|
+
* index uses the same vault key already protecting sealed-row
|
|
48
|
+
* storage, so adding the FTS index doesn't widen the cryptographic
|
|
49
|
+
* trust boundary. A future opt-in plaintext-FTS overlay would be
|
|
50
|
+
* gated by a relaxed posture; this module ships sealed-only.
|
|
51
|
+
*
|
|
52
|
+
* @card
|
|
53
|
+
* Tokenize → vault-salted hash → FTS5 MATCH. The DB dump leaks
|
|
54
|
+
* nothing readable; search works against ciphertext.
|
|
55
|
+
*/
|
|
56
|
+
|
|
57
|
+
var bCrypto = require("./crypto");
|
|
58
|
+
var vault = require("./vault");
|
|
59
|
+
var C = require("./constants");
|
|
60
|
+
|
|
61
|
+
// Stopwords are dropped before hashing — they'd dominate every row's
|
|
62
|
+
// token set without adding query selectivity. Kept conservative to
|
|
63
|
+
// stay locale-neutral for v1.
|
|
64
|
+
var STOPWORDS = Object.create(null);
|
|
65
|
+
(
|
|
66
|
+
"a an the of to in for on and or but is are was were be been by with " +
|
|
67
|
+
"as at from this that it its their our your his her him us we i you " +
|
|
68
|
+
"do does did not no yes if so up down out over under than then them"
|
|
69
|
+
).split(" ").forEach(function (w) { STOPWORDS[w] = true; });
|
|
70
|
+
|
|
71
|
+
// Per-token bounds. NFC-normalised codepoint count, not byte length —
|
|
72
|
+
// tokens carrying multi-byte UTF-8 are not penalised relative to ASCII.
|
|
73
|
+
var MIN_TOKEN_LEN = 2;
|
|
74
|
+
var MAX_TOKEN_LEN = 64;
|
|
75
|
+
|
|
76
|
+
// Per-row token-set cap. A single 50 MiB message can produce
|
|
77
|
+
// millions of tokens; FTS5 row insert + index update must stay
|
|
78
|
+
// bounded. The cap is applied AFTER stopword + length filter so the
|
|
79
|
+
// surviving tokens are the highest-signal subset.
|
|
80
|
+
var MAX_TOKENS_PER_FIELD = 8192; // allow:raw-byte-literal — token-count cap, not bytes
|
|
81
|
+
|
|
82
|
+
// Per-field FTS column names. Kept symmetric with the messages table
|
|
83
|
+
// columns so callers can reason about which FTS column corresponds
|
|
84
|
+
// to which plaintext source field.
|
|
85
|
+
var FTS_FIELDS = {
|
|
86
|
+
subject: "subject_toks",
|
|
87
|
+
from: "addr_toks",
|
|
88
|
+
to: "addr_toks",
|
|
89
|
+
body: "body_toks",
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
// Token splitter — Unicode-aware. Splits on every non-letter,
|
|
93
|
+
// non-digit code-point (Unicode category L* or N*). Apostrophes inside
|
|
94
|
+
// a word survive (`don't` → `don't`); leading/trailing punctuation is
|
|
95
|
+
// stripped. Email addresses are split on `@` + `.` so both the local
|
|
96
|
+
// part and each domain label produce independent tokens — operators
|
|
97
|
+
// searching for `example.com` find rows whose from/to header carries
|
|
98
|
+
// `alice@example.com` AND rows that mention `example` or `com` in
|
|
99
|
+
// body prose. Stopwords prune the noisy ones.
|
|
100
|
+
//
|
|
101
|
+
// Refuses input larger than MAX_INPUT_BYTES to bound tokenizer work
|
|
102
|
+
// — protects against DoS-shaped messages whose body is 50 MiB of a
|
|
103
|
+
// single token boundary.
|
|
104
|
+
var MAX_INPUT_BYTES = C.BYTES.mib(8); // 8 MiB
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* @primitive b.mailStore.fts.tokenize
|
|
108
|
+
* @signature b.mailStore.fts.tokenize(text)
|
|
109
|
+
* @since 0.11.25
|
|
110
|
+
* @status stable
|
|
111
|
+
*
|
|
112
|
+
* Split `text` into a deduplicated, lowercased, NFC-normalised token
|
|
113
|
+
* array. Drops stopwords + tokens outside the 2..64-codepoint band.
|
|
114
|
+
* Splits on every non-letter / non-digit codepoint, including the
|
|
115
|
+
* `@` + `.` boundaries of email addresses so local-part + domain
|
|
116
|
+
* labels become independent tokens.
|
|
117
|
+
*
|
|
118
|
+
* @example
|
|
119
|
+
* b.mailStore.fts.tokenize("Hello world from alice@example.com");
|
|
120
|
+
* // → ["hello", "world", "alice", "example", "com"]
|
|
121
|
+
*/
|
|
122
|
+
function tokenize(text) {
|
|
123
|
+
if (typeof text !== "string") return [];
|
|
124
|
+
if (text.length === 0) return [];
|
|
125
|
+
if (Buffer.byteLength(text, "utf8") > MAX_INPUT_BYTES) {
|
|
126
|
+
// Truncate at MAX_INPUT_BYTES. Tokenization on the prefix is
|
|
127
|
+
// already representative for the body's content fingerprint;
|
|
128
|
+
// refusing outright would weaken indexing on legitimately large
|
|
129
|
+
// messages.
|
|
130
|
+
text = text.slice(0, MAX_INPUT_BYTES);
|
|
131
|
+
}
|
|
132
|
+
// NFC normalise so visually-identical tokens hash to the same value
|
|
133
|
+
// regardless of the source's encoding form.
|
|
134
|
+
var nfc = text.normalize("NFC").toLowerCase();
|
|
135
|
+
// Split on any run of characters that is NOT a letter, digit, or
|
|
136
|
+
// intra-word apostrophe. `\p{L}` + `\p{N}` need the `u` flag.
|
|
137
|
+
var rawTokens = nfc.split(/[^\p{L}\p{N}']+/u);
|
|
138
|
+
var seen = Object.create(null);
|
|
139
|
+
var out = [];
|
|
140
|
+
for (var i = 0; i < rawTokens.length && out.length < MAX_TOKENS_PER_FIELD; i++) {
|
|
141
|
+
var t = rawTokens[i];
|
|
142
|
+
if (!t) continue;
|
|
143
|
+
// Drop leading/trailing apostrophes that survived the split.
|
|
144
|
+
t = t.replace(/^[']+/, "").replace(/[']+$/, "");
|
|
145
|
+
if (!t) continue;
|
|
146
|
+
// Count CODEPOINTS, not UTF-16 units.
|
|
147
|
+
var len = Array.from(t).length;
|
|
148
|
+
if (len < MIN_TOKEN_LEN || len > MAX_TOKEN_LEN) continue;
|
|
149
|
+
if (STOPWORDS[t]) continue;
|
|
150
|
+
if (seen[t]) continue;
|
|
151
|
+
seen[t] = true;
|
|
152
|
+
out.push(t);
|
|
153
|
+
}
|
|
154
|
+
return out;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Hash one token using the same scheme cryptoField uses for derived-
|
|
158
|
+
// hash mirrors: `sha3Hash(vaultSalt + namespace + token)`. The
|
|
159
|
+
// namespace is per-table, per-field, per-purpose ("fts") so that
|
|
160
|
+
// rotating an operator's vault salt invalidates every FTS row in
|
|
161
|
+
// the same step as every sealed column. Returns a 16-char hex prefix
|
|
162
|
+
// — full 64-char SHA3 is overkill for FTS hash space, and shorter
|
|
163
|
+
// tokens compress the FTS5 row 4x without observable collision risk
|
|
164
|
+
// at corpus sizes the framework targets (≤ 10^9 unique tokens, where
|
|
165
|
+
// 64-bit collision space leaves the birthday bound > 10^9).
|
|
166
|
+
/**
|
|
167
|
+
* @primitive b.mailStore.fts.hashToken
|
|
168
|
+
* @signature b.mailStore.fts.hashToken(table, field, token)
|
|
169
|
+
* @since 0.11.25
|
|
170
|
+
* @status stable
|
|
171
|
+
*
|
|
172
|
+
* Vault-salted hash of one token under the (table, field) namespace.
|
|
173
|
+
* The same scheme `b.cryptoField.computeDerived` uses for derived-
|
|
174
|
+
* hash mirrors on sealed columns — rotating the vault salt
|
|
175
|
+
* invalidates every FTS hash in step with every sealed-column hash.
|
|
176
|
+
* Returns a 16-char hex prefix.
|
|
177
|
+
*
|
|
178
|
+
* @example
|
|
179
|
+
* var h = b.mailStore.fts.hashToken("mail_messages", "body", "kubernetes");
|
|
180
|
+
* /^[0-9a-f]{16}$/.test(h); // → true
|
|
181
|
+
*/
|
|
182
|
+
function hashToken(table, field, token) {
|
|
183
|
+
if (typeof token !== "string" || token.length === 0) return "";
|
|
184
|
+
// Mirrors cryptoField's internal `namespaceFor()` scheme — the FTS
|
|
185
|
+
// fields are pseudo-fields (no sealed-column registration), so the
|
|
186
|
+
// canonical fallback path is always the right answer here.
|
|
187
|
+
var ns = "bj-" + table + "-" + field + ":fts:";
|
|
188
|
+
var salt = vault.getDerivedHashSalt();
|
|
189
|
+
var saltHex = (salt && typeof salt.toString === "function") ? salt.toString("hex") : "";
|
|
190
|
+
return bCrypto.sha3Hash(saltHex + ns + token).slice(0, 16); // allow:raw-byte-literal — 16-char hex prefix length, not bytes
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Hash a token array → space-separated string suitable for FTS5
|
|
194
|
+
// row insertion. The output is what gets MATCH'd at query time.
|
|
195
|
+
/**
|
|
196
|
+
* @primitive b.mailStore.fts.hashTokens
|
|
197
|
+
* @signature b.mailStore.fts.hashTokens(table, field, tokens)
|
|
198
|
+
* @since 0.11.25
|
|
199
|
+
* @status stable
|
|
200
|
+
*
|
|
201
|
+
* Hash an array of tokens → space-separated hash string suitable for
|
|
202
|
+
* direct insertion into an FTS5 column. Empty + duplicate token-
|
|
203
|
+
* hashes drop on the way out.
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* b.mailStore.fts.hashTokens("t", "subject", ["hello", "world"]);
|
|
207
|
+
* // → "<16hex> <16hex>"
|
|
208
|
+
*/
|
|
209
|
+
function hashTokens(table, field, tokens) {
|
|
210
|
+
if (!Array.isArray(tokens) || tokens.length === 0) return "";
|
|
211
|
+
var seen = Object.create(null);
|
|
212
|
+
var out = [];
|
|
213
|
+
for (var i = 0; i < tokens.length; i++) {
|
|
214
|
+
var h = hashToken(table, field, tokens[i]);
|
|
215
|
+
if (!h || seen[h]) continue;
|
|
216
|
+
seen[h] = true;
|
|
217
|
+
out.push(h);
|
|
218
|
+
}
|
|
219
|
+
return out.join(" ");
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Tokenize + hash + join in one step (the common path for both
|
|
223
|
+
// append-side index updates and search-side query rewriting).
|
|
224
|
+
/**
|
|
225
|
+
* @primitive b.mailStore.fts.hashText
|
|
226
|
+
* @signature b.mailStore.fts.hashText(table, field, text)
|
|
227
|
+
* @since 0.11.25
|
|
228
|
+
* @status stable
|
|
229
|
+
*
|
|
230
|
+
* Tokenize + hash + join in one step. Convenience wrapper —
|
|
231
|
+
* equivalent to `hashTokens(table, field, tokenize(text))`.
|
|
232
|
+
*
|
|
233
|
+
* @example
|
|
234
|
+
* b.mailStore.fts.hashText("mail_messages", "body", "kubernetes deploy");
|
|
235
|
+
* // → "<16hex> <16hex>"
|
|
236
|
+
*/
|
|
237
|
+
function hashText(table, field, text) {
|
|
238
|
+
return hashTokens(table, field, tokenize(text));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Build the FTS row body for one message. Subject + body tokens get
|
|
242
|
+
// their own FTS columns; from + to addresses share `addr_toks` so a
|
|
243
|
+
// search for an address hits regardless of which side it's on. The
|
|
244
|
+
// `addr_toks` namespace is a single pseudo-field "addr" so the index
|
|
245
|
+
// + query sides hash identically regardless of which header carried
|
|
246
|
+
// the token — `{from: "alice@x"}` and `{to: "alice@x"}` BOTH hit a
|
|
247
|
+
// row that mentions alice@x in EITHER header.
|
|
248
|
+
/**
|
|
249
|
+
* @primitive b.mailStore.fts.rowFromMessage
|
|
250
|
+
* @signature b.mailStore.fts.rowFromMessage(table, msg)
|
|
251
|
+
* @since 0.11.25
|
|
252
|
+
* @status stable
|
|
253
|
+
*
|
|
254
|
+
* Build the FTS5 row payload `{ objectid, subject_toks, addr_toks,
|
|
255
|
+
* body_toks }` from a `{ objectid, subject, from, to, body }`
|
|
256
|
+
* plaintext message. `from` + `to` share `addr_toks`.
|
|
257
|
+
*
|
|
258
|
+
* @example
|
|
259
|
+
* b.mailStore.fts.rowFromMessage("t", { objectid:"o1", subject:"Hi", from:"a@x", to:"b@x", body:"hello" });
|
|
260
|
+
* // → { objectid:"o1", subject_toks:"<hash>", addr_toks:"<hash> <hash>", body_toks:"<hash>" }
|
|
261
|
+
*/
|
|
262
|
+
function rowFromMessage(table, msg) {
|
|
263
|
+
var addrTokens = tokenize(msg.from || "").concat(tokenize(msg.to || ""));
|
|
264
|
+
return {
|
|
265
|
+
objectid: msg.objectid,
|
|
266
|
+
subject_toks: hashText(table, "subject", msg.subject || ""),
|
|
267
|
+
addr_toks: hashTokens(table, "addr", addrTokens),
|
|
268
|
+
body_toks: hashText(table, "body", msg.body || ""),
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Map a query-side filter key onto the (FTS5 column, namespace pseudo-
|
|
273
|
+
// field) pair the indexer used. Keeps the index + query in lock-step
|
|
274
|
+
// so future column additions only touch this table.
|
|
275
|
+
//
|
|
276
|
+
// filter key → FTS5 column + namespace field
|
|
277
|
+
// subject → subject_toks + "subject"
|
|
278
|
+
// body → body_toks + "body"
|
|
279
|
+
// from / to → addr_toks + "addr"
|
|
280
|
+
//
|
|
281
|
+
// For a broad cross-column `text` query the caller iterates this
|
|
282
|
+
// mapping and OR's the per-column MATCH clauses.
|
|
283
|
+
var QUERY_KEY_MAP = {
|
|
284
|
+
subject: { column: "subject_toks", field: "subject" },
|
|
285
|
+
body: { column: "body_toks", field: "body" },
|
|
286
|
+
from: { column: "addr_toks", field: "addr" },
|
|
287
|
+
to: { column: "addr_toks", field: "addr" },
|
|
288
|
+
};
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* @primitive b.mailStore.fts.columnAndFieldFor
|
|
292
|
+
* @signature b.mailStore.fts.columnAndFieldFor(filterKey)
|
|
293
|
+
* @since 0.11.25
|
|
294
|
+
* @status stable
|
|
295
|
+
*
|
|
296
|
+
* Map a search filter key (`subject` / `body` / `from` / `to`) to
|
|
297
|
+
* the FTS5 column it indexes into PLUS the namespace pseudo-field
|
|
298
|
+
* the indexer uses when hashing tokens. Used by the search path so
|
|
299
|
+
* the query-side hash transform matches the index-side one byte-
|
|
300
|
+
* for-byte.
|
|
301
|
+
*
|
|
302
|
+
* @example
|
|
303
|
+
* b.mailStore.fts.columnAndFieldFor("from");
|
|
304
|
+
* // → { column: "addr_toks", field: "addr" }
|
|
305
|
+
*/
|
|
306
|
+
function columnAndFieldFor(key) {
|
|
307
|
+
return QUERY_KEY_MAP[key] || null;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Rewrite an operator query term into a FTS5 MATCH expression. The
|
|
311
|
+
// term is tokenized + hashed exactly like an index value, then the
|
|
312
|
+
// hashes are AND'd together so multi-word queries require every
|
|
313
|
+
// token to appear in the row. Returns null when no tokens survive
|
|
314
|
+
// the filter (caller should skip the FTS join in that case).
|
|
315
|
+
/**
|
|
316
|
+
* @primitive b.mailStore.fts.buildMatchExpression
|
|
317
|
+
* @signature b.mailStore.fts.buildMatchExpression(table, field, term)
|
|
318
|
+
* @since 0.11.25
|
|
319
|
+
* @status stable
|
|
320
|
+
*
|
|
321
|
+
* Tokenize + hash an operator's query `term` and produce the FTS5
|
|
322
|
+
* MATCH expression that selects rows containing every surviving
|
|
323
|
+
* token. Returns `null` when no tokens survive the tokenize +
|
|
324
|
+
* stopword filter (caller skips the FTS join in that case).
|
|
325
|
+
*
|
|
326
|
+
* @example
|
|
327
|
+
* var expr = b.mailStore.fts.buildMatchExpression("t", "body", "kubernetes deploy");
|
|
328
|
+
* // → "<16hex> AND <16hex>"
|
|
329
|
+
*/
|
|
330
|
+
function buildMatchExpression(table, field, term) {
|
|
331
|
+
var tokens = tokenize(term);
|
|
332
|
+
if (tokens.length === 0) return null;
|
|
333
|
+
var hashes = [];
|
|
334
|
+
var seen = Object.create(null);
|
|
335
|
+
for (var i = 0; i < tokens.length; i++) {
|
|
336
|
+
var h = hashToken(table, field, tokens[i]);
|
|
337
|
+
if (!h || seen[h]) continue;
|
|
338
|
+
seen[h] = true;
|
|
339
|
+
hashes.push(h);
|
|
340
|
+
}
|
|
341
|
+
if (hashes.length === 0) return null;
|
|
342
|
+
// FTS5 default operator is AND; explicit for readability.
|
|
343
|
+
return hashes.join(" AND ");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// SQL builder — creates the FTS5 virtual table. Caller supplies the
|
|
347
|
+
// quoted parent table identifier; this module owns the FTS table
|
|
348
|
+
// name and column layout.
|
|
349
|
+
/**
|
|
350
|
+
* @primitive b.mailStore.fts.createSql
|
|
351
|
+
* @signature b.mailStore.fts.createSql(qFtsTable)
|
|
352
|
+
* @since 0.11.25
|
|
353
|
+
* @status stable
|
|
354
|
+
*
|
|
355
|
+
* Returns the `CREATE VIRTUAL TABLE IF NOT EXISTS` SQL for the
|
|
356
|
+
* sealed-token FTS5 table. The caller passes the quoted table
|
|
357
|
+
* identifier (e.g. `"blamejs_mail_messages_fts"`).
|
|
358
|
+
*
|
|
359
|
+
* @example
|
|
360
|
+
* db.prepare(b.mailStore.fts.createSql('"mail_fts"')).run();
|
|
361
|
+
*/
|
|
362
|
+
function createSql(qFtsTable) {
|
|
363
|
+
return "CREATE VIRTUAL TABLE IF NOT EXISTS " + qFtsTable + " USING fts5(" +
|
|
364
|
+
"objectid UNINDEXED, " +
|
|
365
|
+
"subject_toks, " +
|
|
366
|
+
"addr_toks, " +
|
|
367
|
+
"body_toks, " +
|
|
368
|
+
"tokenize = 'unicode61 remove_diacritics 2'" +
|
|
369
|
+
")";
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
module.exports = {
|
|
373
|
+
// SQL primitives
|
|
374
|
+
createSql: createSql,
|
|
375
|
+
|
|
376
|
+
// Index-side
|
|
377
|
+
tokenize: tokenize,
|
|
378
|
+
hashToken: hashToken,
|
|
379
|
+
hashTokens: hashTokens,
|
|
380
|
+
hashText: hashText,
|
|
381
|
+
rowFromMessage: rowFromMessage,
|
|
382
|
+
|
|
383
|
+
// Query-side
|
|
384
|
+
buildMatchExpression: buildMatchExpression,
|
|
385
|
+
columnAndFieldFor: columnAndFieldFor,
|
|
386
|
+
QUERY_KEY_MAP: QUERY_KEY_MAP,
|
|
387
|
+
|
|
388
|
+
// Constants surfaced for tests + adjacent modules.
|
|
389
|
+
STOPWORDS: STOPWORDS,
|
|
390
|
+
MIN_TOKEN_LEN: MIN_TOKEN_LEN,
|
|
391
|
+
MAX_TOKEN_LEN: MAX_TOKEN_LEN,
|
|
392
|
+
MAX_TOKENS_PER_FIELD: MAX_TOKENS_PER_FIELD,
|
|
393
|
+
FTS_FIELDS: FTS_FIELDS,
|
|
394
|
+
};
|
package/lib/mail-store.js
CHANGED
|
@@ -63,6 +63,7 @@ var cryptoField = require("./crypto-field");
|
|
|
63
63
|
var safeMime = require("./safe-mime");
|
|
64
64
|
var safeSql = require("./safe-sql");
|
|
65
65
|
var guardMessageId = require("./guard-message-id");
|
|
66
|
+
var mailStoreFts = require("./mail-store-fts");
|
|
66
67
|
var { defineClass } = require("./framework-error");
|
|
67
68
|
|
|
68
69
|
var MailStoreError = defineClass("MailStoreError", { alwaysPermanent: true });
|
|
@@ -126,6 +127,7 @@ function create(opts) {
|
|
|
126
127
|
var qFolders = safeSql.quoteIdentifier(prefix + "_folders", "sqlite");
|
|
127
128
|
var qFlags = safeSql.quoteIdentifier(prefix + "_flags", "sqlite");
|
|
128
129
|
var qQuota = safeSql.quoteIdentifier(prefix + "_quota", "sqlite");
|
|
130
|
+
var qFts = safeSql.quoteIdentifier(prefix + "_messages_fts", "sqlite");
|
|
129
131
|
var messagesTable = prefix + "_messages";
|
|
130
132
|
|
|
131
133
|
var maxMessageBytes = opts.maxMessageBytes !== undefined ? opts.maxMessageBytes : DEFAULT_MAX_MESSAGE_BYTES;
|
|
@@ -147,7 +149,7 @@ function create(opts) {
|
|
|
147
149
|
});
|
|
148
150
|
|
|
149
151
|
if (doInit) {
|
|
150
|
-
_ensureSchema(db, qMsgs, qFolders, qFlags, qQuota);
|
|
152
|
+
_ensureSchema(db, qMsgs, qFolders, qFlags, qQuota, qFts);
|
|
151
153
|
_ensureDefaultFolders(db, qFolders);
|
|
152
154
|
}
|
|
153
155
|
|
|
@@ -195,12 +197,27 @@ function create(opts) {
|
|
|
195
197
|
" WHERE folder_id = ? AND objectid IN (SELECT value FROM json_each(?))");
|
|
196
198
|
var stmtDeleteMsg = db.prepare("DELETE FROM " + qMsgs + " WHERE objectid = ?");
|
|
197
199
|
var stmtDeleteFlags = db.prepare("DELETE FROM " + qFlags + " WHERE objectid = ?");
|
|
200
|
+
// Sealed-token FTS5 prepared statements — index sync runs in the
|
|
201
|
+
// same transaction window as the canonical row mutation so a crash
|
|
202
|
+
// between the two cannot leave the FTS index out of step with the
|
|
203
|
+
// messages table. See lib/mail-store-fts.js for the tokenize +
|
|
204
|
+
// vault-salted-hash transform applied here.
|
|
205
|
+
var stmtInsertFts = db.prepare(
|
|
206
|
+
"INSERT INTO " + qFts + " (objectid, subject_toks, addr_toks, body_toks) VALUES (?, ?, ?, ?)");
|
|
207
|
+
var stmtDeleteFts = db.prepare("DELETE FROM " + qFts + " WHERE objectid = ?");
|
|
198
208
|
|
|
199
209
|
return {
|
|
200
210
|
appendMessage: function (folderName, rawBytes, appendOpts) {
|
|
201
|
-
|
|
211
|
+
// Wrap canonical row insert + FTS row insert in a single backend
|
|
212
|
+
// transaction so a crash / FTS-row failure CANNOT leave a message
|
|
213
|
+
// persisted but unsearchable (state drift). better-sqlite3-style
|
|
214
|
+
// backends expose `.transaction(fn)()`; backends without
|
|
215
|
+
// transactions fall back to per-statement (the FTS insert is the
|
|
216
|
+
// last write, so partial state == still consistent to the reader).
|
|
217
|
+
var args = {
|
|
202
218
|
db: db, qMsgs: qMsgs, qFlags: qFlags, messagesTable: messagesTable,
|
|
203
219
|
stmtInsertMsg: stmtInsertMsg,
|
|
220
|
+
stmtInsertFts: stmtInsertFts,
|
|
204
221
|
stmtBumpFolderModseq: stmtBumpFolderModseq,
|
|
205
222
|
stmtGetFolderByName: stmtGetFolderByName,
|
|
206
223
|
stmtFindThreadByMsgId: stmtFindThreadByMsgId,
|
|
@@ -209,7 +226,13 @@ function create(opts) {
|
|
|
209
226
|
safeMimeOpts: safeMimeOpts,
|
|
210
227
|
maxMessageBytes: maxMessageBytes,
|
|
211
228
|
maxBodyBytes: maxBodyBytes,
|
|
212
|
-
}
|
|
229
|
+
};
|
|
230
|
+
if (typeof db.transaction === "function") {
|
|
231
|
+
var result;
|
|
232
|
+
db.transaction(function () { result = _appendMessage(args); })();
|
|
233
|
+
return result;
|
|
234
|
+
}
|
|
235
|
+
return _appendMessage(args);
|
|
213
236
|
},
|
|
214
237
|
fetchByObjectId: function (folderName, objectid) {
|
|
215
238
|
return _fetchByObjectId({
|
|
@@ -220,6 +243,97 @@ function create(opts) {
|
|
|
220
243
|
folderName: folderName, objectid: objectid,
|
|
221
244
|
});
|
|
222
245
|
},
|
|
246
|
+
/**
|
|
247
|
+
* search — sealed-token full-text search inside a single folder.
|
|
248
|
+
*
|
|
249
|
+
* Composes the FTS5 virtual table populated by `appendMessage`.
|
|
250
|
+
* Each filter term is tokenized + vault-salted-hashed exactly like
|
|
251
|
+
* the index side, then issued as an FTS5 `MATCH` expression
|
|
252
|
+
* intersected with the modseq + flag window. Result rows carry the
|
|
253
|
+
* SAME shape as `queryByModseq` so operators iterate either path
|
|
254
|
+
* symmetrically.
|
|
255
|
+
*
|
|
256
|
+
* `filter` accepts (any subset; all present terms AND-combine):
|
|
257
|
+
* - text: match across subject + addr + body
|
|
258
|
+
* - subject: match against `subject_toks` column only
|
|
259
|
+
* - body: match against `body_toks` column only
|
|
260
|
+
* - from / to: match against `addr_toks`
|
|
261
|
+
* - sinceModseq: integer floor
|
|
262
|
+
* - limit: result cap (default 100, hard cap 1000)
|
|
263
|
+
*
|
|
264
|
+
* When NO text-side filter is present, falls through to the
|
|
265
|
+
* `queryByModseq` path — search is purely additive on the existing
|
|
266
|
+
* modseq cursor.
|
|
267
|
+
*/
|
|
268
|
+
search: function (folderName, filter) {
|
|
269
|
+
var folder = stmtGetFolderByName.get(folderName);
|
|
270
|
+
if (!folder) {
|
|
271
|
+
throw new MailStoreError("mail-store/no-folder",
|
|
272
|
+
"search: folder '" + folderName + "' not found");
|
|
273
|
+
}
|
|
274
|
+
var f = filter || {};
|
|
275
|
+
var sinceModseq = f.sinceModseq || 0;
|
|
276
|
+
var limit = f.limit || 100;
|
|
277
|
+
if (limit > 1000) limit = 1000; // allow:raw-byte-literal — query row cap, not bytes
|
|
278
|
+
|
|
279
|
+
var matchClauses = [];
|
|
280
|
+
function addMatch(filterKey, term) {
|
|
281
|
+
if (!term) return;
|
|
282
|
+
var m = mailStoreFts.columnAndFieldFor(filterKey);
|
|
283
|
+
if (!m) return;
|
|
284
|
+
var expr = mailStoreFts.buildMatchExpression(messagesTable, m.field, term);
|
|
285
|
+
if (expr) matchClauses.push(m.column + ":(" + expr + ")");
|
|
286
|
+
}
|
|
287
|
+
if (f.subject) addMatch("subject", f.subject);
|
|
288
|
+
if (f.body) addMatch("body", f.body);
|
|
289
|
+
if (f.from) addMatch("from", f.from);
|
|
290
|
+
if (f.to) addMatch("to", f.to);
|
|
291
|
+
if (f.text) {
|
|
292
|
+
var perCol = ["subject", "body", "from"].map(function (key) {
|
|
293
|
+
var m = mailStoreFts.columnAndFieldFor(key);
|
|
294
|
+
var perColExpr = mailStoreFts.buildMatchExpression(messagesTable, m.field, f.text);
|
|
295
|
+
return perColExpr ? "(" + m.column + ":(" + perColExpr + "))" : null;
|
|
296
|
+
}).filter(Boolean);
|
|
297
|
+
if (perCol.length > 0) {
|
|
298
|
+
matchClauses.push("(" + perCol.join(" OR ") + ")");
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (matchClauses.length === 0) {
|
|
303
|
+
var fallback = stmtQueryByModseq.all(folder.id, sinceModseq, limit);
|
|
304
|
+
return {
|
|
305
|
+
rows: fallback.map(function (r) {
|
|
306
|
+
return {
|
|
307
|
+
objectid: r.objectid, modseq: r.modseq, sizeBytes: r.size_bytes,
|
|
308
|
+
internalDate: r.internal_date, legalHold: r.legal_hold === 1,
|
|
309
|
+
};
|
|
310
|
+
}),
|
|
311
|
+
nextModseq: fallback.length > 0 ? fallback[fallback.length - 1].modseq : sinceModseq,
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
var matchExpr = matchClauses.join(" AND ");
|
|
316
|
+
// FTS5 MATCH binds to the virtual-table name — aliases / joined-
|
|
317
|
+
// table refs are parsed as ordinary column refs and fail. The
|
|
318
|
+
// IN-subquery shape sidesteps that.
|
|
319
|
+
var sql =
|
|
320
|
+
"SELECT objectid, modseq, size_bytes, internal_date, legal_hold " +
|
|
321
|
+
"FROM " + qMsgs + " " +
|
|
322
|
+
"WHERE folder_id = ? AND modseq > ? " +
|
|
323
|
+
"AND objectid IN (SELECT objectid FROM " + qFts + " WHERE " + qFts + " MATCH ?) " +
|
|
324
|
+
"ORDER BY modseq ASC LIMIT ?";
|
|
325
|
+
var rows = db.prepare(sql).all(folder.id, sinceModseq, matchExpr, limit);
|
|
326
|
+
return {
|
|
327
|
+
rows: rows.map(function (r) {
|
|
328
|
+
return {
|
|
329
|
+
objectid: r.objectid, modseq: r.modseq, sizeBytes: r.size_bytes,
|
|
330
|
+
internalDate: r.internal_date, legalHold: r.legal_hold === 1,
|
|
331
|
+
};
|
|
332
|
+
}),
|
|
333
|
+
nextModseq: rows.length > 0 ? rows[rows.length - 1].modseq : sinceModseq,
|
|
334
|
+
matchExpr: matchExpr,
|
|
335
|
+
};
|
|
336
|
+
},
|
|
223
337
|
queryByModseq: function (folderName, queryOpts) {
|
|
224
338
|
var folder = stmtGetFolderByName.get(folderName);
|
|
225
339
|
if (!folder) {
|
|
@@ -368,6 +482,7 @@ function create(opts) {
|
|
|
368
482
|
function _runTxn() {
|
|
369
483
|
for (var di = 0; di < toDelete.length; di += 1) {
|
|
370
484
|
stmtDeleteFlags.run(toDelete[di].objectid);
|
|
485
|
+
stmtDeleteFts.run(toDelete[di].objectid);
|
|
371
486
|
stmtDeleteMsg.run(toDelete[di].objectid);
|
|
372
487
|
totalBytes += toDelete[di].size_bytes || 0;
|
|
373
488
|
}
|
|
@@ -512,6 +627,18 @@ function _appendMessage(args) {
|
|
|
512
627
|
args.stmtBumpFolderModseq.run(modseq, args.folderName);
|
|
513
628
|
args.stmtBumpQuota.run(folder.id, buf.length, 1);
|
|
514
629
|
|
|
630
|
+
// FTS index update — tokenize the PRE-seal plaintext, hash each
|
|
631
|
+
// token with the per-deployment vault salt, insert into the FTS5
|
|
632
|
+
// virtual table.
|
|
633
|
+
var ftsRow = mailStoreFts.rowFromMessage(args.messagesTable, {
|
|
634
|
+
objectid: objectid,
|
|
635
|
+
subject: subject,
|
|
636
|
+
from: fromAddr,
|
|
637
|
+
to: toAddrs,
|
|
638
|
+
body: bodyText,
|
|
639
|
+
});
|
|
640
|
+
args.stmtInsertFts.run(ftsRow.objectid, ftsRow.subject_toks, ftsRow.addr_toks, ftsRow.body_toks);
|
|
641
|
+
|
|
515
642
|
return { objectid: objectid, modseq: modseq, sizeBytes: buf.length, threadRootId: threadRootId };
|
|
516
643
|
}
|
|
517
644
|
|
|
@@ -706,7 +833,7 @@ function _normalizeMsgId(s) {
|
|
|
706
833
|
|
|
707
834
|
// ---- Schema bootstrap ----------------------------------------------------
|
|
708
835
|
|
|
709
|
-
function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota) {
|
|
836
|
+
function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota, qFts) {
|
|
710
837
|
// Folders table — created first since messages reference folder_id.
|
|
711
838
|
db.prepare(
|
|
712
839
|
"CREATE TABLE IF NOT EXISTS " + qFolders + " (" +
|
|
@@ -775,6 +902,13 @@ function _ensureSchema(db, qMsgs, qFolders, qFlags, qQuota) {
|
|
|
775
902
|
"cap_count INTEGER, " +
|
|
776
903
|
"FOREIGN KEY(folder_id) REFERENCES " + qFolders + "(id))"
|
|
777
904
|
).run();
|
|
905
|
+
|
|
906
|
+
// Sealed-token FTS5 virtual table. The token-hash transform lives in
|
|
907
|
+
// `lib/mail-store-fts.js`; this is the storage layer. Tokenizer is
|
|
908
|
+
// `unicode61 remove_diacritics 2` so FTS5's segmenter splits hash-
|
|
909
|
+
// tokens on whitespace exactly — hashes are ASCII-hex-only, so no
|
|
910
|
+
// Unicode case-fold runs at MATCH time.
|
|
911
|
+
db.prepare(mailStoreFts.createSql(qFts)).run();
|
|
778
912
|
}
|
|
779
913
|
|
|
780
914
|
function _ensureDefaultFolders(db, qFolders) {
|
|
@@ -788,4 +922,8 @@ module.exports = {
|
|
|
788
922
|
create: create,
|
|
789
923
|
DEFAULT_FOLDERS: DEFAULT_FOLDERS,
|
|
790
924
|
MailStoreError: MailStoreError,
|
|
925
|
+
// Sealed-token FTS substrate. Exposed for adjacent primitives (e.g.
|
|
926
|
+
// wire-protocol adapters translating IMAP SEARCH TEXT into the
|
|
927
|
+
// store's FTS5 column expression).
|
|
928
|
+
fts: mailStoreFts,
|
|
791
929
|
};
|