@remnic/core 9.3.597 → 9.3.598
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/access-cli.js +17 -17
- package/dist/access-http.js +6 -6
- package/dist/access-mcp.js +5 -5
- package/dist/access-service.js +4 -4
- package/dist/behavior-learner.js +2 -1
- package/dist/behavior-learner.js.map +1 -1
- package/dist/causal-behavior.js +3 -3
- package/dist/causal-chain.js +3 -3
- package/dist/causal-consolidation.js +4 -4
- package/dist/causal-retrieval.js +3 -3
- package/dist/causal-trajectory.js +2 -2
- package/dist/{chunk-A2Z6UCWT.js → chunk-33JBK2XP.js} +2 -2
- package/dist/{chunk-D2MMMTDV.js → chunk-5SQ5CQJP.js} +2 -2
- package/dist/{chunk-F4LM4ULA.js → chunk-65JSA4MP.js} +12 -12
- package/dist/{chunk-SKGV326D.js → chunk-6GDHLVJC.js} +2 -2
- package/dist/chunk-6HEM6HTQ.js +359 -0
- package/dist/chunk-6HEM6HTQ.js.map +1 -0
- package/dist/{chunk-WXACKLKP.js → chunk-75O6YQ63.js} +22 -7
- package/dist/chunk-75O6YQ63.js.map +1 -0
- package/dist/{chunk-D65TSG24.js → chunk-7DZRO2DC.js} +2 -2
- package/dist/{chunk-TYICDVQW.js → chunk-BDCCWRHR.js} +4 -4
- package/dist/{chunk-LYPDMKUT.js → chunk-CL3MWNNQ.js} +2 -2
- package/dist/{chunk-YQMZ7IH2.js → chunk-D4KJ74JJ.js} +65 -27
- package/dist/chunk-D4KJ74JJ.js.map +1 -0
- package/dist/{chunk-W5O2FQTZ.js → chunk-GUPISBV2.js} +2 -2
- package/dist/{chunk-472U7RDF.js → chunk-JGSKJHF7.js} +2 -2
- package/dist/{chunk-IEFHBIU2.js → chunk-KDUVQU6Y.js} +14 -14
- package/dist/{chunk-6HZ6AO2P.js → chunk-LBJBNWS2.js} +37 -10
- package/dist/chunk-LBJBNWS2.js.map +1 -0
- package/dist/{chunk-Z4R6RI2N.js → chunk-NSKYFGDL.js} +2 -2
- package/dist/{chunk-OD5LFAPZ.js → chunk-V67GWXM2.js} +1 -1
- package/dist/{chunk-5NXIJZFX.js → chunk-WR64DQFE.js} +3 -3
- package/dist/{chunk-5BUGGPBR.js → chunk-WZA5Y6AC.js} +3 -3
- package/dist/chunk-ZBJMUXZH.js +121 -0
- package/dist/chunk-ZBJMUXZH.js.map +1 -0
- package/dist/{chunk-XPXEJRUB.js → chunk-ZRWB5D4H.js} +2 -2
- package/dist/{chunk-MA5MWGKP.js → chunk-ZT3EGNLR.js} +2 -2
- package/dist/{chunk-LMPHTYJC.js → chunk-ZZYF3BUL.js} +2 -2
- package/dist/cli.js +14 -14
- package/dist/compounding/engine.js +1 -1
- package/dist/direct-answer-wiring.js +3 -3
- package/dist/direct-answer.d.ts +1 -1
- package/dist/direct-answer.js +2 -2
- package/dist/harmonic-retrieval.js +2 -2
- package/dist/index.js +21 -21
- package/dist/orchestrator.js +16 -16
- package/dist/policy-runtime.js +3 -2
- package/dist/recall-query-policy.js +2 -1
- package/dist/recall-tokenization.d.ts +5 -1
- package/dist/recall-tokenization.js +3 -1
- package/dist/resume-bundles.js +3 -3
- package/dist/retrieval-agents.js +2 -2
- package/dist/schemas.d.ts +22 -22
- package/dist/semantic-consolidation.js +2 -2
- package/dist/semantic-rule-verifier.js +2 -2
- package/dist/temporal-index.js +1 -1
- package/dist/transfer/types.d.ts +12 -12
- package/dist/trust-zones.js +2 -2
- package/dist/verified-recall.js +2 -2
- package/dist/work-product-ledger.js +2 -2
- package/package.json +1 -1
- package/src/causal-chain.ts +80 -42
- package/src/direct-answer.test.ts +618 -15
- package/src/direct-answer.ts +259 -20
- package/src/recall-query-policy.ts +49 -27
- package/src/recall-tokenization.ts +131 -21
- package/src/temporal-index.ts +23 -6
- package/dist/chunk-6HZ6AO2P.js.map +0 -1
- package/dist/chunk-DT5TVLJE.js +0 -32
- package/dist/chunk-DT5TVLJE.js.map +0 -1
- package/dist/chunk-WXACKLKP.js.map +0 -1
- package/dist/chunk-Y4FHOFJ2.js +0 -140
- package/dist/chunk-Y4FHOFJ2.js.map +0 -1
- package/dist/chunk-YQMZ7IH2.js.map +0 -1
- /package/dist/{chunk-A2Z6UCWT.js.map → chunk-33JBK2XP.js.map} +0 -0
- /package/dist/{chunk-D2MMMTDV.js.map → chunk-5SQ5CQJP.js.map} +0 -0
- /package/dist/{chunk-F4LM4ULA.js.map → chunk-65JSA4MP.js.map} +0 -0
- /package/dist/{chunk-SKGV326D.js.map → chunk-6GDHLVJC.js.map} +0 -0
- /package/dist/{chunk-D65TSG24.js.map → chunk-7DZRO2DC.js.map} +0 -0
- /package/dist/{chunk-TYICDVQW.js.map → chunk-BDCCWRHR.js.map} +0 -0
- /package/dist/{chunk-LYPDMKUT.js.map → chunk-CL3MWNNQ.js.map} +0 -0
- /package/dist/{chunk-W5O2FQTZ.js.map → chunk-GUPISBV2.js.map} +0 -0
- /package/dist/{chunk-472U7RDF.js.map → chunk-JGSKJHF7.js.map} +0 -0
- /package/dist/{chunk-IEFHBIU2.js.map → chunk-KDUVQU6Y.js.map} +0 -0
- /package/dist/{chunk-Z4R6RI2N.js.map → chunk-NSKYFGDL.js.map} +0 -0
- /package/dist/{chunk-OD5LFAPZ.js.map → chunk-V67GWXM2.js.map} +0 -0
- /package/dist/{chunk-5NXIJZFX.js.map → chunk-WR64DQFE.js.map} +0 -0
- /package/dist/{chunk-5BUGGPBR.js.map → chunk-WZA5Y6AC.js.map} +0 -0
- /package/dist/{chunk-XPXEJRUB.js.map → chunk-ZRWB5D4H.js.map} +0 -0
- /package/dist/{chunk-MA5MWGKP.js.map → chunk-ZT3EGNLR.js.map} +0 -0
- /package/dist/{chunk-LMPHTYJC.js.map → chunk-ZZYF3BUL.js.map} +0 -0
package/src/direct-answer.ts
CHANGED
|
@@ -18,12 +18,9 @@
|
|
|
18
18
|
* Not wired into retrieval yet — see slice 3.
|
|
19
19
|
*/
|
|
20
20
|
|
|
21
|
-
import
|
|
21
|
+
import { normalizeRecallTokenSet, normalizeRecallTokens } from "./recall-tokenization.js";
|
|
22
22
|
import type { TrustZoneName } from "./trust-zones.js";
|
|
23
|
-
import {
|
|
24
|
-
countRecallTokenOverlap,
|
|
25
|
-
normalizeRecallTokens,
|
|
26
|
-
} from "./recall-tokenization.js";
|
|
23
|
+
import type { MemoryFile, MemoryStatus } from "./types.js";
|
|
27
24
|
|
|
28
25
|
/**
|
|
29
26
|
* Caller-supplied candidate.
|
|
@@ -105,9 +102,246 @@ export const FILTER_LABELS = {
|
|
|
105
102
|
belowTokenOverlapFloor: "below-token-overlap-floor",
|
|
106
103
|
} as const;
|
|
107
104
|
|
|
105
|
+
const PROMPT_RECALL_WORDS = new Set([
|
|
106
|
+
"what",
|
|
107
|
+
"who",
|
|
108
|
+
"where",
|
|
109
|
+
"when",
|
|
110
|
+
"why",
|
|
111
|
+
"how",
|
|
112
|
+
"is",
|
|
113
|
+
"are",
|
|
114
|
+
"was",
|
|
115
|
+
"were",
|
|
116
|
+
"do",
|
|
117
|
+
"does",
|
|
118
|
+
"did",
|
|
119
|
+
"find",
|
|
120
|
+
"get",
|
|
121
|
+
"show",
|
|
122
|
+
"search",
|
|
123
|
+
"lookup",
|
|
124
|
+
"recall",
|
|
125
|
+
"remember",
|
|
126
|
+
"list",
|
|
127
|
+
"status",
|
|
128
|
+
"include",
|
|
129
|
+
"tell",
|
|
130
|
+
"me",
|
|
131
|
+
"give",
|
|
132
|
+
"about",
|
|
133
|
+
"please",
|
|
134
|
+
"the",
|
|
135
|
+
"and",
|
|
136
|
+
"for",
|
|
137
|
+
"with",
|
|
138
|
+
"from",
|
|
139
|
+
"into",
|
|
140
|
+
"that",
|
|
141
|
+
"this",
|
|
142
|
+
"найди",
|
|
143
|
+
"найти",
|
|
144
|
+
"поиск",
|
|
145
|
+
"покажи",
|
|
146
|
+
"статус",
|
|
147
|
+
"включи",
|
|
148
|
+
]);
|
|
149
|
+
|
|
108
150
|
interface ScoredCandidate {
|
|
109
151
|
candidate: DirectAnswerCandidate;
|
|
110
152
|
tokenOverlap: number;
|
|
153
|
+
requiredTokenMismatch: boolean;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function hasUnsegmentableRecallChar(token: string): boolean {
|
|
157
|
+
if (token.includes("ー") || token.includes("ー")) return true;
|
|
158
|
+
return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(token);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function requiredCjkPhraseTokens(query: string): string[] {
|
|
162
|
+
const phrases = new Set<string>();
|
|
163
|
+
let segment = "";
|
|
164
|
+
|
|
165
|
+
const addPhrase = (phrase: string) => {
|
|
166
|
+
if ([...phrase].length >= 4) {
|
|
167
|
+
phrases.add(phrase);
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
const flushSegment = () => {
|
|
172
|
+
let buffered = "";
|
|
173
|
+
for (const run of segment.split(/\s+/)) {
|
|
174
|
+
if (!run) continue;
|
|
175
|
+
if ([...run].length >= 4) {
|
|
176
|
+
addPhrase(buffered);
|
|
177
|
+
buffered = "";
|
|
178
|
+
addPhrase(run);
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
buffered += run;
|
|
182
|
+
}
|
|
183
|
+
addPhrase(buffered);
|
|
184
|
+
segment = "";
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
for (const ch of query.toLowerCase().normalize("NFC")) {
|
|
188
|
+
if (hasUnsegmentableRecallChar(ch)) {
|
|
189
|
+
segment += ch;
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
if (/\p{M}/u.test(ch) && segment.length > 0 && !/\s$/u.test(segment)) {
|
|
193
|
+
segment += ch;
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
if (/\s/u.test(ch)) {
|
|
197
|
+
segment += " ";
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
flushSegment();
|
|
201
|
+
}
|
|
202
|
+
flushSegment();
|
|
203
|
+
|
|
204
|
+
return [...phrases];
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function requiredMixedScriptTokens(query: string): string[] {
|
|
208
|
+
const required = new Set<string>();
|
|
209
|
+
const parts: string[] = [];
|
|
210
|
+
let segment = "";
|
|
211
|
+
|
|
212
|
+
const flushSegment = () => {
|
|
213
|
+
if (segment) {
|
|
214
|
+
parts.push(segment);
|
|
215
|
+
}
|
|
216
|
+
segment = "";
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
const segmentableRecallTokens = (value: string) => {
|
|
220
|
+
const tokens = new Set<string>();
|
|
221
|
+
let segment = "";
|
|
222
|
+
const flushSegmentableSegment = () => {
|
|
223
|
+
for (const token of normalizeRecallTokenSet(segment, [], { minTokenLength: 1 })) {
|
|
224
|
+
tokens.add(token);
|
|
225
|
+
}
|
|
226
|
+
segment = "";
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
for (const ch of value) {
|
|
230
|
+
if (/[\p{L}\p{N}]/u.test(ch) && !hasUnsegmentableRecallChar(ch)) {
|
|
231
|
+
segment += ch;
|
|
232
|
+
continue;
|
|
233
|
+
}
|
|
234
|
+
if (/\p{M}/u.test(ch) && segment.length > 0) {
|
|
235
|
+
segment += ch;
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
flushSegmentableSegment();
|
|
239
|
+
}
|
|
240
|
+
flushSegmentableSegment();
|
|
241
|
+
return tokens;
|
|
242
|
+
};
|
|
243
|
+
|
|
244
|
+
const hasRequiredSegmentableToken = (value: string) => {
|
|
245
|
+
return segmentableRecallTokens(value).size > 0;
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
const hasBoundarySegmentableToken = (value: string) => {
|
|
249
|
+
for (const token of segmentableRecallTokens(value)) {
|
|
250
|
+
if (PROMPT_RECALL_WORDS.has(token)) {
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
|
|
254
|
+
if (token.length >= 3 || /\p{N}/u.test(token) || hasNonAsciiCodepoint) {
|
|
255
|
+
return true;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return false;
|
|
259
|
+
};
|
|
260
|
+
|
|
261
|
+
const addRequiredTokens = (value: string) => {
|
|
262
|
+
for (const token of normalizeRecallTokenSet(value, [], { minTokenLength: 1 })) {
|
|
263
|
+
required.add(token);
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
for (const ch of query.toLowerCase().normalize("NFC")) {
|
|
268
|
+
if (/[\p{L}\p{N}\p{M}]/u.test(ch) || hasUnsegmentableRecallChar(ch)) {
|
|
269
|
+
segment += ch;
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
flushSegment();
|
|
273
|
+
}
|
|
274
|
+
flushSegment();
|
|
275
|
+
|
|
276
|
+
for (const part of parts) {
|
|
277
|
+
if (hasUnsegmentableRecallChar(part) && hasRequiredSegmentableToken(part)) {
|
|
278
|
+
addRequiredTokens(part);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
for (let i = 0; i < parts.length - 1; i += 1) {
|
|
283
|
+
const current = parts[i];
|
|
284
|
+
const next = parts[i + 1];
|
|
285
|
+
const currentHasUnsegmentable = hasUnsegmentableRecallChar(current);
|
|
286
|
+
const nextHasUnsegmentable = hasUnsegmentableRecallChar(next);
|
|
287
|
+
if (currentHasUnsegmentable === nextHasUnsegmentable) {
|
|
288
|
+
continue;
|
|
289
|
+
}
|
|
290
|
+
const segmentablePart = currentHasUnsegmentable ? next : current;
|
|
291
|
+
if (!hasBoundarySegmentableToken(segmentablePart)) {
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
addRequiredTokens(current);
|
|
295
|
+
addRequiredTokens(next);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
for (let i = 1; i < parts.length - 1; i += 1) {
|
|
299
|
+
const prev = parts[i - 1];
|
|
300
|
+
const current = parts[i];
|
|
301
|
+
const next = parts[i + 1];
|
|
302
|
+
if (!hasUnsegmentableRecallChar(prev) || hasUnsegmentableRecallChar(current) || !hasUnsegmentableRecallChar(next)) {
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
if (!hasRequiredSegmentableToken(current)) {
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
addRequiredTokens(prev);
|
|
309
|
+
addRequiredTokens(current);
|
|
310
|
+
addRequiredTokens(next);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return [...required];
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function requiredSegmentableUnicodeTokens(queryTokens: Set<string>): string[] {
|
|
317
|
+
const segmentableTokens = [...queryTokens].filter((token) => !hasUnsegmentableRecallChar(token));
|
|
318
|
+
const hasSegmentableUnicodeToken = segmentableTokens.some((token) =>
|
|
319
|
+
[...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f)
|
|
320
|
+
);
|
|
321
|
+
if (!hasSegmentableUnicodeToken) {
|
|
322
|
+
return [];
|
|
323
|
+
}
|
|
324
|
+
return segmentableTokens.filter((token) => {
|
|
325
|
+
if (PROMPT_RECALL_WORDS.has(token)) {
|
|
326
|
+
return false;
|
|
327
|
+
}
|
|
328
|
+
const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
|
|
329
|
+
if (hasNonAsciiCodepoint) {
|
|
330
|
+
return true;
|
|
331
|
+
}
|
|
332
|
+
if (!/^[a-z0-9]+$/u.test(token)) {
|
|
333
|
+
return false;
|
|
334
|
+
}
|
|
335
|
+
return token.length >= 3 || /\p{N}/u.test(token);
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
function countTokenOverlap(queryTokens: Set<string>, valueTokens: Set<string>): number {
|
|
340
|
+
let matches = 0;
|
|
341
|
+
for (const token of queryTokens) {
|
|
342
|
+
if (valueTokens.has(token)) matches += 1;
|
|
343
|
+
}
|
|
344
|
+
return matches;
|
|
111
345
|
}
|
|
112
346
|
|
|
113
347
|
/**
|
|
@@ -123,9 +357,7 @@ interface ScoredCandidate {
|
|
|
123
357
|
* 6. top two candidates within ambiguityMargin → "ambiguous"
|
|
124
358
|
* 7. otherwise → "eligible"
|
|
125
359
|
*/
|
|
126
|
-
export function isDirectAnswerEligible(
|
|
127
|
-
input: DirectAnswerInput,
|
|
128
|
-
): DirectAnswerResult {
|
|
360
|
+
export function isDirectAnswerEligible(input: DirectAnswerInput): DirectAnswerResult {
|
|
129
361
|
const { query, candidates, config, queryEntityRefs } = input;
|
|
130
362
|
|
|
131
363
|
if (!config.enabled) {
|
|
@@ -164,17 +396,13 @@ export function isDirectAnswerEligible(
|
|
|
164
396
|
return status === "active";
|
|
165
397
|
});
|
|
166
398
|
|
|
167
|
-
working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) =>
|
|
168
|
-
c.trustZone === "trusted",
|
|
169
|
-
);
|
|
399
|
+
working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) => c.trustZone === "trusted");
|
|
170
400
|
|
|
171
401
|
working = applyFilter(
|
|
172
402
|
working,
|
|
173
403
|
filteredBy,
|
|
174
404
|
FILTER_LABELS.ineligibleTaxonomyBucket,
|
|
175
|
-
(c) =>
|
|
176
|
-
c.taxonomyBucket !== null &&
|
|
177
|
-
config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket),
|
|
405
|
+
(c) => c.taxonomyBucket !== null && config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket)
|
|
178
406
|
);
|
|
179
407
|
|
|
180
408
|
working = applyFilter(working, filteredBy, FILTER_LABELS.belowImportanceFloor, (c) => {
|
|
@@ -201,13 +429,24 @@ export function isDirectAnswerEligible(
|
|
|
201
429
|
}
|
|
202
430
|
|
|
203
431
|
const scored: ScoredCandidate[] = working.map((candidate) => {
|
|
204
|
-
const searchable =
|
|
205
|
-
|
|
206
|
-
const
|
|
207
|
-
|
|
432
|
+
const searchable = `${candidate.memory.frontmatter.tags?.join(" ") ?? ""} ${candidate.memory.content}`.trim();
|
|
433
|
+
const searchableTokens = normalizeRecallTokenSet(searchable);
|
|
434
|
+
const requiredSearchableTokens = normalizeRecallTokenSet(searchable, [], { minTokenLength: 1 });
|
|
435
|
+
const requiredPhrases = requiredCjkPhraseTokens(query);
|
|
436
|
+
const requiredMixedTokens = requiredMixedScriptTokens(query);
|
|
437
|
+
const requiredUnicodeTokens = requiredSegmentableUnicodeTokens(queryTokens);
|
|
438
|
+
const hasRequiredPhrase =
|
|
439
|
+
requiredPhrases.length === 0 || requiredPhrases.every((token) => searchableTokens.has(token));
|
|
440
|
+
const hasRequiredMixedTokens =
|
|
441
|
+
requiredMixedTokens.length === 0 || requiredMixedTokens.every((token) => requiredSearchableTokens.has(token));
|
|
442
|
+
const hasRequiredUnicodeTokens =
|
|
443
|
+
requiredUnicodeTokens.length === 0 || requiredUnicodeTokens.every((token) => searchableTokens.has(token));
|
|
444
|
+
const requiredTokenMismatch = !hasRequiredPhrase || !hasRequiredMixedTokens || !hasRequiredUnicodeTokens;
|
|
445
|
+
const matches = requiredTokenMismatch ? 0 : countTokenOverlap(queryTokens, searchableTokens);
|
|
446
|
+
return { candidate, tokenOverlap: matches / queryTokens.size, requiredTokenMismatch };
|
|
208
447
|
});
|
|
209
448
|
|
|
210
|
-
const overlapSurvivors = scored.filter((s) => s.tokenOverlap >= config.tokenOverlapFloor);
|
|
449
|
+
const overlapSurvivors = scored.filter((s) => !s.requiredTokenMismatch && s.tokenOverlap >= config.tokenOverlapFloor);
|
|
211
450
|
if (overlapSurvivors.length < scored.length) {
|
|
212
451
|
filteredBy.push(FILTER_LABELS.belowTokenOverlapFloor);
|
|
213
452
|
}
|
|
@@ -252,7 +491,7 @@ function applyFilter(
|
|
|
252
491
|
working: DirectAnswerCandidate[],
|
|
253
492
|
filteredBy: string[],
|
|
254
493
|
label: string,
|
|
255
|
-
keep: (c: DirectAnswerCandidate) => boolean
|
|
494
|
+
keep: (c: DirectAnswerCandidate) => boolean
|
|
256
495
|
): DirectAnswerCandidate[] {
|
|
257
496
|
const before = working.length;
|
|
258
497
|
const next = working.filter(keep);
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { normalizeRecallTokens } from "./recall-tokenization.js";
|
|
2
|
+
|
|
1
3
|
export type RecallPromptShape = "standard" | "instruction_heavy";
|
|
2
4
|
export type CronConversationRecallMode = "auto" | "always" | "never";
|
|
3
5
|
export type RecallBudgetMode = "full" | "minimal";
|
|
@@ -16,7 +18,7 @@ export interface RecallQueryPolicyResult {
|
|
|
16
18
|
retrievalBudgetMode: RecallBudgetMode;
|
|
17
19
|
}
|
|
18
20
|
|
|
19
|
-
const DEFAULT_STOPWORDS =
|
|
21
|
+
const DEFAULT_STOPWORDS = [
|
|
20
22
|
"the",
|
|
21
23
|
"and",
|
|
22
24
|
"for",
|
|
@@ -72,7 +74,9 @@ const DEFAULT_STOPWORDS = new Set([
|
|
|
72
74
|
"data",
|
|
73
75
|
"gathering",
|
|
74
76
|
"context",
|
|
75
|
-
]
|
|
77
|
+
];
|
|
78
|
+
const MAX_COMPACT_TOKENS_PER_SOURCE_TERM = 8;
|
|
79
|
+
const COMPACT_IDENTIFIER_RE = /^[a-z0-9]+(?:[:_-][a-z0-9]+)+$/i;
|
|
76
80
|
|
|
77
81
|
function collapseWhitespace(text: string): string {
|
|
78
82
|
return text.replace(/\s+/g, " ").trim();
|
|
@@ -85,6 +89,24 @@ function stripFilesystemLikePaths(text: string): string {
|
|
|
85
89
|
.replace(/(?:^|\s)([A-Za-z]:\\[^\s)]+)(?=\s|$)/g, " ");
|
|
86
90
|
}
|
|
87
91
|
|
|
92
|
+
function trimCompactSourceTerm(term: string): string {
|
|
93
|
+
return term.replace(/^[^\p{L}\p{N}\p{M}]+|[^\p{L}\p{N}\p{M}]+$/gu, "");
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function splitCompactSourceTerm(term: string): string[] {
|
|
97
|
+
return term
|
|
98
|
+
.split(/[^\p{L}\p{N}\p{M}:_-]+/gu)
|
|
99
|
+
.map((part) => trimCompactSourceTerm(part))
|
|
100
|
+
.filter((part) => part.length > 0);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function capCompactSourceTermTokens(tokens: string[]): string[] {
|
|
104
|
+
if (tokens.length <= MAX_COMPACT_TOKENS_PER_SOURCE_TERM) return tokens;
|
|
105
|
+
const last = tokens.at(-1);
|
|
106
|
+
if (!last) return tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM);
|
|
107
|
+
return [...tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM - 1), last];
|
|
108
|
+
}
|
|
109
|
+
|
|
88
110
|
function isBulletOrNumberedLine(line: string): boolean {
|
|
89
111
|
if (line.startsWith("-") || line.startsWith("*")) {
|
|
90
112
|
return true;
|
|
@@ -112,16 +134,14 @@ function scoreInstructionHeavyShape(prompt: string): number {
|
|
|
112
134
|
const headingLineCount = lines.filter(
|
|
113
135
|
(line) =>
|
|
114
136
|
/^(goal|output format|tone rules|grounding rules|data gathering|date computation|crm context|follow-up|social|current time|return)\b/i.test(
|
|
115
|
-
line
|
|
116
|
-
) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line)
|
|
137
|
+
line
|
|
138
|
+
) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line)
|
|
117
139
|
).length;
|
|
118
140
|
const bulletLineCount = lines.filter((line) => isBulletOrNumberedLine(line)).length;
|
|
119
141
|
const longLineCount = lines.filter((line) => line.length >= 180).length;
|
|
120
|
-
const hasPathDensity =
|
|
121
|
-
(prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
|
|
142
|
+
const hasPathDensity = (prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
|
|
122
143
|
const hasImperativeDensity =
|
|
123
|
-
(prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >=
|
|
124
|
-
8;
|
|
144
|
+
(prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >= 8;
|
|
125
145
|
|
|
126
146
|
let score = 0;
|
|
127
147
|
if (lineCount >= 24) score += 2;
|
|
@@ -139,27 +159,29 @@ export function classifyRecallPromptShape(prompt: string): RecallPromptShape {
|
|
|
139
159
|
}
|
|
140
160
|
|
|
141
161
|
function tokenizeForCompactQuery(text: string): string[] {
|
|
142
|
-
const
|
|
143
|
-
.toLowerCase()
|
|
144
|
-
.replace(/[^a-z0-9\s:_-]+/g, " ")
|
|
145
|
-
.split(/\s+/)
|
|
146
|
-
.filter((token) => token.length >= 3);
|
|
147
|
-
const deduped: string[] = [];
|
|
162
|
+
const tokens: string[] = [];
|
|
148
163
|
const seen = new Set<string>();
|
|
149
|
-
|
|
150
|
-
if (
|
|
151
|
-
if (seen.has(token)) continue;
|
|
164
|
+
const addToken = (token: string) => {
|
|
165
|
+
if (token.length === 0 || seen.has(token)) return;
|
|
152
166
|
seen.add(token);
|
|
153
|
-
|
|
167
|
+
tokens.push(token);
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
for (const rawTerm of text.split(/\s+/)) {
|
|
171
|
+
for (const sourceTerm of splitCompactSourceTerm(trimCompactSourceTerm(rawTerm))) {
|
|
172
|
+
if (COMPACT_IDENTIFIER_RE.test(sourceTerm)) {
|
|
173
|
+
addToken(sourceTerm.toLowerCase());
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
for (const token of capCompactSourceTermTokens(normalizeRecallTokens(sourceTerm, DEFAULT_STOPWORDS))) {
|
|
177
|
+
addToken(token);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
154
180
|
}
|
|
155
|
-
return
|
|
181
|
+
return tokens;
|
|
156
182
|
}
|
|
157
183
|
|
|
158
|
-
function buildInstructionHeavyQuery(
|
|
159
|
-
prompt: string,
|
|
160
|
-
tokenCap: number,
|
|
161
|
-
maxChars: number,
|
|
162
|
-
): string {
|
|
184
|
+
function buildInstructionHeavyQuery(prompt: string, tokenCap: number, maxChars: number): string {
|
|
163
185
|
const cleaned = stripFilesystemLikePaths(prompt);
|
|
164
186
|
const tokens = tokenizeForCompactQuery(cleaned).slice(0, Math.max(8, tokenCap));
|
|
165
187
|
const joined = tokens.join(" ");
|
|
@@ -182,7 +204,7 @@ function buildStandardQuery(prompt: string, maxChars: number): string {
|
|
|
182
204
|
export function buildRecallQueryPolicy(
|
|
183
205
|
prompt: string,
|
|
184
206
|
sessionKey: string | undefined,
|
|
185
|
-
cfg: RecallQueryPolicyConfig
|
|
207
|
+
cfg: RecallQueryPolicyConfig
|
|
186
208
|
): RecallQueryPolicyResult {
|
|
187
209
|
const normalizedPrompt = collapseWhitespace(prompt);
|
|
188
210
|
const isCron = (sessionKey ?? "").includes(":cron:");
|
|
@@ -207,8 +229,8 @@ export function buildRecallQueryPolicy(
|
|
|
207
229
|
cfg.cronConversationRecallMode === "never"
|
|
208
230
|
? true
|
|
209
231
|
: cfg.cronConversationRecallMode === "always"
|
|
210
|
-
|
|
211
|
-
|
|
232
|
+
? false
|
|
233
|
+
: promptShape === "instruction_heavy";
|
|
212
234
|
|
|
213
235
|
const retrievalBudgetMode = promptShape === "instruction_heavy" ? "minimal" : "full";
|
|
214
236
|
|
|
@@ -1,32 +1,142 @@
|
|
|
1
|
-
export
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
export interface NormalizeRecallTokenOptions {
|
|
2
|
+
minTokenLength?: number;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
const DEFAULT_RECALL_STOP_WORDS = ["the", "and", "for", "with", "from", "into", "that", "this", "why", "did"];
|
|
6
|
+
|
|
7
|
+
function isUnsegmentableRecallChar(char: string): boolean {
|
|
8
|
+
if (char === "ー" || char === "ー") return true;
|
|
9
|
+
return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(char);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function isRecallCombiningMark(char: string): boolean {
|
|
13
|
+
return /\p{M}/u.test(char);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function buildRecallStopWords(extraStopWords: string[]): Set<string> {
|
|
17
|
+
return new Set([...DEFAULT_RECALL_STOP_WORDS, ...extraStopWords.map((word) => word.toLowerCase())]);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function shouldKeepRecallToken(token: string, minTokenLength: number, stopWords: Set<string>): boolean {
|
|
21
|
+
if (stopWords.has(token)) return false;
|
|
22
|
+
if (token.length >= minTokenLength) return true;
|
|
23
|
+
const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
|
|
24
|
+
return token.length >= 2 && hasNonAsciiCodepoint && /\p{L}/u.test(token);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function addUnsegmentableRecallSegment(tokens: Set<string>, segment: string, stopWords: Set<string>): void {
|
|
28
|
+
const chars = [...segment].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
|
|
29
|
+
for (const ch of chars) {
|
|
30
|
+
if (!stopWords.has(ch)) tokens.add(ch);
|
|
31
|
+
}
|
|
32
|
+
for (const size of [2, 3, 4]) {
|
|
33
|
+
if (chars.length < size) continue;
|
|
34
|
+
for (let index = 0; index <= chars.length - size; index += 1) {
|
|
35
|
+
const token = chars.slice(index, index + size).join("");
|
|
36
|
+
if (!stopWords.has(token)) tokens.add(token);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
const whole = chars.join("");
|
|
40
|
+
if (whole.length > 3 && !stopWords.has(whole)) {
|
|
41
|
+
tokens.add(whole);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function isUnsegmentableRecallToken(token: string): boolean {
|
|
46
|
+
const chars = [...token].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
|
|
47
|
+
return (
|
|
48
|
+
chars.length > 0 &&
|
|
49
|
+
chars.some(isUnsegmentableRecallChar) &&
|
|
50
|
+
chars.every((ch) => isUnsegmentableRecallChar(ch) || isRecallCombiningMark(ch))
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function addBridgedUnsegmentableRecallSegments(tokens: Set<string>, cleaned: string, stopWords: Set<string>): void {
|
|
55
|
+
let segment = "";
|
|
56
|
+
const flushSegment = () => {
|
|
57
|
+
addUnsegmentableRecallSegment(tokens, segment, stopWords);
|
|
58
|
+
segment = "";
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
for (const token of cleaned.split(/\s+/)) {
|
|
62
|
+
if (isUnsegmentableRecallToken(token)) {
|
|
63
|
+
segment += token;
|
|
64
|
+
} else {
|
|
65
|
+
flushSegment();
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
flushSegment();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function normalizeRecallTokenSet(
|
|
72
|
+
value: string,
|
|
73
|
+
extraStopWords: string[] = [],
|
|
74
|
+
options: NormalizeRecallTokenOptions = {}
|
|
75
|
+
): Set<string> {
|
|
76
|
+
const minTokenLength = Math.max(1, Math.floor(options.minTokenLength ?? 3));
|
|
77
|
+
const stopWords = buildRecallStopWords(extraStopWords);
|
|
78
|
+
const cleaned = value
|
|
17
79
|
.toLowerCase()
|
|
18
|
-
.
|
|
19
|
-
.
|
|
20
|
-
.
|
|
80
|
+
.normalize("NFC")
|
|
81
|
+
.replace(/[^\p{L}\p{N}\p{M}\u30fc\uff70]+/gu, " ")
|
|
82
|
+
.trim();
|
|
83
|
+
if (cleaned.length === 0) return new Set();
|
|
84
|
+
|
|
85
|
+
const tokens = new Set<string>();
|
|
86
|
+
addBridgedUnsegmentableRecallSegments(tokens, cleaned, stopWords);
|
|
87
|
+
for (const token of cleaned.split(/\s+/)) {
|
|
88
|
+
if (!token) continue;
|
|
89
|
+
if ([...token].some(isUnsegmentableRecallChar)) {
|
|
90
|
+
let segment = "";
|
|
91
|
+
let unsegmentableSegment = "";
|
|
92
|
+
const flushSegment = () => {
|
|
93
|
+
if (shouldKeepRecallToken(segment, minTokenLength, stopWords)) {
|
|
94
|
+
tokens.add(segment);
|
|
95
|
+
}
|
|
96
|
+
segment = "";
|
|
97
|
+
};
|
|
98
|
+
const flushUnsegmentableSegment = () => {
|
|
99
|
+
addUnsegmentableRecallSegment(tokens, unsegmentableSegment, stopWords);
|
|
100
|
+
unsegmentableSegment = "";
|
|
101
|
+
};
|
|
102
|
+
for (const ch of token) {
|
|
103
|
+
if (!/[\p{L}\p{N}\p{M}]/u.test(ch) && !isUnsegmentableRecallChar(ch)) continue;
|
|
104
|
+
if (isUnsegmentableRecallChar(ch)) {
|
|
105
|
+
flushSegment();
|
|
106
|
+
unsegmentableSegment += ch;
|
|
107
|
+
} else if (isRecallCombiningMark(ch)) {
|
|
108
|
+
if (unsegmentableSegment.length > 0) {
|
|
109
|
+
unsegmentableSegment += ch;
|
|
110
|
+
} else {
|
|
111
|
+
segment += ch;
|
|
112
|
+
}
|
|
113
|
+
} else {
|
|
114
|
+
flushUnsegmentableSegment();
|
|
115
|
+
segment += ch;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
flushUnsegmentableSegment();
|
|
119
|
+
flushSegment();
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
if (shouldKeepRecallToken(token, minTokenLength, stopWords)) {
|
|
123
|
+
tokens.add(token);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return tokens;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function normalizeRecallTokens(value: string, extraStopWords: string[] = []): string[] {
|
|
130
|
+
return Array.from(normalizeRecallTokenSet(value, extraStopWords));
|
|
21
131
|
}
|
|
22
132
|
|
|
23
133
|
export function countRecallTokenOverlap(
|
|
24
134
|
queryTokens: Set<string>,
|
|
25
135
|
value: string | undefined,
|
|
26
|
-
extraStopWords: string[] = []
|
|
136
|
+
extraStopWords: string[] = []
|
|
27
137
|
): number {
|
|
28
138
|
if (!value) return 0;
|
|
29
|
-
const tokens =
|
|
139
|
+
const tokens = normalizeRecallTokenSet(value, extraStopWords);
|
|
30
140
|
let matches = 0;
|
|
31
141
|
for (const token of queryTokens) {
|
|
32
142
|
if (tokens.has(token)) matches += 1;
|
package/src/temporal-index.ts
CHANGED
|
@@ -76,9 +76,7 @@ function tagIndexPath(memoryDir: string): string {
|
|
|
76
76
|
|
|
77
77
|
function ensureStateDir(memoryDir: string): void {
|
|
78
78
|
const dir = stateDir(memoryDir);
|
|
79
|
-
|
|
80
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
81
|
-
}
|
|
79
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
82
80
|
}
|
|
83
81
|
|
|
84
82
|
function readJsonSafe<T>(filePath: string, fallback: T): T {
|
|
@@ -187,6 +185,13 @@ function lockOwnerIsRunning(owner: IndexLockOwner): boolean {
|
|
|
187
185
|
return runningStartedAtMs <= owner.processStartedAtMs + INDEX_PROCESS_START_TOLERANCE_MS;
|
|
188
186
|
}
|
|
189
187
|
|
|
188
|
+
function lockIsFresh(lockInfo: fs.Stats, owner: IndexLockOwner | null): boolean {
|
|
189
|
+
const ownerCreatedAtMs =
|
|
190
|
+
typeof owner?.createdAt === "string" && owner.createdAt.length > 0 ? Date.parse(owner.createdAt) : Number.NaN;
|
|
191
|
+
const referenceMs = Number.isFinite(ownerCreatedAtMs) ? ownerCreatedAtMs : lockInfo.mtimeMs;
|
|
192
|
+
return Date.now() - referenceMs < INDEX_LOCK_STALE_MS;
|
|
193
|
+
}
|
|
194
|
+
|
|
190
195
|
function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
|
|
191
196
|
try {
|
|
192
197
|
const info = fs.lstatSync(lockDir);
|
|
@@ -196,11 +201,14 @@ function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
|
|
|
196
201
|
return "removed";
|
|
197
202
|
}
|
|
198
203
|
const owner = readIndexLockOwner(lockDir);
|
|
199
|
-
if (owner !== null
|
|
200
|
-
|
|
204
|
+
if (owner !== null) {
|
|
205
|
+
if (lockOwnerIsRunning(owner)) return "wait";
|
|
206
|
+
}
|
|
207
|
+
if (owner === null && lockIsFresh(info, null)) return "wait";
|
|
201
208
|
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
202
209
|
return "removed";
|
|
203
|
-
} catch {
|
|
210
|
+
} catch (error) {
|
|
211
|
+
if ((error as NodeJS.ErrnoException)?.code === "ENOENT") return "removed";
|
|
204
212
|
// Fail silently — indexes are advisory only
|
|
205
213
|
return "blocked";
|
|
206
214
|
}
|
|
@@ -217,6 +225,15 @@ function withIndexFileLock(filePath: string, update: () => void): void {
|
|
|
217
225
|
acquired = true;
|
|
218
226
|
} catch (error) {
|
|
219
227
|
const code = (error as NodeJS.ErrnoException)?.code;
|
|
228
|
+
if (code === "ENOENT") {
|
|
229
|
+
try {
|
|
230
|
+
fs.mkdirSync(path.dirname(lockDir), { recursive: true });
|
|
231
|
+
} catch {
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
sleepSync(INDEX_LOCK_POLL_MS);
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
220
237
|
if (code !== "EEXIST") return;
|
|
221
238
|
const cleanupResult = removeAbandonedIndexLock(lockDir);
|
|
222
239
|
if (cleanupResult === "blocked") return;
|