@remnic/core 9.3.597 → 9.3.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/access-cli.js +17 -17
  2. package/dist/access-http.js +6 -6
  3. package/dist/access-mcp.js +5 -5
  4. package/dist/access-service.js +4 -4
  5. package/dist/behavior-learner.js +2 -1
  6. package/dist/behavior-learner.js.map +1 -1
  7. package/dist/causal-behavior.js +3 -3
  8. package/dist/causal-chain.js +3 -3
  9. package/dist/causal-consolidation.js +4 -4
  10. package/dist/causal-retrieval.js +3 -3
  11. package/dist/causal-trajectory.js +2 -2
  12. package/dist/{chunk-A2Z6UCWT.js → chunk-33JBK2XP.js} +2 -2
  13. package/dist/{chunk-D2MMMTDV.js → chunk-5SQ5CQJP.js} +2 -2
  14. package/dist/{chunk-F4LM4ULA.js → chunk-65JSA4MP.js} +12 -12
  15. package/dist/{chunk-SKGV326D.js → chunk-6GDHLVJC.js} +2 -2
  16. package/dist/chunk-6HEM6HTQ.js +359 -0
  17. package/dist/chunk-6HEM6HTQ.js.map +1 -0
  18. package/dist/{chunk-WXACKLKP.js → chunk-75O6YQ63.js} +22 -7
  19. package/dist/chunk-75O6YQ63.js.map +1 -0
  20. package/dist/{chunk-D65TSG24.js → chunk-7DZRO2DC.js} +2 -2
  21. package/dist/{chunk-TYICDVQW.js → chunk-BDCCWRHR.js} +4 -4
  22. package/dist/{chunk-LYPDMKUT.js → chunk-CL3MWNNQ.js} +2 -2
  23. package/dist/{chunk-YQMZ7IH2.js → chunk-D4KJ74JJ.js} +65 -27
  24. package/dist/chunk-D4KJ74JJ.js.map +1 -0
  25. package/dist/{chunk-W5O2FQTZ.js → chunk-GUPISBV2.js} +2 -2
  26. package/dist/{chunk-472U7RDF.js → chunk-JGSKJHF7.js} +2 -2
  27. package/dist/{chunk-IEFHBIU2.js → chunk-KDUVQU6Y.js} +14 -14
  28. package/dist/{chunk-6HZ6AO2P.js → chunk-LBJBNWS2.js} +37 -10
  29. package/dist/chunk-LBJBNWS2.js.map +1 -0
  30. package/dist/{chunk-Z4R6RI2N.js → chunk-NSKYFGDL.js} +2 -2
  31. package/dist/{chunk-OD5LFAPZ.js → chunk-V67GWXM2.js} +1 -1
  32. package/dist/{chunk-5NXIJZFX.js → chunk-WR64DQFE.js} +3 -3
  33. package/dist/{chunk-5BUGGPBR.js → chunk-WZA5Y6AC.js} +3 -3
  34. package/dist/chunk-ZBJMUXZH.js +121 -0
  35. package/dist/chunk-ZBJMUXZH.js.map +1 -0
  36. package/dist/{chunk-XPXEJRUB.js → chunk-ZRWB5D4H.js} +2 -2
  37. package/dist/{chunk-MA5MWGKP.js → chunk-ZT3EGNLR.js} +2 -2
  38. package/dist/{chunk-LMPHTYJC.js → chunk-ZZYF3BUL.js} +2 -2
  39. package/dist/cli.js +14 -14
  40. package/dist/compounding/engine.js +1 -1
  41. package/dist/direct-answer-wiring.js +3 -3
  42. package/dist/direct-answer.d.ts +1 -1
  43. package/dist/direct-answer.js +2 -2
  44. package/dist/harmonic-retrieval.js +2 -2
  45. package/dist/index.js +21 -21
  46. package/dist/orchestrator.js +16 -16
  47. package/dist/policy-runtime.js +3 -2
  48. package/dist/recall-query-policy.js +2 -1
  49. package/dist/recall-tokenization.d.ts +5 -1
  50. package/dist/recall-tokenization.js +3 -1
  51. package/dist/resume-bundles.js +3 -3
  52. package/dist/retrieval-agents.js +2 -2
  53. package/dist/semantic-consolidation.js +2 -2
  54. package/dist/semantic-rule-verifier.js +2 -2
  55. package/dist/temporal-index.js +1 -1
  56. package/dist/trust-zones.js +2 -2
  57. package/dist/verified-recall.js +2 -2
  58. package/dist/work-product-ledger.js +2 -2
  59. package/package.json +1 -1
  60. package/src/causal-chain.ts +80 -42
  61. package/src/direct-answer.test.ts +618 -15
  62. package/src/direct-answer.ts +259 -20
  63. package/src/recall-query-policy.ts +49 -27
  64. package/src/recall-tokenization.ts +131 -21
  65. package/src/temporal-index.ts +23 -6
  66. package/dist/chunk-6HZ6AO2P.js.map +0 -1
  67. package/dist/chunk-DT5TVLJE.js +0 -32
  68. package/dist/chunk-DT5TVLJE.js.map +0 -1
  69. package/dist/chunk-WXACKLKP.js.map +0 -1
  70. package/dist/chunk-Y4FHOFJ2.js +0 -140
  71. package/dist/chunk-Y4FHOFJ2.js.map +0 -1
  72. package/dist/chunk-YQMZ7IH2.js.map +0 -1
  73. /package/dist/{chunk-A2Z6UCWT.js.map → chunk-33JBK2XP.js.map} +0 -0
  74. /package/dist/{chunk-D2MMMTDV.js.map → chunk-5SQ5CQJP.js.map} +0 -0
  75. /package/dist/{chunk-F4LM4ULA.js.map → chunk-65JSA4MP.js.map} +0 -0
  76. /package/dist/{chunk-SKGV326D.js.map → chunk-6GDHLVJC.js.map} +0 -0
  77. /package/dist/{chunk-D65TSG24.js.map → chunk-7DZRO2DC.js.map} +0 -0
  78. /package/dist/{chunk-TYICDVQW.js.map → chunk-BDCCWRHR.js.map} +0 -0
  79. /package/dist/{chunk-LYPDMKUT.js.map → chunk-CL3MWNNQ.js.map} +0 -0
  80. /package/dist/{chunk-W5O2FQTZ.js.map → chunk-GUPISBV2.js.map} +0 -0
  81. /package/dist/{chunk-472U7RDF.js.map → chunk-JGSKJHF7.js.map} +0 -0
  82. /package/dist/{chunk-IEFHBIU2.js.map → chunk-KDUVQU6Y.js.map} +0 -0
  83. /package/dist/{chunk-Z4R6RI2N.js.map → chunk-NSKYFGDL.js.map} +0 -0
  84. /package/dist/{chunk-OD5LFAPZ.js.map → chunk-V67GWXM2.js.map} +0 -0
  85. /package/dist/{chunk-5NXIJZFX.js.map → chunk-WR64DQFE.js.map} +0 -0
  86. /package/dist/{chunk-5BUGGPBR.js.map → chunk-WZA5Y6AC.js.map} +0 -0
  87. /package/dist/{chunk-XPXEJRUB.js.map → chunk-ZRWB5D4H.js.map} +0 -0
  88. /package/dist/{chunk-MA5MWGKP.js.map → chunk-ZT3EGNLR.js.map} +0 -0
  89. /package/dist/{chunk-LMPHTYJC.js.map → chunk-ZZYF3BUL.js.map} +0 -0
@@ -18,12 +18,9 @@
18
18
  * Not wired into retrieval yet — see slice 3.
19
19
  */
20
20
 
21
- import type { MemoryFile, MemoryStatus } from "./types.js";
21
+ import { normalizeRecallTokenSet, normalizeRecallTokens } from "./recall-tokenization.js";
22
22
  import type { TrustZoneName } from "./trust-zones.js";
23
- import {
24
- countRecallTokenOverlap,
25
- normalizeRecallTokens,
26
- } from "./recall-tokenization.js";
23
+ import type { MemoryFile, MemoryStatus } from "./types.js";
27
24
 
28
25
  /**
29
26
  * Caller-supplied candidate.
@@ -105,9 +102,246 @@ export const FILTER_LABELS = {
105
102
  belowTokenOverlapFloor: "below-token-overlap-floor",
106
103
  } as const;
107
104
 
105
+ const PROMPT_RECALL_WORDS = new Set([
106
+ "what",
107
+ "who",
108
+ "where",
109
+ "when",
110
+ "why",
111
+ "how",
112
+ "is",
113
+ "are",
114
+ "was",
115
+ "were",
116
+ "do",
117
+ "does",
118
+ "did",
119
+ "find",
120
+ "get",
121
+ "show",
122
+ "search",
123
+ "lookup",
124
+ "recall",
125
+ "remember",
126
+ "list",
127
+ "status",
128
+ "include",
129
+ "tell",
130
+ "me",
131
+ "give",
132
+ "about",
133
+ "please",
134
+ "the",
135
+ "and",
136
+ "for",
137
+ "with",
138
+ "from",
139
+ "into",
140
+ "that",
141
+ "this",
142
+ "найди",
143
+ "найти",
144
+ "поиск",
145
+ "покажи",
146
+ "статус",
147
+ "включи",
148
+ ]);
149
+
108
150
  interface ScoredCandidate {
109
151
  candidate: DirectAnswerCandidate;
110
152
  tokenOverlap: number;
153
+ requiredTokenMismatch: boolean;
154
+ }
155
+
156
+ function hasUnsegmentableRecallChar(token: string): boolean {
157
+ if (token.includes("ー") || token.includes("ー")) return true;
158
+ return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(token);
159
+ }
160
+
161
+ function requiredCjkPhraseTokens(query: string): string[] {
162
+ const phrases = new Set<string>();
163
+ let segment = "";
164
+
165
+ const addPhrase = (phrase: string) => {
166
+ if ([...phrase].length >= 4) {
167
+ phrases.add(phrase);
168
+ }
169
+ };
170
+
171
+ const flushSegment = () => {
172
+ let buffered = "";
173
+ for (const run of segment.split(/\s+/)) {
174
+ if (!run) continue;
175
+ if ([...run].length >= 4) {
176
+ addPhrase(buffered);
177
+ buffered = "";
178
+ addPhrase(run);
179
+ continue;
180
+ }
181
+ buffered += run;
182
+ }
183
+ addPhrase(buffered);
184
+ segment = "";
185
+ };
186
+
187
+ for (const ch of query.toLowerCase().normalize("NFC")) {
188
+ if (hasUnsegmentableRecallChar(ch)) {
189
+ segment += ch;
190
+ continue;
191
+ }
192
+ if (/\p{M}/u.test(ch) && segment.length > 0 && !/\s$/u.test(segment)) {
193
+ segment += ch;
194
+ continue;
195
+ }
196
+ if (/\s/u.test(ch)) {
197
+ segment += " ";
198
+ continue;
199
+ }
200
+ flushSegment();
201
+ }
202
+ flushSegment();
203
+
204
+ return [...phrases];
205
+ }
206
+
207
+ function requiredMixedScriptTokens(query: string): string[] {
208
+ const required = new Set<string>();
209
+ const parts: string[] = [];
210
+ let segment = "";
211
+
212
+ const flushSegment = () => {
213
+ if (segment) {
214
+ parts.push(segment);
215
+ }
216
+ segment = "";
217
+ };
218
+
219
+ const segmentableRecallTokens = (value: string) => {
220
+ const tokens = new Set<string>();
221
+ let segment = "";
222
+ const flushSegmentableSegment = () => {
223
+ for (const token of normalizeRecallTokenSet(segment, [], { minTokenLength: 1 })) {
224
+ tokens.add(token);
225
+ }
226
+ segment = "";
227
+ };
228
+
229
+ for (const ch of value) {
230
+ if (/[\p{L}\p{N}]/u.test(ch) && !hasUnsegmentableRecallChar(ch)) {
231
+ segment += ch;
232
+ continue;
233
+ }
234
+ if (/\p{M}/u.test(ch) && segment.length > 0) {
235
+ segment += ch;
236
+ continue;
237
+ }
238
+ flushSegmentableSegment();
239
+ }
240
+ flushSegmentableSegment();
241
+ return tokens;
242
+ };
243
+
244
+ const hasRequiredSegmentableToken = (value: string) => {
245
+ return segmentableRecallTokens(value).size > 0;
246
+ };
247
+
248
+ const hasBoundarySegmentableToken = (value: string) => {
249
+ for (const token of segmentableRecallTokens(value)) {
250
+ if (PROMPT_RECALL_WORDS.has(token)) {
251
+ continue;
252
+ }
253
+ const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
254
+ if (token.length >= 3 || /\p{N}/u.test(token) || hasNonAsciiCodepoint) {
255
+ return true;
256
+ }
257
+ }
258
+ return false;
259
+ };
260
+
261
+ const addRequiredTokens = (value: string) => {
262
+ for (const token of normalizeRecallTokenSet(value, [], { minTokenLength: 1 })) {
263
+ required.add(token);
264
+ }
265
+ };
266
+
267
+ for (const ch of query.toLowerCase().normalize("NFC")) {
268
+ if (/[\p{L}\p{N}\p{M}]/u.test(ch) || hasUnsegmentableRecallChar(ch)) {
269
+ segment += ch;
270
+ continue;
271
+ }
272
+ flushSegment();
273
+ }
274
+ flushSegment();
275
+
276
+ for (const part of parts) {
277
+ if (hasUnsegmentableRecallChar(part) && hasRequiredSegmentableToken(part)) {
278
+ addRequiredTokens(part);
279
+ }
280
+ }
281
+
282
+ for (let i = 0; i < parts.length - 1; i += 1) {
283
+ const current = parts[i];
284
+ const next = parts[i + 1];
285
+ const currentHasUnsegmentable = hasUnsegmentableRecallChar(current);
286
+ const nextHasUnsegmentable = hasUnsegmentableRecallChar(next);
287
+ if (currentHasUnsegmentable === nextHasUnsegmentable) {
288
+ continue;
289
+ }
290
+ const segmentablePart = currentHasUnsegmentable ? next : current;
291
+ if (!hasBoundarySegmentableToken(segmentablePart)) {
292
+ continue;
293
+ }
294
+ addRequiredTokens(current);
295
+ addRequiredTokens(next);
296
+ }
297
+
298
+ for (let i = 1; i < parts.length - 1; i += 1) {
299
+ const prev = parts[i - 1];
300
+ const current = parts[i];
301
+ const next = parts[i + 1];
302
+ if (!hasUnsegmentableRecallChar(prev) || hasUnsegmentableRecallChar(current) || !hasUnsegmentableRecallChar(next)) {
303
+ continue;
304
+ }
305
+ if (!hasRequiredSegmentableToken(current)) {
306
+ continue;
307
+ }
308
+ addRequiredTokens(prev);
309
+ addRequiredTokens(current);
310
+ addRequiredTokens(next);
311
+ }
312
+
313
+ return [...required];
314
+ }
315
+
316
+ function requiredSegmentableUnicodeTokens(queryTokens: Set<string>): string[] {
317
+ const segmentableTokens = [...queryTokens].filter((token) => !hasUnsegmentableRecallChar(token));
318
+ const hasSegmentableUnicodeToken = segmentableTokens.some((token) =>
319
+ [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f)
320
+ );
321
+ if (!hasSegmentableUnicodeToken) {
322
+ return [];
323
+ }
324
+ return segmentableTokens.filter((token) => {
325
+ if (PROMPT_RECALL_WORDS.has(token)) {
326
+ return false;
327
+ }
328
+ const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
329
+ if (hasNonAsciiCodepoint) {
330
+ return true;
331
+ }
332
+ if (!/^[a-z0-9]+$/u.test(token)) {
333
+ return false;
334
+ }
335
+ return token.length >= 3 || /\p{N}/u.test(token);
336
+ });
337
+ }
338
+
339
+ function countTokenOverlap(queryTokens: Set<string>, valueTokens: Set<string>): number {
340
+ let matches = 0;
341
+ for (const token of queryTokens) {
342
+ if (valueTokens.has(token)) matches += 1;
343
+ }
344
+ return matches;
111
345
  }
112
346
 
113
347
  /**
@@ -123,9 +357,7 @@ interface ScoredCandidate {
123
357
  * 6. top two candidates within ambiguityMargin → "ambiguous"
124
358
  * 7. otherwise → "eligible"
125
359
  */
126
- export function isDirectAnswerEligible(
127
- input: DirectAnswerInput,
128
- ): DirectAnswerResult {
360
+ export function isDirectAnswerEligible(input: DirectAnswerInput): DirectAnswerResult {
129
361
  const { query, candidates, config, queryEntityRefs } = input;
130
362
 
131
363
  if (!config.enabled) {
@@ -164,17 +396,13 @@ export function isDirectAnswerEligible(
164
396
  return status === "active";
165
397
  });
166
398
 
167
- working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) =>
168
- c.trustZone === "trusted",
169
- );
399
+ working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) => c.trustZone === "trusted");
170
400
 
171
401
  working = applyFilter(
172
402
  working,
173
403
  filteredBy,
174
404
  FILTER_LABELS.ineligibleTaxonomyBucket,
175
- (c) =>
176
- c.taxonomyBucket !== null &&
177
- config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket),
405
+ (c) => c.taxonomyBucket !== null && config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket)
178
406
  );
179
407
 
180
408
  working = applyFilter(working, filteredBy, FILTER_LABELS.belowImportanceFloor, (c) => {
@@ -201,13 +429,24 @@ export function isDirectAnswerEligible(
201
429
  }
202
430
 
203
431
  const scored: ScoredCandidate[] = working.map((candidate) => {
204
- const searchable =
205
- `${candidate.memory.frontmatter.tags?.join(" ") ?? ""} ${candidate.memory.content}`.trim();
206
- const matches = countRecallTokenOverlap(queryTokens, searchable);
207
- return { candidate, tokenOverlap: matches / queryTokens.size };
432
+ const searchable = `${candidate.memory.frontmatter.tags?.join(" ") ?? ""} ${candidate.memory.content}`.trim();
433
+ const searchableTokens = normalizeRecallTokenSet(searchable);
434
+ const requiredSearchableTokens = normalizeRecallTokenSet(searchable, [], { minTokenLength: 1 });
435
+ const requiredPhrases = requiredCjkPhraseTokens(query);
436
+ const requiredMixedTokens = requiredMixedScriptTokens(query);
437
+ const requiredUnicodeTokens = requiredSegmentableUnicodeTokens(queryTokens);
438
+ const hasRequiredPhrase =
439
+ requiredPhrases.length === 0 || requiredPhrases.every((token) => searchableTokens.has(token));
440
+ const hasRequiredMixedTokens =
441
+ requiredMixedTokens.length === 0 || requiredMixedTokens.every((token) => requiredSearchableTokens.has(token));
442
+ const hasRequiredUnicodeTokens =
443
+ requiredUnicodeTokens.length === 0 || requiredUnicodeTokens.every((token) => searchableTokens.has(token));
444
+ const requiredTokenMismatch = !hasRequiredPhrase || !hasRequiredMixedTokens || !hasRequiredUnicodeTokens;
445
+ const matches = requiredTokenMismatch ? 0 : countTokenOverlap(queryTokens, searchableTokens);
446
+ return { candidate, tokenOverlap: matches / queryTokens.size, requiredTokenMismatch };
208
447
  });
209
448
 
210
- const overlapSurvivors = scored.filter((s) => s.tokenOverlap >= config.tokenOverlapFloor);
449
+ const overlapSurvivors = scored.filter((s) => !s.requiredTokenMismatch && s.tokenOverlap >= config.tokenOverlapFloor);
211
450
  if (overlapSurvivors.length < scored.length) {
212
451
  filteredBy.push(FILTER_LABELS.belowTokenOverlapFloor);
213
452
  }
@@ -252,7 +491,7 @@ function applyFilter(
252
491
  working: DirectAnswerCandidate[],
253
492
  filteredBy: string[],
254
493
  label: string,
255
- keep: (c: DirectAnswerCandidate) => boolean,
494
+ keep: (c: DirectAnswerCandidate) => boolean
256
495
  ): DirectAnswerCandidate[] {
257
496
  const before = working.length;
258
497
  const next = working.filter(keep);
@@ -1,3 +1,5 @@
1
+ import { normalizeRecallTokens } from "./recall-tokenization.js";
2
+
1
3
  export type RecallPromptShape = "standard" | "instruction_heavy";
2
4
  export type CronConversationRecallMode = "auto" | "always" | "never";
3
5
  export type RecallBudgetMode = "full" | "minimal";
@@ -16,7 +18,7 @@ export interface RecallQueryPolicyResult {
16
18
  retrievalBudgetMode: RecallBudgetMode;
17
19
  }
18
20
 
19
- const DEFAULT_STOPWORDS = new Set([
21
+ const DEFAULT_STOPWORDS = [
20
22
  "the",
21
23
  "and",
22
24
  "for",
@@ -72,7 +74,9 @@ const DEFAULT_STOPWORDS = new Set([
72
74
  "data",
73
75
  "gathering",
74
76
  "context",
75
- ]);
77
+ ];
78
+ const MAX_COMPACT_TOKENS_PER_SOURCE_TERM = 8;
79
+ const COMPACT_IDENTIFIER_RE = /^[a-z0-9]+(?:[:_-][a-z0-9]+)+$/i;
76
80
 
77
81
  function collapseWhitespace(text: string): string {
78
82
  return text.replace(/\s+/g, " ").trim();
@@ -85,6 +89,24 @@ function stripFilesystemLikePaths(text: string): string {
85
89
  .replace(/(?:^|\s)([A-Za-z]:\\[^\s)]+)(?=\s|$)/g, " ");
86
90
  }
87
91
 
92
+ function trimCompactSourceTerm(term: string): string {
93
+ return term.replace(/^[^\p{L}\p{N}\p{M}]+|[^\p{L}\p{N}\p{M}]+$/gu, "");
94
+ }
95
+
96
+ function splitCompactSourceTerm(term: string): string[] {
97
+ return term
98
+ .split(/[^\p{L}\p{N}\p{M}:_-]+/gu)
99
+ .map((part) => trimCompactSourceTerm(part))
100
+ .filter((part) => part.length > 0);
101
+ }
102
+
103
+ function capCompactSourceTermTokens(tokens: string[]): string[] {
104
+ if (tokens.length <= MAX_COMPACT_TOKENS_PER_SOURCE_TERM) return tokens;
105
+ const last = tokens.at(-1);
106
+ if (!last) return tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM);
107
+ return [...tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM - 1), last];
108
+ }
109
+
88
110
  function isBulletOrNumberedLine(line: string): boolean {
89
111
  if (line.startsWith("-") || line.startsWith("*")) {
90
112
  return true;
@@ -112,16 +134,14 @@ function scoreInstructionHeavyShape(prompt: string): number {
112
134
  const headingLineCount = lines.filter(
113
135
  (line) =>
114
136
  /^(goal|output format|tone rules|grounding rules|data gathering|date computation|crm context|follow-up|social|current time|return)\b/i.test(
115
- line,
116
- ) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line),
137
+ line
138
+ ) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line)
117
139
  ).length;
118
140
  const bulletLineCount = lines.filter((line) => isBulletOrNumberedLine(line)).length;
119
141
  const longLineCount = lines.filter((line) => line.length >= 180).length;
120
- const hasPathDensity =
121
- (prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
142
+ const hasPathDensity = (prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
122
143
  const hasImperativeDensity =
123
- (prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >=
124
- 8;
144
+ (prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >= 8;
125
145
 
126
146
  let score = 0;
127
147
  if (lineCount >= 24) score += 2;
@@ -139,27 +159,29 @@ export function classifyRecallPromptShape(prompt: string): RecallPromptShape {
139
159
  }
140
160
 
141
161
  function tokenizeForCompactQuery(text: string): string[] {
142
- const raw = text
143
- .toLowerCase()
144
- .replace(/[^a-z0-9\s:_-]+/g, " ")
145
- .split(/\s+/)
146
- .filter((token) => token.length >= 3);
147
- const deduped: string[] = [];
162
+ const tokens: string[] = [];
148
163
  const seen = new Set<string>();
149
- for (const token of raw) {
150
- if (DEFAULT_STOPWORDS.has(token)) continue;
151
- if (seen.has(token)) continue;
164
+ const addToken = (token: string) => {
165
+ if (token.length === 0 || seen.has(token)) return;
152
166
  seen.add(token);
153
- deduped.push(token);
167
+ tokens.push(token);
168
+ };
169
+
170
+ for (const rawTerm of text.split(/\s+/)) {
171
+ for (const sourceTerm of splitCompactSourceTerm(trimCompactSourceTerm(rawTerm))) {
172
+ if (COMPACT_IDENTIFIER_RE.test(sourceTerm)) {
173
+ addToken(sourceTerm.toLowerCase());
174
+ continue;
175
+ }
176
+ for (const token of capCompactSourceTermTokens(normalizeRecallTokens(sourceTerm, DEFAULT_STOPWORDS))) {
177
+ addToken(token);
178
+ }
179
+ }
154
180
  }
155
- return deduped;
181
+ return tokens;
156
182
  }
157
183
 
158
- function buildInstructionHeavyQuery(
159
- prompt: string,
160
- tokenCap: number,
161
- maxChars: number,
162
- ): string {
184
+ function buildInstructionHeavyQuery(prompt: string, tokenCap: number, maxChars: number): string {
163
185
  const cleaned = stripFilesystemLikePaths(prompt);
164
186
  const tokens = tokenizeForCompactQuery(cleaned).slice(0, Math.max(8, tokenCap));
165
187
  const joined = tokens.join(" ");
@@ -182,7 +204,7 @@ function buildStandardQuery(prompt: string, maxChars: number): string {
182
204
  export function buildRecallQueryPolicy(
183
205
  prompt: string,
184
206
  sessionKey: string | undefined,
185
- cfg: RecallQueryPolicyConfig,
207
+ cfg: RecallQueryPolicyConfig
186
208
  ): RecallQueryPolicyResult {
187
209
  const normalizedPrompt = collapseWhitespace(prompt);
188
210
  const isCron = (sessionKey ?? "").includes(":cron:");
@@ -207,8 +229,8 @@ export function buildRecallQueryPolicy(
207
229
  cfg.cronConversationRecallMode === "never"
208
230
  ? true
209
231
  : cfg.cronConversationRecallMode === "always"
210
- ? false
211
- : promptShape === "instruction_heavy";
232
+ ? false
233
+ : promptShape === "instruction_heavy";
212
234
 
213
235
  const retrievalBudgetMode = promptShape === "instruction_heavy" ? "minimal" : "full";
214
236
 
@@ -1,32 +1,142 @@
1
- export function normalizeRecallTokens(value: string, extraStopWords: string[] = []): string[] {
2
- const stopWords = new Set([
3
- "the",
4
- "and",
5
- "for",
6
- "with",
7
- "from",
8
- "into",
9
- "that",
10
- "this",
11
- "why",
12
- "did",
13
- ...extraStopWords,
14
- ]);
15
-
16
- return value
1
+ export interface NormalizeRecallTokenOptions {
2
+ minTokenLength?: number;
3
+ }
4
+
5
+ const DEFAULT_RECALL_STOP_WORDS = ["the", "and", "for", "with", "from", "into", "that", "this", "why", "did"];
6
+
7
+ function isUnsegmentableRecallChar(char: string): boolean {
8
+ if (char === "" || char === "ー") return true;
9
+ return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(char);
10
+ }
11
+
12
+ function isRecallCombiningMark(char: string): boolean {
13
+ return /\p{M}/u.test(char);
14
+ }
15
+
16
+ function buildRecallStopWords(extraStopWords: string[]): Set<string> {
17
+ return new Set([...DEFAULT_RECALL_STOP_WORDS, ...extraStopWords.map((word) => word.toLowerCase())]);
18
+ }
19
+
20
+ function shouldKeepRecallToken(token: string, minTokenLength: number, stopWords: Set<string>): boolean {
21
+ if (stopWords.has(token)) return false;
22
+ if (token.length >= minTokenLength) return true;
23
+ const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
24
+ return token.length >= 2 && hasNonAsciiCodepoint && /\p{L}/u.test(token);
25
+ }
26
+
27
+ function addUnsegmentableRecallSegment(tokens: Set<string>, segment: string, stopWords: Set<string>): void {
28
+ const chars = [...segment].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
29
+ for (const ch of chars) {
30
+ if (!stopWords.has(ch)) tokens.add(ch);
31
+ }
32
+ for (const size of [2, 3, 4]) {
33
+ if (chars.length < size) continue;
34
+ for (let index = 0; index <= chars.length - size; index += 1) {
35
+ const token = chars.slice(index, index + size).join("");
36
+ if (!stopWords.has(token)) tokens.add(token);
37
+ }
38
+ }
39
+ const whole = chars.join("");
40
+ if (whole.length > 3 && !stopWords.has(whole)) {
41
+ tokens.add(whole);
42
+ }
43
+ }
44
+
45
+ function isUnsegmentableRecallToken(token: string): boolean {
46
+ const chars = [...token].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
47
+ return (
48
+ chars.length > 0 &&
49
+ chars.some(isUnsegmentableRecallChar) &&
50
+ chars.every((ch) => isUnsegmentableRecallChar(ch) || isRecallCombiningMark(ch))
51
+ );
52
+ }
53
+
54
+ function addBridgedUnsegmentableRecallSegments(tokens: Set<string>, cleaned: string, stopWords: Set<string>): void {
55
+ let segment = "";
56
+ const flushSegment = () => {
57
+ addUnsegmentableRecallSegment(tokens, segment, stopWords);
58
+ segment = "";
59
+ };
60
+
61
+ for (const token of cleaned.split(/\s+/)) {
62
+ if (isUnsegmentableRecallToken(token)) {
63
+ segment += token;
64
+ } else {
65
+ flushSegment();
66
+ }
67
+ }
68
+ flushSegment();
69
+ }
70
+
71
+ export function normalizeRecallTokenSet(
72
+ value: string,
73
+ extraStopWords: string[] = [],
74
+ options: NormalizeRecallTokenOptions = {}
75
+ ): Set<string> {
76
+ const minTokenLength = Math.max(1, Math.floor(options.minTokenLength ?? 3));
77
+ const stopWords = buildRecallStopWords(extraStopWords);
78
+ const cleaned = value
17
79
  .toLowerCase()
18
- .split(/[^a-z0-9]+/)
19
- .map((token) => token.trim())
20
- .filter((token) => token.length >= 3 && !stopWords.has(token));
80
+ .normalize("NFC")
81
+ .replace(/[^\p{L}\p{N}\p{M}\u30fc\uff70]+/gu, " ")
82
+ .trim();
83
+ if (cleaned.length === 0) return new Set();
84
+
85
+ const tokens = new Set<string>();
86
+ addBridgedUnsegmentableRecallSegments(tokens, cleaned, stopWords);
87
+ for (const token of cleaned.split(/\s+/)) {
88
+ if (!token) continue;
89
+ if ([...token].some(isUnsegmentableRecallChar)) {
90
+ let segment = "";
91
+ let unsegmentableSegment = "";
92
+ const flushSegment = () => {
93
+ if (shouldKeepRecallToken(segment, minTokenLength, stopWords)) {
94
+ tokens.add(segment);
95
+ }
96
+ segment = "";
97
+ };
98
+ const flushUnsegmentableSegment = () => {
99
+ addUnsegmentableRecallSegment(tokens, unsegmentableSegment, stopWords);
100
+ unsegmentableSegment = "";
101
+ };
102
+ for (const ch of token) {
103
+ if (!/[\p{L}\p{N}\p{M}]/u.test(ch) && !isUnsegmentableRecallChar(ch)) continue;
104
+ if (isUnsegmentableRecallChar(ch)) {
105
+ flushSegment();
106
+ unsegmentableSegment += ch;
107
+ } else if (isRecallCombiningMark(ch)) {
108
+ if (unsegmentableSegment.length > 0) {
109
+ unsegmentableSegment += ch;
110
+ } else {
111
+ segment += ch;
112
+ }
113
+ } else {
114
+ flushUnsegmentableSegment();
115
+ segment += ch;
116
+ }
117
+ }
118
+ flushUnsegmentableSegment();
119
+ flushSegment();
120
+ continue;
121
+ }
122
+ if (shouldKeepRecallToken(token, minTokenLength, stopWords)) {
123
+ tokens.add(token);
124
+ }
125
+ }
126
+ return tokens;
127
+ }
128
+
129
+ export function normalizeRecallTokens(value: string, extraStopWords: string[] = []): string[] {
130
+ return Array.from(normalizeRecallTokenSet(value, extraStopWords));
21
131
  }
22
132
 
23
133
  export function countRecallTokenOverlap(
24
134
  queryTokens: Set<string>,
25
135
  value: string | undefined,
26
- extraStopWords: string[] = [],
136
+ extraStopWords: string[] = []
27
137
  ): number {
28
138
  if (!value) return 0;
29
- const tokens = new Set(normalizeRecallTokens(value, extraStopWords));
139
+ const tokens = normalizeRecallTokenSet(value, extraStopWords);
30
140
  let matches = 0;
31
141
  for (const token of queryTokens) {
32
142
  if (tokens.has(token)) matches += 1;
@@ -76,9 +76,7 @@ function tagIndexPath(memoryDir: string): string {
76
76
 
77
77
  function ensureStateDir(memoryDir: string): void {
78
78
  const dir = stateDir(memoryDir);
79
- if (!fs.existsSync(dir)) {
80
- fs.mkdirSync(dir, { recursive: true });
81
- }
79
+ fs.mkdirSync(dir, { recursive: true });
82
80
  }
83
81
 
84
82
  function readJsonSafe<T>(filePath: string, fallback: T): T {
@@ -187,6 +185,13 @@ function lockOwnerIsRunning(owner: IndexLockOwner): boolean {
187
185
  return runningStartedAtMs <= owner.processStartedAtMs + INDEX_PROCESS_START_TOLERANCE_MS;
188
186
  }
189
187
 
188
+ function lockIsFresh(lockInfo: fs.Stats, owner: IndexLockOwner | null): boolean {
189
+ const ownerCreatedAtMs =
190
+ typeof owner?.createdAt === "string" && owner.createdAt.length > 0 ? Date.parse(owner.createdAt) : Number.NaN;
191
+ const referenceMs = Number.isFinite(ownerCreatedAtMs) ? ownerCreatedAtMs : lockInfo.mtimeMs;
192
+ return Date.now() - referenceMs < INDEX_LOCK_STALE_MS;
193
+ }
194
+
190
195
  function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
191
196
  try {
192
197
  const info = fs.lstatSync(lockDir);
@@ -196,11 +201,14 @@ function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
196
201
  return "removed";
197
202
  }
198
203
  const owner = readIndexLockOwner(lockDir);
199
- if (owner !== null && lockOwnerIsRunning(owner)) return "wait";
200
- if (owner === null && Date.now() - info.mtimeMs < INDEX_LOCK_STALE_MS) return "wait";
204
+ if (owner !== null) {
205
+ if (lockOwnerIsRunning(owner)) return "wait";
206
+ }
207
+ if (owner === null && lockIsFresh(info, null)) return "wait";
201
208
  fs.rmSync(lockDir, { recursive: true, force: true });
202
209
  return "removed";
203
- } catch {
210
+ } catch (error) {
211
+ if ((error as NodeJS.ErrnoException)?.code === "ENOENT") return "removed";
204
212
  // Fail silently — indexes are advisory only
205
213
  return "blocked";
206
214
  }
@@ -217,6 +225,15 @@ function withIndexFileLock(filePath: string, update: () => void): void {
217
225
  acquired = true;
218
226
  } catch (error) {
219
227
  const code = (error as NodeJS.ErrnoException)?.code;
228
+ if (code === "ENOENT") {
229
+ try {
230
+ fs.mkdirSync(path.dirname(lockDir), { recursive: true });
231
+ } catch {
232
+ return;
233
+ }
234
+ sleepSync(INDEX_LOCK_POLL_MS);
235
+ continue;
236
+ }
220
237
  if (code !== "EEXIST") return;
221
238
  const cleanupResult = removeAbandonedIndexLock(lockDir);
222
239
  if (cleanupResult === "blocked") return;