npm - @remnic/core - Versions diffs - 9.3.597 → 9.3.599 - Mend

@remnic/core 9.3.597 → 9.3.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/dist/access-cli.js +17 -17
package/dist/access-http.js +6 -6
package/dist/access-mcp.js +5 -5
package/dist/access-service.js +4 -4
package/dist/behavior-learner.js +2 -1
package/dist/behavior-learner.js.map +1 -1
package/dist/causal-behavior.js +3 -3
package/dist/causal-chain.js +3 -3
package/dist/causal-consolidation.js +4 -4
package/dist/causal-retrieval.js +3 -3
package/dist/causal-trajectory.js +2 -2
package/dist/{chunk-A2Z6UCWT.js → chunk-33JBK2XP.js} +2 -2
package/dist/{chunk-D2MMMTDV.js → chunk-5SQ5CQJP.js} +2 -2
package/dist/{chunk-F4LM4ULA.js → chunk-65JSA4MP.js} +12 -12
package/dist/{chunk-SKGV326D.js → chunk-6GDHLVJC.js} +2 -2
package/dist/chunk-6HEM6HTQ.js +359 -0
package/dist/chunk-6HEM6HTQ.js.map +1 -0
package/dist/{chunk-WXACKLKP.js → chunk-75O6YQ63.js} +22 -7
package/dist/chunk-75O6YQ63.js.map +1 -0
package/dist/{chunk-D65TSG24.js → chunk-7DZRO2DC.js} +2 -2
package/dist/{chunk-TYICDVQW.js → chunk-BDCCWRHR.js} +4 -4
package/dist/{chunk-LYPDMKUT.js → chunk-CL3MWNNQ.js} +2 -2
package/dist/{chunk-YQMZ7IH2.js → chunk-D4KJ74JJ.js} +65 -27
package/dist/chunk-D4KJ74JJ.js.map +1 -0
package/dist/{chunk-W5O2FQTZ.js → chunk-GUPISBV2.js} +2 -2
package/dist/{chunk-472U7RDF.js → chunk-JGSKJHF7.js} +2 -2
package/dist/{chunk-IEFHBIU2.js → chunk-KDUVQU6Y.js} +14 -14
package/dist/{chunk-6HZ6AO2P.js → chunk-LBJBNWS2.js} +37 -10
package/dist/chunk-LBJBNWS2.js.map +1 -0
package/dist/{chunk-Z4R6RI2N.js → chunk-NSKYFGDL.js} +2 -2
package/dist/{chunk-OD5LFAPZ.js → chunk-V67GWXM2.js} +1 -1
package/dist/{chunk-5NXIJZFX.js → chunk-WR64DQFE.js} +3 -3
package/dist/{chunk-5BUGGPBR.js → chunk-WZA5Y6AC.js} +3 -3
package/dist/chunk-ZBJMUXZH.js +121 -0
package/dist/chunk-ZBJMUXZH.js.map +1 -0
package/dist/{chunk-XPXEJRUB.js → chunk-ZRWB5D4H.js} +2 -2
package/dist/{chunk-MA5MWGKP.js → chunk-ZT3EGNLR.js} +2 -2
package/dist/{chunk-LMPHTYJC.js → chunk-ZZYF3BUL.js} +2 -2
package/dist/cli.js +14 -14
package/dist/compounding/engine.js +1 -1
package/dist/direct-answer-wiring.js +3 -3
package/dist/direct-answer.d.ts +1 -1
package/dist/direct-answer.js +2 -2
package/dist/harmonic-retrieval.js +2 -2
package/dist/index.js +21 -21
package/dist/orchestrator.js +16 -16
package/dist/policy-runtime.js +3 -2
package/dist/recall-query-policy.js +2 -1
package/dist/recall-tokenization.d.ts +5 -1
package/dist/recall-tokenization.js +3 -1
package/dist/resume-bundles.js +3 -3
package/dist/retrieval-agents.js +2 -2
package/dist/semantic-consolidation.js +2 -2
package/dist/semantic-rule-verifier.js +2 -2
package/dist/temporal-index.js +1 -1
package/dist/trust-zones.js +2 -2
package/dist/verified-recall.js +2 -2
package/dist/work-product-ledger.js +2 -2
package/package.json +1 -1
package/src/causal-chain.ts +80 -42
package/src/direct-answer.test.ts +618 -15
package/src/direct-answer.ts +259 -20
package/src/recall-query-policy.ts +49 -27
package/src/recall-tokenization.ts +131 -21
package/src/temporal-index.ts +23 -6
package/dist/chunk-6HZ6AO2P.js.map +0 -1
package/dist/chunk-DT5TVLJE.js +0 -32
package/dist/chunk-DT5TVLJE.js.map +0 -1
package/dist/chunk-WXACKLKP.js.map +0 -1
package/dist/chunk-Y4FHOFJ2.js +0 -140
package/dist/chunk-Y4FHOFJ2.js.map +0 -1
package/dist/chunk-YQMZ7IH2.js.map +0 -1
/package/dist/{chunk-A2Z6UCWT.js.map → chunk-33JBK2XP.js.map} +0 -0
/package/dist/{chunk-D2MMMTDV.js.map → chunk-5SQ5CQJP.js.map} +0 -0
/package/dist/{chunk-F4LM4ULA.js.map → chunk-65JSA4MP.js.map} +0 -0
/package/dist/{chunk-SKGV326D.js.map → chunk-6GDHLVJC.js.map} +0 -0
/package/dist/{chunk-D65TSG24.js.map → chunk-7DZRO2DC.js.map} +0 -0
/package/dist/{chunk-TYICDVQW.js.map → chunk-BDCCWRHR.js.map} +0 -0
/package/dist/{chunk-LYPDMKUT.js.map → chunk-CL3MWNNQ.js.map} +0 -0
/package/dist/{chunk-W5O2FQTZ.js.map → chunk-GUPISBV2.js.map} +0 -0
/package/dist/{chunk-472U7RDF.js.map → chunk-JGSKJHF7.js.map} +0 -0
/package/dist/{chunk-IEFHBIU2.js.map → chunk-KDUVQU6Y.js.map} +0 -0
/package/dist/{chunk-Z4R6RI2N.js.map → chunk-NSKYFGDL.js.map} +0 -0
/package/dist/{chunk-OD5LFAPZ.js.map → chunk-V67GWXM2.js.map} +0 -0
/package/dist/{chunk-5NXIJZFX.js.map → chunk-WR64DQFE.js.map} +0 -0
/package/dist/{chunk-5BUGGPBR.js.map → chunk-WZA5Y6AC.js.map} +0 -0
/package/dist/{chunk-XPXEJRUB.js.map → chunk-ZRWB5D4H.js.map} +0 -0
/package/dist/{chunk-MA5MWGKP.js.map → chunk-ZT3EGNLR.js.map} +0 -0
/package/dist/{chunk-LMPHTYJC.js.map → chunk-ZZYF3BUL.js.map} +0 -0

package/src/direct-answer.ts CHANGED Viewed

@@ -18,12 +18,9 @@
  * Not wired into retrieval yet — see slice 3.
  */
-import type { MemoryFile, MemoryStatus } from "./types.js";
+import { normalizeRecallTokenSet, normalizeRecallTokens } from "./recall-tokenization.js";
 import type { TrustZoneName } from "./trust-zones.js";
-import {
-  countRecallTokenOverlap,
-  normalizeRecallTokens,
-} from "./recall-tokenization.js";
+import type { MemoryFile, MemoryStatus } from "./types.js";
 /**
  * Caller-supplied candidate.
@@ -105,9 +102,246 @@ export const FILTER_LABELS = {
   belowTokenOverlapFloor: "below-token-overlap-floor",
 } as const;
+const PROMPT_RECALL_WORDS = new Set([
+  "what",
+  "who",
+  "where",
+  "when",
+  "why",
+  "how",
+  "is",
+  "are",
+  "was",
+  "were",
+  "do",
+  "does",
+  "did",
+  "find",
+  "get",
+  "show",
+  "search",
+  "lookup",
+  "recall",
+  "remember",
+  "list",
+  "status",
+  "include",
+  "tell",
+  "me",
+  "give",
+  "about",
+  "please",
+  "the",
+  "and",
+  "for",
+  "with",
+  "from",
+  "into",
+  "that",
+  "this",
+  "найди",
+  "найти",
+  "поиск",
+  "покажи",
+  "статус",
+  "включи",
+]);
 interface ScoredCandidate {
   candidate: DirectAnswerCandidate;
   tokenOverlap: number;
+  requiredTokenMismatch: boolean;
+}
+function hasUnsegmentableRecallChar(token: string): boolean {
+  if (token.includes("ー") || token.includes("ｰ")) return true;
+  return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(token);
+}
+function requiredCjkPhraseTokens(query: string): string[] {
+  const phrases = new Set<string>();
+  let segment = "";
+  const addPhrase = (phrase: string) => {
+    if ([...phrase].length >= 4) {
+      phrases.add(phrase);
+    }
+  };
+  const flushSegment = () => {
+    let buffered = "";
+    for (const run of segment.split(/\s+/)) {
+      if (!run) continue;
+      if ([...run].length >= 4) {
+        addPhrase(buffered);
+        buffered = "";
+        addPhrase(run);
+        continue;
+      }
+      buffered += run;
+    }
+    addPhrase(buffered);
+    segment = "";
+  };
+  for (const ch of query.toLowerCase().normalize("NFC")) {
+    if (hasUnsegmentableRecallChar(ch)) {
+      segment += ch;
+      continue;
+    }
+    if (/\p{M}/u.test(ch) && segment.length > 0 && !/\s$/u.test(segment)) {
+      segment += ch;
+      continue;
+    }
+    if (/\s/u.test(ch)) {
+      segment += " ";
+      continue;
+    }
+    flushSegment();
+  }
+  flushSegment();
+  return [...phrases];
+}
+function requiredMixedScriptTokens(query: string): string[] {
+  const required = new Set<string>();
+  const parts: string[] = [];
+  let segment = "";
+  const flushSegment = () => {
+    if (segment) {
+      parts.push(segment);
+    }
+    segment = "";
+  };
+  const segmentableRecallTokens = (value: string) => {
+    const tokens = new Set<string>();
+    let segment = "";
+    const flushSegmentableSegment = () => {
+      for (const token of normalizeRecallTokenSet(segment, [], { minTokenLength: 1 })) {
+        tokens.add(token);
+      }
+      segment = "";
+    };
+    for (const ch of value) {
+      if (/[\p{L}\p{N}]/u.test(ch) && !hasUnsegmentableRecallChar(ch)) {
+        segment += ch;
+        continue;
+      }
+      if (/\p{M}/u.test(ch) && segment.length > 0) {
+        segment += ch;
+        continue;
+      }
+      flushSegmentableSegment();
+    }
+    flushSegmentableSegment();
+    return tokens;
+  };
+  const hasRequiredSegmentableToken = (value: string) => {
+    return segmentableRecallTokens(value).size > 0;
+  };
+  const hasBoundarySegmentableToken = (value: string) => {
+    for (const token of segmentableRecallTokens(value)) {
+      if (PROMPT_RECALL_WORDS.has(token)) {
+        continue;
+      }
+      const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
+      if (token.length >= 3 || /\p{N}/u.test(token) || hasNonAsciiCodepoint) {
+        return true;
+      }
+    }
+    return false;
+  };
+  const addRequiredTokens = (value: string) => {
+    for (const token of normalizeRecallTokenSet(value, [], { minTokenLength: 1 })) {
+      required.add(token);
+    }
+  };
+  for (const ch of query.toLowerCase().normalize("NFC")) {
+    if (/[\p{L}\p{N}\p{M}]/u.test(ch) || hasUnsegmentableRecallChar(ch)) {
+      segment += ch;
+      continue;
+    }
+    flushSegment();
+  }
+  flushSegment();
+  for (const part of parts) {
+    if (hasUnsegmentableRecallChar(part) && hasRequiredSegmentableToken(part)) {
+      addRequiredTokens(part);
+    }
+  }
+  for (let i = 0; i < parts.length - 1; i += 1) {
+    const current = parts[i];
+    const next = parts[i + 1];
+    const currentHasUnsegmentable = hasUnsegmentableRecallChar(current);
+    const nextHasUnsegmentable = hasUnsegmentableRecallChar(next);
+    if (currentHasUnsegmentable === nextHasUnsegmentable) {
+      continue;
+    }
+    const segmentablePart = currentHasUnsegmentable ? next : current;
+    if (!hasBoundarySegmentableToken(segmentablePart)) {
+      continue;
+    }
+    addRequiredTokens(current);
+    addRequiredTokens(next);
+  }
+  for (let i = 1; i < parts.length - 1; i += 1) {
+    const prev = parts[i - 1];
+    const current = parts[i];
+    const next = parts[i + 1];
+    if (!hasUnsegmentableRecallChar(prev) || hasUnsegmentableRecallChar(current) || !hasUnsegmentableRecallChar(next)) {
+      continue;
+    }
+    if (!hasRequiredSegmentableToken(current)) {
+      continue;
+    }
+    addRequiredTokens(prev);
+    addRequiredTokens(current);
+    addRequiredTokens(next);
+  }
+  return [...required];
+}
+function requiredSegmentableUnicodeTokens(queryTokens: Set<string>): string[] {
+  const segmentableTokens = [...queryTokens].filter((token) => !hasUnsegmentableRecallChar(token));
+  const hasSegmentableUnicodeToken = segmentableTokens.some((token) =>
+    [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f)
+  );
+  if (!hasSegmentableUnicodeToken) {
+    return [];
+  }
+  return segmentableTokens.filter((token) => {
+    if (PROMPT_RECALL_WORDS.has(token)) {
+      return false;
+    }
+    const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
+    if (hasNonAsciiCodepoint) {
+      return true;
+    }
+    if (!/^[a-z0-9]+$/u.test(token)) {
+      return false;
+    }
+    return token.length >= 3 || /\p{N}/u.test(token);
+  });
+}
+function countTokenOverlap(queryTokens: Set<string>, valueTokens: Set<string>): number {
+  let matches = 0;
+  for (const token of queryTokens) {
+    if (valueTokens.has(token)) matches += 1;
+  }
+  return matches;
 }
 /**
@@ -123,9 +357,7 @@ interface ScoredCandidate {
  *   6. top two candidates within ambiguityMargin → "ambiguous"
  *   7. otherwise → "eligible"
  */
-export function isDirectAnswerEligible(
-  input: DirectAnswerInput,
-): DirectAnswerResult {
+export function isDirectAnswerEligible(input: DirectAnswerInput): DirectAnswerResult {
   const { query, candidates, config, queryEntityRefs } = input;
   if (!config.enabled) {
@@ -164,17 +396,13 @@ export function isDirectAnswerEligible(
     return status === "active";
   });
-  working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) =>
-    c.trustZone === "trusted",
-  );
+  working = applyFilter(working, filteredBy, FILTER_LABELS.notTrustedZone, (c) => c.trustZone === "trusted");
   working = applyFilter(
     working,
     filteredBy,
     FILTER_LABELS.ineligibleTaxonomyBucket,
-    (c) =>
-      c.taxonomyBucket !== null &&
-      config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket),
+    (c) => c.taxonomyBucket !== null && config.eligibleTaxonomyBuckets.includes(c.taxonomyBucket)
   );
   working = applyFilter(working, filteredBy, FILTER_LABELS.belowImportanceFloor, (c) => {
@@ -201,13 +429,24 @@ export function isDirectAnswerEligible(
   }
   const scored: ScoredCandidate[] = working.map((candidate) => {
-    const searchable =
-      `${candidate.memory.frontmatter.tags?.join(" ") ?? ""} ${candidate.memory.content}`.trim();
-    const matches = countRecallTokenOverlap(queryTokens, searchable);
-    return { candidate, tokenOverlap: matches / queryTokens.size };
+    const searchable = `${candidate.memory.frontmatter.tags?.join(" ") ?? ""} ${candidate.memory.content}`.trim();
+    const searchableTokens = normalizeRecallTokenSet(searchable);
+    const requiredSearchableTokens = normalizeRecallTokenSet(searchable, [], { minTokenLength: 1 });
+    const requiredPhrases = requiredCjkPhraseTokens(query);
+    const requiredMixedTokens = requiredMixedScriptTokens(query);
+    const requiredUnicodeTokens = requiredSegmentableUnicodeTokens(queryTokens);
+    const hasRequiredPhrase =
+      requiredPhrases.length === 0 || requiredPhrases.every((token) => searchableTokens.has(token));
+    const hasRequiredMixedTokens =
+      requiredMixedTokens.length === 0 || requiredMixedTokens.every((token) => requiredSearchableTokens.has(token));
+    const hasRequiredUnicodeTokens =
+      requiredUnicodeTokens.length === 0 || requiredUnicodeTokens.every((token) => searchableTokens.has(token));
+    const requiredTokenMismatch = !hasRequiredPhrase || !hasRequiredMixedTokens || !hasRequiredUnicodeTokens;
+    const matches = requiredTokenMismatch ? 0 : countTokenOverlap(queryTokens, searchableTokens);
+    return { candidate, tokenOverlap: matches / queryTokens.size, requiredTokenMismatch };
   });
-  const overlapSurvivors = scored.filter((s) => s.tokenOverlap >= config.tokenOverlapFloor);
+  const overlapSurvivors = scored.filter((s) => !s.requiredTokenMismatch && s.tokenOverlap >= config.tokenOverlapFloor);
   if (overlapSurvivors.length < scored.length) {
     filteredBy.push(FILTER_LABELS.belowTokenOverlapFloor);
   }
@@ -252,7 +491,7 @@ function applyFilter(
   working: DirectAnswerCandidate[],
   filteredBy: string[],
   label: string,
-  keep: (c: DirectAnswerCandidate) => boolean,
+  keep: (c: DirectAnswerCandidate) => boolean
 ): DirectAnswerCandidate[] {
   const before = working.length;
   const next = working.filter(keep);

package/src/recall-query-policy.ts CHANGED Viewed

@@ -1,3 +1,5 @@
+import { normalizeRecallTokens } from "./recall-tokenization.js";
 export type RecallPromptShape = "standard" | "instruction_heavy";
 export type CronConversationRecallMode = "auto" | "always" | "never";
 export type RecallBudgetMode = "full" | "minimal";
@@ -16,7 +18,7 @@ export interface RecallQueryPolicyResult {
   retrievalBudgetMode: RecallBudgetMode;
 }
-const DEFAULT_STOPWORDS = new Set([
+const DEFAULT_STOPWORDS = [
   "the",
   "and",
   "for",
@@ -72,7 +74,9 @@ const DEFAULT_STOPWORDS = new Set([
   "data",
   "gathering",
   "context",
-]);
+];
+const MAX_COMPACT_TOKENS_PER_SOURCE_TERM = 8;
+const COMPACT_IDENTIFIER_RE = /^[a-z0-9]+(?:[:_-][a-z0-9]+)+$/i;
 function collapseWhitespace(text: string): string {
   return text.replace(/\s+/g, " ").trim();
@@ -85,6 +89,24 @@ function stripFilesystemLikePaths(text: string): string {
     .replace(/(?:^|\s)([A-Za-z]:\\[^\s)]+)(?=\s|$)/g, " ");
 }
+function trimCompactSourceTerm(term: string): string {
+  return term.replace(/^[^\p{L}\p{N}\p{M}]+|[^\p{L}\p{N}\p{M}]+$/gu, "");
+}
+function splitCompactSourceTerm(term: string): string[] {
+  return term
+    .split(/[^\p{L}\p{N}\p{M}:_-]+/gu)
+    .map((part) => trimCompactSourceTerm(part))
+    .filter((part) => part.length > 0);
+}
+function capCompactSourceTermTokens(tokens: string[]): string[] {
+  if (tokens.length <= MAX_COMPACT_TOKENS_PER_SOURCE_TERM) return tokens;
+  const last = tokens.at(-1);
+  if (!last) return tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM);
+  return [...tokens.slice(0, MAX_COMPACT_TOKENS_PER_SOURCE_TERM - 1), last];
+}
 function isBulletOrNumberedLine(line: string): boolean {
   if (line.startsWith("-") || line.startsWith("*")) {
     return true;
@@ -112,16 +134,14 @@ function scoreInstructionHeavyShape(prompt: string): number {
   const headingLineCount = lines.filter(
     (line) =>
       /^(goal|output format|tone rules|grounding rules|data gathering|date computation|crm context|follow-up|social|current time|return)\b/i.test(
-        line,
-      ) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line),
+        line
+      ) || /^[A-Z][A-Z\s/-]{4,}:$/.test(line)
   ).length;
   const bulletLineCount = lines.filter((line) => isBulletOrNumberedLine(line)).length;
   const longLineCount = lines.filter((line) => line.length >= 180).length;
-  const hasPathDensity =
-    (prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
+  const hasPathDensity = (prompt.match(/(?:~\/|\/Users\/|[A-Za-z]:\\)/g)?.length ?? 0) >= 2;
   const hasImperativeDensity =
-    (prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >=
-    8;
+    (prompt.match(/\b(run|extract|read|parse|determine|include|omit|skip)\b/gi)?.length ?? 0) >= 8;
   let score = 0;
   if (lineCount >= 24) score += 2;
@@ -139,27 +159,29 @@ export function classifyRecallPromptShape(prompt: string): RecallPromptShape {
 }
 function tokenizeForCompactQuery(text: string): string[] {
-  const raw = text
-    .toLowerCase()
-    .replace(/[^a-z0-9\s:_-]+/g, " ")
-    .split(/\s+/)
-    .filter((token) => token.length >= 3);
-  const deduped: string[] = [];
+  const tokens: string[] = [];
   const seen = new Set<string>();
-  for (const token of raw) {
-    if (DEFAULT_STOPWORDS.has(token)) continue;
-    if (seen.has(token)) continue;
+  const addToken = (token: string) => {
+    if (token.length === 0 || seen.has(token)) return;
     seen.add(token);
-    deduped.push(token);
+    tokens.push(token);
+  };
+  for (const rawTerm of text.split(/\s+/)) {
+    for (const sourceTerm of splitCompactSourceTerm(trimCompactSourceTerm(rawTerm))) {
+      if (COMPACT_IDENTIFIER_RE.test(sourceTerm)) {
+        addToken(sourceTerm.toLowerCase());
+        continue;
+      }
+      for (const token of capCompactSourceTermTokens(normalizeRecallTokens(sourceTerm, DEFAULT_STOPWORDS))) {
+        addToken(token);
+      }
+    }
   }
-  return deduped;
+  return tokens;
 }
-function buildInstructionHeavyQuery(
-  prompt: string,
-  tokenCap: number,
-  maxChars: number,
-): string {
+function buildInstructionHeavyQuery(prompt: string, tokenCap: number, maxChars: number): string {
   const cleaned = stripFilesystemLikePaths(prompt);
   const tokens = tokenizeForCompactQuery(cleaned).slice(0, Math.max(8, tokenCap));
   const joined = tokens.join(" ");
@@ -182,7 +204,7 @@ function buildStandardQuery(prompt: string, maxChars: number): string {
 export function buildRecallQueryPolicy(
   prompt: string,
   sessionKey: string | undefined,
-  cfg: RecallQueryPolicyConfig,
+  cfg: RecallQueryPolicyConfig
 ): RecallQueryPolicyResult {
   const normalizedPrompt = collapseWhitespace(prompt);
   const isCron = (sessionKey ?? "").includes(":cron:");
@@ -207,8 +229,8 @@ export function buildRecallQueryPolicy(
     cfg.cronConversationRecallMode === "never"
       ? true
       : cfg.cronConversationRecallMode === "always"
-      ? false
-      : promptShape === "instruction_heavy";
+        ? false
+        : promptShape === "instruction_heavy";
   const retrievalBudgetMode = promptShape === "instruction_heavy" ? "minimal" : "full";

package/src/recall-tokenization.ts CHANGED Viewed

@@ -1,32 +1,142 @@
-export function normalizeRecallTokens(value: string, extraStopWords: string[] = []): string[] {
-  const stopWords = new Set([
-    "the",
-    "and",
-    "for",
-    "with",
-    "from",
-    "into",
-    "that",
-    "this",
-    "why",
-    "did",
-    ...extraStopWords,
-  ]);
-  return value
+export interface NormalizeRecallTokenOptions {
+  minTokenLength?: number;
+}
+const DEFAULT_RECALL_STOP_WORDS = ["the", "and", "for", "with", "from", "into", "that", "this", "why", "did"];
+function isUnsegmentableRecallChar(char: string): boolean {
+  if (char === "ー" || char === "ｰ") return true;
+  return /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}]/u.test(char);
+}
+function isRecallCombiningMark(char: string): boolean {
+  return /\p{M}/u.test(char);
+}
+function buildRecallStopWords(extraStopWords: string[]): Set<string> {
+  return new Set([...DEFAULT_RECALL_STOP_WORDS, ...extraStopWords.map((word) => word.toLowerCase())]);
+}
+function shouldKeepRecallToken(token: string, minTokenLength: number, stopWords: Set<string>): boolean {
+  if (stopWords.has(token)) return false;
+  if (token.length >= minTokenLength) return true;
+  const hasNonAsciiCodepoint = [...token].some((ch) => (ch.codePointAt(0) ?? 0) > 0x7f);
+  return token.length >= 2 && hasNonAsciiCodepoint && /\p{L}/u.test(token);
+}
+function addUnsegmentableRecallSegment(tokens: Set<string>, segment: string, stopWords: Set<string>): void {
+  const chars = [...segment].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
+  for (const ch of chars) {
+    if (!stopWords.has(ch)) tokens.add(ch);
+  }
+  for (const size of [2, 3, 4]) {
+    if (chars.length < size) continue;
+    for (let index = 0; index <= chars.length - size; index += 1) {
+      const token = chars.slice(index, index + size).join("");
+      if (!stopWords.has(token)) tokens.add(token);
+    }
+  }
+  const whole = chars.join("");
+  if (whole.length > 3 && !stopWords.has(whole)) {
+    tokens.add(whole);
+  }
+}
+function isUnsegmentableRecallToken(token: string): boolean {
+  const chars = [...token].filter((ch) => /[\p{L}\p{N}\p{M}]/u.test(ch) || isUnsegmentableRecallChar(ch));
+  return (
+    chars.length > 0 &&
+    chars.some(isUnsegmentableRecallChar) &&
+    chars.every((ch) => isUnsegmentableRecallChar(ch) || isRecallCombiningMark(ch))
+  );
+}
+function addBridgedUnsegmentableRecallSegments(tokens: Set<string>, cleaned: string, stopWords: Set<string>): void {
+  let segment = "";
+  const flushSegment = () => {
+    addUnsegmentableRecallSegment(tokens, segment, stopWords);
+    segment = "";
+  };
+  for (const token of cleaned.split(/\s+/)) {
+    if (isUnsegmentableRecallToken(token)) {
+      segment += token;
+    } else {
+      flushSegment();
+    }
+  }
+  flushSegment();
+}
+export function normalizeRecallTokenSet(
+  value: string,
+  extraStopWords: string[] = [],
+  options: NormalizeRecallTokenOptions = {}
+): Set<string> {
+  const minTokenLength = Math.max(1, Math.floor(options.minTokenLength ?? 3));
+  const stopWords = buildRecallStopWords(extraStopWords);
+  const cleaned = value
     .toLowerCase()
-    .split(/[^a-z0-9]+/)
-    .map((token) => token.trim())
-    .filter((token) => token.length >= 3 && !stopWords.has(token));
+    .normalize("NFC")
+    .replace(/[^\p{L}\p{N}\p{M}\u30fc\uff70]+/gu, " ")
+    .trim();
+  if (cleaned.length === 0) return new Set();
+  const tokens = new Set<string>();
+  addBridgedUnsegmentableRecallSegments(tokens, cleaned, stopWords);
+  for (const token of cleaned.split(/\s+/)) {
+    if (!token) continue;
+    if ([...token].some(isUnsegmentableRecallChar)) {
+      let segment = "";
+      let unsegmentableSegment = "";
+      const flushSegment = () => {
+        if (shouldKeepRecallToken(segment, minTokenLength, stopWords)) {
+          tokens.add(segment);
+        }
+        segment = "";
+      };
+      const flushUnsegmentableSegment = () => {
+        addUnsegmentableRecallSegment(tokens, unsegmentableSegment, stopWords);
+        unsegmentableSegment = "";
+      };
+      for (const ch of token) {
+        if (!/[\p{L}\p{N}\p{M}]/u.test(ch) && !isUnsegmentableRecallChar(ch)) continue;
+        if (isUnsegmentableRecallChar(ch)) {
+          flushSegment();
+          unsegmentableSegment += ch;
+        } else if (isRecallCombiningMark(ch)) {
+          if (unsegmentableSegment.length > 0) {
+            unsegmentableSegment += ch;
+          } else {
+            segment += ch;
+          }
+        } else {
+          flushUnsegmentableSegment();
+          segment += ch;
+        }
+      }
+      flushUnsegmentableSegment();
+      flushSegment();
+      continue;
+    }
+    if (shouldKeepRecallToken(token, minTokenLength, stopWords)) {
+      tokens.add(token);
+    }
+  }
+  return tokens;
+}
+export function normalizeRecallTokens(value: string, extraStopWords: string[] = []): string[] {
+  return Array.from(normalizeRecallTokenSet(value, extraStopWords));
 }
 export function countRecallTokenOverlap(
   queryTokens: Set<string>,
   value: string | undefined,
-  extraStopWords: string[] = [],
+  extraStopWords: string[] = []
 ): number {
   if (!value) return 0;
-  const tokens = new Set(normalizeRecallTokens(value, extraStopWords));
+  const tokens = normalizeRecallTokenSet(value, extraStopWords);
   let matches = 0;
   for (const token of queryTokens) {
     if (tokens.has(token)) matches += 1;

package/src/temporal-index.ts CHANGED Viewed

@@ -76,9 +76,7 @@ function tagIndexPath(memoryDir: string): string {
 function ensureStateDir(memoryDir: string): void {
   const dir = stateDir(memoryDir);
-  if (!fs.existsSync(dir)) {
-    fs.mkdirSync(dir, { recursive: true });
-  }
+  fs.mkdirSync(dir, { recursive: true });
 }
 function readJsonSafe<T>(filePath: string, fallback: T): T {
@@ -187,6 +185,13 @@ function lockOwnerIsRunning(owner: IndexLockOwner): boolean {
   return runningStartedAtMs <= owner.processStartedAtMs + INDEX_PROCESS_START_TOLERANCE_MS;
 }
+function lockIsFresh(lockInfo: fs.Stats, owner: IndexLockOwner | null): boolean {
+  const ownerCreatedAtMs =
+    typeof owner?.createdAt === "string" && owner.createdAt.length > 0 ? Date.parse(owner.createdAt) : Number.NaN;
+  const referenceMs = Number.isFinite(ownerCreatedAtMs) ? ownerCreatedAtMs : lockInfo.mtimeMs;
+  return Date.now() - referenceMs < INDEX_LOCK_STALE_MS;
+}
 function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
   try {
     const info = fs.lstatSync(lockDir);
@@ -196,11 +201,14 @@ function removeAbandonedIndexLock(lockDir: string): IndexLockCleanupResult {
       return "removed";
     }
     const owner = readIndexLockOwner(lockDir);
-    if (owner !== null && lockOwnerIsRunning(owner)) return "wait";
-    if (owner === null && Date.now() - info.mtimeMs < INDEX_LOCK_STALE_MS) return "wait";
+    if (owner !== null) {
+      if (lockOwnerIsRunning(owner)) return "wait";
+    }
+    if (owner === null && lockIsFresh(info, null)) return "wait";
     fs.rmSync(lockDir, { recursive: true, force: true });
     return "removed";
-  } catch {
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException)?.code === "ENOENT") return "removed";
     // Fail silently — indexes are advisory only
     return "blocked";
   }
@@ -217,6 +225,15 @@ function withIndexFileLock(filePath: string, update: () => void): void {
       acquired = true;
     } catch (error) {
       const code = (error as NodeJS.ErrnoException)?.code;
+      if (code === "ENOENT") {
+        try {
+          fs.mkdirSync(path.dirname(lockDir), { recursive: true });
+        } catch {
+          return;
+        }
+        sleepSync(INDEX_LOCK_POLL_MS);
+        continue;
+      }
       if (code !== "EEXIST") return;
       const cleanupResult = removeAbandonedIndexLock(lockDir);
       if (cleanupResult === "blocked") return;