npm - @rubytech/create-realagent - Versions diffs - 1.0.759 → 1.0.761 - Mend

@rubytech/create-realagent 1.0.759 → 1.0.761

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/payload/platform/plugins/whatsapp-import/lib/src/parse-export.ts ADDED Viewed

@@ -0,0 +1,385 @@
+import { createHash } from "node:crypto";
+import { readFileSync } from "node:fs";
+// ---------------------------------------------------------------------------
+// parse-export — deterministic WhatsApp `_chat.txt` parser (Task 805).
+//
+// Pure function. No LLM in the per-line decision path. Replaces the prose
+// grammar that lived in references/export-parse.md when the database-operator
+// specialist's Sonnet was the line tokeniser. Every grammar branch here is
+// exercised by the vitest grid in `__tests__/parse-export.test.ts`; that
+// grid IS the contract — extending the grammar means a new test first.
+//
+// Doctrine alignment:
+//   - feedback_deterministic_means_remove_llm.md — the LLM is no longer in
+//     the per-line decision path.
+//   - feedback_deterministic_is_a_shell_script.md — TypeScript is the right
+//     deliverable shape here (UTF-8 decode + multi-line body assembly + sha256
+//     would be cumbersome in shell); the LITERAL-MAPPING rule yields to
+//     "Node module" because the per-line decision path is the deliverable, not
+//     a one-shot orchestrator.
+//   - feedback_loud_failures.md — encoding errors, empty files, and lines
+//     that match a timestamp prefix but cannot be tokenised throw with named
+//     reasons rather than degrading silently.
+// ---------------------------------------------------------------------------
+export interface ParseExportInput {
+  /** Absolute path to the `_chat.txt` file. */
+  filePath: string;
+  /** Account scope used to compose `conversationId`. */
+  accountId: string;
+  /** IANA timezone the operator confirmed (e.g. `Europe/London`). */
+  timezone: string;
+  /** Defaults to `DD/MM/YY`; operator confirms when locale is ambiguous. */
+  dateFormat?: "DD/MM/YY" | "MM/DD/YY";
+}
+export interface ParsedLine {
+  senderName: string;
+  /** ISO 8601 with timezone offset for the supplied IANA zone. */
+  dateSent: string;
+  body: string;
+  /** Position within emitted (post-skip) messages, 0-based. */
+  sequenceIndex: number;
+}
+export interface ParseExportCounters {
+  parsed: number;
+  systemSkipped: number;
+  mediaSkipped: number;
+  parseErrors: number;
+}
+export interface ParseExportResult {
+  conversationId: string;
+  /** `whatsapp-export:<sha256-hex>` of the raw file bytes. */
+  archiveSourceFile: string;
+  parsedLines: ParsedLine[];
+  counters: ParseExportCounters;
+}
+const TIMESTAMP_PREFIX_DDMMYY =
+  /^\[(\d{2})\/(\d{2})\/(\d{2}),\s+(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
+const TIMESTAMP_PREFIX_MMDDYY = TIMESTAMP_PREFIX_DDMMYY; // shape is identical; ordering differs in interpretation only
+// System-message patterns that appear WITHOUT a `: ` sender/body separator.
+// WhatsApp emits group-event and security-code lines as `<Sender> <verb> ...`
+// (no colon). Lines that match the timestamp prefix but lack `: ` and do not
+// match one of these patterns are LOUD-FAIL parse errors — never silently
+// dropped.
+const LINE_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
+  /^Messages and calls are end-to-end encrypted/i,
+  /'s security code changed\.?$/i,
+  / created group ["“”]/,
+  / added /,
+  / removed /,
+  / left$/,
+  / changed the subject from /,
+  / changed this group's icon/,
+  / joined using this group's invite link/,
+  /^You're now an admin$/i,
+  /^You created group/i,
+];
+// Body-level patterns evaluated after `Sender: body` split. These are real
+// messages syntactically but carry no graph value (deletions, media-only).
+const BODY_LEVEL_SYSTEM_PATTERNS: RegExp[] = [
+  /^You deleted this message\.?$/,
+  /^This message was deleted\.?$/,
+];
+const MEDIA_ONLY_PATTERNS: RegExp[] = [
+  /^<Media omitted>$/,
+  /^IMG-\d+-\w+\.(jpg|jpeg|png|heic|gif)\s*\(file attached\)$/i,
+  /^VID-\d+-\w+\.mp4\s*\(file attached\)$/i,
+  /^PTT-\d+-\w+\.opus\s*\(file attached\)$/i,
+  /^AUD-\d+-\w+\.opus\s*\(file attached\)$/i,
+  /^STK-\d+-\w+\.webp\s*\(file attached\)$/i,
+  /^.+\.(pdf|docx|doc|xlsx|xls|pptx|ppt|zip|csv|txt)\s*\(file attached\)$/i,
+  /^‎.+attached:\s*.+$/, // alternative LRM-prefixed format on some platforms
+];
+export function parseExport(input: ParseExportInput): ParseExportResult {
+  const { filePath, accountId, timezone, dateFormat = "DD/MM/YY" } = input;
+  if (!accountId || !accountId.trim()) {
+    throw new Error("parse-export: accountId is required.");
+  }
+  if (!timezone || !timezone.trim()) {
+    throw new Error("parse-export: timezone is required (e.g. 'Europe/London').");
+  }
+  const rawBytes = readFileSync(filePath);
+  const sha256Hex = createHash("sha256").update(rawBytes).digest("hex");
+  const archiveSourceFile = `whatsapp-export:${sha256Hex}`;
+  const conversationId = `whatsapp-export:${sha256Hex}:${accountId}`;
+  const text = decodeAndNormalise(rawBytes);
+  if (text.length === 0) {
+    throw new Error(
+      `parse-export: file is empty — not a _chat.txt. file=${filePath}`,
+    );
+  }
+  const lines = text.split("\n");
+  const counters: ParseExportCounters = {
+    parsed: 0,
+    systemSkipped: 0,
+    mediaSkipped: 0,
+    parseErrors: 0,
+  };
+  // Stage 1 — tokenise into raw messages (timestamp + remainder), accumulating
+  // continuation lines into the previous remainder. Stage 2 then categorises
+  // each tokenised message (system / media / real) so the counter increments
+  // happen exactly once per source line.
+  interface RawMessage {
+    rawLineIndex: number; // 1-based file line number for LOUD-FAIL diagnostics
+    year: number;
+    month: number;
+    day: number;
+    hour: number;
+    minute: number;
+    second: number;
+    remainder: string; // everything after `]` on the prefix line, plus continuation lines
+  }
+  const raw: RawMessage[] = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    if (line.length === 0 && i === lines.length - 1) continue; // trailing newline
+    const prefixMatch = matchTimestampPrefix(line, dateFormat);
+    if (prefixMatch) {
+      raw.push({
+        rawLineIndex: i + 1,
+        ...prefixMatch.dateParts,
+        remainder: prefixMatch.remainder,
+      });
+    } else {
+      // Continuation of the previous message body. If there is no previous
+      // message, this line is leading garbage — ignore it (matches the
+      // export-parse.md edge case where a leading BOM or blank line precedes
+      // the first timestamp).
+      const last = raw[raw.length - 1];
+      if (last) {
+        last.remainder += "\n" + line;
+      }
+    }
+  }
+  // Stage 2 — categorise each raw message. Do NOT trim trailing whitespace
+  // from the remainder before splitting — `Joel: ` (sender + colon + trailing
+  // space + newline) collapses to `Joel:` after a `\s+$` trim and the `: `
+  // separator disappears, turning an empty-body system skip into a LOUD-FAIL.
+  const parsedLines: ParsedLine[] = [];
+  for (const r of raw) {
+    const remainder = r.remainder;
+    const colonIdx = findFirstColonSeparator(remainder);
+    if (colonIdx === -1) {
+      // No `: ` separator. Must match a known system pattern or LOUD-FAIL.
+      const trimmed = remainder.replace(/\s+$/, "");
+      if (matchesAny(trimmed, LINE_LEVEL_SYSTEM_PATTERNS)) {
+        counters.systemSkipped++;
+        continue;
+      }
+      counters.parseErrors++;
+      throw new Error(
+        `parse-export: parse-error file=${filePath} line=${r.rawLineIndex} reason=no-sender-body-separator content="${trimmed.slice(0, 80)}"`,
+      );
+    }
+    const senderName = remainder.slice(0, colonIdx).trim();
+    const body = remainder.slice(colonIdx + 2).replace(/\s+$/, "");
+    if (body.length === 0) {
+      counters.systemSkipped++;
+      continue;
+    }
+    if (matchesAny(body, BODY_LEVEL_SYSTEM_PATTERNS)) {
+      counters.systemSkipped++;
+      continue;
+    }
+    if (matchesAny(body, MEDIA_ONLY_PATTERNS)) {
+      counters.mediaSkipped++;
+      continue;
+    }
+    const dateSent = isoWithOffset(
+      r.year,
+      r.month,
+      r.day,
+      r.hour,
+      r.minute,
+      r.second,
+      timezone,
+    );
+    parsedLines.push({
+      senderName,
+      dateSent,
+      body,
+      sequenceIndex: parsedLines.length,
+    });
+    counters.parsed++;
+  }
+  if (parsedLines.length === 0 && counters.systemSkipped === 0 && counters.mediaSkipped === 0) {
+    throw new Error(
+      `parse-export: zero parsed lines after walking ${filePath} — not a _chat.txt or all lines failed grammar.`,
+    );
+  }
+  return {
+    conversationId,
+    archiveSourceFile,
+    parsedLines,
+    counters,
+  };
+}
+// ---------------------------------------------------------------------------
+// Internals
+// ---------------------------------------------------------------------------
+function decodeAndNormalise(bytes: Buffer): string {
+  // Strict UTF-8 decode. Node's TextDecoder with `fatal: true` throws on
+  // invalid bytes — that's the LOUD-FAIL the brief mandates for encoding
+  // errors. The default `Buffer.toString('utf8')` silently substitutes
+  // U+FFFD, which would let bad bytes propagate into the graph.
+  let text: string;
+  try {
+    text = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
+  } catch (err) {
+    throw new Error(
+      `parse-export: UTF-8 decode failed — ${err instanceof Error ? err.message : String(err)}. The file is not valid UTF-8; re-export from WhatsApp.`,
+    );
+  }
+  // Strip leading BOM (U+FEFF).
+  if (text.charCodeAt(0) === 0xfeff) {
+    text = text.slice(1);
+  }
+  // Normalise mixed line endings to LF.
+  text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
+  return text;
+}
+interface TimestampMatch {
+  dateParts: {
+    year: number;
+    month: number;
+    day: number;
+    hour: number;
+    minute: number;
+    second: number;
+  };
+  remainder: string;
+}
+function matchTimestampPrefix(
+  line: string,
+  dateFormat: "DD/MM/YY" | "MM/DD/YY",
+): TimestampMatch | null {
+  const re = dateFormat === "MM/DD/YY" ? TIMESTAMP_PREFIX_MMDDYY : TIMESTAMP_PREFIX_DDMMYY;
+  const m = line.match(re);
+  if (!m) return null;
+  const a = parseInt(m[1], 10); // dd or mm depending on dateFormat
+  const b = parseInt(m[2], 10); // mm or dd
+  const yy = parseInt(m[3], 10);
+  const hour = parseInt(m[4], 10);
+  const minute = parseInt(m[5], 10);
+  const second = m[6] !== undefined ? parseInt(m[6], 10) : 0;
+  const remainder = m[7] ?? "";
+  const day = dateFormat === "MM/DD/YY" ? b : a;
+  const month = dateFormat === "MM/DD/YY" ? a : b;
+  // Range-check before passing to Date.UTC — that function silently rolls
+  // over invalid components (Date.UTC(2026, 13, 1) → 2027-02-01), which
+  // would corrupt timestamps when the operator passes the wrong dateFormat
+  // for a US-locale export. Reject as not-a-prefix; the caller retries the
+  // file with the correct format or LOUD-FAILs when the file isn't a chat.
+  if (month < 1 || month > 12 || day < 1 || day > 31) return null;
+  if (hour > 23 || minute > 59 || second > 59) return null;
+  // WhatsApp's two-digit year is unambiguous in the 21st century; explicit
+  // shift here documents the assumption rather than relying on locale.
+  const year = 2000 + yy;
+  return {
+    dateParts: { year, month, day, hour, minute, second },
+    remainder,
+  };
+}
+function findFirstColonSeparator(remainder: string): number {
+  // Split on the FIRST `: ` (colon-space). A sender display name may itself
+  // contain a `:` (e.g. "Joel: Work"), so we anchor on the first colon
+  // followed by a space — that's the WhatsApp export's stable separator.
+  const idx = remainder.indexOf(": ");
+  return idx;
+}
+function matchesAny(text: string, patterns: RegExp[]): boolean {
+  for (const p of patterns) {
+    if (p.test(text)) return true;
+  }
+  return false;
+}
+function isoWithOffset(
+  year: number,
+  month: number,
+  day: number,
+  hour: number,
+  minute: number,
+  second: number,
+  timezone: string,
+): string {
+  // Produce ISO 8601 with the offset that the supplied IANA zone holds for
+  // this wall-clock instant. Two-step refinement is needed to handle DST:
+  // the wall-clock components describe a local time, and we need the offset
+  // for the corresponding UTC instant in `timezone`.
+  const guessUtcMs = Date.UTC(year, month - 1, day, hour, minute, second);
+  let offMin = offsetMinutesAt(new Date(guessUtcMs), timezone);
+  const refinedUtcMs = guessUtcMs - offMin * 60_000;
+  offMin = offsetMinutesAt(new Date(refinedUtcMs), timezone);
+  const sign = offMin >= 0 ? "+" : "-";
+  const absOff = Math.abs(offMin);
+  const offHH = String(Math.floor(absOff / 60)).padStart(2, "0");
+  const offMM = String(absOff % 60).padStart(2, "0");
+  const Y = String(year).padStart(4, "0");
+  const M = String(month).padStart(2, "0");
+  const D = String(day).padStart(2, "0");
+  const H = String(hour).padStart(2, "0");
+  const Mi = String(minute).padStart(2, "0");
+  const S = String(second).padStart(2, "0");
+  return `${Y}-${M}-${D}T${H}:${Mi}:${S}${sign}${offHH}:${offMM}`;
+}
+function offsetMinutesAt(date: Date, timezone: string): number {
+  // Use Intl.DateTimeFormat with longOffset to read the IANA-zone offset for
+  // the given UTC instant. Output format: "GMT+01:00", "GMT-05:00", or "GMT".
+  const formatter = new Intl.DateTimeFormat("en-US", {
+    timeZone: timezone,
+    timeZoneName: "longOffset",
+  });
+  const parts = formatter.formatToParts(date);
+  const tzPart = parts.find((p) => p.type === "timeZoneName");
+  if (!tzPart) {
+    throw new Error(`parse-export: unable to read offset for timezone "${timezone}".`);
+  }
+  const value = tzPart.value;
+  if (value === "GMT" || value === "UTC") return 0;
+  const m = value.match(/^(?:GMT|UTC)([+-])(\d{1,2}):?(\d{2})?$/);
+  if (!m) {
+    throw new Error(
+      `parse-export: cannot parse timezone offset "${value}" for IANA zone "${timezone}".`,
+    );
+  }
+  const sign = m[1] === "+" ? 1 : -1;
+  const hh = parseInt(m[2], 10);
+  const mm = m[3] ? parseInt(m[3], 10) : 0;
+  return sign * (hh * 60 + mm);
+}

package/payload/platform/plugins/whatsapp-import/lib/tsconfig.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "extends": "../../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "dist",
+    "rootDir": "src"
+  },
+  "include": ["src"],
+  "exclude": ["src/__tests__"]
+}

package/payload/platform/plugins/whatsapp-import/lib/vitest.config.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import { defineConfig } from "vitest/config";
+export default defineConfig({
+  test: {
+    environment: "node",
+    globals: false,
+    include: ["src/__tests__/**/*.test.ts"],
+  },
+});

package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md CHANGED Viewed

@@ -27,7 +27,9 @@ The owner is metadata: who exported this chat. Stamped on the `:Conversation` no
 ### Step 2 — Participants
-Parse the `_chat.txt` per [export-parse.md](references/export-parse.md). For each distinct sender name, capture: `{senderName, firstSeen, lastSeen, messageCount}`. Display the list in chat with these counts; the operator sees who they're about to ingest before any write.
+Parse the `_chat.txt` by invoking `mcp__memory__whatsapp-export-parse(filePath: <path>, timezone: <iana-zone>)` (Task 805). The tool returns `{conversationId, archiveSourceFile, parsedLines[], counters}` — the deterministic Node parser in [platform/plugins/whatsapp-import/lib/](../../../lib/) walks the line grammar; the agent does not tokenise lines itself. See [export-parse.md](references/export-parse.md) for the parser's behaviour reference.
+For each distinct sender name in `parsedLines[]`, capture: `{senderName, firstSeen, lastSeen, messageCount}`. Display the list in chat with these counts; the operator sees who they're about to ingest before any write.
 For each distinct sender, ask the operator to choose:
@@ -80,17 +82,17 @@ Convert each parsed timestamp to ISO 8601 with the supplied offset before passin
 ## Execution model
-1. **Parse** — Read `_chat.txt` per [export-parse.md](references/export-parse.md). Build the parsed-line structure: `{senderName, dateSent, body, sequenceIndex}`. Skip system messages and media-only lines, increment counters.
+1. **Parse** — Invoke `mcp__memory__whatsapp-export-parse(filePath, timezone, dateFormat?)`. The deterministic parser walks the line grammar, returns `{conversationId, archiveSourceFile, parsedLines[], counters}`. LOUD-FAIL on encoding error / empty file / malformed timestamp surfaces as the tool's `isError` content; the skill aborts the import without further work. The `archiveSourceFile` is `whatsapp-export:<sha256-of-file-bytes>` — keep this exact value; `memory-archive-write` will recompute and assert it matches in Step 6.
 2. **Owner+participant confirmation** — Steps 1–3 above. Persist `$ownerNodeId` + `$participantNodeIds`.
 3. **Selective-ingest gate** — If `parsedLines.length > 100`, pause for filter selection. Apply filter.
 4. **Build rows[]** — Map each parsed line to `{messageId, conversationId, senderNodeId, senderName, dateSent (ISO 8601), body, sequenceIndex}`. Compute `messageId` per line.
-5. **Build conversation block** — `{conversationId, archiveSourceFile, firstMessageAt, lastMessageAt, participantCount, messageCount}` from the rows[].
-6. **Dispatch** `mcp__memory__memory-archive-write` once with `archiveType='whatsapp-export'`, `ownerNodeId`, `conversation`, `participantNodeIds` (the distinct elementIds from the map), `rows`, `sessionId`. The tool MERGEs the Conversation, MERGEs Messages, links PART_OF + SENT + PARTICIPANT_IN edges per row, and runs the `finalize` hook to MERGE the NEXT chronology by dateSent ordering.
+5. **Build conversation block** — `{conversationId, archiveSourceFile, firstMessageAt, lastMessageAt, participantCount, messageCount}` from the rows[]. `conversationId` and `archiveSourceFile` come straight from the parser's return value.
+6. **Dispatch** `mcp__memory__memory-archive-write` once with `archiveType='whatsapp-export'`, `ownerNodeId`, `conversation`, `participantNodeIds` (the distinct elementIds from the map), `rows`, `sessionId`, **and `archiveFilePath: <same path you passed to whatsapp-export-parse>`**. The server re-computes `sha256(file)` and asserts it matches `conversation.archiveSourceFile` before any write — mismatch is a hard reject (Task 805 silent-substitution gate). The tool MERGEs the Conversation, MERGEs Messages, links PART_OF + SENT + PARTICIPANT_IN edges per row, and runs the `finalize` hook to MERGE the NEXT chronology by dateSent ordering.
 7. **Emit per-export log line:**
    ```
    [whatsapp-import] file=<chat.txt> conversationId=<cid> participants=<n> messages-parsed=<n> media-skipped=<n> system-skipped=<n> ms=<elapsed>
    ```
-8. **Insight pass** — Run pass 2 per [insight-extraction.md](references/insight-extraction.md). Read the just-written messages via `memory-search`, classify within the specialist's own LLM turn, and write typed observations through `memory-write` / `memory-update`. Emit:
+8. **Insight pass** — Run pass 2 per [insight-extraction.md](references/insight-extraction.md). Read the just-written messages via `memory-search`, classify within the specialist's own LLM turn, and write typed observations through `memory-write` / `memory-update`. **`:MENTIONS` and `:RELATED_TO` edges route through `mcp__memory__whatsapp-export-insight-write` (Task 805) — that tool re-runs `memory-search` server-side and asserts the agent's claimed candidate elementIds appear in the live result; rejects single-first-name names without `disambiguatorOk=true`; refuses `:RELATED_TO` writes without `operatorConfirmed=true`. The agent never authors `:MENTIONS` / `:RELATED_TO` Cypher directly.** Emit:
    ```
    [whatsapp-import] insight-pass model=sonnet chunks=<n> mentions=<n> preferences=<n> tasks=<n> observed-relationships=<n> novel-insights=<n> ms=<elapsed>
    ```
@@ -101,13 +103,9 @@ All writes route through `mcp__memory__memory-archive-write` (bulk Conversation+
 ## LOUD-FAIL on parse errors
-If [export-parse.md](references/export-parse.md)'s grammar doesn't match a line (genuine parser failure, not a documented skip case), emit:
-```
-[whatsapp-import] parse-error file=<chat.txt> line=<n> reason=<r>
-```
+`mcp__memory__whatsapp-export-parse` is the LOUD-FAIL surface (Task 805). When the grammar can't classify a line, the tool throws with `parse-error file=<...> line=<n> reason=<r>` and the MCP layer returns `isError: true` with that message. The skill MUST abort the import on a parse-error response — do not retry, do not "best effort" the rest of the file. The operator gets a named error and re-exports if necessary.
-…and abort the import. Do NOT silently truncate or guess. The operator gets a named error; we keep no half-truths in the graph.
+The deterministic parser also LOUD-FAILs on UTF-8 decode failure (`reason=encoding-error`), zero parsed lines (`reason=not-a-_chat.txt`), and missing required arguments (`reason=accountId|timezone`). All of these surface through the same tool error path; the agent does not need to detect them itself.
 ## Idempotency contract

package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/export-parse.md CHANGED Viewed

@@ -1,6 +1,8 @@
-# Reference: `_chat.txt` parsing
+# Reference: `_chat.txt` parsing — implementation reference
-WhatsApp's "Export Chat" produces a UTF-8 text file with a deterministic line grammar. This reference is the contract for converting that file into the parsed-line structure the [SKILL.md](../SKILL.md) builds rows from.
+> **Task 805 — this is no longer operator instruction.** The agent does NOT walk this grammar in its own LLM turn. Parsing runs deterministically in [`platform/plugins/whatsapp-import/lib/src/parse-export.ts`](../../../lib/src/parse-export.ts), invoked via `mcp__memory__whatsapp-export-parse`. The vitest grid in [`lib/src/__tests__/parse-export.test.ts`](../../../lib/src/__tests__/parse-export.test.ts) is the executable contract; this prose is the human-readable companion. Extend the grammar by adding a failing test first.
+WhatsApp's "Export Chat" produces a UTF-8 text file with a deterministic line grammar. This reference describes what the parser library does when it converts that file into the `{senderName, dateSent, body, sequenceIndex}[]` structure the SKILL.md consumes.
 ## File-open invariants
@@ -95,8 +97,11 @@ The skill consumes this directly. The `messageId` is computed by the skill (not
 ## When to LOUD-FAIL
-- Encoding error at file open (UTF-8 decode fails partway).
-- Zero parsed lines after walking the file (the file isn't a `_chat.txt`).
-- A timestamp prefix matches but the body parse fails (no `: ` separator after the closing `]`) — emit `[whatsapp-import] parse-error file=<...> line=<n> reason=<r>` and abort.
+The parser throws (and `whatsapp-export-parse` returns `isError: true`) on:
+- Encoding error at file open (UTF-8 decode fails — the parser uses `TextDecoder` with `fatal: true`, so any invalid byte sequence aborts loudly rather than silently substituting U+FFFD).
+- Empty file or zero parsed lines after walking the file (the file isn't a `_chat.txt`).
+- A timestamp prefix matches but the body parse fails (no `: ` separator after the closing `]` AND no system-pattern match) — emits `parse-error file=<...> line=<n> reason=no-sender-body-separator content="<...>"`.
+- Missing required input (`accountId`, `timezone`).
 Never silently drop data the parser couldn't classify. The operator chooses to skip; the parser does not choose for them.

package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/references/insight-extraction.md CHANGED Viewed

@@ -15,28 +15,27 @@ This pass runs INLINE in the database-operator specialist's own LLM turn — Son
 | Inter-person relationship | (matched `:Person` nodes) | `:Person` | `:Person` | `:RELATED_TO` (with `kind`, `evidenceMessageIds[]`) | Operator-confirmation gate before write — see below. |
 | Genuinely novel finding | `:Insight` (new label, last resort only) | `:Insight` | `:Message` | `:DERIVED_FROM` | Only when reuse-over-invent fails for every existing label. Self-rated `confidence` 0–1. |
-## Anti-hallucination gates
+## Anti-hallucination gates — server-enforced (Task 805)
-The biggest risk in this pass is Sonnet writing edges to wrong-Person nodes. Two gates protect the graph:
+The biggest risk in this pass is Sonnet writing edges to wrong-Person nodes. **Three gates protect the graph and they live in code, not prose.** `mcp__memory__whatsapp-export-insight-write` enforces all of them server-side; the agent cannot bypass them by skipping this section. This file describes how the gates work and what the agent must supply to pass them; the [tool source](../../../../memory/mcp/src/tools/whatsapp-export-insight-write.ts) is the canonical contract.
-### Gate 1: `memory-search` BEFORE every `:MENTIONS` edge
+`:MENTIONS` and `:RELATED_TO` writes ROUTE THROUGH this tool. Other observation kinds (`:Preference`, `:Task`, `:DefinedTerm`, `:Insight`) keep using `memory-write` because their adjacency is not subject to wrong-Person ambiguity.
-For every `:MENTIONS` edge candidate, the skill turn MUST run `memory-search(query=<mentioned-name>, kind='person')` first. The result determines what happens:
+### Gate 1: candidate-overlap (re-run `memory-search` before every `:MENTIONS` write)
-- **0 hits** — the mentioned name doesn't exist in the graph. Skip silently. Do not auto-mint a `:Person` from a chat-mention alone.
-- **1 hit** — proceed if the match is unambiguous; surface to operator if ambiguous (see Gate 2).
-- **2+ hits** — ambiguous. Surface to operator before any edge writes.
+For every `:MENTIONS` edge candidate the agent runs `memory-search(query=<mentioned-name>, labels=['Person','AdminUser'])` first, then calls `whatsapp-export-insight-write` with the resulting candidate `nodeId`s in `candidateElementIds`. **The server re-runs the same search and asserts at least one of those IDs appears in the live result.** Mismatch → `gate-rejected reason=candidate-mismatch`.
-The agent never writes a `:MENTIONS` edge without the prior `memory-search`. This is a discipline gate: the inline classification is just `memory-search → memory-write`, never `memory-write` directly from raw classification.
+- **0 hits** — the mentioned name doesn't exist in the graph. Don't write — the server would reject with `candidate-mismatch` because there is nothing to overlap. Do not auto-mint a `:Person` from a chat-mention alone.
+- **1+ hits** — supply the `nodeId`(s) the agent expects to be the referent. Server confirms by re-running the search.
-### Gate 2: First-name-only matches surface to operator regardless of hit count
+### Gate 2: first-name-only rejection (independent of hit count)
-Single-first-name references in chat ("ask Sarah about Q3") have ambiguous referents even when `memory-search` returns one match — that one match might be the wrong Sarah. The rule:
+Single-token names ("Sarah") without an explicit disambiguator are rejected at the tool boundary regardless of memory-search hit count — that one match might be the wrong Sarah. The rule lives in code: if `name` lacks whitespace AND lacks digits AND `disambiguatorOk` is not `true`, the tool returns `gate-rejected reason=first-name-only` and writes nothing.
-- The mention has a **disambiguator** (full name "Sarah Chen", phone, email, role context "Sarah at Acme") → `memory-search` 1-hit → write the edge.
-- The mention is a **first-name only** without disambiguator → ALWAYS surface to operator confirmation, regardless of `memory-search` result count.
+- The mention has a **disambiguator** (full name "Sarah Chen", phone, email, role context "Sarah at Acme") → set `disambiguatorOk: true` in the tool call, gate passes if Gate 1 also passes.
+- The mention is a **first-name only** without disambiguator → omit `disambiguatorOk` (or set `false`); the tool refuses the write. The agent surfaces this to the operator as ambiguous and asks for confirmation before retrying with `disambiguatorOk: true`.
-Surface format:
+Surface format when the operator must disambiguate:
 ```
 [whatsapp-import] mention-ambiguous name="Sarah" reason=first-name-only candidates=1 awaiting-operator-resolution
@@ -44,12 +43,16 @@ Surface format:
 Followed by a chat prompt: `"Sarah" mentioned in message <messageId>. Found 1 :Person candidate: Sarah Chen (sarah@acme.com). Confirm? Yes / No / Pick another.`
-### Gate 3: `:RELATED_TO` between two existing distinct Persons
+### Gate 3: `:RELATED_TO` requires `operatorConfirmed: true`
-When the second pass infers a relationship between two `:Person` nodes who both already exist in the graph (e.g., chat says "Joel and Sarah are working on Q3 together" → `(joel)-[:RELATED_TO {kind:'collaborator'}]->(sarah)`), surface to operator confirmation before write. The operator sees the inferred edge with both endpoints' names + the supporting message excerpts; on yes, the edge writes with `evidenceMessageIds: [...]`.
+When the second pass infers a relationship between two `:Person` nodes who both already exist in the graph (e.g., chat says "Joel and Sarah are working on Q3 together" → `(joel)-[:RELATED_TO {kind:'collaborator'}]->(sarah)`), the agent surfaces the inferred edge with both endpoints' names + supporting message excerpts. On operator yes, the agent calls `whatsapp-export-insight-write(kind='RELATED_TO', operatorConfirmed: true, evidenceMessageIds: [...])`. **Without `operatorConfirmed=true` the tool returns `gate-rejected reason=relationship-needs-confirm` and writes nothing.**
 The default for this gate is conservative — when in doubt, surface. False-positive RELATED_TO edges are graph noise; false-negative skips can be re-run.
+### Endpoint label + accountId checks (free with the tool)
+`whatsapp-export-insight-write` also rejects writes whose endpoints are missing, cross-account, or wrong-labelled (a MENTIONS source must be a :Message; the target must be :Person/:AdminUser; RELATED_TO requires both endpoints to be :Person/:AdminUser). These are tool-level invariants — the agent does not need to re-check them in skill code.
 ## Chunking strategy
 For conversations with 100+ messages, chunk the input to the inline LLM turn at ~50 messages per chunk. The classifier processes each chunk independently; the skill aggregates observations across chunks before writing. Aggregation deduplicates (the same `:MENTIONS` edge would otherwise be proposed once per chunk that referenced the same person).

package/payload/platform/templates/specialists/agents/database-operator.md CHANGED Viewed

@@ -3,7 +3,7 @@ name: database-operator
 description: "Document and archive ingestion and ad-hoc graph operations — running the universal `document-ingest` skill for any unstructured document (PDF, text, transcript, web page, audio, video) and per-source archive-import skills (LinkedIn Basic Data Export today; CRM-type seed archives as each plugin ships), plus operator-driven graph hygiene (prune orphans, deduplicate entities, add edges, normalise labels). Delegate when the operator uploads any document, drops an archive directory into chat, or asks for any graph operation that is not a routine per-turn write."
 summary: "Ingests every unstructured document and external archive into your graph (LinkedIn today; other CRM sources in future) and handles ad-hoc graph tidy-ups on request. For example, when you upload a CV, a pricing guide, or a contract; when you drop a LinkedIn export folder into chat; or when you ask to prune orphan nodes, merge duplicate people, or add edges between entities."
 model: claude-sonnet-4-6
-tools: Read, Bash, Glob, Grep, mcp__graph__maxy-graph-read_neo4j_cypher, mcp__graph__maxy-graph-write_neo4j_cypher, mcp__graph__maxy-graph-get_neo4j_schema, mcp__memory__memory-write, mcp__memory__memory-update, mcp__memory__memory-delete, mcp__memory__memory-search, mcp__memory__memory-rank, mcp__memory__memory-reindex, mcp__memory__memory-find-candidates, mcp__memory__memory-ingest, mcp__memory__memory-ingest-extract, mcp__memory__memory-ingest-web, mcp__memory__memory-classify, mcp__memory__memory-archive-write, mcp__memory__graph-prune-denylist-list, mcp__memory__graph-prune-denylist-add, mcp__memory__graph-prune-denylist-remove, mcp__contacts__contact-create, mcp__contacts__contact-update, mcp__contacts__contact-lookup, mcp__contacts__contact-list, mcp__admin__file-attach, mcp__admin__plugin-read
+tools: Read, Bash, Glob, Grep, mcp__graph__maxy-graph-read_neo4j_cypher, mcp__graph__maxy-graph-write_neo4j_cypher, mcp__graph__maxy-graph-get_neo4j_schema, mcp__memory__memory-write, mcp__memory__memory-update, mcp__memory__memory-delete, mcp__memory__memory-search, mcp__memory__memory-rank, mcp__memory__memory-reindex, mcp__memory__memory-find-candidates, mcp__memory__memory-ingest, mcp__memory__memory-ingest-extract, mcp__memory__memory-ingest-web, mcp__memory__memory-classify, mcp__memory__memory-archive-write, mcp__memory__whatsapp-export-parse, mcp__memory__whatsapp-export-insight-write, mcp__memory__graph-prune-denylist-list, mcp__memory__graph-prune-denylist-add, mcp__memory__graph-prune-denylist-remove, mcp__contacts__contact-create, mcp__contacts__contact-update, mcp__contacts__contact-lookup, mcp__contacts__contact-list, mcp__admin__file-attach, mcp__admin__plugin-read
 ---
 # Database Operator