npm - @rubytech/create-maxy - Versions diffs - 1.0.808 → 1.0.809 - Mend

@rubytech/create-maxy 1.0.808 → 1.0.809

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/payload/platform/plugins/whatsapp-import/lib/src/__tests__/ingest-idempotence.test.ts ADDED Viewed

@@ -0,0 +1,141 @@
+import { describe, it, expect } from "vitest";
+import {
+  normaliseSenderName,
+  sha256Hex,
+  deriveMessageId,
+  observationContentHash,
+} from "../derive-keys.js";
+describe("normaliseSenderName", () => {
+  it("returns NFKC-trim-lower form", () => {
+    expect(normaliseSenderName("  Adam Mackay  ")).toBe("adam mackay");
+  });
+  it("collapses NFKC equivalent forms (composed vs decomposed accents)", () => {
+    const composed = "Adám";
+    const decomposed = "Adám";
+    expect(normaliseSenderName(composed)).toBe(normaliseSenderName(decomposed));
+  });
+  it("collapses full-width characters to ASCII via NFKC", () => {
+    expect(normaliseSenderName("Ａｄａｍ")).toBe("adam");
+  });
+  it("returns empty string for empty input without throwing", () => {
+    expect(normaliseSenderName("")).toBe("");
+  });
+});
+describe("sha256Hex", () => {
+  it("matches the canonical sha256 of an empty string", () => {
+    expect(sha256Hex("")).toBe(
+      "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+    );
+  });
+  it("produces a deterministic hex digest", () => {
+    expect(sha256Hex("hello")).toBe(sha256Hex("hello"));
+    expect(sha256Hex("hello")).not.toBe(sha256Hex("world"));
+  });
+});
+describe("deriveMessageId", () => {
+  const baseInputs = {
+    conversationSha256: "abc123",
+    dateSent: "2026-03-14T10:15:23+00:00",
+    senderName: "Adam Mackay",
+    body: "Hello there",
+  };
+  it("produces a stable id for identical inputs", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({ ...baseInputs });
+    expect(id1).toBe(id2);
+  });
+  it("collapses identical (sender, dateSent, body) tuples to one id under NFKC-trim-lower (correct for export duplicates)", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({ ...baseInputs, senderName: "  ADAM Mackay  " });
+    expect(id1).toBe(id2);
+  });
+  it("changes when the body differs", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({ ...baseInputs, body: "Hello there!" });
+    expect(id1).not.toBe(id2);
+  });
+  it("changes when the sender differs (after normalisation)", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({ ...baseInputs, senderName: "Joel" });
+    expect(id1).not.toBe(id2);
+  });
+  it("changes when the dateSent differs", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({
+      ...baseInputs,
+      dateSent: "2026-03-14T10:15:24+00:00",
+    });
+    expect(id1).not.toBe(id2);
+  });
+  it("changes when the conversation changes", () => {
+    const id1 = deriveMessageId(baseInputs);
+    const id2 = deriveMessageId({ ...baseInputs, conversationSha256: "def456" });
+    expect(id1).not.toBe(id2);
+  });
+  it("starts with the whatsapp-export:msg prefix and embeds normalised sender", () => {
+    const id = deriveMessageId(baseInputs);
+    expect(id.startsWith("whatsapp-export:msg:")).toBe(true);
+    expect(id).toContain(":adam mackay:");
+  });
+  it("does not embed array-position or FNV32 collapse (Task 870 contract)", () => {
+    const id = deriveMessageId(baseInputs);
+    expect(id).toContain(":msg:");
+    expect(id).toMatch(/:[a-f0-9]{64}$/);
+    expect(id).not.toMatch(/:\d+:[a-f0-9]{8}$/);
+  });
+  it("produces a stable id for empty body", () => {
+    const id = deriveMessageId({ ...baseInputs, body: "" });
+    expect(id).toBe(deriveMessageId({ ...baseInputs, body: "" }));
+  });
+});
+describe("observationContentHash", () => {
+  it("is deterministic for identical inputs", () => {
+    const fields = { summary: "Adam said hi", from: "Adam", to: "Joel", subject: null };
+    expect(observationContentHash(fields)).toBe(observationContentHash(fields));
+  });
+  it("normalises NFKC + trim + lowercase across all fields", () => {
+    const a = { summary: "  Hello  ", from: "ADAM", to: null, subject: null };
+    const b = { summary: "hello", from: "adam", to: null, subject: null };
+    expect(observationContentHash(a)).toBe(observationContentHash(b));
+  });
+  it("treats null and empty string equivalently", () => {
+    const withNull = { summary: "x", from: null, to: null, subject: null };
+    const withEmpty = { summary: "x", from: "", to: "", subject: "" };
+    expect(observationContentHash(withNull)).toBe(observationContentHash(withEmpty));
+  });
+  it("changes when any field changes", () => {
+    const base = { summary: "x", from: null, to: null, subject: null };
+    expect(observationContentHash(base)).not.toBe(
+      observationContentHash({ ...base, summary: "y" }),
+    );
+    expect(observationContentHash(base)).not.toBe(
+      observationContentHash({ ...base, from: "z" }),
+    );
+  });
+  it("collapses NFKC equivalent forms in summary", () => {
+    const composed = { summary: "Adám", from: null, to: null, subject: null };
+    const decomposed = { summary: "Adám", from: null, to: null, subject: null };
+    expect(observationContentHash(composed)).toBe(observationContentHash(decomposed));
+  });
+});

package/payload/platform/plugins/whatsapp-import/lib/src/derive-keys.ts ADDED Viewed

@@ -0,0 +1,59 @@
+import { createHash } from "node:crypto";
+// ---------------------------------------------------------------------------
+// derive-keys — natural-key derivation for whatsapp-import (Task 870).
+//
+// Pure functions. No I/O. The whole point is that re-imports of the same
+// archive collapse to the same Message identity regardless of release-level
+// drift in array indices, hash widths, or arbitrary tiebreakers.
+//
+// Key shape (Task 870 brief):
+//
+//   messageId = whatsapp-export:msg:<conversationSha256>:<dateSentISO>
+//                                  :<NFKC-trim-lower(senderName)>
+//                                  :<sha256-hex(body)>
+//
+// Operator constraint: the same archive must be re-imported with the same
+// `--timezone` flag. Different timezones reinterpret wall-clock instants and
+// will produce drifted messageIds — that is correct semantics, not a bug.
+// Documented in .docs/whatsapp.md natural-key contract section.
+// ---------------------------------------------------------------------------
+export function normaliseSenderName(name: string): string {
+  return name.normalize("NFKC").trim().toLowerCase();
+}
+export function sha256Hex(input: string): string {
+  return createHash("sha256").update(input).digest("hex");
+}
+export interface DeriveMessageIdInput {
+  /** SHA-256 of the source `_chat.txt` bytes — stable across re-imports. */
+  conversationSha256: string;
+  /** ISO 8601 with timezone offset, as emitted by parseExport. */
+  dateSent: string;
+  /** Raw senderName from the export line. Normalised internally. */
+  senderName: string;
+  /** Raw message body. Hashed internally. */
+  body: string;
+}
+export function deriveMessageId(input: DeriveMessageIdInput): string {
+  const norm = normaliseSenderName(input.senderName);
+  const bodyHash = sha256Hex(input.body);
+  return `whatsapp-export:msg:${input.conversationSha256}:${input.dateSent}:${norm}:${bodyHash}`;
+}
+export interface ObservationContentFields {
+  summary?: string | null;
+  from?: string | null;
+  to?: string | null;
+  subject?: string | null;
+}
+export function observationContentHash(fields: ObservationContentFields): string {
+  const parts = [fields.summary, fields.from, fields.to, fields.subject].map(
+    (p) => (p ?? "").normalize("NFKC").trim().toLowerCase(),
+  );
+  return sha256Hex(parts.join("|"));
+}

package/payload/platform/plugins/whatsapp-import/lib/src/filter.ts ADDED Viewed

@@ -0,0 +1,136 @@
+// ---------------------------------------------------------------------------
+// filter — operator-supplied gate over ParsedLine[] (Task 871).
+//
+// Phase 1 ingest is now mandatory-filter: the deterministic Bash entry refuses
+// to write a bulk archive without `--filter`. Three forms cover the operator
+// patterns named in the brief:
+//
+//   --filter all                                     → no row drop
+//   --filter senders=Alice,Bob Carter                → keep rows whose
+//                                                       senderName matches
+//                                                       any csv entry exactly
+//   --filter date-range=2024-01-01..2024-06-30       → keep rows whose
+//                                                       dateSent ISO falls
+//                                                       inside the inclusive
+//                                                       range (date or full
+//                                                       ISO 8601)
+//
+// Doctrine alignment:
+//   - feedback_compress_at_ingest_for_bulk_archives.md — the gate is
+//     mandatory at write-time, not after.
+//   - feedback_deterministic_means_remove_llm.md — the filter parser is a
+//     pure function, no LLM in the per-row decision path.
+//   - feedback_loud_failures.md — malformed `--filter` raises a structured
+//     error with a named reason rather than silently coercing to `all`.
+// ---------------------------------------------------------------------------
+import type { ParsedLine } from "./parse-export.js";
+export type Filter =
+  | { kind: "all" }
+  | { kind: "senders"; senders: string[] }
+  | { kind: "date-range"; fromIso: string; toIso: string };
+/**
+ * Parse a CLI `--filter` argument into a structured Filter.
+ *
+ * Throws Error with message starting "filter: …" on malformed input. The
+ * caller (ingest.mjs / vitest) surfaces the reason verbatim — the brief
+ * mandates `[whatsapp-ingest] FAIL filter-required reason="…"` so the
+ * operator can grep one line.
+ */
+export function parseFilterArg(raw: string | undefined | null): Filter {
+  if (raw == null || raw.trim() === "") {
+    throw new Error(
+      'filter: --filter is required (one of "all", "senders=<csv>", "date-range=<isoFrom>..<isoTo>")',
+    );
+  }
+  const value = raw.trim();
+  if (value === "all") return { kind: "all" };
+  if (value.startsWith("senders=")) {
+    const csv = value.slice("senders=".length);
+    const senders = csv
+      .split(",")
+      .map((s) => s.trim())
+      .filter((s) => s.length > 0);
+    if (senders.length === 0) {
+      throw new Error('filter: senders= requires at least one comma-separated name');
+    }
+    return { kind: "senders", senders };
+  }
+  if (value.startsWith("date-range=")) {
+    const range = value.slice("date-range=".length);
+    const parts = range.split("..");
+    if (parts.length !== 2) {
+      throw new Error(
+        `filter: date-range must be "<isoFrom>..<isoTo>" — got "${range}"`,
+      );
+    }
+    const [fromIso, toIso] = parts.map((p) => p.trim());
+    if (!fromIso || !toIso) {
+      throw new Error(
+        `filter: date-range requires both endpoints — got "${range}"`,
+      );
+    }
+    if (Number.isNaN(Date.parse(fromIso))) {
+      throw new Error(`filter: date-range fromIso="${fromIso}" is not parseable as ISO 8601`);
+    }
+    if (Number.isNaN(Date.parse(toIso))) {
+      throw new Error(`filter: date-range toIso="${toIso}" is not parseable as ISO 8601`);
+    }
+    if (Date.parse(fromIso) > Date.parse(toIso)) {
+      throw new Error(`filter: date-range fromIso="${fromIso}" is later than toIso="${toIso}"`);
+    }
+    return { kind: "date-range", fromIso, toIso };
+  }
+  throw new Error(
+    `filter: unrecognised form "${value}" — must be "all", "senders=<csv>", or "date-range=<isoFrom>..<isoTo>"`,
+  );
+}
+/**
+ * Apply a parsed Filter to ParsedLine[]. Returns a new array of kept lines
+ * with the parser's original `sequenceIndex` preserved (the filter never
+ * reorders). ingest.mjs re-stamps `sequenceIndex` to its post-filter position
+ * during row construction for archive-write — re-stamping here too would be
+ * redundant.
+ */
+export function applyFilter(
+  parsedLines: readonly ParsedLine[],
+  filter: Filter,
+): ParsedLine[] {
+  const predicate = makePredicate(filter);
+  const kept: ParsedLine[] = [];
+  for (const line of parsedLines) {
+    if (!predicate(line)) continue;
+    kept.push(line);
+  }
+  return kept;
+}
+function makePredicate(filter: Filter): (line: ParsedLine) => boolean {
+  if (filter.kind === "all") return () => true;
+  if (filter.kind === "senders") {
+    const allow = new Set(filter.senders);
+    return (line) => allow.has(line.senderName);
+  }
+  // date-range: inclusive on both ends. Date-only endpoints widen to whole-
+  // day semantics: `from=YYYY-MM-DD` → `T00:00:00Z`, `to=YYYY-MM-DD` →
+  // `T23:59:59.999Z`. Full ISO 8601 endpoints with `T` are passed through.
+  // Without this widening, `--filter date-range=2024-01-01..2024-06-30`
+  // would silently drop every message later than 2024-06-30T00:00:00Z on the
+  // last day — a UX trap that contradicts the operator's reading.
+  const fromMs = parseRangeEndpoint(filter.fromIso, "start");
+  const toMs = parseRangeEndpoint(filter.toIso, "end");
+  return (line) => {
+    const ms = Date.parse(line.dateSent);
+    return ms >= fromMs && ms <= toMs;
+  };
+}
+function parseRangeEndpoint(iso: string, edge: "start" | "end"): number {
+  if (/T/.test(iso)) return Date.parse(iso);
+  // Date-only — widen to whole-day inclusive on the requested edge.
+  const suffix = edge === "start" ? "T00:00:00.000Z" : "T23:59:59.999Z";
+  return Date.parse(iso + suffix);
+}

package/payload/platform/plugins/whatsapp-import/lib/src/index.ts CHANGED Viewed

@@ -5,3 +5,15 @@ export type {
   ParseExportCounters,
   ParsedLine,
 } from "./parse-export.js";
+export { parseFilterArg, applyFilter } from "./filter.js";
+export type { Filter } from "./filter.js";
+export {
+  normaliseSenderName,
+  sha256Hex,
+  deriveMessageId,
+  observationContentHash,
+} from "./derive-keys.js";
+export type {
+  DeriveMessageIdInput,
+  ObservationContentFields,
+} from "./derive-keys.js";

package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 name: whatsapp-import
-description: Import a WhatsApp `_chat.txt` export into a {{productName}} Neo4j graph as a Conversation with chronologically-chained Messages, then derive typed insights (mentions, tasks, preferences, observed relationships) as first-class graph entities. Triggers when the user asks to import a WhatsApp chat, ingest a `_chat.txt` file, or drops the contents of an "Export Chat" folder into chat. Distinct from the live `whatsapp` plugin (Baileys); this is import-from-export only.
+description: Phase 1 of the WhatsApp `_chat.txt` ingest contract — deterministic, LLM-free. Preview the archive (parsed counts, date range, sender histogram), ask the operator to choose a filter (`all`, `senders=<csv>`, `date-range=<isoFrom>..<isoTo>`), then write Conversation + Messages + NEXT chain + auto-Person participants via the single Bash entry `whatsapp-ingest.sh`. NO observations and NO LLM at this phase — semantic enrichment lives in the `whatsapp-import-enrich` skill (Phase 2). Triggers when the user asks to import a WhatsApp chat, ingest a `_chat.txt` file, or drops the contents of an "Export Chat" folder into chat. Distinct from the live `whatsapp` plugin (Baileys); this is import-from-export only.
 ---
-# WhatsApp Import
+# WhatsApp Import — Phase 1 (Load)
-Ingests a WhatsApp "Export Chat" archive — `_chat.txt` plus media attachments — into a {{productName}} Neo4j graph in one in-process pipeline: parse → archive-write → insight (Haiku via OAuth). The deterministic ingest is delivered as a single Bash entry; the database-operator subagent invokes it once and yields with the operator-facing summary.
+Phase 1 of the two-phase WhatsApp ingest contract. Deterministic only: parse → preview → operator-supplied filter → archive-write. NO LLM is invoked at this phase. The chunked Haiku insight pass moved to Phase 2 (`whatsapp-import-enrich` skill) so one ingest cannot blow the operator's context window with `:Observation` enumeration prose.
 ## Owner confirmation (mandatory first step)
-A WhatsApp export belongs to exactly one operator (the person whose phone produced the export). The owner is metadata stamped on the `:Conversation` node — the row-level participants are auto-created by the script and promoted in a later semantic-enrichment pass.
+A WhatsApp export belongs to exactly one operator (the person whose phone produced the export). The owner is metadata stamped on the `:Conversation` node — the row-level participants are auto-created by the script and promoted in Phase 2.
 1. List every `:AdminUser` in the graph via `mcp__graph__maxy-graph-read_neo4j_cypher`:
    `MATCH (u:AdminUser) RETURN elementId(u) AS elementId, u.name AS name, u.userId AS userId, u.accountId AS accountId`
@@ -17,58 +17,113 @@ A WhatsApp export belongs to exactly one operator (the person whose phone produc
 3. Echo the chosen owner back verbatim. Require explicit yes/no confirmation.
 4. Persist the resolved owner's `elementId` for the script invocation as `--owner-element-id`.
-## Invoke the ingest
+## Step 1 — preview (mandatory before any write)
+Call `mcp__memory__whatsapp-export-preview` with the operator-supplied path:
+```json
+{
+  "filePath": "/abs/path/to/_chat.txt",
+  "timezone": "Europe/London"
+}
+```
+Returns: `{conversationSha256, archiveSourceFile, archiveBytes, parsed, mediaSkipped, systemSkipped, totalMessages, dateRange:{first,last}, senders:[{name,messageCount}, …]}`. No Cypher writes; the call is read-only and does NOT touch Neo4j.
+Surface to the operator as one chat message — counters and the histogram, no prose:
+> Preview of `<archive>`: `<parsed>` messages parsed, `<mediaSkipped>` media skipped, `<systemSkipped>` system skipped. Date range: `<first>` → `<last>`. Senders (top by count): `Joel (812), Adam (895)`. File hash: `<conversationSha256>` (`<archiveBytes>` bytes).
+## Step 2 — operator chooses a filter
+Ask exactly: "Filter to apply: `all`, `senders=<csv>`, or `date-range=<isoFrom>..<isoTo>`?" — no defaults, no menu of "or shall I just write everything". The operator picks one of the three forms verbatim:
+| Filter | Effect |
+|--------|--------|
+| `all` | Write every parsed row. Operator's explicit "I want the full archive" choice. |
+| `senders=Alice,Bob Carter` | Keep only rows whose senderName matches one of the comma-separated names exactly (whitespace trimmed). |
+| `date-range=2024-01-01..2024-06-30` | Keep only rows whose `dateSent` falls inside the inclusive range (date-only or full ISO 8601 endpoints both accepted). |
+Echo the chosen filter back; require explicit yes/no confirmation before the write.
+## Step 3 — archive-write
 Single Bash call:
 ```bash
 bash platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh <archive.zip|dir|_chat.txt> \
   --owner-element-id <id> \
-  --scope <admin|public>
+  --scope <admin|public> \
+  --filter <all|senders=<csv>|date-range=<isoFrom>..<isoTo>>
 ```
 Optional flags:
 - `--account-id <id>` — explicit account id when more than one exists under `data/accounts/` (Phase 0 has one).
 - `--timezone <iana>` — IANA zone for timestamps (default `Europe/London`).
 - `--date-format <DD/MM/YY|MM/DD/YY|DD/MM/YYYY|MM/DD/YYYY>` — override auto-detect for ambiguous locales.
-- `--no-insight` — skip the Haiku insight pass (parse + archive-write only).
 The script:
 - Unzips the archive if needed; locates `_chat.txt`.
 - Parses the file deterministically (year shape, sender/body grammar, timezone offset).
-- Auto-creates one `:Person {participantStatus:'auto-created'}` per distinct senderName, scoped to the account, MERGEd on `(accountId, source, name)`.
+- Applies the operator-supplied filter to `parseResult.parsedLines` BEFORE archive-write.
+- Auto-creates one `:Person {participantStatus:'auto-created'}` per distinct senderName in the filtered set, scoped to the account, MERGEd on `(accountId, source, name)`.
 - Writes the Conversation + Messages + edges + NEXT chronology via `memoryArchiveWrite` directly (no MCP envelope between steps).
-- Runs the insight pass: chunked Haiku calls (1500 messages per chunk), in-process Cypher writes of `:Observation` nodes connected `:OBSERVED_IN`→`:Conversation`. Insight failures log and proceed; archive-write success is what determines exit code.
-## Outcome
+NO insight pass runs. The `--no-insight` flag of older releases is gone — Phase 1 always means parse + filter + archive-write, nothing else.
+## Phase 1 agent-return — counters only
+Stdout JSON shape (success — full diagnostic counters per Task 871 success criterion 5):
+```json
+{
+  "conversationElementId": "4:abcd…:42",
+  "conversationId": "whatsapp-export:<sha>:<accountId>",
+  "parsed": 1707,
+  "mediaSkipped": 0,
+  "systemSkipped": 0,
+  "filtered": 1707,
+  "written": 1707,
+  "messagesAlreadyExisted": 0,
+  "nextEdgesProcessed": 1706,
+  "nextEdgesCreated": 1706,
+  "participantsAlreadyExisted": 0,
+  "ms": 6800
+}
+```
+Surface to the admin agent as exactly one message (the agent-return shape per Task 871 success criterion 6 — counters first, one sentence pointing at the Phase 2 surface):
+> Imported `<written>` messages from `<archive>` into conversation `<conversationElementId>` (`<conversationId>`); already existed: `<messagesAlreadyExisted>`; NEXT edges created: `<nextEdgesCreated>`. Use `mcp__memory__whatsapp-export-preview` for any future re-import preview; trigger semantic enrichment via the `whatsapp-import-enrich` skill ("enrich the `<chat-name>` chat") when ready.
+NO inline enumeration of mention/task/preference/relationship counts. NO multi-paragraph "ask to enrich" prose. The above shape is load-bearing — the brief's `feedback_concision_over_completeness.md` and the Task-871 root incident (one ingest blew the operator's context with the count enumeration) require this discipline.
+### Re-import signal
+A second invocation against the same archive should report `messagesAlreadyExisted > 0 AND written > 0` (after Task 870's stable-messageId contract lands; pre-Task 870 the messageId is unstable and re-imports double). The subagent asserts both counters appear non-trivially before claiming a re-import landed cleanly.
+## Failure path — single FAIL line
-- **Exit 0** + JSON summary on stdout:
-  ```json
-  {"conversationId": "whatsapp-export:<sha>:<accountId>",
-   "parsed": 1707, "mediaSkipped": 0, "systemSkipped": 0,
-   "createdMessages": 1707,
-   "insightCounters": {"chunks": 2, "mentions": 12, "tasks": 3, "preferences": 1, "observedRelationships": 0},
-   "ms": 67000}
-  ```
-  Surface this to the operator as one chat message: `Imported 1707 messages from <archive> into conversation <conversationId>; insights: 12 mentions, 3 tasks, 1 preference, 0 relationships.`
+- **Exit non-zero** + one stderr line: `[whatsapp-ingest] FAIL phase=<argv|filter|parse|archive-write|import|uncaught> reason="<sanitised first 80c>" ...`. Surface this verbatim to the operator and yield. **Do not retry. Do not edit parser source.** The archive-ingest-surface-gate denies parser-source edits, JS test runners, and the legacy `whatsapp-export-parse` / `whatsapp-export-insight-write` / `memory-archive-write{archiveType:whatsapp-export}` MCP tools — none of those are escape hatches in your surface.
-- **Exit non-zero** + one stderr line: `[whatsapp-ingest] FAIL phase=<parse|archive-write|argv|import|uncaught> reason="<sanitised first 80c>" ...`. Surface this verbatim to the operator and yield. **Do not retry. Do not edit parser source.** The archive-ingest-surface-gate denies parser-source edits, JS test runners, and the legacy `whatsapp-export-parse` / `whatsapp-export-insight-write` / `memory-archive-write{archiveType:whatsapp-export}` MCP tools — none of those are escape hatches in your surface.
+Missing `--filter` emits the pinned line `[whatsapp-ingest] FAIL filter-required reason="bulk-archive-gate (Task 871) — operator must specify --filter (one of all, senders=<csv>, date-range=<isoFrom>..<isoTo>)"`. Re-invoke with the operator's chosen filter — never fabricate a default.
 ## Idempotency
-Re-running the script against the same archive is a no-op: `createdMessages: 0`, NEXT chain unchanged, conversation scalars refreshed via `lastImportedAt` / `lastImportedBySession`. Re-exports with appended messages add only the delta and extend the NEXT chain to cover the new tail.
+Re-running with the same `<archive>` + `--filter` is a no-op once Task 870's stable-messageId contract lands: `written: 0`, `nextEdgesCreated: 0`, conversation scalars refreshed via `lastImportedAt` / `lastImportedBySession`. Re-exports with appended messages add only the delta and extend the NEXT chain. Pre-Task-870 the messageId is unstable (`hashLine` is a 32-bit FNV collapse; `sequenceIndex` is array-position) so re-imports double the message set — Task 870 is the natural-key fix that makes the contract real.
 ## Verification (post-write)
 Run via `mcp__graph__maxy-graph-read_neo4j_cypher`:
 - `MATCH (c:Conversation:WhatsAppConversation {conversationId: $cid}) RETURN c.messageCount, c.participantCount, c.firstMessageAt, c.lastMessageAt` — agrees with the JSON summary.
-- `MATCH (m:Message)-[:PART_OF]->(c {conversationId: $cid}) RETURN count(m)` — equals `parsed`.
-- `MATCH p=(:Message {conversationId: $cid})-[:NEXT*]->() WITH max(length(p)) AS chain RETURN chain` — equals `createdMessages - 1` (or the Conversation's `messageCount - 1`).
-- `MATCH (o:Observation {createdBySession: $sessionId})-[:OBSERVED_IN]->(:Conversation {conversationId: $cid}) RETURN o.kind, count(*)` — agrees with `insightCounters`.
+- `MATCH (m:Message)-[:PART_OF]->(c {conversationId: $cid}) RETURN count(m)` — equals `written + messagesAlreadyExisted` (post-filter).
+- `MATCH p=(:Message {conversationId: $cid})-[:NEXT*]->() WITH max(length(p)) AS chain RETURN chain` — equals `messageCount - 1`.
+- Phase 1 wrote ZERO observations: `MATCH (o:Observation)-[:OBSERVED_IN]->(:Conversation {conversationId: $cid}) RETURN count(o)` — should be 0 immediately after Phase 1. Observations land only when the operator triggers Phase 2.
 ## What this is not
 - **Not** the live `whatsapp` plugin. That plugin (Baileys QR pairing) holds messages in an in-memory store cleared on restart. This plugin imports historical exports into Neo4j as persistent graph nodes.
 - **Not** a media-transcription pipeline. Voice notes, photos, PDFs are skipped at parse with a counter logged.
-- **Not** the operator-level semantic enrichment pass. The auto-created participants and `:Observation` nodes are deliberately raw — disambiguation, edge wiring to specific `:Person` / `:Task` / `:Service` entities, and operator confirmation of `:RELATED_TO`-style relationships are the responsibility of a later pass (Task 859).
+- **Not** the operator-level semantic enrichment pass. Auto-created participants and `:Observation` nodes are deliberately raw — Phase 2 (`whatsapp-import-enrich`) lays down the observations via `whatsapp-export-insight-pass` and walks them through operator-confirmed wiring.
+- **Not** an LLM entry. Phase 1 has no Haiku call, no OAuth call, no model surface. The single sanctioned LLM entry for WhatsApp ingest is `mcp__memory__whatsapp-export-insight-pass`, invoked by the Phase 2 skill.

package/payload/platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md CHANGED Viewed

@@ -1,16 +1,34 @@
 ---
 name: whatsapp-import-enrich
-description: Operator-driven semantic enrichment pass over an already-loaded WhatsApp Conversation. Walks `:Person {participantStatus:'auto-created'}` and `:Observation {observationStatus:'auto-extracted'}` rows scoped to the chosen conversation, surfaces evidence per row, and writes operator-confirmed wiring (participant promotion/merge, `:MENTIONS` / `:RELATED_TO` edges, `:Task` and `:Preference` nodes). Triggers on operator phrases like "enrich the X chat", "promote the auto-created participants from Y", "wire the observations from yesterday's import". Runs against a Conversation already imported by `whatsapp-import` (Task 855); never re-runs parse.
+description: Operator-driven semantic enrichment pass over an already-loaded WhatsApp Conversation. Owns the LLM half of the WhatsApp ingest pipeline (Task 871) — first runs `mcp__memory__whatsapp-export-insight-pass` (chunkSize=50, overlap=5, server-side confidence>=0.8 gate) to lay down `:Observation {observationStatus:'auto-extracted'}` rows, then walks `:Person {participantStatus:'auto-created'}` and the auto-extracted observations, surfaces evidence per row, and writes operator-confirmed wiring (participant promotion/merge, `:MENTIONS` / `:RELATED_TO` edges, `:Task` and `:Preference` nodes). Triggers on operator phrases like "enrich the X chat", "promote the auto-created participants from Y", "wire the observations from yesterday's import". Runs against a Conversation already imported by `whatsapp-import` (Task 855 + Task 871 Phase 1); never re-runs parse.
 ---
 # WhatsApp Import — Enrich
-Phase 2 of the two-phase WhatsApp ingest contract. Phase 1 (`whatsapp-import`) is the deterministic Bash entry that lands raw shape: Conversation + Messages + chronological NEXT chain + auto-created `:Person` participants + raw `:Observation` nodes. Phase 2 (this skill) is operator-driven semantic resolution: disambiguates participants against the live graph, wires observations to typed entities, and reattributes the operator's own messages from the auto-Person to their `:AdminUser`.
+Phase 2 of the two-phase WhatsApp ingest contract. Phase 1 (`whatsapp-import`) is the deterministic, LLM-FREE Bash entry that lands raw shape: Conversation + Messages + chronological NEXT chain + auto-created `:Person` participants. Phase 2 (this skill) owns the LLM half: it runs the chunked Haiku insight pass on demand to lay down `:Observation` nodes, then operator-driven semantic resolution disambiguates participants, wires observations to typed entities, and reattributes the operator's own messages from the auto-Person to their `:AdminUser`.
+The split was the Task 871 collapse: Phase 1 used to run the insight pass inline (1500 msgs/chunk, no operator gate), which polluted the parent's tool_result with `:Observation` enumeration prose and blew operator context (`max-turns-retry-budget-exhausted`, log `0d5442b4`). Phase 1 is now mute on insights; this skill triggers them consciously with `mcp__memory__whatsapp-export-insight-pass` when the operator asks.
 ## When this applies
 The operator triggers this skill against a single, already-loaded `:Conversation:WhatsAppConversation`. Acceptable phrases include any reference to enriching, promoting participants from, or wiring observations against a conversation the operator can name (display name, recent timestamp, conversationId). When the conversation reference is ambiguous, list the recent WhatsApp conversations and require operator selection before any walk begins. Never run against a conversation whose `whatsapp-import` Phase 1 has not completed (`MATCH (c:WhatsAppConversation {conversationId:$cid}) WHERE c.lastImportedAt IS NULL` is a blocker — surface "Phase 1 has not completed for <cid>; run whatsapp-import first" and yield).
+## Step 0 — run the chunked Haiku insight pass (Phase 2a)
+Phase 1 writes ZERO `:Observation` rows. Before any walk, lay them down via `mcp__memory__whatsapp-export-insight-pass`:
+```json
+{ "conversationId": "whatsapp-export:<sha>:<accountId>" }
+```
+The tool walks the Messages of the conversation in chronological order, chunks them at **chunkSize=50** with **overlap=5** (vs the Task 855 implementation's 1500 msgs/chunk that lost per-message attention), runs Haiku per chunk, applies a server-side `confidence>=0.8` gate, and MERGE-keys `:Observation` rows. Returns `{conversationId, chunks, chunkSize, overlap, confidenceThreshold, totals:{mentions, tasks, preferences, observedRelationships, rejectedLowConfidence, written}, ms}`.
+Surface to the operator as one chat message — counters only, no enumeration:
+> Insight pass complete on `<conversationId>`: `<chunks>` chunks at chunkSize=50 / overlap=5 / confidenceThreshold=0.8. Wrote `<written>` observations (`<mentions>` mentions, `<tasks>` tasks, `<preferences>` preferences, `<observedRelationships>` relationships); rejected `<rejectedLowConfidence>` low-confidence items.
+Idempotent — re-running collapses identical `(conversationId, sourceMessageRef, kind, contentHash)` tuples into one row (Task 870 contract). Re-runs are safe; the operator can tune the conversation by re-importing extra rows in Phase 1, then re-running the pass here.
 ## Bulk preview (mandatory, before any walk)
 Before walking a single row, count the work and offer a yield. Two read-only Cyphers via `mcp__graph__maxy-graph-read_neo4j_cypher`:
@@ -297,6 +315,7 @@ Every line emitted to chat is mirrored into the per-conversation agent-stream lo
 Every prescribed tool resolves on database-operator's frontmatter `tools:` list. The pre-publish gate `platform/scripts/verify-skill-tool-surface.sh` asserts this statically:
+- `mcp__memory__whatsapp-export-insight-pass` — Phase 2a chunked-Haiku insight extraction (chunkSize=50, overlap=5, confidence>=0.8). Lays down `:Observation` rows the rest of this skill walks. Owns the LLM half of WhatsApp ingest — Phase 1 has none.
 - `mcp__graph__maxy-graph-read_neo4j_cypher` — bulk preview, evidence reads, messageId recovery, owner-reconciliation lookup.
 - `mcp__graph__maxy-graph-write_neo4j_cypher` — `apoc.refactor.mergeNodes`, `:MENTIONS` and `:RELATED_TO` MERGEs, status-update SETs.
 - `mcp__memory__memory-search` — entity disambiguation for mentions and observed-relationship endpoints.
@@ -308,7 +327,7 @@ Raw Cypher and `cypher-shell` are forbidden in this skill (per [database-operato
 ## What this is not
-- **Not** Phase 1. Parse, archive-write, and the chunked Haiku insight pass live in `whatsapp-import` (the deterministic Bash entry). This skill never re-parses; it only transitions already-loaded state.
+- **Not** Phase 1. Parse and archive-write live in `whatsapp-import` (the deterministic Bash entry, LLM-FREE). This skill never re-parses. The Haiku insight pass moved here in Task 871 — Step 0 above is the one sanctioned LLM entry for WhatsApp ingest, and it is invoked consciously by the operator, not silently on archive-write.
 - **Not** automatic. Every transition out of `auto-created` / `auto-extracted` requires an operator action — no auto-promotion, no auto-mention-acceptance, no batch confirmation. Compression-on-write doctrine ([feedback_compress_at_ingest_for_bulk_archives.md](../../../../../.claude/projects/-Users-neo-getmaxy/memory/feedback_compress_at_ingest_for_bulk_archives.md)) requires per-row operator judgement.
 - **Not** cross-conversation. The walk is scoped to one Conversation. Cross-conversation participant deduplication (the same person under two conversations) is operator-driven graph hygiene via [database-operator.md §Dedup merges](../../../../templates/specialists/agents/database-operator.md#dedup-merges), not this skill.
 - **Not** a backfill tool. Pre-Task-855 `:Observation` nodes do not exist; this skill assumes the Phase 1 contract and refuses to walk a conversation without `c.lastImportedAt`.

package/payload/platform/templates/specialists/agents/database-operator.md CHANGED Viewed

@@ -3,7 +3,7 @@ name: database-operator
 description: "Document and archive ingestion and ad-hoc graph operations — running the universal `document-ingest` skill for any unstructured document (PDF, text, transcript, web page, audio, video) and per-source archive-import skills (LinkedIn Basic Data Export today; CRM-type seed archives as each plugin ships), plus operator-driven graph hygiene (prune orphans, deduplicate entities, add edges, normalise labels). Delegate when the operator uploads any document, drops an archive directory into chat, or asks for any graph operation that is not a routine per-turn write."
 summary: "Ingests every unstructured document and external archive into your graph (LinkedIn today; other CRM sources in future) and handles ad-hoc graph tidy-ups on request. For example, when you upload a CV, a pricing guide, or a contract; when you drop a LinkedIn export folder into chat; or when you ask to prune orphan nodes, merge duplicate people, or add edges between entities."
 model: claude-sonnet-4-6
-tools: Read, Bash, Glob, Grep, mcp__graph__maxy-graph-read_neo4j_cypher, mcp__graph__maxy-graph-write_neo4j_cypher, mcp__graph__maxy-graph-get_neo4j_schema, mcp__memory__memory-write, mcp__memory__memory-update, mcp__memory__memory-delete, mcp__memory__memory-search, mcp__memory__memory-rank, mcp__memory__memory-reindex, mcp__memory__memory-find-candidates, mcp__memory__memory-ingest, mcp__memory__memory-ingest-extract, mcp__memory__memory-ingest-web, mcp__memory__memory-classify, mcp__memory__memory-archive-write, mcp__memory__graph-prune-denylist-list, mcp__memory__graph-prune-denylist-add, mcp__memory__graph-prune-denylist-remove, mcp__contacts__contact-create, mcp__contacts__contact-update, mcp__contacts__contact-lookup, mcp__contacts__contact-list, mcp__tasks__task-create, mcp__admin__file-attach, mcp__admin__plugin-read
+tools: Read, Bash, Glob, Grep, mcp__graph__maxy-graph-read_neo4j_cypher, mcp__graph__maxy-graph-write_neo4j_cypher, mcp__graph__maxy-graph-get_neo4j_schema, mcp__memory__memory-write, mcp__memory__memory-update, mcp__memory__memory-delete, mcp__memory__memory-search, mcp__memory__memory-rank, mcp__memory__memory-reindex, mcp__memory__memory-find-candidates, mcp__memory__memory-ingest, mcp__memory__memory-ingest-extract, mcp__memory__memory-ingest-web, mcp__memory__memory-classify, mcp__memory__memory-archive-write, mcp__memory__whatsapp-export-preview, mcp__memory__whatsapp-export-insight-pass, mcp__memory__graph-prune-denylist-list, mcp__memory__graph-prune-denylist-add, mcp__memory__graph-prune-denylist-remove, mcp__contacts__contact-create, mcp__contacts__contact-update, mcp__contacts__contact-lookup, mcp__contacts__contact-list, mcp__tasks__task-create, mcp__admin__file-attach, mcp__admin__plugin-read
 ---
 # Database Operator
@@ -119,9 +119,14 @@ The classifier maps document sections to typed ontology labels. It does not inve
 Per-source archive imports keep their own skill because their CSVs already encode entity types deterministically and need no LLM classifier. Currently shipped:
 - **linkedin-import** — LinkedIn Basic Data Export. Ships with references for `Profile.csv` and `Connections.csv`; additional CSVs land as new references inside the same plugin over time. Path: `platform/plugins/linkedin-import/skills/linkedin-import/SKILL.md`. Load via `plugin-read` before any ingestion.
-- **whatsapp-import** — WhatsApp `_chat.txt` export ingestion. **Two-phase contract** (Task 855 + Task 859):
-  - **Phase 1 — load** (`whatsapp-import` skill). The single deterministic Bash entry at `platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh` parses the archive, writes Conversation + Messages with chronological NEXT chain, auto-creates one `:Person {participantStatus:'auto-created'}` per distinct senderName, and runs the chunked Haiku insight pass that lands `:Observation {observationStatus:'auto-extracted'}` rows connected `:OBSERVED_IN`→Conversation. Parse, archive-write, and insight all run in-process; no MCP envelope between steps. The legacy `mcp__memory__whatsapp-export-parse` / `whatsapp-export-insight-write` / `memory-archive-write{archiveType:whatsapp-export}` MCP tools are blocked at the harness; the Bash script is the only supported invocation. SKILL: `platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md`.
-  - **Phase 2 — enrich** (`whatsapp-import-enrich` skill). Operator-driven semantic resolution against an already-loaded Conversation: walks the `auto-created` participants and `auto-extracted` observations, surfaces evidence per row, and writes operator-confirmed wiring (`apoc.refactor.mergeNodes` for participant promotion/merge, `:MENTIONS` and `:RELATED_TO` edges with `evidenceSnippet`/`evidenceMessageIds`, `:Task` via `mcp__tasks__task-create`, `:Preference` via `memory-write`). Idempotent — re-running surfaces only items still in `auto-created`/`auto-extracted` state. SKILL: `platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md`.
+- **whatsapp-import** — WhatsApp `_chat.txt` export ingestion. **Two-phase contract** (Task 855 + Task 871 — Phase 1 deterministic, Phase 2 operator-triggered):
+  - **Phase 1 — preview-then-filtered-write** (`whatsapp-import` skill). Phase 1 is LLM-FREE. Three steps:
+    1. **Preview** via `mcp__memory__whatsapp-export-preview` — read-only parse that returns `{conversationSha256, parsed, mediaSkipped, systemSkipped, dateRange:{first,last}, senders:[{name,messageCount}], totalMessages, archiveBytes}`. No Cypher writes.
+    2. **Operator chooses a filter.** Surface the preview to the operator and ask: "Filter to apply: `all`, `senders=<csv>`, or `date-range=<isoFrom>..<isoTo>`?". `--filter` is mandatory — the deterministic Bash entry refuses to write without it (`feedback_compress_at_ingest_for_bulk_archives.md`).
+    3. **Archive-write** via `bash platform/plugins/whatsapp-import/bin/whatsapp-ingest.sh <archive> --owner-element-id <id> --scope <admin|public> --filter <chosen>`. Parses, applies the filter, writes Conversation + Messages with chronological NEXT chain, auto-creates one `:Person {participantStatus:'auto-created'}` per distinct senderName. ZERO `:Observation` writes — the LLM insight pass moved to Phase 2.
+    Phase 1 agent-return is COUNTERS ONLY — no inline enumeration of mention/task/preference counts, no multi-paragraph "ask to enrich" prose. Surface as one chat message: the JSON shape `{conversationElementId, conversationId, parsed, written, alreadyExisted, nextEdgesCreated, ms}` plus one sentence: "Preview before any future re-import via `mcp__memory__whatsapp-export-preview`; enrich semantically when ready via the `whatsapp-import-enrich` skill." The legacy `mcp__memory__whatsapp-export-parse` / `whatsapp-export-insight-write` / `memory-archive-write{archiveType:whatsapp-export}` MCP tools remain blocked at the harness; the Bash script is the only supported archive-write invocation. SKILL: `platform/plugins/whatsapp-import/skills/whatsapp-import/SKILL.md`.
+  - **Phase 2 — enrich** (`whatsapp-import-enrich` skill). Operator-triggered ("enrich the X chat"). First runs `mcp__memory__whatsapp-export-insight-pass` against the already-loaded Conversation (chunkSize=50, overlap=5, server-side `confidence>=0.8` gate) to lay down `:Observation {observationStatus:'auto-extracted'}` rows. Then walks the auto-created participants and auto-extracted observations, surfacing evidence per row, and writes operator-confirmed wiring (`apoc.refactor.mergeNodes` for participant promotion/merge, `:MENTIONS` and `:RELATED_TO` edges with `evidenceSnippet`/`evidenceMessageIds`, `:Task` via `mcp__tasks__task-create`, `:Preference` via `memory-write`). Idempotent — re-running surfaces only items still in `auto-created`/`auto-extracted` state. SKILL: `platform/plugins/whatsapp-import/skills/whatsapp-import-enrich/SKILL.md`.
   - Distinct from the live `whatsapp` plugin (Baileys QR pairing, in-memory store). Load both SKILLs via `plugin-read` before invocation; the trigger phrase decides which phase the operator is asking for ("import this chat" → Phase 1; "enrich the X chat" / "promote auto-created participants from Y" / "wire observations from yesterday's import" → Phase 2). Phase 2 refuses to run against a Conversation whose `c.lastImportedAt` is null (Phase 1 never completed).
 Future CRM-type seed plugins (HubSpot, Salesforce, Pipedrive, iCloud contacts, Gmail CSV, etc.) will ship under the same pattern — each as its own opt-in plugin, each with its own `SKILL.md` path under `platform/plugins/<name>/skills/`. When the admin adds a new archive-import skill, its PLUGIN.md will name itself here and in the admin's `<plugin-manifest>`. No prompt change required.