npm - mojulo - Versions diffs - 0.0.0 → 0.1.1 - Mend

mojulo 0.0.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

package/README.md +54 -4
package/lib/audit-logger-new.js +11 -0
package/lib/auth/gate.js +25 -0
package/lib/auth/service.js +17 -0
package/lib/auth/session.js +63 -0
package/lib/builder/chat-processor.js +607 -0
package/lib/builder/composer-bridge.js +82 -0
package/lib/builder/evaluator.js +159 -0
package/lib/builder/executor.js +252 -0
package/lib/builder/index.js +48 -0
package/lib/builder/session.js +248 -0
package/lib/builder/system-prompt.js +422 -0
package/lib/builder/tone-presets.js +75 -0
package/lib/builder/tool-executors.js +1527 -0
package/lib/builder/tools.js +338 -0
package/lib/builder/validators.js +239 -0
package/lib/composer/composer.js +225 -0
package/lib/composer/index.js +40 -0
package/lib/composer/protocols/00_base.txt +19 -0
package/lib/composer/protocols/01_knowledge.txt +9 -0
package/lib/composer/protocols/02_form-gathering.txt +32 -0
package/lib/composer/protocols/03_appointments.txt +16 -0
package/lib/composer/protocols/04_triage.txt +15 -0
package/lib/composer/protocols/05_optical-read.txt +22 -0
package/lib/composer/response-builder.js +98 -0
package/lib/config-builder.js +650 -0
package/lib/db/ids.js +10 -0
package/lib/db/index.js +179 -0
package/lib/db/repositories/apiKeys.js +72 -0
package/lib/db/repositories/auditLogs.js +12 -0
package/lib/db/repositories/botSpaces.js +12 -0
package/lib/db/repositories/builderSessions.js +312 -0
package/lib/db/repositories/deploymentEvents.js +12 -0
package/lib/db/repositories/deployments.js +385 -0
package/lib/db/repositories/documents.js +68 -0
package/lib/db/repositories/mcpJobs.js +84 -0
package/lib/deployers/bot-fleet.js +110 -0
package/lib/deployers/bot-proxy.js +72 -0
package/lib/deployers/build.js +89 -0
package/lib/deployers/cloud-deploy.js +310 -0
package/lib/deployers/docker.js +439 -0
package/lib/deployers/fly.js +432 -0
package/lib/deployers/index.js +38 -0
package/lib/deployment-auth.js +36 -0
package/lib/document-parser.js +171 -0
package/lib/embedder/chunker.js +93 -0
package/lib/embedder/local.js +101 -0
package/lib/embedder/preview-rag.js +93 -0
package/lib/envelope-schema.js +54 -0
package/lib/fleet/scoped-sql.js +342 -0
package/lib/form-schema-config/base.js +135 -0
package/lib/form-schema-config/index.js +286 -0
package/lib/form-schema-config/locales/af-ZA.js +153 -0
package/lib/form-schema-config/locales/ar-AE.js +142 -0
package/lib/form-schema-config/locales/ar-SA.js +164 -0
package/lib/form-schema-config/locales/de-DE.js +152 -0
package/lib/form-schema-config/locales/en-AU.js +161 -0
package/lib/form-schema-config/locales/en-CA.js +115 -0
package/lib/form-schema-config/locales/en-GB.js +132 -0
package/lib/form-schema-config/locales/en-IN.js +219 -0
package/lib/form-schema-config/locales/en-MY.js +171 -0
package/lib/form-schema-config/locales/en-NG.js +198 -0
package/lib/form-schema-config/locales/en-PH.js +186 -0
package/lib/form-schema-config/locales/en-SG.js +153 -0
package/lib/form-schema-config/locales/en-US.js +138 -0
package/lib/form-schema-config/locales/es-ES.js +171 -0
package/lib/form-schema-config/locales/es-MX.js +193 -0
package/lib/form-schema-config/locales/fr-CA.js +138 -0
package/lib/form-schema-config/locales/fr-FR.js +155 -0
package/lib/form-schema-config/locales/hi-IN.js +219 -0
package/lib/form-schema-config/locales/it-IT.js +157 -0
package/lib/form-schema-config/locales/ja-JP.js +169 -0
package/lib/form-schema-config/locales/ko-KR.js +140 -0
package/lib/form-schema-config/locales/nl-NL.js +149 -0
package/lib/form-schema-config/locales/pt-BR.js +168 -0
package/lib/form-schema-config/locales/zh-CN.js +172 -0
package/lib/form-schema-config/locales/zh-HK.js +142 -0
package/lib/form-structure-schema.js +191 -0
package/lib/llm-providers.js +828 -0
package/lib/markdown.js +197 -0
package/lib/mcp/catalysts/appointment-to-calendar.md +84 -0
package/lib/mcp/catalysts/conversations-to-channel-digest.md +104 -0
package/lib/mcp/catalysts/document-extract-to-store.md +92 -0
package/lib/mcp/catalysts/knowledge-gap-miner.md +96 -0
package/lib/mcp/catalysts/loader.js +144 -0
package/lib/mcp/catalysts/qualify-lead-to-crm.md +83 -0
package/lib/mcp/catalysts/scan-conversations-for-signal.md +92 -0
package/lib/mcp/catalysts/submission-to-ticket.md +83 -0
package/lib/mcp/catalysts/submissions-to-warehouse.md +103 -0
package/lib/mcp/catalysts/weekly-submissions-digest.md +82 -0
package/lib/mcp/jobs.js +64 -0
package/lib/mcp/server.js +184 -0
package/lib/mcp/session-binding.js +130 -0
package/lib/mcp/tools/build.js +123 -0
package/lib/mcp/tools/catalysts.js +477 -0
package/lib/mcp/tools/context.js +325 -0
package/lib/mcp/tools/fleet.js +391 -0
package/lib/mcp/tools/jobs-tools.js +240 -0
package/lib/mcp/tools/operate.js +314 -0
package/lib/preview/build-preview-config.js +136 -0
package/lib/rate-limiter.js +11 -0
package/lib/resolve-api-key.js +142 -0
package/lib/storage/index.js +40 -0
package/messages/de.json +2136 -0
package/messages/en.json +2136 -0
package/messages/es.json +2136 -0
package/messages/fr.json +2136 -0
package/messages/it.json +2136 -0
package/messages/ja.json +2136 -0
package/messages/ko.json +2136 -0
package/messages/nl.json +2136 -0
package/messages/pl.json +2136 -0
package/messages/pt.json +2136 -0
package/messages/ru.json +2136 -0
package/messages/uk.json +2136 -0
package/messages/zh.json +2136 -0
package/package.json +68 -5
package/scripts/mcp-config.mjs +162 -0
package/scripts/mcp-stdio-loader.mjs +42 -0
package/scripts/mcp-stdio.mjs +108 -0
package/scripts/mojulo-paths.mjs +48 -0

package/lib/markdown.js ADDED Viewed

@@ -0,0 +1,197 @@
+/**
+ * Simple markdown to HTML converter for legal documents
+ * Handles: headers, tables, lists, bold, code, links, horizontal rules
+ */
+export function markdownToHtml(markdown) {
+  let html = markdown;
+  // Escape HTML entities first (except for our own tags)
+  html = html
+    .replace(/&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;');
+  // Headers (must be at start of line)
+  html = html.replace(/^### (.+)$/gm, '<h3>$1</h3>');
+  html = html.replace(/^## (.+)$/gm, '<h2>$1</h2>');
+  html = html.replace(/^# (.+)$/gm, '<h1>$1</h1>');
+  // Bold and italic
+  html = html.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>');
+  html = html.replace(/\*(.+?)\*/g, '<em>$1</em>');
+  // Inline code
+  html = html.replace(/`([^`]+)`/g, '<code>$1</code>');
+  // Links
+  html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2">$1</a>');
+  // Horizontal rules
+  html = html.replace(/^---$/gm, '<hr />');
+  // Tables
+  html = convertTables(html);
+  // Lists (unordered)
+  html = convertLists(html);
+  // Paragraphs - wrap remaining text blocks
+  html = convertParagraphs(html);
+  return html;
+}
+function convertTables(html) {
+  const lines = html.split('\n');
+  const result = [];
+  let inTable = false;
+  let tableRows = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    const isTableRow = line.trim().startsWith('|') && line.trim().endsWith('|');
+    const isSeparator = /^\|[-:\s|]+\|$/.test(line.trim());
+    if (isTableRow && !isSeparator) {
+      if (!inTable) {
+        inTable = true;
+        tableRows = [];
+      }
+      tableRows.push(line);
+    } else if (isSeparator && inTable) {
+      // Skip separator row
+      continue;
+    } else {
+      if (inTable) {
+        // End of table, convert it
+        result.push(buildTable(tableRows));
+        inTable = false;
+        tableRows = [];
+      }
+      result.push(line);
+    }
+  }
+  // Handle table at end of content
+  if (inTable) {
+    result.push(buildTable(tableRows));
+  }
+  return result.join('\n');
+}
+function buildTable(rows) {
+  if (rows.length === 0) return '';
+  let html = '<table>';
+  rows.forEach((row, index) => {
+    const cells = row
+      .split('|')
+      .slice(1, -1)
+      .map((cell) => cell.trim());
+    if (index === 0) {
+      html += '<thead><tr>';
+      cells.forEach((cell) => {
+        html += `<th>${cell}</th>`;
+      });
+      html += '</tr></thead><tbody>';
+    } else {
+      html += '<tr>';
+      cells.forEach((cell) => {
+        html += `<td>${cell}</td>`;
+      });
+      html += '</tr>';
+    }
+  });
+  html += '</tbody></table>';
+  return html;
+}
+function convertLists(html) {
+  const lines = html.split('\n');
+  const result = [];
+  let inList = false;
+  for (const line of lines) {
+    const listMatch = line.match(/^[-*]\s+(.+)$/);
+    const numberedMatch = line.match(/^\d+\.\s+(.+)$/);
+    if (listMatch) {
+      if (!inList) {
+        result.push('<ul>');
+        inList = 'ul';
+      }
+      result.push(`<li>${listMatch[1]}</li>`);
+    } else if (numberedMatch) {
+      if (!inList) {
+        result.push('<ol>');
+        inList = 'ol';
+      }
+      result.push(`<li>${numberedMatch[1]}</li>`);
+    } else {
+      if (inList) {
+        result.push(inList === 'ul' ? '</ul>' : '</ol>');
+        inList = false;
+      }
+      result.push(line);
+    }
+  }
+  if (inList) {
+    result.push(inList === 'ul' ? '</ul>' : '</ol>');
+  }
+  return result.join('\n');
+}
+function convertParagraphs(html) {
+  const lines = html.split('\n');
+  const result = [];
+  let paragraph = [];
+  const isBlockElement = (line) => {
+    const trimmed = line.trim();
+    return (
+      trimmed === '' ||
+      trimmed.startsWith('<h') ||
+      trimmed.startsWith('<table') ||
+      trimmed.startsWith('<ul') ||
+      trimmed.startsWith('<ol') ||
+      trimmed.startsWith('<li') ||
+      trimmed.startsWith('</') ||
+      trimmed.startsWith('<hr') ||
+      trimmed === '</ul>' ||
+      trimmed === '</ol>' ||
+      trimmed === '</table>' ||
+      trimmed === '</thead>' ||
+      trimmed === '</tbody>' ||
+      trimmed === '<tbody>' ||
+      trimmed.startsWith('<tr') ||
+      trimmed.startsWith('<th') ||
+      trimmed.startsWith('<td')
+    );
+  };
+  for (const line of lines) {
+    if (isBlockElement(line)) {
+      if (paragraph.length > 0) {
+        result.push(`<p>${paragraph.join(' ')}</p>`);
+        paragraph = [];
+      }
+      if (line.trim() !== '') {
+        result.push(line);
+      }
+    } else {
+      paragraph.push(line);
+    }
+  }
+  if (paragraph.length > 0) {
+    result.push(`<p>${paragraph.join(' ')}</p>`);
+  }
+  return result.join('\n');
+}

package/lib/mcp/catalysts/appointment-to-calendar.md ADDED Viewed

@@ -0,0 +1,84 @@
+---
+{
+  "id": "appointment-to-calendar",
+  "name": "Appointment booking to calendar",
+  "summary": "Sync new appointment-protocol bookings into a calendar MCP (Google Calendar, Cal.com, Outlook), with attendee + reminder wiring.",
+  "valueHook": "Bookings the bot collects show up as real calendar events for your team, with attendees invited and reminders set — no manual transfer.",
+  "version": 1,
+  "category": "calendar",
+  "requires": {
+    "protocols": ["appointments"],
+    "destinationMcpCategory": "calendar-like",
+    "destinationExamples": ["Google Calendar", "Cal.com", "Outlook Calendar", "Fastmail Calendar"]
+  },
+  "parameters": [
+    {
+      "name": "calendarId",
+      "prompt": "Which calendar should bookings land in? (calendar id, email, or workspace identifier — depends on the calendar MCP)"
+    },
+    {
+      "name": "defaultDuration",
+      "prompt": "Default appointment duration in minutes when the submission doesn't specify one?",
+      "default": 30
+    },
+    {
+      "name": "attendeeFields",
+      "prompt": "Which submission fields hold attendee identity? (typically email and name)"
+    },
+    {
+      "name": "sendInvites",
+      "prompt": "Should the calendar event send email invites to attendees? (true/false)",
+      "default": false
+    }
+  ],
+  "mcpTools": {
+    "mojulo": ["query_submissions", "get_deployment"],
+    "destination": {
+      "description": "A calendar-like MCP exposing event create with start/end, attendees, and (optionally) reminders. Examples: Google Calendar, Cal.com, Outlook."
+    }
+  }
+}
+---
+# Appointment booking to calendar
+The `appointments` protocol captures a user's preferred time and contact info into a submission. This catalyst lifts that submission into a real calendar event so the user (or the booked party) sees it on their schedule.
+## How to synthesize the skill
+1. `get_deployment(deploymentId)` — read the appointments config and form schema. The appointments protocol stores the captured slot in a known field shape; map it before guessing.
+2. Ask the user the four `parameters` questions.
+3. Inspect the destination MCP's event-create surface — timezone handling is the part that varies most. Google Calendar wants `start.dateTime` + `start.timeZone`; Cal.com handles it implicitly via booking type.
+4. Write `.claude/skills/<bot-slug>-calendar-sync/SKILL.md`.
+## Mapping intent
+The appointment submission typically holds:
+- A datetime (the booked slot) — UTC ISO or a local time + timezone. **Always normalize to UTC before passing to the calendar MCP**, even if the MCP accepts local; calendar-MCP timezone bugs are the #1 source of off-by-an-hour incidents.
+- Attendee identity (name + email at minimum) from `attendeeFields`.
+- Optional context (chief complaint, service type, notes) → event description.
+Event composition:
+- **Title:** `{serviceType or 'Appointment'} — {attendeeName}`
+- **Description:** the submission notes plus a mojulo trace footer (submission id, conversation id, deployment id) so the calendar entry is traceable back to the source conversation.
+- **Duration:** the submission's `duration` field if present, else `defaultDuration`.
+- **Attendees:** the user's calendar always; the booked party only if `sendInvites=true` AND the submission includes a valid email.
+## Idempotency
+Each event create should attach the `mojulo_submission_id` as a custom property (Google Calendar `extendedProperties.private`; Cal.com booking metadata). Search-before-create on that property to avoid duplicates on re-run. The `since` cursor is the primary defense; the property is the safety net.
+## Pitfalls
+- **Timezone bugs.** Already called out above — surface this prominently in the synthesized skill. If the bot serves users across timezones, the appointment slot's timezone has to be carried, not assumed.
+- **`sendInvites` is irreversible.** Once an invite email is sent, it can't be unsent. Default to `false`. Make the user explicitly opt in per run, not just at synthesis time.
+- **Cancellations.** This skill creates events; cancellations through the bot (if any) aren't propagated. If the user needs that flow, it's a separate skill — note this as a limitation.
+- **No-shows / reschedules.** Mojulo doesn't currently observe these. The calendar is the source of truth post-booking.
+## Skill behavior contract
+- **Inputs:** `deploymentId` (required), `since` (optional ISO), `dryRun` (default true), `sendInvites` (default false, requires explicit per-run flag for true)
+- **Outputs:** per-submission decision log `{ submissionId, calendarEventId?, action: 'created' | 'duplicate-skipped' | 'invalid-slot' }`
+- **Side effects (live mode):** calendar event create via destination MCP. Email invites only when `sendInvites=true`.

package/lib/mcp/catalysts/conversations-to-channel-digest.md ADDED Viewed

@@ -0,0 +1,104 @@
+---
+{
+  "id": "conversations-to-channel-digest",
+  "name": "Conversation digest to channel",
+  "summary": "Generate a recurring narrative summary of what end users have been saying to the bot — themes, recurring questions, sentiment, notable conversations — and post to a channel (Slack/email/Notion).",
+  "valueHook": "A recurring narrative of what users are actually saying to the bot, posted where your team already pays attention.",
+  "version": 1,
+  "category": "digest",
+  "requires": {
+    "protocols": [],
+    "destinationMcpCategory": "channel-like",
+    "destinationExamples": ["Slack", "Gmail", "Notion", "Microsoft Teams", "Discord"]
+  },
+  "parameters": [
+    {
+      "name": "cadenceDescription",
+      "prompt": "How often will this run and what window should each digest cover? (e.g., 'weekly, covering the prior 7 days')"
+    },
+    {
+      "name": "summaryAxes",
+      "prompt": "What dimensions of conversation should the digest highlight? (e.g., 'recurring questions, sentiment trends, novel topics, escalation candidates' — pick 2-4)"
+    },
+    {
+      "name": "sampleCeiling",
+      "prompt": "If the window has more conversations than this number, sample at this size rather than read everything. Defaults to 100; lower for cost control, higher for completeness.",
+      "default": 100
+    },
+    {
+      "name": "outputChannel",
+      "prompt": "Where does this land? (e.g., 'Slack #cs-insights', 'email to team@example.com', 'Notion page in workspace X')"
+    },
+    {
+      "name": "audienceTone",
+      "prompt": "Who reads this and how formal should the summary be? (e.g., 'engineering team — terse, bullet-heavy', 'leadership — narrative, qualitative', 'support manager — actionable, ticket-oriented')"
+    }
+  ],
+  "mcpTools": {
+    "mojulo": ["query_conversations", "get_conversation", "get_deployment"],
+    "destination": {
+      "description": "A channel-like MCP that posts narrative content. Slack (post_message), Gmail (send_email), Notion (create_page or append_block), Microsoft Teams, Discord, or any messaging surface."
+    }
+  }
+}
+---
+# Conversation digest to channel
+This catalyst is distinct from `weekly-submissions-digest`: that one summarizes *structured submissions* (counts, breakdowns, notable rows). This one summarizes *conversation content* — what end users actually said to the bot, in their own words. Two very different sources, two very different digest shapes. Many bots benefit from both running in parallel — submissions tell you *what was captured*, conversations tell you *what was asked*.
+The output is a narrative report posted to a channel where the audience reads it without clicking through to the dashboard. The value is keeping the operating team aware of how the bot is being *used* without anyone manually scrubbing conversations.
+## How to synthesize the skill
+1. `get_deployment(deploymentId)` — read the bot's identity and protocols. The identity (industry, role, customer base) shapes how you interpret what users are saying; "frustration" means different things on a dental-intake bot vs. a SaaS-support bot.
+2. Ask the user the five `parameters` questions, batched.
+3. Inspect the destination MCP's post surface — markdown support, message length limits, threading capability. Slack's `post_message` has length limits and benefits from a `blocks` payload; email allows long-form HTML; Notion allows arbitrarily long structured pages. The digest's render form adapts to the destination.
+4. Write `.claude/skills/<bot-slug>-conv-digest/SKILL.md`. The skill takes `deploymentId`, `windowStart`, `windowEnd` as inputs.
+## Digest composition
+Four sections, in this order:
+1. **Header** — bot name, window covered, total conversations, total turns, average conversation length. One line each.
+2. **Recurring questions / themes** — cluster conversations by the user's underlying question or topic. Surface the top 3-7 clusters with: a canonical phrasing of the question, observation count, 1-2 representative quotes (PII-redacted), and any pattern in how the bot handled them. This is the section the audience reads most carefully — it's the closest thing to "voice of the customer" from the bot's vantage.
+3. **Sentiment / friction signals** — conversations where the user expressed frustration, repeated the same question, gave up, or escalated. Bounded list (top 3-5), each with conversation id and a one-line summary. Distinguish "user gave up because bot couldn't help" from "user got what they needed and left" — the former is the actionable signal.
+4. **Novel topics** (optional, if window > 2 weeks) — questions or topics that appeared this window but not in prior windows. Catches drift in customer concerns over time. Skip in narrow-window digests; the signal-to-noise is bad below ~2 weeks.
+## Sampling discipline
+Conversation reading is expensive (every conversation requires a `get_conversation` call + LLM read). The `sampleCeiling` defaults to 100 to keep cost predictable. If the window has more conversations than that:
+- For clustering (themes/questions): random sample to ceiling. Quality plateaus around 100 for most clustering work; doubling rarely doubles signal.
+- For friction signals: prioritize keeping the most recent N rather than random — fresh frustration matters more than old.
+`query_conversations` returns summaries cheaply; use those to make the sampling decision before calling `get_conversation` for the full turns. This is the key efficiency trick for this catalyst.
+## Output adaptation per destination
+- **Slack** — `blocks` payload, bullet-heavy, each theme in its own section. Length cap matters; if the digest is long, post a summary in the channel and link to a thread with the full content.
+- **Email** — long-form HTML or markdown is fine. Include a TL;DR at the top for the inbox preview.
+- **Notion** — structured page with headings per section. Notion preserves rich-text and tables well; lean into that. Search-before-create on the page title to update an existing digest rather than spawn duplicates per run.
+- **Teams/Discord** — similar to Slack but the API shapes differ; adapt to what the bound MCP exposes.
+## Idempotency
+Less critical than for write-side catalysts — re-running just re-posts. But:
+- **Notion/Doc destinations:** search-before-create on the page title to update rather than spawn duplicates.
+- **Slack/email destinations:** no idempotency surface. Default the synthesized skill to `--dry-run` mode that prints the digest to stdout; `--send` required for live posting.
+- **Empty windows:** a bot with no conversations in the window shouldn't produce a noisy "0 conversations" digest. Default to skip-when-empty unless the user explicitly wants the heartbeat.
+## Pitfalls
+- **PII in quotes.** Sample utterances may contain names, emails, account numbers, location. The digest's value is the *pattern*, not the asker. Redact aggressively before including any direct quote — substitute placeholders for identity. The redaction step is non-negotiable in the synthesized skill; don't make it optional.
+- **Over-summarization hides the signal.** Resist the urge to compress every quote to a generic "users asked about pricing." A specific quote — properly redacted — communicates the texture of what users actually said, which is the point. Aim for 1-2 lightly-edited verbatim quotes per cluster.
+- **Calibration drift.** "Frustration" or "novel topic" are model judgements. If the bot's domain shifts (new product launches, new customer segment), the model's calibration drifts. Recommend the user re-run the catalyst flow when the bot's identity or domain changes substantially.
+- **Don't surface conversations that ended in handoff.** If `triage` is enabled, conversations that handed off to another bot already got attention from that downstream — including them as "friction" double-counts. Filter handoffs out of the friction signal section unless the user wants them.
+- **Volume bias.** A loud, repeating user can dominate a recurring-question cluster. When sampling, deduplicate by conversation id (one observation per user) before counting frequency.
+## Skill behavior contract
+- **Inputs:** `deploymentId` (required), `windowStart` and `windowEnd` (optional ISO — defaults derived from cadence), `sampleCeiling` (default from parameter), `dryRun` (default true)
+- **Outputs:** the rendered digest (printed in dry-run mode; posted otherwise)
+- **Side effects (live mode):** one document/message create or update via destination MCP. No mojulo-side writes.

package/lib/mcp/catalysts/document-extract-to-store.md ADDED Viewed

@@ -0,0 +1,92 @@
+---
+{
+  "id": "document-extract-to-store",
+  "name": "Optical extraction to durable store",
+  "summary": "Persist optical-read extractions to a structured store (Notion/Airtable/Sheets rows) or a vector store (Pinecone/Qdrant/Chroma chunks), preserving traceability back to the source image and submission.",
+  "valueHook": "Photos and screenshots the bot reads become queryable rows or searchable embeddings — extractions stop being one-shot.",
+  "version": 1,
+  "category": "extraction-pipeline",
+  "requires": {
+    "protocols": ["opticalRead"],
+    "optionalProtocols": ["formGathering"],
+    "destinationMcpCategory": "data-store-like",
+    "destinationExamples": ["Notion", "Airtable", "Google Sheets", "Pinecone", "Qdrant"]
+  },
+  "parameters": [
+    {
+      "name": "destinationMode",
+      "prompt": "Where should extracted fields land — a structured table (Notion/Airtable/Sheets, rows + columns), or a vector store (Pinecone/Qdrant/Chroma, chunks + embeddings)? If the user has only one of the two installed, pick that and confirm."
+    },
+    {
+      "name": "recordKey",
+      "prompt": "Which field uniquely identifies a record for dedupe? (typically a document number, claim id, policy id, or a hash of the extracted-field tuple when no natural key exists)"
+    },
+    {
+      "name": "fieldMapping",
+      "prompt": "How should the bot's extractedFields map to the destination? For table mode: field name → column name pairs. For vector mode: which fields are chunked, which become metadata filters?"
+    },
+    {
+      "name": "imageRetention",
+      "prompt": "Should the synthesized skill include a URL/path back to the original image in each record? (true/false — depends on whether the bot serves the image bytes long-term)",
+      "default": false
+    }
+  ],
+  "mcpTools": {
+    "mojulo": ["query_submissions", "get_conversation", "get_deployment"],
+    "destination": {
+      "description": "A data-store-like MCP. Two shapes are supported: (a) structured table MCPs (Notion, Airtable, Google Sheets, Coda) exposing row create/upsert with named columns; (b) vector store MCPs (Pinecone, Qdrant, Chroma, Weaviate) exposing embed + upsert with metadata. The synthesized skill commits to one shape per skill instance — write two skills if the user wants both."
+    }
+  }
+}
+---
+# Optical extraction to durable store
+The `opticalRead` protocol turns uploaded images (claim forms, IDs, lab results, receipts, contracts) into a structured `extractedFields` payload that gets attached to the submission. This catalyst takes that structured output and persists it to a long-term store where downstream systems — analytics, lookup tools, RAG corpora — can use it.
+## How to synthesize the skill
+1. `get_deployment(deploymentId)` — read the optical-read configuration. The `extractedFields` schema (`idName`, `label`, `hint`) tells you exactly what fields each scan produces. **This is your source-of-truth for `fieldMapping`** — never invent fields the bot doesn't extract.
+2. Ask the user the four `parameters` questions, batched. The `destinationMode` answer is the load-bearing branch — table mode and vector mode synthesize different skills.
+3. Inspect the bound destination MCP. Confirm it matches `destinationMode` (a row-creation surface for table mode, an embed+upsert surface for vector mode). If the user has a vector store MCP but answered "table," ask — don't force-fit.
+4. Write `.claude/skills/<bot-slug>-extract-to-<destination-slug>/SKILL.md`. The skill takes `deploymentId` and `since` as inputs.
+## Mapping intent — table mode (Notion, Airtable, Sheets, Coda)
+Each submission with an `extractedFields` payload becomes one row. Columns are derived from the `fieldMapping`:
+- **Identity column** — the `recordKey` field. Used for upsert (search-before-create); this is the row's primary key from the destination's perspective.
+- **Data columns** — one per extracted field. Map `idName` to the destination column. Preserve types: dates as dates, currency as numbers, strings as strings. Do not coerce everything to text.
+- **Mojulo trace columns** — `mojulo_submission_id`, `mojulo_deployment_id`, `mojulo_captured_at`, optionally `mojulo_conversation_id`. Always include. The reviewer downstream needs to walk back to the source conversation when an extraction looks wrong.
+- **Confidence/quality columns (optional)** — if the optical-read output carries per-field confidence, surface it. A column like `extraction_quality: 'high' | 'medium' | 'low'` lets the reviewer prioritize what to spot-check.
+Field-to-column mapping that doesn't fit — extracted fields with no destination column — should prompt the user during synthesis, not be silently dropped. If the destination has a JSON/blob column, fall back to a `raw_extraction` JSON dump for unmapped fields; otherwise ask.
+## Mapping intent — vector mode (Pinecone, Qdrant, Chroma, Weaviate)
+Each submission with an `extractedFields` payload becomes one **or more** vector records. The chunking and metadata design is where vector mode earns its keep:
+- **Chunking choice.** Two reasonable defaults: (a) one chunk per submission, concatenating `label: value` pairs into a single text string for embedding; (b) one chunk per extracted field, embedded as `<field label>: <value>` so semantic search can find documents matching a specific field pattern. Default to (a) unless the user's intent (per `fieldMapping`) names specific fields as standalone search targets.
+- **Metadata.** Every chunk carries: `submission_id`, `deployment_id`, `captured_at`, `record_key` (the value of the `recordKey` field). Also any extracted field the user named as a metadata filter — these become the structured-filter dimensions for hybrid retrieval (e.g., `claim_year: 2026`).
+- **Embedding choice.** The destination MCP usually exposes embedding internally (Pinecone has its own; Qdrant integrates with several). Use the destination's own embedding pipeline rather than re-embedding from Claude. If the destination requires pre-embedded vectors, the user has to provide an embedding tool (separate MCP or local helper) — this is the one case to ask before assuming.
+- **Namespace / collection.** Default to per-deployment namespace (`mojulo_<deploymentId>`), so multiple bots writing to the same vector store don't pollute each other.
+## Idempotency
+**Both modes** use `since` as the primary high-water cursor on submission timestamp. Search-before-upsert on `recordKey` is the safety net for re-runs and duplicate submissions.
+**Vector mode adds a wrinkle:** if `chunkStrategy` is "per-field" and the same submission is reprocessed, you get N chunks per submission and need to delete the prior N before re-upserting. Most vector MCPs expose a `delete-by-metadata` (filter on `submission_id`) — use it before upsert. The synthesized skill should make this explicit; silent N+N+N growth on re-runs is the most common bug here.
+## Pitfalls
+- **Extraction confidence is variable.** Optical-read is not perfect. Documents with low confidence shouldn't be auto-promoted to a system-of-record store. Recommend the synthesized skill default to a confidence threshold (e.g., skip-and-log when any required field is below `medium`), with the user opting into "include all" if they're staging for review.
+- **PII in the destination.** Optical-read often captures sensitive fields (DOB, SSN, insurance ids, addresses). Tables and vector stores typically have broader access than the bot's own SQLite. Confirm with the user during synthesis which fields should be redacted, hashed, or excluded entirely before landing. Default to including everything the user says to include — but the question is non-skippable.
+- **Vector store costs scale with rerun.** Vector upserts cost per-vector and per-embedding-call. A wide `since` window on first run can be expensive. Recommend starting with a 1-day window, validating the chunk shape, then widening.
+- **Schema drift.** If the bot's `opticalRead` extraction fields change later (new field added, label renamed), the table schema or vector metadata schema will silently misalign. The synthesized skill should fail-loud on schema mismatch rather than silently dropping fields — and recommend the user re-run the catalyst flow when the bot's extraction config changes.
+- **Image retention is a side concern.** If `imageRetention=true`, the URL/path included in each record only stays valid as long as the bot serves the image. If the bot rotates or deletes old uploads, the link breaks. Don't promise long-term access the bot doesn't deliver.
+## Skill behavior contract
+- **Inputs:** `deploymentId` (required), `since` (optional ISO, default 24h ago or last-cursor), `confidenceThreshold` (string, default `medium`), `dryRun` (default true)
+- **Outputs:** per-submission decision log: `{ submissionId, recordKey, action: 'inserted' | 'updated' | 'skipped-low-confidence' | 'skipped-duplicate' | 'failed', destinationRecordId?, chunkCount? }`. Vector mode adds `chunkCount` per record.
+- **Side effects (live mode):** row create/upsert (table mode) or chunk delete+upsert (vector mode) via destination MCP. No mojulo-side writes.

package/lib/mcp/catalysts/knowledge-gap-miner.md ADDED Viewed

@@ -0,0 +1,96 @@
+---
+{
+  "id": "knowledge-gap-miner",
+  "name": "Knowledge gap miner",
+  "summary": "Analyze recent conversations on a knowledge-protocol bot to find questions the RAG corpus answered poorly, and propose additions to the user's documentation backlog.",
+  "valueHook": "Find the questions your RAG corpus is answering badly, so your docs can fill the gap before users notice.",
+  "version": 1,
+  "category": "rag-curation",
+  "requires": {
+    "protocols": ["knowledge"],
+    "destinationMcpCategory": "optional-doc-backlog",
+    "destinationExamples": ["Notion", "Linear", "GitHub Issues"]
+  },
+  "parameters": [
+    {
+      "name": "lookbackWindow",
+      "prompt": "How far back should this scan? (e.g., '7 days', '30 days')",
+      "default": "14 days"
+    },
+    {
+      "name": "minOccurrences",
+      "prompt": "How many times must a gap be observed before it's worth surfacing?",
+      "default": 2
+    },
+    {
+      "name": "backlogDestination",
+      "prompt": "Where should proposed doc additions go? (e.g., 'a Notion page', 'a Linear ticket per gap', 'just print to stdout' — leave empty for stdout-only)"
+    }
+  ],
+  "mcpTools": {
+    "mojulo": ["query_conversations", "get_conversation", "get_deployment"],
+    "destination": {
+      "description": "Optional. If specified, a doc/backlog MCP that can accept proposed additions. Examples: Notion (create_page), Linear (issue_create), GitHub (create_issue)."
+    }
+  }
+}
+---
+# Knowledge gap miner
+A `knowledge`-protocol bot answers from its RAG corpus. When it doesn't have a good answer — vague reply, hedged response, "I don't have information about that" — that's a signal the corpus is missing something users actually ask about. This catalyst mines those signals and turns them into a deduplicated, prioritized backlog of doc additions.
+Unlike the other catalysts, the destination is **optional**. The most useful output is often just the printed list — a focused weekly review by whoever owns the corpus. A backlog MCP is a nice-to-have.
+## How to synthesize the skill
+1. `get_deployment(deploymentId)` — confirm the `knowledge` protocol is active. Read the bot's domain identity; it shapes how you interpret "gap."
+2. Ask the user the three `parameters` questions.
+3. If `backlogDestination` was given, inspect that MCP's create surface.
+4. Write `.claude/skills/<bot-slug>-gap-miner/SKILL.md`.
+## Detection logic
+Walk recent conversations (`query_conversations` with `since` derived from `lookbackWindow`, then `get_conversation` per id). For each conversation, scan the bot's turns for **weak-answer signals**:
+- Explicit declines: "I don't have information about that," "I can't find that in my knowledge base," "you'd need to contact support for that"
+- Hedging: "based on what I can tell," "I'm not entirely sure," "you may want to verify"
+- Topic-deflection: bot answers a *related* question rather than the one asked
+- User dissatisfaction cues: user re-phrases the same question, user says "that's not what I asked," user abandons the conversation after a vague answer
+For each weak-answer turn, extract the **user's underlying question** as a short canonical phrasing (not a quote — a generalization). This is the gap.
+## Clustering and dedup
+Cluster gaps by semantic similarity across the window. One user asking "what are your hours" three times is one gap, three observations. Three different users asking variations of "how do I cancel" is one gap, three observations.
+Surface only clusters with ≥ `minOccurrences` observations. This filters one-off questions from genuine corpus gaps.
+## Proposal composition
+For each surfaced gap, generate:
+- **Canonical question** — the gap as a documentable Q
+- **Observation count** — how many conversations hit this
+- **Sample utterances** — 2-3 actual phrasings from real conversations (with conversation ids for traceability)
+- **Proposed addition** — a short paragraph the user could paste into their docs as a starting point. Mark this clearly as **proposed, not authoritative** — the user must review before adding to the corpus.
+The user re-uploads accepted additions through the normal mojulo document-upload flow ([upload_document_from_url](docs/mcp-integration.md) tool) — this skill does **not** modify the bot's corpus directly.
+## Output
+- **Always:** a markdown report printed to stdout (or, in Claude Code, returned as the skill's result text). The user reads it.
+- **If `backlogDestination` is configured:** one entry per surfaced gap in the destination. For Linear: one issue per gap. For Notion: one page (or one row in a database). Each entry includes the conversation ids so the reviewer can drill back.
+## Pitfalls
+- **Weak-answer false positives.** A bot that's been told to hedge ("I'm an AI, please verify with...") will look like it has gaps everywhere. Calibrate by reading the bot's identity prompt — if hedging is configured behavior, raise the bar for what counts as weak.
+- **PII in the report.** Sample utterances may contain identity. Redact aggressively — the report's value is the *question pattern*, not the asker. Replace names/emails/specific identifiers with placeholders before including.
+- **Don't auto-add to corpus.** The corpus is the bot's behavior. Silent additions are surprise behavior changes. Always go through the user — propose, never inject.
+- **Cadence.** Once-a-week or once-a-month is plenty. Running this daily produces noise and the corpus doesn't change that fast.
+## Skill behavior contract
+- **Inputs:** `deploymentId` (required), `lookbackWindow` (default 14d), `minOccurrences` (default 2), `dryRun` (default true)
+- **Outputs:** the gap report (always), per-gap destination action results (when configured)
+- **Side effects (live mode, only if destination configured):** one entry per gap in the destination. **Never writes to the bot's corpus** — that path is user-mediated through document upload.