npm - membot - Versions diffs - 0.5.2 → 0.7.0 - Mend

membot 0.5.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/.claude/skills/membot.md +25 -10
package/.cursor/rules/membot.mdc +25 -10
package/README.md +36 -4
package/package.json +8 -5
package/scripts/apply-patches.sh +0 -11
package/scripts/build-test-docx.ts +84 -0
package/src/cli.ts +2 -2
package/src/commands/login-page.mustache +50 -0
package/src/commands/login.ts +83 -0
package/src/config/schemas.ts +23 -5
package/src/constants.ts +20 -1
package/src/context.ts +1 -24
package/src/db/files.ts +21 -25
package/src/db/migrations/003-downloader-columns.ts +58 -0
package/src/db/migrations.ts +2 -1
package/src/ingest/converter/docx.ts +47 -5
package/src/ingest/converter/html.ts +10 -3
package/src/ingest/converter/image.ts +40 -3
package/src/ingest/converter/images-inline.ts +132 -0
package/src/ingest/converter/index.ts +13 -3
package/src/ingest/converter/xlsx.ts +111 -0
package/src/ingest/downloaders/browser.ts +180 -0
package/src/ingest/downloaders/generic-web.ts +81 -0
package/src/ingest/downloaders/github.ts +178 -0
package/src/ingest/downloaders/google-docs.ts +56 -0
package/src/ingest/downloaders/google-shared.ts +86 -0
package/src/ingest/downloaders/google-sheets.ts +58 -0
package/src/ingest/downloaders/google-slides.ts +53 -0
package/src/ingest/downloaders/index.ts +182 -0
package/src/ingest/downloaders/linear.ts +291 -0
package/src/ingest/fetcher.ts +104 -129
package/src/ingest/ingest.ts +44 -71
package/src/mcp/instructions.ts +4 -2
package/src/operations/add.ts +6 -4
package/src/operations/info.ts +4 -6
package/src/operations/move.ts +2 -3
package/src/operations/refresh.ts +2 -4
package/src/operations/remove.ts +23 -2
package/src/operations/tree.ts +1 -1
package/src/operations/types.ts +1 -1
package/src/refresh/runner.ts +60 -115
package/src/types/text-modules.d.ts +5 -0
package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
package/src/commands/mcpx.ts +0 -112
package/src/ingest/agent-fetcher.ts +0 -639

package/.claude/skills/membot.md CHANGED Viewed

@@ -26,15 +26,26 @@ membot search "<question>"          # hybrid search (semantic + keyword)
 ## 2. Ingest
 ```bash
-membot add ./README.md                            # single file
-membot add ./docs                                 # recursive directory walk
-membot add "docs/**/*.md"                         # glob
-membot add a.md b.md "docs/**/*.md"               # any number of args; each resolved independently
-membot add https://example.com/spec.pdf           # URL (auto-converted to markdown)
-membot add "inline:Decision: use X because Y"     # literal text
-membot add ./docs --refresh-frequency 24h         # auto-refresh every day
+membot add ./README.md                                            # single file
+membot add ./docs                                                 # recursive directory walk
+membot add "docs/**/*.md"                                         # glob
+membot add a.md b.md "docs/**/*.md"                               # any number of args; each resolved independently
+membot add https://docs.google.com/document/d/<ID>/edit           # Google Docs/Sheets/Slides via export endpoints
+membot add https://github.com/<owner>/<repo>/issues/<n>           # GitHub issues + PRs (with comments)
+membot add https://linear.app/<workspace>/issue/<KEY>             # Linear issues + projects
+membot add https://example.com/spec.pdf                           # any other URL (browser print-to-PDF fallback)
+membot add "inline:Decision: use X because Y"                     # literal text
+membot add ./docs --refresh-frequency 24h                         # auto-refresh every day
 ```
+Remote URLs go through per-service downloaders. Google needs cookies
+captured by `membot login` (one-time browser sign-in); GitHub and
+Linear need API keys set via
+`membot config set downloaders.<svc>.api_key`. If a fetch fails with
+an auth error, the `HelpfulError` will tell you exactly which command
+to run. Fetches are non-interactive — they never open a browser
+during ingest or refresh.
 Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
 The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
@@ -77,6 +88,7 @@ membot refresh                         # refresh all rows whose schedule has ela
 membot mv old/path new/path            # rename (history preserved under both)
 membot rm <paths...>                   # tombstone one or more paths/globs (history still queryable)
 membot rm "docs/**/*.md" notes/old.md  # globs match logical_paths in the DB; literals + globs can mix
+membot rm -r remotes/docs.google.com   # --recursive removes every path under a directory prefix
 membot prune --before <iso-ts>         # drop non-current versions older than cutoff (irreversible)
 ```
@@ -116,16 +128,17 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 | `membot read <path>`                  | Read current markdown surrogate (or `--bytes` for original)                    |
 | `membot write <path> --content <txt>` | Write inline agent-authored markdown as a new version                          |
 | `membot search <query>`               | Hybrid search (semantic + BM25); add `--include-history` to search older versions |
-| `membot info <path>`                  | Inspect metadata (source, fetcher, refresh schedule, digests) without content  |
+| `membot info <path>`                  | Inspect metadata (source, downloader, refresh schedule, digests) without content |
 | `membot versions <path>`              | List every version newest-first with version_id and change notes               |
 | `membot diff <path> --a <ts>`         | Unified diff between two versions                                              |
 | `membot mv <old> <new>`               | Rename a logical_path (history preserved)                                      |
-| `membot rm <paths...>`                | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); history kept |
+| `membot rm <paths...>`                | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); pass `-r` / `--recursive` to remove a directory prefix; history kept |
 | `membot refresh [path]`               | Re-read source; create new version only if bytes changed                       |
 | `membot prune --before <ts>`          | Permanently drop non-current versions older than cutoff (irreversible)         |
 | `membot serve`                        | Start MCP server (stdio default, `--http <port>` for HTTP)                     |
 | `membot reindex`                      | Rebuild the FTS keyword index over current chunks                              |
 | `membot config <subcommand>`          | Host-side config management (`get` / `set` / `unset` / `list` / `path`). **Don't run** — this is for the human operator, not for agents |
+| `membot login`                        | Open a browser to sign into Google / GitHub / Linear / etc. (one-time host-side setup). **Don't run** — this is for the human operator |
 ## Output formats
@@ -137,7 +150,9 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 ## Troubleshooting
 - **"ingest failed: unsupported mime"** → Add a converter or pass `--bytes` to keep the original; LLM-fallback only runs when `ANTHROPIC_API_KEY` is set.
-- **"refresh failed: auth"** → The original fetch used an authenticated mcpx tool; re-auth via `mcpx auth <server>`.
+- **"refresh failed: auth"** for a Google URL → cookies expired. Run `membot login` to refresh the browser session.
+- **"refresh failed: auth"** for a GitHub URL → set the PAT via `membot config set downloaders.github.api_key <PAT>` (or export `GITHUB_TOKEN`).
+- **"refresh failed: auth"** for a Linear URL → set the personal API key via `membot config set downloaders.linear.api_key <KEY>` (create one at `linear.app/settings/api`).
 - **Search returns nothing** → Confirm the file ingested with `membot info <path>`; if needed, run `membot reindex` to rebuild the FTS keyword index.
 - **Stale results after manual DB edits** → `membot reindex`.
 - **Two paths point at the same content** → `membot mv` doesn't merge; tombstone one with `membot rm`.

package/.cursor/rules/membot.mdc CHANGED Viewed

@@ -26,15 +26,26 @@ membot search "<question>"          # hybrid search (semantic + keyword)
 ## 2. Ingest
 ```bash
-membot add ./README.md                            # single file
-membot add ./docs                                 # recursive directory walk
-membot add "docs/**/*.md"                         # glob
-membot add a.md b.md "docs/**/*.md"               # any number of args; each resolved independently
-membot add https://example.com/spec.pdf           # URL (auto-converted to markdown)
-membot add "inline:Decision: use X because Y"     # literal text
-membot add ./docs --refresh-frequency 24h         # auto-refresh every day
+membot add ./README.md                                            # single file
+membot add ./docs                                                 # recursive directory walk
+membot add "docs/**/*.md"                                         # glob
+membot add a.md b.md "docs/**/*.md"                               # any number of args; each resolved independently
+membot add https://docs.google.com/document/d/<ID>/edit           # Google Docs/Sheets/Slides via export endpoints
+membot add https://github.com/<owner>/<repo>/issues/<n>           # GitHub issues + PRs (with comments)
+membot add https://linear.app/<workspace>/issue/<KEY>             # Linear issues + projects
+membot add https://example.com/spec.pdf                           # any other URL (browser print-to-PDF fallback)
+membot add "inline:Decision: use X because Y"                     # literal text
+membot add ./docs --refresh-frequency 24h                         # auto-refresh every day
 ```
+Remote URLs go through per-service downloaders. Google needs cookies
+captured by `membot login` (one-time browser sign-in); GitHub and
+Linear need API keys set via
+`membot config set downloaders.<svc>.api_key`. If a fetch fails with
+an auth error, the `HelpfulError` will tell you exactly which command
+to run. Fetches are non-interactive — they never open a browser
+during ingest or refresh.
 Each entry becomes a new version under its own `logical_path`. PDFs/DOCX/HTML are converted to markdown; images get vision captions; original bytes are kept and reachable via `membot read --bytes`.
 The default `logical_path` mirrors the source path so files with the same basename in different projects don't collide:
@@ -77,6 +88,7 @@ membot refresh                         # refresh all rows whose schedule has ela
 membot mv old/path new/path            # rename (history preserved under both)
 membot rm <paths...>                   # tombstone one or more paths/globs (history still queryable)
 membot rm "docs/**/*.md" notes/old.md  # globs match logical_paths in the DB; literals + globs can mix
+membot rm -r remotes/docs.google.com   # --recursive removes every path under a directory prefix
 membot prune --before <iso-ts>         # drop non-current versions older than cutoff (irreversible)
 ```
@@ -116,16 +128,17 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 | `membot read <path>`                  | Read current markdown surrogate (or `--bytes` for original)                    |
 | `membot write <path> --content <txt>` | Write inline agent-authored markdown as a new version                          |
 | `membot search <query>`               | Hybrid search (semantic + BM25); add `--include-history` to search older versions |
-| `membot info <path>`                  | Inspect metadata (source, fetcher, refresh schedule, digests) without content  |
+| `membot info <path>`                  | Inspect metadata (source, downloader, refresh schedule, digests) without content |
 | `membot versions <path>`              | List every version newest-first with version_id and change notes               |
 | `membot diff <path> --a <ts>`         | Unified diff between two versions                                              |
 | `membot mv <old> <new>`               | Rename a logical_path (history preserved)                                      |
-| `membot rm <paths...>`                | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); history kept |
+| `membot rm <paths...>`                | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); pass `-r` / `--recursive` to remove a directory prefix; history kept |
 | `membot refresh [path]`               | Re-read source; create new version only if bytes changed                       |
 | `membot prune --before <ts>`          | Permanently drop non-current versions older than cutoff (irreversible)         |
 | `membot serve`                        | Start MCP server (stdio default, `--http <port>` for HTTP)                     |
 | `membot reindex`                      | Rebuild the FTS keyword index over current chunks                              |
 | `membot config <subcommand>`          | Host-side config management (`get` / `set` / `unset` / `list` / `path`). **Don't run** — this is for the human operator, not for agents |
+| `membot login`                        | Open a browser to sign into Google / GitHub / Linear / etc. (one-time host-side setup). **Don't run** — this is for the human operator |
 ## Output formats
@@ -137,7 +150,9 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
 ## Troubleshooting
 - **"ingest failed: unsupported mime"** → Add a converter or pass `--bytes` to keep the original; LLM-fallback only runs when `ANTHROPIC_API_KEY` is set.
-- **"refresh failed: auth"** → The original fetch used an authenticated mcpx tool; re-auth via `mcpx auth <server>`.
+- **"refresh failed: auth"** for a Google URL → cookies expired. Run `membot login` to refresh the browser session.
+- **"refresh failed: auth"** for a GitHub URL → set the PAT via `membot config set downloaders.github.api_key <PAT>` (or export `GITHUB_TOKEN`).
+- **"refresh failed: auth"** for a Linear URL → set the personal API key via `membot config set downloaders.linear.api_key <KEY>` (create one at `linear.app/settings/api`).
 - **Search returns nothing** → Confirm the file ingested with `membot info <path>`; if needed, run `membot reindex` to rebuild the FTS keyword index.
 - **Stale results after manual DB edits** → `membot reindex`.
 - **Two paths point at the same content** → `membot mv` doesn't merge; tombstone one with `membot rm`.

package/README.md CHANGED Viewed

@@ -16,15 +16,43 @@
 ```bash
 bun install -g membot
+bunx playwright install chromium    # one-time browser binary download (~150 MB)
 ```
-This pulls in DuckDB's per-platform native bindings alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`), so a global Bun install is the supported path.
+This pulls in DuckDB's per-platform native bindings and Playwright's Chromium binary alongside membot. The build externalizes `@duckdb/*` (those `.node` bindings can't be embedded by `bun build --compile`) and `playwright*` (the browser binary lives in `~/.cache/ms-playwright`), so a global Bun install is the supported path.
+After installing, set up the services you want to ingest from:
+```bash
+membot login
+```
+A real Chromium window opens with two sections:
+- **Browser sign-in** — Google Docs / Sheets / Slides. Click the Google link in the window, sign in, close the window. Cookies + IndexedDB persist to `~/.membot/auth/browser-profile/` and reused by every browser-based downloader.
+- **API-key services** — GitHub and Linear. The window shows the settings URL where you create a token and the `membot config set …` command to run in your terminal:
+```bash
+# GitHub: settings/tokens → fine-grained, repo:read
+membot config set downloaders.github.api_key <PAT>
+# or read from environment
+export GITHUB_TOKEN=<PAT>
+# Linear: linear.app/settings/api → personal API key
+membot config set downloaders.linear.api_key <KEY>
+```
+Public GitHub repos work without a token (rate-limited at 60 req/hr). Linear always needs a key.
 ## Quick start
 ```bash
+membot login                                     # one-time: sign into Google / GitHub / Linear in a browser
 membot add ./docs                                # ingest a directory recursively
-membot add https://example.com/spec.pdf          # ingest a URL (auto-converted to markdown)
+membot add https://docs.google.com/document/d/.. # Google Docs / Sheets / Slides via export endpoints
+membot add https://github.com/o/r/issues/123     # GitHub issues + PRs (with comments)
+membot add https://linear.app/w/issue/ABC-12     # Linear issues + projects
+membot add https://example.com/spec.pdf          # any other URL (browser print-to-PDF fallback)
 membot add a.md b.md "docs/**/*.md"              # any number of files / globs in one call
 membot ls                                        # list current files
 membot search "how does refresh work?"           # hybrid search
@@ -59,13 +87,13 @@ The skill files describe the discover → ingest → search → read → write w
 | `membot diff <path> <a> [b]`    | Unified diff between two versions                                                 |
 | `membot write <path>`           | Write inline agent-authored markdown as a new version                             |
 | `membot mv <from> <to>`         | Rename a logical_path (history preserved under both)                              |
-| `membot rm <paths...>`          | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); history kept |
+| `membot rm <paths...>`          | Tombstone one or more logical_paths or globs (e.g. `"docs/**/*.md"`); pass `-r` / `--recursive` to remove a directory prefix; history kept |
 | `membot refresh [path]`         | Re-read source; new version only if bytes changed                                 |
 | `membot prune --before <ts>`    | Permanently drop non-current versions older than cutoff (irreversible)            |
 | `membot serve`                  | Run the MCP server (stdio default; `--http <port>` for HTTP)                      |
 | `membot reindex`                | Rebuild the FTS keyword index over current chunks                                 |
 | `membot config <subcommand>`    | Get / set values in `~/.membot/config.json` (`get`, `set`, `unset`, `list`, `path`) |
-| `membot mcpx <subcommand>`      | Forward to the bundled `mcpx` CLI for managing remote MCP servers                 |
+| `membot login`                  | Open a Chromium window to sign into Google / GitHub / Linear / etc. — closes save the session |
 | `membot skill install`          | Install the Claude Code / Cursor agent skill                                      |
 Run `membot <command> --help` for full flags and arguments. Every command produces JSON when piped, when `--json` is set, or when `CI=true`.
@@ -108,15 +136,19 @@ Add `--watch` (and optional `--tick <sec>`) to also run the refresh daemon, whic
   membot config list                                            # show every value (secrets masked)
   membot config set llm.anthropic_api_key sk-ant-...            # enable LLM-fallback paths
   membot config set chunker.target_chars 800                    # tweak any nested value
+  membot config set converters.max_inline_image_captions 50     # raise per-doc cap on vision captions for embedded images
   membot config get llm.anthropic_api_key --show-secrets        # reveal the masked key
   membot config unset chunker.target_chars                      # back to schema default
   membot config path                                            # print the absolute config path
   ```
   Values are written with file mode `0600`. `ANTHROPIC_API_KEY` set in the environment still wins on read, so existing env-var setups keep working.
+- **Browser session:** `~/.membot/auth/browser-profile/` (Playwright persistent profile — cookies, localStorage, IndexedDB). Captured by `membot login`; cookie-based downloaders (Google) reuse it on every fetch. Delete the directory to force a fresh login.
+- **API keys:** stored under `downloaders.<service>.api_key` in `~/.membot/config.json`. Read by API-based downloaders (GitHub, Linear).
 - **Environment variables:**
   - `ANTHROPIC_API_KEY` — optional. Enables LLM fallback for messy / scanned input (vision captions for images, last-resort markdown conversion). Without it, the pipeline degrades to deterministic native conversion. Equivalent to `membot config set llm.anthropic_api_key ...`; the env var takes precedence on read.
   - `MEMBOT_HOME` — override the data directory.
+  - `MEMBOT_SKIP_E2E` — skip live-network E2E downloader tests in `bun test`.
   - `NO_COLOR`, `CI`, `FORCE_COLOR` — standard output controls.
 ## Development

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "membot",
-	"version": "0.5.2",
+	"version": "0.7.0",
 	"description": "Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.",
 	"type": "module",
 	"exports": {
@@ -27,7 +27,7 @@
 		"lint": "biome ci . && tsc --noEmit",
 		"format": "biome check --write .",
 		"prebuild": "bash scripts/apply-patches.sh",
-		"build": "bun build --compile --minify --sourcemap --external '@duckdb/*' ./src/cli.ts --outfile dist/membot"
+		"build": "bun build --compile --minify --sourcemap --external '@duckdb/*' --external 'playwright*' ./src/cli.ts --outfile dist/membot"
 	},
 	"keywords": [
 		"mcp",
@@ -56,27 +56,30 @@
 	"dependencies": {
 		"@anthropic-ai/sdk": "^0.32.0",
 		"@duckdb/node-api": "1.5.2-r.1",
-		"@evantahler/mcpx": "^0.21.4",
 		"@huggingface/transformers": "^4.2.0",
 		"@modelcontextprotocol/sdk": "^1.29.0",
+		"@types/picomatch": "^4.0.3",
+		"@types/turndown": "^5.0.5",
 		"ansis": "^4.2.0",
 		"commander": "^14.0.3",
 		"gray-matter": "^4.0.3",
 		"mammoth": "^1.8.0",
+		"mustache": "^4.2.0",
 		"nanospinner": "^1.2.2",
 		"onnxruntime-web": "1.26.0-dev.20260416-b7804b056c",
 		"picomatch": "^4.0.4",
-		"@types/picomatch": "^4.0.3",
+		"playwright": "^1.59.1",
 		"tesseract.js": "^5.1.0",
 		"turndown": "^7.2.0",
-		"@types/turndown": "^5.0.5",
 		"unpdf": "^0.12.0",
+		"xlsx": "^0.18.5",
 		"zod": "^4.0.0",
 		"zod-to-json-schema": "^3.23.0"
 	},
 	"devDependencies": {
 		"@biomejs/biome": "^2.4.14",
 		"@types/bun": "latest",
+		"@types/mustache": "^4.2.6",
 		"typescript": "^6"
 	},
 	"peerDependencies": {

package/scripts/apply-patches.sh CHANGED Viewed

@@ -38,14 +38,3 @@ apply_patch \
 	"node_modules/@huggingface/transformers" \
 	".membot-transformers-patch-applied"
-# @evantahler/mcpx — rewrite `src/search/onnx-wasm-paths.ts` so its static
-# `with { type: "file" }` imports of onnxruntime-web's WASM resolve from the
-# consumer's hoisted node_modules layout (../../../../onnxruntime-web/...)
-# instead of mcpx's own repo layout (../../node_modules/...). With this
-# patch in place, mcpx's semantic search runs end-to-end inside membot
-# (the agent fetcher's `mcp_search` exercises it) and `bun build --compile`
-# can bundle the WASM assets into the standalone binary.
-apply_patch \
-	"patches/@evantahler%2Fmcpx@0.21.4.patch" \
-	"node_modules/@evantahler/mcpx" \
-	".membot-mcpx-patch-applied"

package/scripts/build-test-docx.ts ADDED Viewed

@@ -0,0 +1,84 @@
+#!/usr/bin/env bun
+/**
+ * One-shot generator for `test/fixtures/sample-with-image.docx`. Run this
+ * (`bun scripts/build-test-docx.ts`) when the fixture is missing or when
+ * the embedded test image needs to change. The DOCX itself is committed
+ * to the repo so test runs don't depend on jszip-as-transitive-dep.
+ */
+import { mkdirSync, writeFileSync } from "node:fs";
+import { dirname } from "node:path";
+// jszip ships transitively via mammoth; this script is run by hand, not in tests.
+import JSZip from "../node_modules/jszip/lib/index.js";
+const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
+const documentXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+            xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+            xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
+            xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+            xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
+  <w:body>
+    <w:p><w:r><w:t>Lead paragraph before the diagram.</w:t></w:r></w:p>
+    <w:p><w:r><w:drawing>
+      <wp:inline>
+        <wp:extent cx="635" cy="635"/>
+        <wp:docPr id="1" name="Picture 1" descr="architecture diagram"/>
+        <a:graphic>
+          <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
+            <pic:pic>
+              <pic:nvPicPr>
+                <pic:cNvPr id="1" name="img.png" descr="architecture diagram"/>
+                <pic:cNvPicPr/>
+              </pic:nvPicPr>
+              <pic:blipFill>
+                <a:blip r:embed="rId1"/>
+                <a:stretch><a:fillRect/></a:stretch>
+              </pic:blipFill>
+              <pic:spPr>
+                <a:xfrm><a:off x="0" y="0"/><a:ext cx="635" cy="635"/></a:xfrm>
+                <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+              </pic:spPr>
+            </pic:pic>
+          </a:graphicData>
+        </a:graphic>
+      </wp:inline>
+    </w:drawing></w:r></w:p>
+    <w:p><w:r><w:t>Trailing paragraph after the diagram.</w:t></w:r></w:p>
+  </w:body>
+</w:document>`;
+const documentRels = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/>
+</Relationships>`;
+const rootRels = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>`;
+const contentTypes = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Default Extension="png" ContentType="image/png"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>`;
+async function main(): Promise<void> {
+	const zip = new JSZip();
+	zip.file("[Content_Types].xml", contentTypes);
+	zip.file("_rels/.rels", rootRels);
+	zip.file("word/document.xml", documentXml);
+	zip.file("word/_rels/document.xml.rels", documentRels);
+	zip.file("word/media/image1.png", Buffer.from(TINY_PNG_BASE64, "base64"));
+	const buffer = await zip.generateAsync({ type: "nodebuffer" });
+	const out = "test/fixtures/sample-with-image.docx";
+	mkdirSync(dirname(out), { recursive: true });
+	writeFileSync(out, buffer);
+	console.log(`wrote ${out} (${buffer.byteLength} bytes)`);
+}
+await main();

package/src/cli.ts CHANGED Viewed

@@ -5,7 +5,7 @@ import { program } from "commander";
 import pkg from "../package.json" with { type: "json" };
 import { registerCheckUpdateCommand } from "./commands/check-update.ts";
 import { registerConfigCommand } from "./commands/config.ts";
-import { registerMcpxCommand } from "./commands/mcpx.ts";
+import { registerLoginCommand } from "./commands/login.ts";
 import { registerReindexCommand } from "./commands/reindex.ts";
 import { registerServeCommand } from "./commands/serve.ts";
 import { registerSkillCommand } from "./commands/skill.ts";
@@ -59,7 +59,7 @@ for (const op of OPERATIONS) {
 registerServeCommand(program);
 registerReindexCommand(program);
 registerConfigCommand(program);
-registerMcpxCommand(program);
+registerLoginCommand(program);
 registerSkillCommand(program);
 registerCheckUpdateCommand(program);
 registerUpgradeCommand(program);

package/src/commands/login-page.mustache ADDED Viewed

@@ -0,0 +1,50 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>membot login</title>
+<style>
+  body { font-family: -apple-system, BlinkMacSystemFont, system-ui, sans-serif; padding: 2.5rem; max-width: 720px; margin: auto; color: #222; line-height: 1.5; }
+  h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
+  h2 { font-size: 1.05rem; margin-top: 2rem; margin-bottom: 0.5rem; color: #444; }
+  .hint { color: #666; margin-bottom: 1.5rem; }
+  ul { padding: 0; list-style: none; }
+  li { padding: 0.7rem 0.9rem; border: 1px solid #ddd; border-radius: 6px; margin-bottom: 0.5rem; }
+  a { color: #2855ff; text-decoration: none; font-weight: 500; }
+  a:hover { text-decoration: underline; }
+  .desc { color: #666; }
+  .footer { color: #888; font-size: 0.9rem; margin-top: 2rem; }
+  code { background: #f5f5f5; padding: 0.1rem 0.35rem; border-radius: 3px; font-size: 0.85rem; }
+  pre { background: #f5f5f5; padding: 0.6rem 0.8rem; border-radius: 4px; font-size: 0.85rem; overflow-x: auto; margin: 0.4rem 0 0 0; }
+</style>
+</head>
+<body>
+<h1>Set up the services membot will fetch from</h1>
+<p class="hint">This is a separate browser session from your daily Chrome — even if you're signed in there, you have to set things up here.</p>
+{{#hasBrowser}}
+<h2>Sign in via this browser</h2>
+<p class="hint">Click any link, complete the login, then close this window when you're done. Cookies + IndexedDB land in your membot profile.</p>
+<ul>
+{{#browser}}
+  <li><a href="{{url}}">{{name}}</a>{{#description}} <span class="desc">— {{description}}</span>{{/description}}</li>
+{{/browser}}
+</ul>
+{{/hasBrowser}}
+{{#hasApiKey}}
+<h2>API-key services</h2>
+<p class="hint">Open the settings page, create a token, then run the command shown in your terminal (not in this browser).</p>
+<ul>
+{{#apiKey}}
+  <li>
+    <a href="{{url}}">{{name}}</a>{{#description}} <span class="desc">— {{description}}</span>{{/description}}
+    <pre>{{setupCommand}}</pre>
+  </li>
+{{/apiKey}}
+</ul>
+{{/hasApiKey}}
+<p class="footer">Closing this window saves the browser-session profile. Run <code>membot login</code> again to refresh expired sessions.</p>
+</body>
+</html>

package/src/commands/login.ts ADDED Viewed

@@ -0,0 +1,83 @@
+import { join } from "node:path";
+import type { Command } from "commander";
+import Mustache from "mustache";
+import { FILES } from "../constants.ts";
+import { buildContext, closeContext } from "../context.ts";
+import { HelpfulError } from "../errors.ts";
+import { BrowserPool } from "../ingest/downloaders/browser.ts";
+import { collectLoginEntries } from "../ingest/downloaders/index.ts";
+import { logger } from "../output/logger.ts";
+import LOGIN_PAGE_TEMPLATE from "./login-page.mustache" with { type: "text" };
+/**
+ * `membot login`
+ *
+ * Open a real Chromium window backed by membot's persistent profile
+ * (cookies + localStorage + IndexedDB + service workers all stored
+ * under `~/.membot/auth/browser-profile/`) and pre-navigate to a
+ * small intro page that lists every login button declared by the
+ * registered downloaders. Adding a new downloader with `logins: […]`
+ * automatically gets a button on this page — login.ts knows nothing
+ * service-specific itself.
+ *
+ * Why a persistent profile instead of `storageState` JSON: SPA-heavy
+ * services like Linear stash session/sync state in IndexedDB, which
+ * `storageState` doesn't capture. A fresh headless context with
+ * cookies but no IndexedDB hangs on Linear's "Loading…" placeholder
+ * forever. The persistent profile carries IDB along with cookies, so
+ * the next headless run finds Linear's app fully bootstrapped.
+ *
+ * Window-close detection uses page-close events because on macOS
+ * closing the last chromium window doesn't quit the process —
+ * `browser.on('disconnected')` never fires. See `BrowserPool.waitForUserDone`.
+ */
+export function registerLoginCommand(program: Command): void {
+	program
+		.command("login")
+		.description(
+			"Open a browser to sign into the services membot fetches from — closing the window saves your session.",
+		)
+		.action(async () => {
+			const ctx = await buildContext({});
+			const userDataDir = join(ctx.dataDir, FILES.BROWSER_PROFILE);
+			const pool = new BrowserPool({ userDataDir, headless: false });
+			const entries = collectLoginEntries();
+			const html = Mustache.render(LOGIN_PAGE_TEMPLATE, {
+				browser: entries.browser,
+				apiKey: entries.apiKey,
+				hasBrowser: entries.browser.length > 0,
+				hasApiKey: entries.apiKey.length > 0,
+			});
+			let cookieCount = 0;
+			try {
+				const page = await pool.newPage();
+				await page.goto(`data:text/html;charset=utf-8,${encodeURIComponent(html)}`).catch(() => {});
+				logger.info("Sign into the services you want membot to fetch from, then close the browser window.");
+				logger.info(`Session profile will be stored at ${userDataDir}.`);
+				await pool.waitForUserDone(page);
+				cookieCount = await pool.cookieCount();
+			} catch (err) {
+				if (err instanceof HelpfulError) throw err;
+				throw new HelpfulError({
+					kind: "internal_error",
+					message: `login failed: ${err instanceof Error ? err.message : String(err)}`,
+					hint: "Run `bunx playwright install chromium` to ensure the browser binary is installed, then retry.",
+				});
+			} finally {
+				await pool.dispose();
+				await closeContext(ctx);
+			}
+			if (cookieCount === 0) {
+				throw new HelpfulError({
+					kind: "auth_error",
+					message: `Browser profile at ${userDataDir} has no cookies — no service was signed in.`,
+					hint: "Run `membot login` again and sign in (Google / GitHub / Linear / …) before closing the window.",
+				});
+			}
+			logger.info(`Saved session profile (${cookieCount} cookie${cookieCount === 1 ? "" : "s"}).`);
+		});
+}

package/src/config/schemas.ts CHANGED Viewed

@@ -7,6 +7,10 @@ export const ChunkerConfigSchema = z.object({
 	max_chars: z.number().int().positive().default(DEFAULTS.CHUNKER_MAX_CHARS),
 });
+export const ConvertersConfigSchema = z.object({
+	max_inline_image_captions: z.number().int().nonnegative().default(DEFAULTS.MAX_INLINE_IMAGE_CAPTIONS),
+});
 export const LlmConfigSchema = z.object({
 	anthropic_api_key: z.string().meta({ secret: true }).default(""),
 	converter_model: z.string().default(DEFAULTS.CONVERTER_MODEL),
@@ -15,14 +19,23 @@ export const LlmConfigSchema = z.object({
 	vision_model: z.string().default(DEFAULTS.VISION_MODEL),
 });
-export const McpxConfigSchema = z.object({
-	config_path: z.string().default(""),
-});
 export const DaemonConfigSchema = z.object({
 	tick_interval_sec: z.number().int().positive().default(DEFAULTS.DAEMON_TICK_SEC),
 });
+export const LinearDownloaderConfigSchema = z.object({
+	api_key: z.string().meta({ secret: true }).default(""),
+});
+export const GithubDownloaderConfigSchema = z.object({
+	api_key: z.string().meta({ secret: true }).default(""),
+});
+export const DownloadersConfigSchema = z.object({
+	linear: LinearDownloaderConfigSchema.default(() => LinearDownloaderConfigSchema.parse({})),
+	github: GithubDownloaderConfigSchema.default(() => GithubDownloaderConfigSchema.parse({})),
+});
 export const DbLockRetryConfigSchema = z.object({
 	max_attempts: z.number().int().positive().default(30),
 	base_delay_ms: z.number().int().positive().default(100),
@@ -34,8 +47,9 @@ export const MembotConfigSchema = z.object({
 	embedding_model: z.string().default(EMBEDDING_MODEL),
 	embedding_dimension: z.number().int().positive().default(EMBEDDING_DIMENSION),
 	chunker: ChunkerConfigSchema.default(() => ChunkerConfigSchema.parse({})),
+	converters: ConvertersConfigSchema.default(() => ConvertersConfigSchema.parse({})),
 	llm: LlmConfigSchema.default(() => LlmConfigSchema.parse({})),
-	mcpx: McpxConfigSchema.default(() => McpxConfigSchema.parse({})),
+	downloaders: DownloadersConfigSchema.default(() => DownloadersConfigSchema.parse({})),
 	daemon: DaemonConfigSchema.default(() => DaemonConfigSchema.parse({})),
 	db_lock_retry: DbLockRetryConfigSchema.default(() => DbLockRetryConfigSchema.parse({})),
 	default_refresh_frequency_sec: z.number().int().positive().nullable().default(null),
@@ -43,4 +57,8 @@ export const MembotConfigSchema = z.object({
 export type MembotConfig = z.infer<typeof MembotConfigSchema>;
 export type ChunkerConfig = z.infer<typeof ChunkerConfigSchema>;
+export type ConvertersConfig = z.infer<typeof ConvertersConfigSchema>;
 export type LlmConfig = z.infer<typeof LlmConfigSchema>;
+export type DownloadersConfig = z.infer<typeof DownloadersConfigSchema>;
+export type LinearDownloaderConfig = z.infer<typeof LinearDownloaderConfigSchema>;
+export type GithubDownloaderConfig = z.infer<typeof GithubDownloaderConfigSchema>;

package/src/constants.ts CHANGED Viewed

@@ -13,7 +13,6 @@ export const ENV = {
 	CONFIG: "MEMBOT_CONFIG",
 	DEBUG: "MEMBOT_DEBUG",
 	ANTHROPIC_API_KEY: "ANTHROPIC_API_KEY",
-	MCPX_CONFIG_PATH: "MCP_CONFIG_PATH",
 	NO_UPDATE_CHECK: "MEMBOT_NO_UPDATE_CHECK",
 } as const;
@@ -41,6 +40,13 @@ export const DEFAULTS = {
 	VISION_MODEL: "claude-haiku-4-5-20251001",
 	UPDATE_CHECK_INTERVAL_MS: 24 * 60 * 60 * 1000,
 	UPDATE_CHECK_TIMEOUT_MS: 5_000,
+	/**
+	 * Per-document cap on Claude vision caption calls when expanding inline
+	 * images during DOCX/HTML conversion. Beyond this, images get a small
+	 * deterministic placeholder so a slide-deck-shaped doc with hundreds of
+	 * embedded images doesn't fan out into hundreds of vision requests.
+	 */
+	MAX_INLINE_IMAGE_CAPTIONS: 20,
 } as const;
 export const FILES = {
@@ -48,4 +54,17 @@ export const FILES = {
 	INDEX_DUCKDB: "index.duckdb",
 	MODELS_DIR: "models",
 	LOGS_DIR: "logs",
+	AUTH_DIR: "auth",
+	/**
+	 * Persistent Chromium profile directory. We use
+	 * `chromium.launchPersistentContext(userDataDir)` rather than the
+	 * lighter `storageState` JSON snapshot because Linear (and other
+	 * SPA-heavy services) stash critical session state in IndexedDB —
+	 * which `storageState` doesn't capture. A persistent profile
+	 * survives the full set: cookies, localStorage, IndexedDB, service
+	 * workers, etc. Trade-off: directory-sized state instead of a tiny
+	 * JSON file, and only one process can have the profile open at a
+	 * time (chromium's single-instance lock).
+	 */
+	BROWSER_PROFILE: "auth/browser-profile",
 } as const;