membot 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.claude/skills/membot.md +25 -10
  2. package/.cursor/rules/membot.mdc +25 -10
  3. package/README.md +35 -4
  4. package/package.json +8 -5
  5. package/scripts/apply-patches.sh +0 -11
  6. package/src/cli.ts +2 -2
  7. package/src/commands/login-page.mustache +50 -0
  8. package/src/commands/login.ts +83 -0
  9. package/src/config/schemas.ts +17 -5
  10. package/src/constants.ts +13 -1
  11. package/src/context.ts +1 -24
  12. package/src/db/files.ts +21 -25
  13. package/src/db/migrations/003-downloader-columns.ts +58 -0
  14. package/src/db/migrations.ts +2 -1
  15. package/src/ingest/converter/index.ts +9 -0
  16. package/src/ingest/converter/xlsx.ts +111 -0
  17. package/src/ingest/downloaders/browser.ts +180 -0
  18. package/src/ingest/downloaders/generic-web.ts +81 -0
  19. package/src/ingest/downloaders/github.ts +178 -0
  20. package/src/ingest/downloaders/google-docs.ts +56 -0
  21. package/src/ingest/downloaders/google-shared.ts +86 -0
  22. package/src/ingest/downloaders/google-sheets.ts +58 -0
  23. package/src/ingest/downloaders/google-slides.ts +53 -0
  24. package/src/ingest/downloaders/index.ts +182 -0
  25. package/src/ingest/downloaders/linear.ts +291 -0
  26. package/src/ingest/fetcher.ts +107 -127
  27. package/src/ingest/ingest.ts +43 -69
  28. package/src/mcp/instructions.ts +4 -2
  29. package/src/operations/add.ts +6 -4
  30. package/src/operations/info.ts +4 -6
  31. package/src/operations/move.ts +2 -3
  32. package/src/operations/refresh.ts +2 -4
  33. package/src/operations/remove.ts +23 -2
  34. package/src/operations/tree.ts +1 -1
  35. package/src/operations/types.ts +1 -1
  36. package/src/refresh/runner.ts +59 -114
  37. package/src/types/text-modules.d.ts +5 -0
  38. package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
  39. package/src/commands/mcpx.ts +0 -112
  40. package/src/ingest/agent-fetcher.ts +0 -564
package/src/db/files.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import type { DbConnection, SqlParam } from "./connection.ts";
2
2
 
3
3
  export type SourceType = "local" | "remote" | "inline";
4
- export type FetcherKind = "http" | "mcpx" | "local" | "inline";
4
+ export type FetcherKind = "downloader" | "local" | "inline";
5
5
 
6
6
  export interface FileRow {
7
7
  logical_path: string;
@@ -18,9 +18,8 @@ export interface FileRow {
18
18
  mime_type: string | null;
19
19
  size_bytes: number | null;
20
20
  fetcher: FetcherKind | null;
21
- fetcher_server: string | null;
22
- fetcher_tool: string | null;
23
- fetcher_args: Record<string, unknown> | null;
21
+ downloader: string | null;
22
+ downloader_args: Record<string, unknown> | null;
24
23
  refresh_frequency_sec: number | null;
25
24
  refreshed_at: string | null;
26
25
  last_refresh_status: string | null;
@@ -43,9 +42,8 @@ export interface NewFileVersion {
43
42
  mime_type?: string | null;
44
43
  size_bytes?: number | null;
45
44
  fetcher?: FetcherKind | null;
46
- fetcher_server?: string | null;
47
- fetcher_tool?: string | null;
48
- fetcher_args?: Record<string, unknown> | null;
45
+ downloader?: string | null;
46
+ downloader_args?: Record<string, unknown> | null;
49
47
  refresh_frequency_sec?: number | null;
50
48
  refreshed_at?: string | null;
51
49
  last_refresh_status?: string | null;
@@ -67,9 +65,8 @@ const ROW_COLUMNS = [
67
65
  "mime_type",
68
66
  "size_bytes",
69
67
  "fetcher",
70
- "fetcher_server",
71
- "fetcher_tool",
72
- "fetcher_args",
68
+ "downloader",
69
+ "downloader_args",
73
70
  "refresh_frequency_sec",
74
71
  "refreshed_at",
75
72
  "last_refresh_status",
@@ -86,21 +83,21 @@ const COLUMN_LIST = ROW_COLUMNS.join(", ");
86
83
  */
87
84
  export async function insertVersion(db: DbConnection, file: NewFileVersion): Promise<string> {
88
85
  const versionId = file.version_id ?? millisIso(Date.now());
89
- const fetcherArgsJson = file.fetcher_args ? JSON.stringify(file.fetcher_args) : null;
86
+ const downloaderArgsJson = file.downloader_args ? JSON.stringify(file.downloader_args) : null;
90
87
 
91
88
  await db.queryRun(
92
89
  `INSERT INTO files (
93
90
  logical_path, version_id, tombstone, source_type,
94
91
  source_path, source_mtime_ms, source_sha256, blob_sha256,
95
92
  content_sha256, content, description, mime_type, size_bytes,
96
- fetcher, fetcher_server, fetcher_tool, fetcher_args,
93
+ fetcher, downloader, downloader_args,
97
94
  refresh_frequency_sec, refreshed_at, last_refresh_status, change_note
98
95
  ) VALUES (
99
96
  ?1, CAST(?2 AS TIMESTAMP), ?3, ?4,
100
97
  ?5, ?6, ?7, ?8,
101
98
  ?9, ?10, ?11, ?12, ?13,
102
- ?14, ?15, ?16, ?17,
103
- ?18, ?19, ?20, ?21
99
+ ?14, ?15, ?16,
100
+ ?17, ?18, ?19, ?20
104
101
  )`,
105
102
  file.logical_path,
106
103
  versionId,
@@ -116,9 +113,8 @@ export async function insertVersion(db: DbConnection, file: NewFileVersion): Pro
116
113
  file.mime_type ?? null,
117
114
  file.size_bytes ?? null,
118
115
  file.fetcher ?? null,
119
- file.fetcher_server ?? null,
120
- file.fetcher_tool ?? null,
121
- fetcherArgsJson,
116
+ file.downloader ?? null,
117
+ downloaderArgsJson,
122
118
  file.refresh_frequency_sec ?? null,
123
119
  file.refreshed_at ?? null,
124
120
  file.last_refresh_status ?? null,
@@ -132,33 +128,33 @@ export function millisIso(ms: number): string {
132
128
  return new Date(ms).toISOString();
133
129
  }
134
130
 
135
- interface RawFileRow extends Omit<FileRow, "fetcher_args" | "tombstone"> {
136
- fetcher_args: string | null | Record<string, unknown>;
131
+ interface RawFileRow extends Omit<FileRow, "downloader_args" | "tombstone"> {
132
+ downloader_args: string | null | Record<string, unknown>;
137
133
  tombstone: boolean | number;
138
134
  [key: string]: unknown;
139
135
  }
140
136
 
141
137
  /**
142
138
  * Coerce a raw DuckDB row into a typed `FileRow`. JSON-parses the
143
- * `fetcher_args` column (DuckDB returns it as text or a parsed object
139
+ * `downloader_args` column (DuckDB returns it as text or a parsed object
144
140
  * depending on driver version) and normalizes `tombstone` to a boolean
145
141
  * (some drivers return 0/1).
146
142
  */
147
143
  function toFileRow(row: RawFileRow | null): FileRow | null {
148
144
  if (!row) return null;
149
145
  let parsed: Record<string, unknown> | null = null;
150
- if (row.fetcher_args && typeof row.fetcher_args === "string") {
146
+ if (row.downloader_args && typeof row.downloader_args === "string") {
151
147
  try {
152
- parsed = JSON.parse(row.fetcher_args);
148
+ parsed = JSON.parse(row.downloader_args);
153
149
  } catch {
154
150
  parsed = null;
155
151
  }
156
- } else if (row.fetcher_args && typeof row.fetcher_args === "object") {
157
- parsed = row.fetcher_args;
152
+ } else if (row.downloader_args && typeof row.downloader_args === "object") {
153
+ parsed = row.downloader_args;
158
154
  }
159
155
  return {
160
156
  ...row,
161
- fetcher_args: parsed,
157
+ downloader_args: parsed,
162
158
  tombstone: !!row.tombstone,
163
159
  };
164
160
  }
@@ -0,0 +1,58 @@
1
+ import type { Migration } from "../migrations.ts";
2
+
3
+ /**
4
+ * Replace the old mcpx-era fetcher metadata triple
5
+ * (`fetcher_server` / `fetcher_tool` / `fetcher_args`) with a flat
6
+ * `(downloader, downloader_args)` shape. The mcpx-driven agent fetcher
7
+ * is gone; per-service downloaders match a URL → run a known fetch
8
+ * (Playwright export endpoints for Google, rendered HTML for GitHub /
9
+ * Linear, headless print-to-PDF for everything else) → return bytes
10
+ * for the existing native converter pipeline.
11
+ *
12
+ * Existing rows whose `fetcher` was `'http'` or `'mcpx'` are migrated
13
+ * to `'downloader'` with `downloader=NULL`. The mcpx-driven ones
14
+ * become refresh-broken (the `fetcher_*` arguments that drove them no
15
+ * longer exist) but their stored `content` is still readable; the
16
+ * plain-HTTP ones will be re-routed through the generic-web downloader
17
+ * the next time refresh runs. The `fetcher` enum loses both `'http'`
18
+ * and `'mcpx'` — every remote row is `'downloader'` now, since even
19
+ * the plain-HTTP fallback is wrapped by the generic-web downloader.
20
+ *
21
+ * The `current_files` view is `SELECT f.* FROM files f`, so it pins the
22
+ * old column shape; we drop and recreate it (and the dependent
23
+ * `current_chunks` view) around the schema change.
24
+ */
25
+ export const MIGRATION_003: Migration = {
26
+ id: 3,
27
+ name: "downloader-columns",
28
+ statements: [
29
+ // DuckDB refuses DROP COLUMN when an index covers any column that
30
+ // comes AFTER the dropped one in the schema, so the indexes have
31
+ // to come down first. The view drops are needed for the same
32
+ // reason — `current_files` is `SELECT f.*`, which pins every
33
+ // column at view-creation time.
34
+ `DROP VIEW IF EXISTS current_chunks`,
35
+ `DROP VIEW IF EXISTS current_files`,
36
+ `DROP INDEX IF EXISTS files_refresh_due_idx`,
37
+ `DROP INDEX IF EXISTS files_blob_sha256_idx`,
38
+ `DROP INDEX IF EXISTS files_logical_path_idx`,
39
+ `UPDATE files SET fetcher = 'downloader' WHERE fetcher IN ('http', 'mcpx')`,
40
+ `ALTER TABLE files DROP COLUMN fetcher_server`,
41
+ `ALTER TABLE files DROP COLUMN fetcher_tool`,
42
+ `ALTER TABLE files DROP COLUMN fetcher_args`,
43
+ `ALTER TABLE files ADD COLUMN downloader TEXT`,
44
+ `ALTER TABLE files ADD COLUMN downloader_args JSON`,
45
+ `CREATE INDEX files_logical_path_idx ON files (logical_path)`,
46
+ `CREATE INDEX files_blob_sha256_idx ON files (blob_sha256)`,
47
+ `CREATE INDEX files_refresh_due_idx ON files (refresh_frequency_sec, refreshed_at)`,
48
+ `CREATE VIEW current_files AS
49
+ SELECT f.* FROM files f
50
+ WHERE (f.logical_path, f.version_id) IN (
51
+ SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
52
+ )
53
+ AND f.tombstone = FALSE`,
54
+ `CREATE VIEW current_chunks AS
55
+ SELECT c.* FROM chunks c
56
+ JOIN current_files cf USING (logical_path, version_id)`,
57
+ ],
58
+ };
@@ -2,6 +2,7 @@ import { logger } from "../output/logger.ts";
2
2
  import type { DbConnection } from "./connection.ts";
3
3
  import { MIGRATION_001 } from "./migrations/001-init.ts";
4
4
  import { MIGRATION_002 } from "./migrations/002-fts.ts";
5
+ import { MIGRATION_003 } from "./migrations/003-downloader-columns.ts";
5
6
 
6
7
  /**
7
8
  * One DDL/DML migration step. The id is monotonically increasing; the name
@@ -14,7 +15,7 @@ export interface Migration {
14
15
  statements: string[];
15
16
  }
16
17
 
17
- const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
18
+ const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002, MIGRATION_003];
18
19
 
19
20
  /**
20
21
  * Process-level cache of paths whose migrations have been applied (or
@@ -6,6 +6,7 @@ import { convertWithLlm } from "./llm.ts";
6
6
  import { ocrImage } from "./ocr.ts";
7
7
  import { convertPdf, shouldOcrPdf } from "./pdf.ts";
8
8
  import { convertText } from "./text.ts";
9
+ import { convertXlsx } from "./xlsx.ts";
9
10
 
10
11
  export interface ConvertResult {
11
12
  markdown: string;
@@ -25,6 +26,10 @@ const STRUCTURED_TEXT_MIMES = new Set([
25
26
  "application/typescript",
26
27
  ]);
27
28
  const DOCX_MIMES = new Set(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]);
29
+ const XLSX_MIMES = new Set([
30
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
31
+ "application/vnd.ms-excel",
32
+ ]);
28
33
  const PDF_MIMES = new Set(["application/pdf"]);
29
34
 
30
35
  /**
@@ -54,6 +59,10 @@ export async function convert(
54
59
  return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
55
60
  }
56
61
 
62
+ if (XLSX_MIMES.has(mt)) {
63
+ return { markdown: await convertXlsx(bytes), contentMimeType: "text/markdown" };
64
+ }
65
+
57
66
  if (PDF_MIMES.has(mt)) {
58
67
  const conversion = await convertPdf(bytes);
59
68
  if (!shouldOcrPdf(conversion)) {
@@ -0,0 +1,111 @@
1
+ import * as XLSX from "xlsx";
2
+
3
+ export interface ConvertXlsxOptions {
4
+ /** Optional sublabel callback driven per-sheet (`parsing 3/8 tabs`). */
5
+ onProgress?: (sublabel: string) => void;
6
+ }
7
+
8
+ /**
9
+ * Convert an XLSX workbook into markdown — one `## <SheetName>`
10
+ * section per tab, each tab rendered as a GitHub-flavored pipe table
11
+ * with the first non-empty row treated as the header. Empty sheets
12
+ * are skipped. Cell values are stringified (numbers, dates, formulas
13
+ * use their displayed value via `XLSX.utils.format_cell`).
14
+ *
15
+ * Pure-JS via SheetJS — no native deps, bundles cleanly with
16
+ * `bun build --compile`. Yields a macrotask between sheets so
17
+ * nanospinner's setInterval keeps animating during big workbooks
18
+ * (otherwise the spinner visibly freezes).
19
+ */
20
+ export async function convertXlsx(bytes: Uint8Array, opts: ConvertXlsxOptions = {}): Promise<string> {
21
+ const workbook = XLSX.read(bytes, { type: "array", cellDates: true });
22
+ const sections: string[] = [];
23
+ const sheetNames = workbook.SheetNames;
24
+
25
+ for (let i = 0; i < sheetNames.length; i++) {
26
+ const sheetName = sheetNames[i] as string;
27
+ opts.onProgress?.(`parsing ${i + 1}/${sheetNames.length} tabs`);
28
+ const sheet = workbook.Sheets[sheetName];
29
+ if (sheet) {
30
+ const rows = sheetToMatrix(sheet);
31
+ const trimmed = trimEmptyEdges(rows);
32
+ if (trimmed.length > 0) sections.push(`## ${sheetName}\n\n${renderTable(trimmed)}`);
33
+ }
34
+ // Yield so the spinner can repaint between sheets — large
35
+ // workbooks would otherwise freeze the UI for the duration of
36
+ // the parse.
37
+ await new Promise<void>((resolve) => setTimeout(resolve, 0));
38
+ }
39
+
40
+ if (sections.length === 0) return "(empty workbook)";
41
+ return sections.join("\n\n");
42
+ }
43
+
44
+ /**
45
+ * Walk every cell in the sheet's used range and produce a 2-D array
46
+ * of display strings. Uses the cell's formatted text (e.g. dates as
47
+ * "2026-05-09", percentages as "12.5%") rather than raw values, so
48
+ * the markdown matches what a human sees in the spreadsheet.
49
+ */
50
+ function sheetToMatrix(sheet: XLSX.WorkSheet): string[][] {
51
+ if (!sheet["!ref"]) return [];
52
+ const range = XLSX.utils.decode_range(sheet["!ref"]);
53
+ const out: string[][] = [];
54
+ for (let r = range.s.r; r <= range.e.r; r++) {
55
+ const row: string[] = [];
56
+ for (let c = range.s.c; c <= range.e.c; c++) {
57
+ const addr = XLSX.utils.encode_cell({ r, c });
58
+ const cell = sheet[addr];
59
+ row.push(cell ? XLSX.utils.format_cell(cell) : "");
60
+ }
61
+ out.push(row);
62
+ }
63
+ return out;
64
+ }
65
+
66
+ /**
67
+ * Drop fully-empty leading/trailing rows and columns. Spreadsheets
68
+ * commonly have the used range padded out beyond the actual data.
69
+ */
70
+ function trimEmptyEdges(rows: string[][]): string[][] {
71
+ if (rows.length === 0) return rows;
72
+ let firstRow = 0;
73
+ let lastRow = rows.length - 1;
74
+ while (firstRow <= lastRow && rows[firstRow]?.every((v) => v === "")) firstRow++;
75
+ while (lastRow >= firstRow && rows[lastRow]?.every((v) => v === "")) lastRow--;
76
+ if (firstRow > lastRow) return [];
77
+ const sliced = rows.slice(firstRow, lastRow + 1);
78
+ const cols = sliced[0]?.length ?? 0;
79
+ let firstCol = 0;
80
+ let lastCol = cols - 1;
81
+ while (firstCol <= lastCol && sliced.every((r) => (r[firstCol] ?? "") === "")) firstCol++;
82
+ while (lastCol >= firstCol && sliced.every((r) => (r[lastCol] ?? "") === "")) lastCol--;
83
+ if (firstCol > lastCol) return [];
84
+ return sliced.map((r) => r.slice(firstCol, lastCol + 1));
85
+ }
86
+
87
+ /**
88
+ * Render a 2-D matrix as a GitHub pipe table. The first row becomes
89
+ * the header. Pipe and newline characters in cells are escaped so
90
+ * they don't break the table layout.
91
+ */
92
+ function renderTable(rows: string[][]): string {
93
+ const colCount = Math.max(...rows.map((r) => r.length));
94
+ const norm = rows.map((r) => {
95
+ const padded = [...r];
96
+ while (padded.length < colCount) padded.push("");
97
+ return padded.map(escapeCell);
98
+ });
99
+ const lines: string[] = [];
100
+ const header = norm[0] ?? Array(colCount).fill("");
101
+ lines.push(`| ${header.join(" | ")} |`);
102
+ lines.push(`| ${Array(colCount).fill("---").join(" | ")} |`);
103
+ for (let i = 1; i < norm.length; i++) {
104
+ lines.push(`| ${(norm[i] as string[]).join(" | ")} |`);
105
+ }
106
+ return lines.join("\n");
107
+ }
108
+
109
+ function escapeCell(value: string): string {
110
+ return value.replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
111
+ }
@@ -0,0 +1,180 @@
1
+ import { mkdir } from "node:fs/promises";
2
+ import type { APIRequestContext, BrowserContext, Page } from "playwright";
3
+ import { HelpfulError } from "../../errors.ts";
4
+
5
+ let chromiumModule: typeof import("playwright").chromium | null = null;
6
+
7
+ /**
8
+ * Lazy-import `playwright.chromium`. Keeping the import deferred so the
9
+ * heavy module isn't loaded on cold paths (e.g. `membot list`); ALSO
10
+ * lets us produce a `HelpfulError` if Playwright isn't installed yet
11
+ * instead of a stack trace at module-load time.
12
+ */
13
+ async function loadChromium(): Promise<typeof import("playwright").chromium> {
14
+ if (chromiumModule) return chromiumModule;
15
+ try {
16
+ const playwright = await import("playwright");
17
+ chromiumModule = playwright.chromium;
18
+ return chromiumModule;
19
+ } catch (err) {
20
+ throw new HelpfulError({
21
+ kind: "internal_error",
22
+ message: `failed to load playwright: ${err instanceof Error ? err.message : String(err)}`,
23
+ hint: "Run `bun add -g membot` to reinstall, then `bunx playwright install chromium` to fetch the browser binary.",
24
+ });
25
+ }
26
+ }
27
+
28
+ export interface BrowserPoolOptions {
29
+ userDataDir: string;
30
+ headless?: boolean;
31
+ }
32
+
33
+ /**
34
+ * Process-scoped lazy-launched chromium context backed by a *persistent
35
+ * profile directory* (`launchPersistentContext`). Persistent profiles
36
+ * survive cookies, localStorage, sessionStorage, IndexedDB, and service
37
+ * worker state across runs — necessary for SPA-heavy services like
38
+ * Linear that stash critical session/sync state in IndexedDB (which
39
+ * the lighter `storageState` JSON snapshot doesn't capture).
40
+ *
41
+ * Trade-offs:
42
+ * - The profile is a directory, not a single JSON file (a few MBs).
43
+ * - Chromium's single-instance lock means only one BrowserPool can
44
+ * have the profile open at a time. Sequential `membot add` calls
45
+ * are fine; concurrent CLI processes against the same profile will
46
+ * fail to launch.
47
+ */
48
+ export class BrowserPool {
49
+ private readonly userDataDir: string;
50
+ private readonly headless: boolean;
51
+ private context: BrowserContext | null = null;
52
+
53
+ constructor(options: BrowserPoolOptions) {
54
+ this.userDataDir = options.userDataDir;
55
+ this.headless = options.headless ?? true;
56
+ }
57
+
58
+ /**
59
+ * Lazy-init the persistent context. The first call launches
60
+ * chromium against `userDataDir` (creating it if needed); subsequent
61
+ * calls reuse the same context so cookies, IDB, and inflight
62
+ * navigation state stay shared across downloaders within one run.
63
+ */
64
+ private async ensureContext(): Promise<BrowserContext> {
65
+ if (this.context) return this.context;
66
+ const chromium = await loadChromium();
67
+ await mkdir(this.userDataDir, { recursive: true });
68
+ try {
69
+ this.context = await chromium.launchPersistentContext(this.userDataDir, {
70
+ headless: this.headless,
71
+ });
72
+ } catch (err) {
73
+ throw new HelpfulError({
74
+ kind: "internal_error",
75
+ message: `chromium failed to launch: ${err instanceof Error ? err.message : String(err)}`,
76
+ hint: this.headless
77
+ ? "Run `bunx playwright install chromium` to download the browser binary."
78
+ : "Close any other membot process holding the browser profile, then retry.",
79
+ });
80
+ }
81
+ return this.context;
82
+ }
83
+
84
+ /** Return the request context for downloaders that just need authenticated HTTP. */
85
+ async request(): Promise<APIRequestContext> {
86
+ const ctx = await this.ensureContext();
87
+ return ctx.request;
88
+ }
89
+
90
+ /** Open a fresh page (caller is responsible for `page.close()`). */
91
+ async newPage(): Promise<Page> {
92
+ const ctx = await this.ensureContext();
93
+ return ctx.newPage();
94
+ }
95
+
96
+ /**
97
+ * How many cookies are in the live context. Used by the auth-prompt
98
+ * flow to detect "user closed the window without logging in" — must
99
+ * be called BEFORE `dispose()` since the context closes its own
100
+ * stores when shutting down.
101
+ */
102
+ async cookieCount(): Promise<number> {
103
+ if (!this.context) return 0;
104
+ try {
105
+ const cookies = await this.context.cookies();
106
+ return cookies.length;
107
+ } catch {
108
+ return 0;
109
+ }
110
+ }
111
+
112
+ /**
113
+ * Return the cookies stored in the persistent profile for a given
114
+ * URL/origin (or all cookies when omitted). Used by downloaders that
115
+ * call services with their own HTTP client (e.g. Node's built-in
116
+ * `fetch`) — they read the cookies once here and pass them via a
117
+ * `Cookie` header. Bypasses Playwright's APIRequestContext, which
118
+ * has a known cookie-parser bug on Google's same-origin redirects.
119
+ */
120
+ async cookieHeader(url: string): Promise<string> {
121
+ const ctx = await this.ensureContext();
122
+ const cookies = await ctx.cookies(url);
123
+ return cookies.map((c) => `${c.name}=${c.value}`).join("; ");
124
+ }
125
+
126
+ /**
127
+ * Resolve when the user is "done" with the headed browser session,
128
+ * detected as: the supplied page closes, OR its context closes, OR
129
+ * the underlying browser disconnects — whichever fires first. We
130
+ * can't rely on the browser-disconnect event alone: on macOS,
131
+ * closing the last window does NOT quit chromium (the app stays
132
+ * alive in the background), so the disconnect event never fires
133
+ * and the caller hangs forever. The page-close event is the only
134
+ * signal that's consistent across macOS, Linux, and Windows.
135
+ */
136
+ async waitForUserDone(page: Page): Promise<void> {
137
+ const ctx = page.context();
138
+ const browser = ctx.browser();
139
+ await new Promise<void>((resolve) => {
140
+ let done = false;
141
+ const finish = () => {
142
+ if (done) return;
143
+ done = true;
144
+ resolve();
145
+ };
146
+ page.on("close", finish);
147
+ ctx.on("close", finish);
148
+ browser?.on("disconnected", finish);
149
+ if (page.isClosed() || (browser && !browser.isConnected())) finish();
150
+ });
151
+ }
152
+
153
+ /** Close the context (which releases the userDataDir lock). Idempotent. */
154
+ async dispose(): Promise<void> {
155
+ try {
156
+ await this.context?.close();
157
+ } catch {
158
+ // best-effort
159
+ }
160
+ this.context = null;
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Resolve `maybeRelative` against `base` and return a `URL`, or `null`
166
+ * if neither parses. Playwright's `APIResponse.url()` sometimes hands
167
+ * back a path-only string (`"/"`) instead of an absolute URL after a
168
+ * same-origin redirect — every downloader that wants to inspect the
169
+ * final URL goes through this helper so the relative-URL handling
170
+ * lives in one place. Login-redirect detection itself is each
171
+ * downloader's responsibility — it's the only code that knows which
172
+ * host its export endpoint redirects to when the session is missing.
173
+ */
174
+ export function safeResolveUrl(maybeRelative: string, base: string): URL | null {
175
+ try {
176
+ return new URL(maybeRelative, base);
177
+ } catch {
178
+ return null;
179
+ }
180
+ }
@@ -0,0 +1,81 @@
1
+ import { HelpfulError } from "../../errors.ts";
2
+ import { sha256Hex } from "../local-reader.ts";
3
+ import type { DownloadedRemote, Downloader } from "./index.ts";
4
+
5
+ /**
6
+ * Catch-all downloader. Always matches HTTP/HTTPS URLs that no
7
+ * specific downloader claimed. Strategy:
8
+ * - Issue an authenticated GET via Playwright's request context
9
+ * (cookies from `membot login` flow through automatically).
10
+ * - If the server returned `text/html`, the page is probably a SPA
11
+ * or auth-gated render — open a real `page`, wait for
12
+ * `networkidle`, and `page.pdf()` the visible result. The rendered
13
+ * PDF goes through `convertPdf` so SPAs and login-walled docs
14
+ * work uniformly.
15
+ * - Otherwise the response IS the file (markdown, JSON, PDF, image,
16
+ * docx, …) — return its bytes verbatim and let the mime
17
+ * dispatcher pick the right native converter.
18
+ *
19
+ * This is what gives "no specific downloader needed" coverage to any
20
+ * URL the user throws at `membot add`.
21
+ */
22
+ export const genericWebDownloader: Downloader = {
23
+ name: "generic-web",
24
+ description:
25
+ "Catch-all for any URL no other downloader handled — HEAD/GET, then either page.pdf() the rendered HTML or stream the raw bytes through the mime converter.",
26
+ matches(url) {
27
+ return url.protocol === "http:" || url.protocol === "https:";
28
+ },
29
+ async download(url, ctx): Promise<DownloadedRemote> {
30
+ ctx.onProgress?.("fetching");
31
+ const request = await ctx.pool.request();
32
+ const response = await request.get(url.toString(), { timeout: 30_000 });
33
+ // As the catch-all we don't know which login page each unknown
34
+ // service redirects to. If the user lands on a rendered login
35
+ // page, it goes through the print-to-PDF path and they'll see
36
+ // an obviously-wrong "Sign in" PDF — the cue to run `membot login`.
37
+ // Specific downloaders own auth-redirect detection for the services
38
+ // they understand.
39
+ if (!response.ok() && response.status() !== 304) {
40
+ throw new HelpfulError({
41
+ kind: "network_error",
42
+ message: `HTTP ${response.status()} ${response.statusText()}: ${url.toString()}`,
43
+ hint: "Open the URL in your browser to verify it exists. For auth-gated content, run `membot login` first.",
44
+ });
45
+ }
46
+ const headers = response.headers();
47
+ const contentType =
48
+ (headers["content-type"] ?? "application/octet-stream").split(";")[0]?.trim() ?? "application/octet-stream";
49
+
50
+ if (contentType === "text/html" || contentType === "application/xhtml+xml") {
51
+ const page = await ctx.pool.newPage();
52
+ try {
53
+ ctx.onProgress?.("rendering page");
54
+ await page.goto(url.toString(), { waitUntil: "networkidle", timeout: 45_000 });
55
+ ctx.onProgress?.("printing to pdf");
56
+ const pdfBuf = await page.pdf({ format: "A4", printBackground: true, preferCSSPageSize: false });
57
+ const bytes = new Uint8Array(pdfBuf);
58
+ return {
59
+ bytes,
60
+ sha256: sha256Hex(pdfBuf),
61
+ mimeType: "application/pdf",
62
+ downloader: "generic-web",
63
+ downloaderArgs: { rendered: true, source_content_type: contentType },
64
+ sourceUrl: url.toString(),
65
+ };
66
+ } finally {
67
+ await page.close().catch(() => {});
68
+ }
69
+ }
70
+
71
+ const body = Buffer.from(await response.body());
72
+ return {
73
+ bytes: new Uint8Array(body),
74
+ sha256: sha256Hex(body),
75
+ mimeType: contentType,
76
+ downloader: "generic-web",
77
+ downloaderArgs: { rendered: false, source_content_type: contentType },
78
+ sourceUrl: url.toString(),
79
+ };
80
+ },
81
+ };