membot 0.5.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +25 -10
- package/.cursor/rules/membot.mdc +25 -10
- package/README.md +36 -4
- package/package.json +8 -5
- package/scripts/apply-patches.sh +0 -11
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +2 -2
- package/src/commands/login-page.mustache +50 -0
- package/src/commands/login.ts +83 -0
- package/src/config/schemas.ts +23 -5
- package/src/constants.ts +20 -1
- package/src/context.ts +1 -24
- package/src/db/files.ts +21 -25
- package/src/db/migrations/003-downloader-columns.ts +58 -0
- package/src/db/migrations.ts +2 -1
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +13 -3
- package/src/ingest/converter/xlsx.ts +111 -0
- package/src/ingest/downloaders/browser.ts +180 -0
- package/src/ingest/downloaders/generic-web.ts +81 -0
- package/src/ingest/downloaders/github.ts +178 -0
- package/src/ingest/downloaders/google-docs.ts +56 -0
- package/src/ingest/downloaders/google-shared.ts +86 -0
- package/src/ingest/downloaders/google-sheets.ts +58 -0
- package/src/ingest/downloaders/google-slides.ts +53 -0
- package/src/ingest/downloaders/index.ts +182 -0
- package/src/ingest/downloaders/linear.ts +291 -0
- package/src/ingest/fetcher.ts +104 -129
- package/src/ingest/ingest.ts +44 -71
- package/src/mcp/instructions.ts +4 -2
- package/src/operations/add.ts +6 -4
- package/src/operations/info.ts +4 -6
- package/src/operations/move.ts +2 -3
- package/src/operations/refresh.ts +2 -4
- package/src/operations/remove.ts +23 -2
- package/src/operations/tree.ts +1 -1
- package/src/operations/types.ts +1 -1
- package/src/refresh/runner.ts +60 -115
- package/src/types/text-modules.d.ts +5 -0
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
- package/src/commands/mcpx.ts +0 -112
- package/src/ingest/agent-fetcher.ts +0 -639
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import * as XLSX from "xlsx";
|
|
2
|
+
|
|
3
|
+
export interface ConvertXlsxOptions {
|
|
4
|
+
/** Optional sublabel callback driven per-sheet (`parsing 3/8 tabs`). */
|
|
5
|
+
onProgress?: (sublabel: string) => void;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Convert an XLSX workbook into markdown — one `## <SheetName>`
|
|
10
|
+
* section per tab, each tab rendered as a GitHub-flavored pipe table
|
|
11
|
+
* with the first non-empty row treated as the header. Empty sheets
|
|
12
|
+
* are skipped. Cell values are stringified (numbers, dates, formulas
|
|
13
|
+
* use their displayed value via `XLSX.utils.format_cell`).
|
|
14
|
+
*
|
|
15
|
+
* Pure-JS via SheetJS — no native deps, bundles cleanly with
|
|
16
|
+
* `bun build --compile`. Yields a macrotask between sheets so
|
|
17
|
+
* nanospinner's setInterval keeps animating during big workbooks
|
|
18
|
+
* (otherwise the spinner visibly freezes).
|
|
19
|
+
*/
|
|
20
|
+
export async function convertXlsx(bytes: Uint8Array, opts: ConvertXlsxOptions = {}): Promise<string> {
|
|
21
|
+
const workbook = XLSX.read(bytes, { type: "array", cellDates: true });
|
|
22
|
+
const sections: string[] = [];
|
|
23
|
+
const sheetNames = workbook.SheetNames;
|
|
24
|
+
|
|
25
|
+
for (let i = 0; i < sheetNames.length; i++) {
|
|
26
|
+
const sheetName = sheetNames[i] as string;
|
|
27
|
+
opts.onProgress?.(`parsing ${i + 1}/${sheetNames.length} tabs`);
|
|
28
|
+
const sheet = workbook.Sheets[sheetName];
|
|
29
|
+
if (sheet) {
|
|
30
|
+
const rows = sheetToMatrix(sheet);
|
|
31
|
+
const trimmed = trimEmptyEdges(rows);
|
|
32
|
+
if (trimmed.length > 0) sections.push(`## ${sheetName}\n\n${renderTable(trimmed)}`);
|
|
33
|
+
}
|
|
34
|
+
// Yield so the spinner can repaint between sheets — large
|
|
35
|
+
// workbooks would otherwise freeze the UI for the duration of
|
|
36
|
+
// the parse.
|
|
37
|
+
await new Promise<void>((resolve) => setTimeout(resolve, 0));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (sections.length === 0) return "(empty workbook)";
|
|
41
|
+
return sections.join("\n\n");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Walk every cell in the sheet's used range and produce a 2-D array
|
|
46
|
+
* of display strings. Uses the cell's formatted text (e.g. dates as
|
|
47
|
+
* "2026-05-09", percentages as "12.5%") rather than raw values, so
|
|
48
|
+
* the markdown matches what a human sees in the spreadsheet.
|
|
49
|
+
*/
|
|
50
|
+
function sheetToMatrix(sheet: XLSX.WorkSheet): string[][] {
|
|
51
|
+
if (!sheet["!ref"]) return [];
|
|
52
|
+
const range = XLSX.utils.decode_range(sheet["!ref"]);
|
|
53
|
+
const out: string[][] = [];
|
|
54
|
+
for (let r = range.s.r; r <= range.e.r; r++) {
|
|
55
|
+
const row: string[] = [];
|
|
56
|
+
for (let c = range.s.c; c <= range.e.c; c++) {
|
|
57
|
+
const addr = XLSX.utils.encode_cell({ r, c });
|
|
58
|
+
const cell = sheet[addr];
|
|
59
|
+
row.push(cell ? XLSX.utils.format_cell(cell) : "");
|
|
60
|
+
}
|
|
61
|
+
out.push(row);
|
|
62
|
+
}
|
|
63
|
+
return out;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Drop fully-empty leading/trailing rows and columns. Spreadsheets
|
|
68
|
+
* commonly have the used range padded out beyond the actual data.
|
|
69
|
+
*/
|
|
70
|
+
function trimEmptyEdges(rows: string[][]): string[][] {
|
|
71
|
+
if (rows.length === 0) return rows;
|
|
72
|
+
let firstRow = 0;
|
|
73
|
+
let lastRow = rows.length - 1;
|
|
74
|
+
while (firstRow <= lastRow && rows[firstRow]?.every((v) => v === "")) firstRow++;
|
|
75
|
+
while (lastRow >= firstRow && rows[lastRow]?.every((v) => v === "")) lastRow--;
|
|
76
|
+
if (firstRow > lastRow) return [];
|
|
77
|
+
const sliced = rows.slice(firstRow, lastRow + 1);
|
|
78
|
+
const cols = sliced[0]?.length ?? 0;
|
|
79
|
+
let firstCol = 0;
|
|
80
|
+
let lastCol = cols - 1;
|
|
81
|
+
while (firstCol <= lastCol && sliced.every((r) => (r[firstCol] ?? "") === "")) firstCol++;
|
|
82
|
+
while (lastCol >= firstCol && sliced.every((r) => (r[lastCol] ?? "") === "")) lastCol--;
|
|
83
|
+
if (firstCol > lastCol) return [];
|
|
84
|
+
return sliced.map((r) => r.slice(firstCol, lastCol + 1));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Render a 2-D matrix as a GitHub pipe table. The first row becomes
|
|
89
|
+
* the header. Pipe and newline characters in cells are escaped so
|
|
90
|
+
* they don't break the table layout.
|
|
91
|
+
*/
|
|
92
|
+
function renderTable(rows: string[][]): string {
|
|
93
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
94
|
+
const norm = rows.map((r) => {
|
|
95
|
+
const padded = [...r];
|
|
96
|
+
while (padded.length < colCount) padded.push("");
|
|
97
|
+
return padded.map(escapeCell);
|
|
98
|
+
});
|
|
99
|
+
const lines: string[] = [];
|
|
100
|
+
const header = norm[0] ?? Array(colCount).fill("");
|
|
101
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
102
|
+
lines.push(`| ${Array(colCount).fill("---").join(" | ")} |`);
|
|
103
|
+
for (let i = 1; i < norm.length; i++) {
|
|
104
|
+
lines.push(`| ${(norm[i] as string[]).join(" | ")} |`);
|
|
105
|
+
}
|
|
106
|
+
return lines.join("\n");
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function escapeCell(value: string): string {
|
|
110
|
+
return value.replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
111
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { mkdir } from "node:fs/promises";
|
|
2
|
+
import type { APIRequestContext, BrowserContext, Page } from "playwright";
|
|
3
|
+
import { HelpfulError } from "../../errors.ts";
|
|
4
|
+
|
|
5
|
+
let chromiumModule: typeof import("playwright").chromium | null = null;
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Lazy-import `playwright.chromium`. Keeping the import deferred so the
|
|
9
|
+
* heavy module isn't loaded on cold paths (e.g. `membot list`); ALSO
|
|
10
|
+
* lets us produce a `HelpfulError` if Playwright isn't installed yet
|
|
11
|
+
* instead of a stack trace at module-load time.
|
|
12
|
+
*/
|
|
13
|
+
async function loadChromium(): Promise<typeof import("playwright").chromium> {
|
|
14
|
+
if (chromiumModule) return chromiumModule;
|
|
15
|
+
try {
|
|
16
|
+
const playwright = await import("playwright");
|
|
17
|
+
chromiumModule = playwright.chromium;
|
|
18
|
+
return chromiumModule;
|
|
19
|
+
} catch (err) {
|
|
20
|
+
throw new HelpfulError({
|
|
21
|
+
kind: "internal_error",
|
|
22
|
+
message: `failed to load playwright: ${err instanceof Error ? err.message : String(err)}`,
|
|
23
|
+
hint: "Run `bun add -g membot` to reinstall, then `bunx playwright install chromium` to fetch the browser binary.",
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface BrowserPoolOptions {
|
|
29
|
+
userDataDir: string;
|
|
30
|
+
headless?: boolean;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Process-scoped lazy-launched chromium context backed by a *persistent
|
|
35
|
+
* profile directory* (`launchPersistentContext`). Persistent profiles
|
|
36
|
+
* survive cookies, localStorage, sessionStorage, IndexedDB, and service
|
|
37
|
+
* worker state across runs — necessary for SPA-heavy services like
|
|
38
|
+
* Linear that stash critical session/sync state in IndexedDB (which
|
|
39
|
+
* the lighter `storageState` JSON snapshot doesn't capture).
|
|
40
|
+
*
|
|
41
|
+
* Trade-offs:
|
|
42
|
+
* - The profile is a directory, not a single JSON file (a few MBs).
|
|
43
|
+
* - Chromium's single-instance lock means only one BrowserPool can
|
|
44
|
+
* have the profile open at a time. Sequential `membot add` calls
|
|
45
|
+
* are fine; concurrent CLI processes against the same profile will
|
|
46
|
+
* fail to launch.
|
|
47
|
+
*/
|
|
48
|
+
export class BrowserPool {
|
|
49
|
+
private readonly userDataDir: string;
|
|
50
|
+
private readonly headless: boolean;
|
|
51
|
+
private context: BrowserContext | null = null;
|
|
52
|
+
|
|
53
|
+
constructor(options: BrowserPoolOptions) {
|
|
54
|
+
this.userDataDir = options.userDataDir;
|
|
55
|
+
this.headless = options.headless ?? true;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Lazy-init the persistent context. The first call launches
|
|
60
|
+
* chromium against `userDataDir` (creating it if needed); subsequent
|
|
61
|
+
* calls reuse the same context so cookies, IDB, and inflight
|
|
62
|
+
* navigation state stay shared across downloaders within one run.
|
|
63
|
+
*/
|
|
64
|
+
private async ensureContext(): Promise<BrowserContext> {
|
|
65
|
+
if (this.context) return this.context;
|
|
66
|
+
const chromium = await loadChromium();
|
|
67
|
+
await mkdir(this.userDataDir, { recursive: true });
|
|
68
|
+
try {
|
|
69
|
+
this.context = await chromium.launchPersistentContext(this.userDataDir, {
|
|
70
|
+
headless: this.headless,
|
|
71
|
+
});
|
|
72
|
+
} catch (err) {
|
|
73
|
+
throw new HelpfulError({
|
|
74
|
+
kind: "internal_error",
|
|
75
|
+
message: `chromium failed to launch: ${err instanceof Error ? err.message : String(err)}`,
|
|
76
|
+
hint: this.headless
|
|
77
|
+
? "Run `bunx playwright install chromium` to download the browser binary."
|
|
78
|
+
: "Close any other membot process holding the browser profile, then retry.",
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
return this.context;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Return the request context for downloaders that just need authenticated HTTP. */
|
|
85
|
+
async request(): Promise<APIRequestContext> {
|
|
86
|
+
const ctx = await this.ensureContext();
|
|
87
|
+
return ctx.request;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** Open a fresh page (caller is responsible for `page.close()`). */
|
|
91
|
+
async newPage(): Promise<Page> {
|
|
92
|
+
const ctx = await this.ensureContext();
|
|
93
|
+
return ctx.newPage();
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* How many cookies are in the live context. Used by the auth-prompt
|
|
98
|
+
* flow to detect "user closed the window without logging in" — must
|
|
99
|
+
* be called BEFORE `dispose()` since the context closes its own
|
|
100
|
+
* stores when shutting down.
|
|
101
|
+
*/
|
|
102
|
+
async cookieCount(): Promise<number> {
|
|
103
|
+
if (!this.context) return 0;
|
|
104
|
+
try {
|
|
105
|
+
const cookies = await this.context.cookies();
|
|
106
|
+
return cookies.length;
|
|
107
|
+
} catch {
|
|
108
|
+
return 0;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Return the cookies stored in the persistent profile for a given
|
|
114
|
+
* URL/origin (or all cookies when omitted). Used by downloaders that
|
|
115
|
+
* call services with their own HTTP client (e.g. Node's built-in
|
|
116
|
+
* `fetch`) — they read the cookies once here and pass them via a
|
|
117
|
+
* `Cookie` header. Bypasses Playwright's APIRequestContext, which
|
|
118
|
+
* has a known cookie-parser bug on Google's same-origin redirects.
|
|
119
|
+
*/
|
|
120
|
+
async cookieHeader(url: string): Promise<string> {
|
|
121
|
+
const ctx = await this.ensureContext();
|
|
122
|
+
const cookies = await ctx.cookies(url);
|
|
123
|
+
return cookies.map((c) => `${c.name}=${c.value}`).join("; ");
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Resolve when the user is "done" with the headed browser session,
|
|
128
|
+
* detected as: the supplied page closes, OR its context closes, OR
|
|
129
|
+
* the underlying browser disconnects — whichever fires first. We
|
|
130
|
+
* can't rely on the browser-disconnect event alone: on macOS,
|
|
131
|
+
* closing the last window does NOT quit chromium (the app stays
|
|
132
|
+
* alive in the background), so the disconnect event never fires
|
|
133
|
+
* and the caller hangs forever. The page-close event is the only
|
|
134
|
+
* signal that's consistent across macOS, Linux, and Windows.
|
|
135
|
+
*/
|
|
136
|
+
async waitForUserDone(page: Page): Promise<void> {
|
|
137
|
+
const ctx = page.context();
|
|
138
|
+
const browser = ctx.browser();
|
|
139
|
+
await new Promise<void>((resolve) => {
|
|
140
|
+
let done = false;
|
|
141
|
+
const finish = () => {
|
|
142
|
+
if (done) return;
|
|
143
|
+
done = true;
|
|
144
|
+
resolve();
|
|
145
|
+
};
|
|
146
|
+
page.on("close", finish);
|
|
147
|
+
ctx.on("close", finish);
|
|
148
|
+
browser?.on("disconnected", finish);
|
|
149
|
+
if (page.isClosed() || (browser && !browser.isConnected())) finish();
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** Close the context (which releases the userDataDir lock). Idempotent. */
|
|
154
|
+
async dispose(): Promise<void> {
|
|
155
|
+
try {
|
|
156
|
+
await this.context?.close();
|
|
157
|
+
} catch {
|
|
158
|
+
// best-effort
|
|
159
|
+
}
|
|
160
|
+
this.context = null;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Resolve `maybeRelative` against `base` and return a `URL`, or `null`
|
|
166
|
+
* if neither parses. Playwright's `APIResponse.url()` sometimes hands
|
|
167
|
+
* back a path-only string (`"/"`) instead of an absolute URL after a
|
|
168
|
+
* same-origin redirect — every downloader that wants to inspect the
|
|
169
|
+
* final URL goes through this helper so the relative-URL handling
|
|
170
|
+
* lives in one place. Login-redirect detection itself is each
|
|
171
|
+
* downloader's responsibility — it's the only code that knows which
|
|
172
|
+
* host its export endpoint redirects to when the session is missing.
|
|
173
|
+
*/
|
|
174
|
+
export function safeResolveUrl(maybeRelative: string, base: string): URL | null {
|
|
175
|
+
try {
|
|
176
|
+
return new URL(maybeRelative, base);
|
|
177
|
+
} catch {
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { HelpfulError } from "../../errors.ts";
|
|
2
|
+
import { sha256Hex } from "../local-reader.ts";
|
|
3
|
+
import type { DownloadedRemote, Downloader } from "./index.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Catch-all downloader. Always matches HTTP/HTTPS URLs that no
|
|
7
|
+
* specific downloader claimed. Strategy:
|
|
8
|
+
* - Issue an authenticated GET via Playwright's request context
|
|
9
|
+
* (cookies from `membot login` flow through automatically).
|
|
10
|
+
* - If the server returned `text/html`, the page is probably a SPA
|
|
11
|
+
* or auth-gated render — open a real `page`, wait for
|
|
12
|
+
* `networkidle`, and `page.pdf()` the visible result. The rendered
|
|
13
|
+
* PDF goes through `convertPdf` so SPAs and login-walled docs
|
|
14
|
+
* work uniformly.
|
|
15
|
+
* - Otherwise the response IS the file (markdown, JSON, PDF, image,
|
|
16
|
+
* docx, …) — return its bytes verbatim and let the mime
|
|
17
|
+
* dispatcher pick the right native converter.
|
|
18
|
+
*
|
|
19
|
+
* This is what gives "no specific downloader needed" coverage to any
|
|
20
|
+
* URL the user throws at `membot add`.
|
|
21
|
+
*/
|
|
22
|
+
export const genericWebDownloader: Downloader = {
|
|
23
|
+
name: "generic-web",
|
|
24
|
+
description:
|
|
25
|
+
"Catch-all for any URL no other downloader handled — HEAD/GET, then either page.pdf() the rendered HTML or stream the raw bytes through the mime converter.",
|
|
26
|
+
matches(url) {
|
|
27
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
28
|
+
},
|
|
29
|
+
async download(url, ctx): Promise<DownloadedRemote> {
|
|
30
|
+
ctx.onProgress?.("fetching");
|
|
31
|
+
const request = await ctx.pool.request();
|
|
32
|
+
const response = await request.get(url.toString(), { timeout: 30_000 });
|
|
33
|
+
// As the catch-all we don't know which login page each unknown
|
|
34
|
+
// service redirects to. If the user lands on a rendered login
|
|
35
|
+
// page, it goes through the print-to-PDF path and they'll see
|
|
36
|
+
// an obviously-wrong "Sign in" PDF — the cue to run `membot login`.
|
|
37
|
+
// Specific downloaders own auth-redirect detection for the services
|
|
38
|
+
// they understand.
|
|
39
|
+
if (!response.ok() && response.status() !== 304) {
|
|
40
|
+
throw new HelpfulError({
|
|
41
|
+
kind: "network_error",
|
|
42
|
+
message: `HTTP ${response.status()} ${response.statusText()}: ${url.toString()}`,
|
|
43
|
+
hint: "Open the URL in your browser to verify it exists. For auth-gated content, run `membot login` first.",
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
const headers = response.headers();
|
|
47
|
+
const contentType =
|
|
48
|
+
(headers["content-type"] ?? "application/octet-stream").split(";")[0]?.trim() ?? "application/octet-stream";
|
|
49
|
+
|
|
50
|
+
if (contentType === "text/html" || contentType === "application/xhtml+xml") {
|
|
51
|
+
const page = await ctx.pool.newPage();
|
|
52
|
+
try {
|
|
53
|
+
ctx.onProgress?.("rendering page");
|
|
54
|
+
await page.goto(url.toString(), { waitUntil: "networkidle", timeout: 45_000 });
|
|
55
|
+
ctx.onProgress?.("printing to pdf");
|
|
56
|
+
const pdfBuf = await page.pdf({ format: "A4", printBackground: true, preferCSSPageSize: false });
|
|
57
|
+
const bytes = new Uint8Array(pdfBuf);
|
|
58
|
+
return {
|
|
59
|
+
bytes,
|
|
60
|
+
sha256: sha256Hex(pdfBuf),
|
|
61
|
+
mimeType: "application/pdf",
|
|
62
|
+
downloader: "generic-web",
|
|
63
|
+
downloaderArgs: { rendered: true, source_content_type: contentType },
|
|
64
|
+
sourceUrl: url.toString(),
|
|
65
|
+
};
|
|
66
|
+
} finally {
|
|
67
|
+
await page.close().catch(() => {});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const body = Buffer.from(await response.body());
|
|
72
|
+
return {
|
|
73
|
+
bytes: new Uint8Array(body),
|
|
74
|
+
sha256: sha256Hex(body),
|
|
75
|
+
mimeType: contentType,
|
|
76
|
+
downloader: "generic-web",
|
|
77
|
+
downloaderArgs: { rendered: false, source_content_type: contentType },
|
|
78
|
+
sourceUrl: url.toString(),
|
|
79
|
+
};
|
|
80
|
+
},
|
|
81
|
+
};
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { HelpfulError } from "../../errors.ts";
|
|
2
|
+
import { sha256Hex } from "../local-reader.ts";
|
|
3
|
+
import type { DownloadedRemote, Downloader, DownloaderCtx } from "./index.ts";
|
|
4
|
+
|
|
5
|
+
const ISSUE_OR_PR = /^\/([^/]+)\/([^/]+)\/(issues|pull)\/(\d+)(?:$|\/|#|\?)/;
|
|
6
|
+
|
|
7
|
+
const API_BASE = "https://api.github.com";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* GitHub issues and PRs via the REST API. The user sets a personal
|
|
11
|
+
* access token once via `membot config set downloaders.github.api_key
|
|
12
|
+
* <PAT>` (or via the `GITHUB_TOKEN` env var, which `gh auth token`
|
|
13
|
+
* happens to populate), and we fetch the issue/PR + every comment as
|
|
14
|
+
* structured JSON, then render to markdown.
|
|
15
|
+
*
|
|
16
|
+
* Why API instead of rendering github.com HTML: the rendered page
|
|
17
|
+
* works for public, network-cooperative cases but stalls when GitHub
|
|
18
|
+
* shows interstitials (rate-limit, abuse, login challenges) and
|
|
19
|
+
* captures hundreds of KB of GitHub chrome that the embedder doesn't
|
|
20
|
+
* care about. The API gives us the exact body and comment thread in
|
|
21
|
+
* a few KB.
|
|
22
|
+
*
|
|
23
|
+
* Public repos: the `api_key` is optional — we'll send unauthenticated
|
|
24
|
+
* requests if it's blank, which works for public content but gets
|
|
25
|
+
* rate-limited at 60 req/hr. Private repos require the token.
|
|
26
|
+
*/
|
|
27
|
+
export const githubDownloader: Downloader = {
|
|
28
|
+
name: "github",
|
|
29
|
+
description: "GitHub issues + PRs (github.com/<owner>/<repo>/(issues|pull)/<n>) — uses the GitHub REST API.",
|
|
30
|
+
logins: [
|
|
31
|
+
{
|
|
32
|
+
kind: "api_key",
|
|
33
|
+
name: "GitHub",
|
|
34
|
+
url: "https://github.com/settings/tokens",
|
|
35
|
+
setupCommand: "membot config set downloaders.github.api_key <PAT>",
|
|
36
|
+
description: "create a fine-grained token with repo:read access (or use GITHUB_TOKEN env var)",
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
requiresApiKey: false,
|
|
40
|
+
matches(url) {
|
|
41
|
+
return url.hostname === "github.com" && ISSUE_OR_PR.test(url.pathname);
|
|
42
|
+
},
|
|
43
|
+
async download(url, ctx): Promise<DownloadedRemote> {
|
|
44
|
+
const args = parseIssueUrl(url);
|
|
45
|
+
const owner = args.owner as string;
|
|
46
|
+
const repo = args.repo as string;
|
|
47
|
+
const number = args.number as number;
|
|
48
|
+
|
|
49
|
+
const token = (ctx.config.downloaders.github.api_key || process.env.GITHUB_TOKEN || "").trim();
|
|
50
|
+
ctx.onProgress?.("fetching issue");
|
|
51
|
+
const issue = await getJson<GithubIssue>(`/repos/${owner}/${repo}/issues/${number}`, token, url);
|
|
52
|
+
ctx.onProgress?.("fetching comments");
|
|
53
|
+
const comments = await getJson<GithubComment[]>(
|
|
54
|
+
`/repos/${owner}/${repo}/issues/${number}/comments?per_page=100`,
|
|
55
|
+
token,
|
|
56
|
+
url,
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
const isPullRequest = !!issue.pull_request;
|
|
60
|
+
const markdown = renderIssue(issue, comments, isPullRequest);
|
|
61
|
+
const bytes = new TextEncoder().encode(markdown);
|
|
62
|
+
return {
|
|
63
|
+
bytes,
|
|
64
|
+
sha256: sha256Hex(bytes),
|
|
65
|
+
mimeType: "text/markdown",
|
|
66
|
+
downloader: "github",
|
|
67
|
+
downloaderArgs: args,
|
|
68
|
+
sourceUrl: url.toString(),
|
|
69
|
+
};
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
interface GithubIssue {
|
|
74
|
+
number: number;
|
|
75
|
+
title: string;
|
|
76
|
+
body: string | null;
|
|
77
|
+
state: string;
|
|
78
|
+
html_url: string;
|
|
79
|
+
user: { login: string } | null;
|
|
80
|
+
assignees: Array<{ login: string }> | null;
|
|
81
|
+
labels: Array<{ name: string } | string> | null;
|
|
82
|
+
created_at: string;
|
|
83
|
+
updated_at: string;
|
|
84
|
+
closed_at: string | null;
|
|
85
|
+
pull_request?: unknown;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
interface GithubComment {
|
|
89
|
+
body: string | null;
|
|
90
|
+
user: { login: string } | null;
|
|
91
|
+
created_at: string;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function getJson<T>(path: string, token: string, url: URL): Promise<T> {
|
|
95
|
+
const headers: Record<string, string> = {
|
|
96
|
+
Accept: "application/vnd.github+json",
|
|
97
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
98
|
+
"User-Agent": "membot",
|
|
99
|
+
};
|
|
100
|
+
if (token !== "") headers.Authorization = `Bearer ${token}`;
|
|
101
|
+
|
|
102
|
+
const response = await fetch(`${API_BASE}${path}`, { headers });
|
|
103
|
+
if (response.status === 401 || response.status === 403) {
|
|
104
|
+
throw new HelpfulError({
|
|
105
|
+
kind: "auth_error",
|
|
106
|
+
message: `GitHub API returned ${response.status} for ${url.toString()}.`,
|
|
107
|
+
hint:
|
|
108
|
+
token === ""
|
|
109
|
+
? "Set a personal access token: create one at https://github.com/settings/tokens, then `membot config set downloaders.github.api_key <PAT>` (or set $GITHUB_TOKEN)."
|
|
110
|
+
: "The configured API key is missing repo:read access for this repo, or has expired. Re-create the token and run `membot config set downloaders.github.api_key <PAT>`.",
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
if (response.status === 404) {
|
|
114
|
+
throw new HelpfulError({
|
|
115
|
+
kind: "not_found",
|
|
116
|
+
message: `GitHub returned 404 for ${url.toString()}.`,
|
|
117
|
+
hint: "Verify the URL exists. Private repos require an API key with the right scope.",
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
if (!response.ok) {
|
|
121
|
+
throw new HelpfulError({
|
|
122
|
+
kind: "network_error",
|
|
123
|
+
message: `GitHub API returned ${response.status} ${response.statusText} for ${url.toString()}.`,
|
|
124
|
+
hint: "Retry; if the failure persists, run with --verbose for the full response.",
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return (await response.json()) as T;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function parseIssueUrl(url: URL): Record<string, unknown> {
|
|
131
|
+
const match = url.pathname.match(ISSUE_OR_PR);
|
|
132
|
+
if (!match) {
|
|
133
|
+
throw new HelpfulError({
|
|
134
|
+
kind: "input_error",
|
|
135
|
+
message: `not a GitHub issue/PR URL: ${url.toString()}`,
|
|
136
|
+
hint: "Pass a URL like https://github.com/<owner>/<repo>/issues/<n> or .../pull/<n>.",
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
return { owner: match[1], repo: match[2], kind: match[3], number: Number(match[4]) };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function renderIssue(issue: GithubIssue, comments: GithubComment[], isPr: boolean): string {
|
|
143
|
+
const lines: string[] = [];
|
|
144
|
+
const kind = isPr ? "PR" : "Issue";
|
|
145
|
+
lines.push(`# ${kind} #${issue.number}: ${issue.title}`);
|
|
146
|
+
lines.push("");
|
|
147
|
+
lines.push(`- URL: ${issue.html_url}`);
|
|
148
|
+
lines.push(`- State: ${issue.state}${issue.closed_at ? ` (closed ${issue.closed_at})` : ""}`);
|
|
149
|
+
if (issue.user) lines.push(`- Author: @${issue.user.login}`);
|
|
150
|
+
if (issue.assignees && issue.assignees.length > 0) {
|
|
151
|
+
lines.push(`- Assignees: ${issue.assignees.map((a) => `@${a.login}`).join(", ")}`);
|
|
152
|
+
}
|
|
153
|
+
if (issue.labels && issue.labels.length > 0) {
|
|
154
|
+
const labels = issue.labels.map((l) => (typeof l === "string" ? l : l.name)).filter(Boolean);
|
|
155
|
+
if (labels.length > 0) lines.push(`- Labels: ${labels.join(", ")}`);
|
|
156
|
+
}
|
|
157
|
+
lines.push(`- Created: ${issue.created_at}`);
|
|
158
|
+
lines.push(`- Updated: ${issue.updated_at}`);
|
|
159
|
+
lines.push("");
|
|
160
|
+
if (issue.body && issue.body.trim() !== "") {
|
|
161
|
+
lines.push("## Description");
|
|
162
|
+
lines.push("");
|
|
163
|
+
lines.push(issue.body.trim());
|
|
164
|
+
lines.push("");
|
|
165
|
+
}
|
|
166
|
+
if (comments.length > 0) {
|
|
167
|
+
lines.push(`## Comments (${comments.length})`);
|
|
168
|
+
lines.push("");
|
|
169
|
+
for (const c of comments) {
|
|
170
|
+
const author = c.user ? `@${c.user.login}` : "(unknown)";
|
|
171
|
+
lines.push(`### ${author} — ${c.created_at}`);
|
|
172
|
+
lines.push("");
|
|
173
|
+
lines.push((c.body ?? "").trim());
|
|
174
|
+
lines.push("");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
return lines.join("\n").trim();
|
|
178
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { HelpfulError } from "../../errors.ts";
|
|
2
|
+
import { sha256Hex } from "../local-reader.ts";
|
|
3
|
+
import { fetchWithBrowserCookies } from "./google-shared.ts";
|
|
4
|
+
import type { DownloadedRemote, Downloader } from "./index.ts";
|
|
5
|
+
|
|
6
|
+
const DOC_PATH = /^\/document\/d\/([a-zA-Z0-9_-]+)/;
|
|
7
|
+
const DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Download a Google Doc as a `.docx` blob via the canonical export
|
|
11
|
+
* endpoint. Authentication uses cookies pulled from the persistent
|
|
12
|
+
* chromium profile (populated by `membot login`); the fetch itself
|
|
13
|
+
* is a plain Node `fetch`, not Playwright's APIRequestContext, to
|
|
14
|
+
* dodge a Playwright bug that crashes parsing Set-Cookie headers
|
|
15
|
+
* from Google's same-origin redirects.
|
|
16
|
+
*/
|
|
17
|
+
export const googleDocsDownloader: Downloader = {
|
|
18
|
+
name: "google-docs",
|
|
19
|
+
description: "Google Docs (docs.google.com/document/d/<id>) — exports as .docx via the user's logged-in session.",
|
|
20
|
+
logins: [
|
|
21
|
+
{
|
|
22
|
+
kind: "browser",
|
|
23
|
+
name: "Google",
|
|
24
|
+
url: "https://accounts.google.com/signin",
|
|
25
|
+
description: "covers Docs, Sheets, and Slides",
|
|
26
|
+
},
|
|
27
|
+
],
|
|
28
|
+
matches(url) {
|
|
29
|
+
return url.hostname === "docs.google.com" && DOC_PATH.test(url.pathname);
|
|
30
|
+
},
|
|
31
|
+
async download(url, ctx): Promise<DownloadedRemote> {
|
|
32
|
+
const docId = extractDocId(url);
|
|
33
|
+
const exportUrl = `https://docs.google.com/document/d/${docId}/export?format=docx`;
|
|
34
|
+
const body = await fetchWithBrowserCookies(exportUrl, ctx, "Google Docs", url);
|
|
35
|
+
return {
|
|
36
|
+
bytes: new Uint8Array(body),
|
|
37
|
+
sha256: sha256Hex(body),
|
|
38
|
+
mimeType: DOCX_MIME,
|
|
39
|
+
downloader: "google-docs",
|
|
40
|
+
downloaderArgs: { document_id: docId },
|
|
41
|
+
sourceUrl: url.toString(),
|
|
42
|
+
};
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
function extractDocId(url: URL): string {
|
|
47
|
+
const match = url.pathname.match(DOC_PATH);
|
|
48
|
+
if (!match || !match[1]) {
|
|
49
|
+
throw new HelpfulError({
|
|
50
|
+
kind: "input_error",
|
|
51
|
+
message: `not a Google Docs URL: ${url.toString()}`,
|
|
52
|
+
hint: "Pass a URL like https://docs.google.com/document/d/<DOC_ID>/edit.",
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
return match[1];
|
|
56
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { HelpfulError } from "../../errors.ts";
|
|
2
|
+
import type { DownloaderCtx } from "./index.ts";
|
|
3
|
+
|
|
4
|
+
const USER_AGENT =
|
|
5
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Fetch a Google export URL using cookies from the persistent
|
|
9
|
+
* chromium profile. Uses Node's built-in `fetch` (not Playwright's
|
|
10
|
+
* APIRequestContext) because Playwright crashes when parsing
|
|
11
|
+
* Set-Cookie headers on same-origin Google redirects (its
|
|
12
|
+
* `_parseSetCookieHeader` calls `new URL(responseUrl)` with a
|
|
13
|
+
* relative path and throws `ERR_INVALID_URL`).
|
|
14
|
+
*
|
|
15
|
+
* Same redirect handling rules as the Playwright path used to do:
|
|
16
|
+
* follow same-origin internal redirects (Google may bounce the
|
|
17
|
+
* download via `/exportInternal` or similar) but bail with a clean
|
|
18
|
+
* `auth_error` if Google sends us to `accounts.google.com/ServiceLogin`
|
|
19
|
+
* because the user isn't signed in.
|
|
20
|
+
*/
|
|
21
|
+
export async function fetchWithBrowserCookies(
|
|
22
|
+
exportUrl: string,
|
|
23
|
+
ctx: DownloaderCtx,
|
|
24
|
+
serviceName: string,
|
|
25
|
+
sourceUrl: URL,
|
|
26
|
+
): Promise<Buffer> {
|
|
27
|
+
ctx.onProgress?.(`downloading from ${serviceName.toLowerCase()}`);
|
|
28
|
+
const cookieHeader = await ctx.pool.cookieHeader(exportUrl);
|
|
29
|
+
|
|
30
|
+
let currentUrl = exportUrl;
|
|
31
|
+
for (let hop = 0; hop < 5; hop++) {
|
|
32
|
+
const response = await fetch(currentUrl, {
|
|
33
|
+
headers: {
|
|
34
|
+
Cookie: cookieHeader,
|
|
35
|
+
"User-Agent": USER_AGENT,
|
|
36
|
+
Accept: "*/*",
|
|
37
|
+
},
|
|
38
|
+
redirect: "manual",
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
if (response.status >= 200 && response.status < 300) {
|
|
42
|
+
return Buffer.from(await response.arrayBuffer());
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (response.status >= 300 && response.status < 400) {
|
|
46
|
+
const location = response.headers.get("location");
|
|
47
|
+
if (!location) {
|
|
48
|
+
throw new HelpfulError({
|
|
49
|
+
kind: "network_error",
|
|
50
|
+
message: `${serviceName} returned ${response.status} for ${sourceUrl.toString()} with no Location header.`,
|
|
51
|
+
hint: "Open the URL in your browser to verify it exists and is shared with you.",
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
const next = new URL(location, currentUrl);
|
|
55
|
+
if (next.hostname === "accounts.google.com" || /\/ServiceLogin/i.test(next.pathname)) {
|
|
56
|
+
throw new HelpfulError({
|
|
57
|
+
kind: "auth_error",
|
|
58
|
+
message: `${serviceName} redirected ${sourceUrl.toString()} to a Google login page.`,
|
|
59
|
+
hint: "Run `membot login` and sign into Google in the browser that opens, then re-run.",
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
currentUrl = next.toString();
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (response.status === 401 || response.status === 403) {
|
|
67
|
+
throw new HelpfulError({
|
|
68
|
+
kind: "auth_error",
|
|
69
|
+
message: `${serviceName} returned ${response.status} for ${sourceUrl.toString()}.`,
|
|
70
|
+
hint: "Run `membot login` and sign into Google in the browser that opens, then re-run.",
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
throw new HelpfulError({
|
|
75
|
+
kind: "network_error",
|
|
76
|
+
message: `${serviceName} returned ${response.status} ${response.statusText} for ${sourceUrl.toString()}.`,
|
|
77
|
+
hint: "Open the URL in your browser to verify it's accessible to your account.",
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
throw new HelpfulError({
|
|
82
|
+
kind: "network_error",
|
|
83
|
+
message: `${serviceName} bounced through too many redirects for ${sourceUrl.toString()}.`,
|
|
84
|
+
hint: "Re-run the command; if the failure persists, open the URL in your browser to investigate.",
|
|
85
|
+
});
|
|
86
|
+
}
|