membot 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +25 -10
- package/.cursor/rules/membot.mdc +25 -10
- package/README.md +35 -4
- package/package.json +8 -5
- package/scripts/apply-patches.sh +0 -11
- package/src/cli.ts +2 -2
- package/src/commands/login-page.mustache +50 -0
- package/src/commands/login.ts +83 -0
- package/src/config/schemas.ts +17 -5
- package/src/constants.ts +13 -1
- package/src/context.ts +1 -24
- package/src/db/files.ts +21 -25
- package/src/db/migrations/003-downloader-columns.ts +58 -0
- package/src/db/migrations.ts +2 -1
- package/src/ingest/converter/index.ts +9 -0
- package/src/ingest/converter/xlsx.ts +111 -0
- package/src/ingest/downloaders/browser.ts +180 -0
- package/src/ingest/downloaders/generic-web.ts +81 -0
- package/src/ingest/downloaders/github.ts +178 -0
- package/src/ingest/downloaders/google-docs.ts +56 -0
- package/src/ingest/downloaders/google-shared.ts +86 -0
- package/src/ingest/downloaders/google-sheets.ts +58 -0
- package/src/ingest/downloaders/google-slides.ts +53 -0
- package/src/ingest/downloaders/index.ts +182 -0
- package/src/ingest/downloaders/linear.ts +291 -0
- package/src/ingest/fetcher.ts +104 -129
- package/src/ingest/ingest.ts +43 -70
- package/src/mcp/instructions.ts +4 -2
- package/src/operations/add.ts +6 -4
- package/src/operations/info.ts +4 -6
- package/src/operations/move.ts +2 -3
- package/src/operations/refresh.ts +2 -4
- package/src/operations/remove.ts +23 -2
- package/src/operations/tree.ts +1 -1
- package/src/operations/types.ts +1 -1
- package/src/refresh/runner.ts +59 -114
- package/src/types/text-modules.d.ts +5 -0
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
- package/src/commands/mcpx.ts +0 -112
- package/src/ingest/agent-fetcher.ts +0 -639
package/src/db/files.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { DbConnection, SqlParam } from "./connection.ts";
|
|
2
2
|
|
|
3
3
|
export type SourceType = "local" | "remote" | "inline";
|
|
4
|
-
export type FetcherKind = "
|
|
4
|
+
export type FetcherKind = "downloader" | "local" | "inline";
|
|
5
5
|
|
|
6
6
|
export interface FileRow {
|
|
7
7
|
logical_path: string;
|
|
@@ -18,9 +18,8 @@ export interface FileRow {
|
|
|
18
18
|
mime_type: string | null;
|
|
19
19
|
size_bytes: number | null;
|
|
20
20
|
fetcher: FetcherKind | null;
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
fetcher_args: Record<string, unknown> | null;
|
|
21
|
+
downloader: string | null;
|
|
22
|
+
downloader_args: Record<string, unknown> | null;
|
|
24
23
|
refresh_frequency_sec: number | null;
|
|
25
24
|
refreshed_at: string | null;
|
|
26
25
|
last_refresh_status: string | null;
|
|
@@ -43,9 +42,8 @@ export interface NewFileVersion {
|
|
|
43
42
|
mime_type?: string | null;
|
|
44
43
|
size_bytes?: number | null;
|
|
45
44
|
fetcher?: FetcherKind | null;
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
fetcher_args?: Record<string, unknown> | null;
|
|
45
|
+
downloader?: string | null;
|
|
46
|
+
downloader_args?: Record<string, unknown> | null;
|
|
49
47
|
refresh_frequency_sec?: number | null;
|
|
50
48
|
refreshed_at?: string | null;
|
|
51
49
|
last_refresh_status?: string | null;
|
|
@@ -67,9 +65,8 @@ const ROW_COLUMNS = [
|
|
|
67
65
|
"mime_type",
|
|
68
66
|
"size_bytes",
|
|
69
67
|
"fetcher",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"fetcher_args",
|
|
68
|
+
"downloader",
|
|
69
|
+
"downloader_args",
|
|
73
70
|
"refresh_frequency_sec",
|
|
74
71
|
"refreshed_at",
|
|
75
72
|
"last_refresh_status",
|
|
@@ -86,21 +83,21 @@ const COLUMN_LIST = ROW_COLUMNS.join(", ");
|
|
|
86
83
|
*/
|
|
87
84
|
export async function insertVersion(db: DbConnection, file: NewFileVersion): Promise<string> {
|
|
88
85
|
const versionId = file.version_id ?? millisIso(Date.now());
|
|
89
|
-
const
|
|
86
|
+
const downloaderArgsJson = file.downloader_args ? JSON.stringify(file.downloader_args) : null;
|
|
90
87
|
|
|
91
88
|
await db.queryRun(
|
|
92
89
|
`INSERT INTO files (
|
|
93
90
|
logical_path, version_id, tombstone, source_type,
|
|
94
91
|
source_path, source_mtime_ms, source_sha256, blob_sha256,
|
|
95
92
|
content_sha256, content, description, mime_type, size_bytes,
|
|
96
|
-
fetcher,
|
|
93
|
+
fetcher, downloader, downloader_args,
|
|
97
94
|
refresh_frequency_sec, refreshed_at, last_refresh_status, change_note
|
|
98
95
|
) VALUES (
|
|
99
96
|
?1, CAST(?2 AS TIMESTAMP), ?3, ?4,
|
|
100
97
|
?5, ?6, ?7, ?8,
|
|
101
98
|
?9, ?10, ?11, ?12, ?13,
|
|
102
|
-
?14, ?15, ?16,
|
|
103
|
-
?18, ?19, ?20
|
|
99
|
+
?14, ?15, ?16,
|
|
100
|
+
?17, ?18, ?19, ?20
|
|
104
101
|
)`,
|
|
105
102
|
file.logical_path,
|
|
106
103
|
versionId,
|
|
@@ -116,9 +113,8 @@ export async function insertVersion(db: DbConnection, file: NewFileVersion): Pro
|
|
|
116
113
|
file.mime_type ?? null,
|
|
117
114
|
file.size_bytes ?? null,
|
|
118
115
|
file.fetcher ?? null,
|
|
119
|
-
file.
|
|
120
|
-
|
|
121
|
-
fetcherArgsJson,
|
|
116
|
+
file.downloader ?? null,
|
|
117
|
+
downloaderArgsJson,
|
|
122
118
|
file.refresh_frequency_sec ?? null,
|
|
123
119
|
file.refreshed_at ?? null,
|
|
124
120
|
file.last_refresh_status ?? null,
|
|
@@ -132,33 +128,33 @@ export function millisIso(ms: number): string {
|
|
|
132
128
|
return new Date(ms).toISOString();
|
|
133
129
|
}
|
|
134
130
|
|
|
135
|
-
interface RawFileRow extends Omit<FileRow, "
|
|
136
|
-
|
|
131
|
+
interface RawFileRow extends Omit<FileRow, "downloader_args" | "tombstone"> {
|
|
132
|
+
downloader_args: string | null | Record<string, unknown>;
|
|
137
133
|
tombstone: boolean | number;
|
|
138
134
|
[key: string]: unknown;
|
|
139
135
|
}
|
|
140
136
|
|
|
141
137
|
/**
|
|
142
138
|
* Coerce a raw DuckDB row into a typed `FileRow`. JSON-parses the
|
|
143
|
-
* `
|
|
139
|
+
* `downloader_args` column (DuckDB returns it as text or a parsed object
|
|
144
140
|
* depending on driver version) and normalizes `tombstone` to a boolean
|
|
145
141
|
* (some drivers return 0/1).
|
|
146
142
|
*/
|
|
147
143
|
function toFileRow(row: RawFileRow | null): FileRow | null {
|
|
148
144
|
if (!row) return null;
|
|
149
145
|
let parsed: Record<string, unknown> | null = null;
|
|
150
|
-
if (row.
|
|
146
|
+
if (row.downloader_args && typeof row.downloader_args === "string") {
|
|
151
147
|
try {
|
|
152
|
-
parsed = JSON.parse(row.
|
|
148
|
+
parsed = JSON.parse(row.downloader_args);
|
|
153
149
|
} catch {
|
|
154
150
|
parsed = null;
|
|
155
151
|
}
|
|
156
|
-
} else if (row.
|
|
157
|
-
parsed = row.
|
|
152
|
+
} else if (row.downloader_args && typeof row.downloader_args === "object") {
|
|
153
|
+
parsed = row.downloader_args;
|
|
158
154
|
}
|
|
159
155
|
return {
|
|
160
156
|
...row,
|
|
161
|
-
|
|
157
|
+
downloader_args: parsed,
|
|
162
158
|
tombstone: !!row.tombstone,
|
|
163
159
|
};
|
|
164
160
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import type { Migration } from "../migrations.ts";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Replace the old mcpx-era fetcher metadata triple
|
|
5
|
+
* (`fetcher_server` / `fetcher_tool` / `fetcher_args`) with a flat
|
|
6
|
+
* `(downloader, downloader_args)` shape. The mcpx-driven agent fetcher
|
|
7
|
+
* is gone; per-service downloaders match a URL → run a known fetch
|
|
8
|
+
* (Playwright export endpoints for Google, rendered HTML for GitHub /
|
|
9
|
+
* Linear, headless print-to-PDF for everything else) → return bytes
|
|
10
|
+
* for the existing native converter pipeline.
|
|
11
|
+
*
|
|
12
|
+
* Existing rows whose `fetcher` was `'http'` or `'mcpx'` are migrated
|
|
13
|
+
* to `'downloader'` with `downloader=NULL`. The mcpx-driven ones
|
|
14
|
+
* become refresh-broken (the `fetcher_*` arguments that drove them no
|
|
15
|
+
* longer exist) but their stored `content` is still readable; the
|
|
16
|
+
* plain-HTTP ones will be re-routed through the generic-web downloader
|
|
17
|
+
* the next time refresh runs. The `fetcher` enum loses both `'http'`
|
|
18
|
+
* and `'mcpx'` — every remote row is `'downloader'` now, since even
|
|
19
|
+
* the plain-HTTP fallback is wrapped by the generic-web downloader.
|
|
20
|
+
*
|
|
21
|
+
* The `current_files` view is `SELECT f.* FROM files f`, so it pins the
|
|
22
|
+
* old column shape; we drop and recreate it (and the dependent
|
|
23
|
+
* `current_chunks` view) around the schema change.
|
|
24
|
+
*/
|
|
25
|
+
export const MIGRATION_003: Migration = {
|
|
26
|
+
id: 3,
|
|
27
|
+
name: "downloader-columns",
|
|
28
|
+
statements: [
|
|
29
|
+
// DuckDB refuses DROP COLUMN when an index covers any column that
|
|
30
|
+
// comes AFTER the dropped one in the schema, so the indexes have
|
|
31
|
+
// to come down first. The view drops are needed for the same
|
|
32
|
+
// reason — `current_files` is `SELECT f.*`, which pins every
|
|
33
|
+
// column at view-creation time.
|
|
34
|
+
`DROP VIEW IF EXISTS current_chunks`,
|
|
35
|
+
`DROP VIEW IF EXISTS current_files`,
|
|
36
|
+
`DROP INDEX IF EXISTS files_refresh_due_idx`,
|
|
37
|
+
`DROP INDEX IF EXISTS files_blob_sha256_idx`,
|
|
38
|
+
`DROP INDEX IF EXISTS files_logical_path_idx`,
|
|
39
|
+
`UPDATE files SET fetcher = 'downloader' WHERE fetcher IN ('http', 'mcpx')`,
|
|
40
|
+
`ALTER TABLE files DROP COLUMN fetcher_server`,
|
|
41
|
+
`ALTER TABLE files DROP COLUMN fetcher_tool`,
|
|
42
|
+
`ALTER TABLE files DROP COLUMN fetcher_args`,
|
|
43
|
+
`ALTER TABLE files ADD COLUMN downloader TEXT`,
|
|
44
|
+
`ALTER TABLE files ADD COLUMN downloader_args JSON`,
|
|
45
|
+
`CREATE INDEX files_logical_path_idx ON files (logical_path)`,
|
|
46
|
+
`CREATE INDEX files_blob_sha256_idx ON files (blob_sha256)`,
|
|
47
|
+
`CREATE INDEX files_refresh_due_idx ON files (refresh_frequency_sec, refreshed_at)`,
|
|
48
|
+
`CREATE VIEW current_files AS
|
|
49
|
+
SELECT f.* FROM files f
|
|
50
|
+
WHERE (f.logical_path, f.version_id) IN (
|
|
51
|
+
SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
|
|
52
|
+
)
|
|
53
|
+
AND f.tombstone = FALSE`,
|
|
54
|
+
`CREATE VIEW current_chunks AS
|
|
55
|
+
SELECT c.* FROM chunks c
|
|
56
|
+
JOIN current_files cf USING (logical_path, version_id)`,
|
|
57
|
+
],
|
|
58
|
+
};
|
package/src/db/migrations.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { logger } from "../output/logger.ts";
|
|
|
2
2
|
import type { DbConnection } from "./connection.ts";
|
|
3
3
|
import { MIGRATION_001 } from "./migrations/001-init.ts";
|
|
4
4
|
import { MIGRATION_002 } from "./migrations/002-fts.ts";
|
|
5
|
+
import { MIGRATION_003 } from "./migrations/003-downloader-columns.ts";
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* One DDL/DML migration step. The id is monotonically increasing; the name
|
|
@@ -14,7 +15,7 @@ export interface Migration {
|
|
|
14
15
|
statements: string[];
|
|
15
16
|
}
|
|
16
17
|
|
|
17
|
-
const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
|
|
18
|
+
const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002, MIGRATION_003];
|
|
18
19
|
|
|
19
20
|
/**
|
|
20
21
|
* Process-level cache of paths whose migrations have been applied (or
|
|
@@ -6,6 +6,7 @@ import { convertWithLlm } from "./llm.ts";
|
|
|
6
6
|
import { ocrImage } from "./ocr.ts";
|
|
7
7
|
import { convertPdf, shouldOcrPdf } from "./pdf.ts";
|
|
8
8
|
import { convertText } from "./text.ts";
|
|
9
|
+
import { convertXlsx } from "./xlsx.ts";
|
|
9
10
|
|
|
10
11
|
export interface ConvertResult {
|
|
11
12
|
markdown: string;
|
|
@@ -25,6 +26,10 @@ const STRUCTURED_TEXT_MIMES = new Set([
|
|
|
25
26
|
"application/typescript",
|
|
26
27
|
]);
|
|
27
28
|
const DOCX_MIMES = new Set(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]);
|
|
29
|
+
const XLSX_MIMES = new Set([
|
|
30
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
31
|
+
"application/vnd.ms-excel",
|
|
32
|
+
]);
|
|
28
33
|
const PDF_MIMES = new Set(["application/pdf"]);
|
|
29
34
|
|
|
30
35
|
/**
|
|
@@ -54,6 +59,10 @@ export async function convert(
|
|
|
54
59
|
return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
|
|
55
60
|
}
|
|
56
61
|
|
|
62
|
+
if (XLSX_MIMES.has(mt)) {
|
|
63
|
+
return { markdown: await convertXlsx(bytes), contentMimeType: "text/markdown" };
|
|
64
|
+
}
|
|
65
|
+
|
|
57
66
|
if (PDF_MIMES.has(mt)) {
|
|
58
67
|
const conversion = await convertPdf(bytes);
|
|
59
68
|
if (!shouldOcrPdf(conversion)) {
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import * as XLSX from "xlsx";
|
|
2
|
+
|
|
3
|
+
export interface ConvertXlsxOptions {
|
|
4
|
+
/** Optional sublabel callback driven per-sheet (`parsing 3/8 tabs`). */
|
|
5
|
+
onProgress?: (sublabel: string) => void;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Convert an XLSX workbook into markdown — one `## <SheetName>`
|
|
10
|
+
* section per tab, each tab rendered as a GitHub-flavored pipe table
|
|
11
|
+
* with the first non-empty row treated as the header. Empty sheets
|
|
12
|
+
* are skipped. Cell values are stringified (numbers, dates, formulas
|
|
13
|
+
* use their displayed value via `XLSX.utils.format_cell`).
|
|
14
|
+
*
|
|
15
|
+
* Pure-JS via SheetJS — no native deps, bundles cleanly with
|
|
16
|
+
* `bun build --compile`. Yields a macrotask between sheets so
|
|
17
|
+
* nanospinner's setInterval keeps animating during big workbooks
|
|
18
|
+
* (otherwise the spinner visibly freezes).
|
|
19
|
+
*/
|
|
20
|
+
export async function convertXlsx(bytes: Uint8Array, opts: ConvertXlsxOptions = {}): Promise<string> {
|
|
21
|
+
const workbook = XLSX.read(bytes, { type: "array", cellDates: true });
|
|
22
|
+
const sections: string[] = [];
|
|
23
|
+
const sheetNames = workbook.SheetNames;
|
|
24
|
+
|
|
25
|
+
for (let i = 0; i < sheetNames.length; i++) {
|
|
26
|
+
const sheetName = sheetNames[i] as string;
|
|
27
|
+
opts.onProgress?.(`parsing ${i + 1}/${sheetNames.length} tabs`);
|
|
28
|
+
const sheet = workbook.Sheets[sheetName];
|
|
29
|
+
if (sheet) {
|
|
30
|
+
const rows = sheetToMatrix(sheet);
|
|
31
|
+
const trimmed = trimEmptyEdges(rows);
|
|
32
|
+
if (trimmed.length > 0) sections.push(`## ${sheetName}\n\n${renderTable(trimmed)}`);
|
|
33
|
+
}
|
|
34
|
+
// Yield so the spinner can repaint between sheets — large
|
|
35
|
+
// workbooks would otherwise freeze the UI for the duration of
|
|
36
|
+
// the parse.
|
|
37
|
+
await new Promise<void>((resolve) => setTimeout(resolve, 0));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (sections.length === 0) return "(empty workbook)";
|
|
41
|
+
return sections.join("\n\n");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Walk every cell in the sheet's used range and produce a 2-D array
|
|
46
|
+
* of display strings. Uses the cell's formatted text (e.g. dates as
|
|
47
|
+
* "2026-05-09", percentages as "12.5%") rather than raw values, so
|
|
48
|
+
* the markdown matches what a human sees in the spreadsheet.
|
|
49
|
+
*/
|
|
50
|
+
function sheetToMatrix(sheet: XLSX.WorkSheet): string[][] {
|
|
51
|
+
if (!sheet["!ref"]) return [];
|
|
52
|
+
const range = XLSX.utils.decode_range(sheet["!ref"]);
|
|
53
|
+
const out: string[][] = [];
|
|
54
|
+
for (let r = range.s.r; r <= range.e.r; r++) {
|
|
55
|
+
const row: string[] = [];
|
|
56
|
+
for (let c = range.s.c; c <= range.e.c; c++) {
|
|
57
|
+
const addr = XLSX.utils.encode_cell({ r, c });
|
|
58
|
+
const cell = sheet[addr];
|
|
59
|
+
row.push(cell ? XLSX.utils.format_cell(cell) : "");
|
|
60
|
+
}
|
|
61
|
+
out.push(row);
|
|
62
|
+
}
|
|
63
|
+
return out;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Drop fully-empty leading/trailing rows and columns. Spreadsheets
|
|
68
|
+
* commonly have the used range padded out beyond the actual data.
|
|
69
|
+
*/
|
|
70
|
+
function trimEmptyEdges(rows: string[][]): string[][] {
|
|
71
|
+
if (rows.length === 0) return rows;
|
|
72
|
+
let firstRow = 0;
|
|
73
|
+
let lastRow = rows.length - 1;
|
|
74
|
+
while (firstRow <= lastRow && rows[firstRow]?.every((v) => v === "")) firstRow++;
|
|
75
|
+
while (lastRow >= firstRow && rows[lastRow]?.every((v) => v === "")) lastRow--;
|
|
76
|
+
if (firstRow > lastRow) return [];
|
|
77
|
+
const sliced = rows.slice(firstRow, lastRow + 1);
|
|
78
|
+
const cols = sliced[0]?.length ?? 0;
|
|
79
|
+
let firstCol = 0;
|
|
80
|
+
let lastCol = cols - 1;
|
|
81
|
+
while (firstCol <= lastCol && sliced.every((r) => (r[firstCol] ?? "") === "")) firstCol++;
|
|
82
|
+
while (lastCol >= firstCol && sliced.every((r) => (r[lastCol] ?? "") === "")) lastCol--;
|
|
83
|
+
if (firstCol > lastCol) return [];
|
|
84
|
+
return sliced.map((r) => r.slice(firstCol, lastCol + 1));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Render a 2-D matrix as a GitHub pipe table. The first row becomes
|
|
89
|
+
* the header. Pipe and newline characters in cells are escaped so
|
|
90
|
+
* they don't break the table layout.
|
|
91
|
+
*/
|
|
92
|
+
function renderTable(rows: string[][]): string {
|
|
93
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
94
|
+
const norm = rows.map((r) => {
|
|
95
|
+
const padded = [...r];
|
|
96
|
+
while (padded.length < colCount) padded.push("");
|
|
97
|
+
return padded.map(escapeCell);
|
|
98
|
+
});
|
|
99
|
+
const lines: string[] = [];
|
|
100
|
+
const header = norm[0] ?? Array(colCount).fill("");
|
|
101
|
+
lines.push(`| ${header.join(" | ")} |`);
|
|
102
|
+
lines.push(`| ${Array(colCount).fill("---").join(" | ")} |`);
|
|
103
|
+
for (let i = 1; i < norm.length; i++) {
|
|
104
|
+
lines.push(`| ${(norm[i] as string[]).join(" | ")} |`);
|
|
105
|
+
}
|
|
106
|
+
return lines.join("\n");
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function escapeCell(value: string): string {
|
|
110
|
+
return value.replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
111
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { mkdir } from "node:fs/promises";
|
|
2
|
+
import type { APIRequestContext, BrowserContext, Page } from "playwright";
|
|
3
|
+
import { HelpfulError } from "../../errors.ts";
|
|
4
|
+
|
|
5
|
+
let chromiumModule: typeof import("playwright").chromium | null = null;
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Lazy-import `playwright.chromium`. Keeping the import deferred so the
|
|
9
|
+
* heavy module isn't loaded on cold paths (e.g. `membot list`); ALSO
|
|
10
|
+
* lets us produce a `HelpfulError` if Playwright isn't installed yet
|
|
11
|
+
* instead of a stack trace at module-load time.
|
|
12
|
+
*/
|
|
13
|
+
async function loadChromium(): Promise<typeof import("playwright").chromium> {
|
|
14
|
+
if (chromiumModule) return chromiumModule;
|
|
15
|
+
try {
|
|
16
|
+
const playwright = await import("playwright");
|
|
17
|
+
chromiumModule = playwright.chromium;
|
|
18
|
+
return chromiumModule;
|
|
19
|
+
} catch (err) {
|
|
20
|
+
throw new HelpfulError({
|
|
21
|
+
kind: "internal_error",
|
|
22
|
+
message: `failed to load playwright: ${err instanceof Error ? err.message : String(err)}`,
|
|
23
|
+
hint: "Run `bun add -g membot` to reinstall, then `bunx playwright install chromium` to fetch the browser binary.",
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface BrowserPoolOptions {
|
|
29
|
+
userDataDir: string;
|
|
30
|
+
headless?: boolean;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Process-scoped lazy-launched chromium context backed by a *persistent
|
|
35
|
+
* profile directory* (`launchPersistentContext`). Persistent profiles
|
|
36
|
+
* survive cookies, localStorage, sessionStorage, IndexedDB, and service
|
|
37
|
+
* worker state across runs — necessary for SPA-heavy services like
|
|
38
|
+
* Linear that stash critical session/sync state in IndexedDB (which
|
|
39
|
+
* the lighter `storageState` JSON snapshot doesn't capture).
|
|
40
|
+
*
|
|
41
|
+
* Trade-offs:
|
|
42
|
+
* - The profile is a directory, not a single JSON file (a few MBs).
|
|
43
|
+
* - Chromium's single-instance lock means only one BrowserPool can
|
|
44
|
+
* have the profile open at a time. Sequential `membot add` calls
|
|
45
|
+
* are fine; concurrent CLI processes against the same profile will
|
|
46
|
+
* fail to launch.
|
|
47
|
+
*/
|
|
48
|
+
export class BrowserPool {
|
|
49
|
+
private readonly userDataDir: string;
|
|
50
|
+
private readonly headless: boolean;
|
|
51
|
+
private context: BrowserContext | null = null;
|
|
52
|
+
|
|
53
|
+
constructor(options: BrowserPoolOptions) {
|
|
54
|
+
this.userDataDir = options.userDataDir;
|
|
55
|
+
this.headless = options.headless ?? true;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Lazy-init the persistent context. The first call launches
|
|
60
|
+
* chromium against `userDataDir` (creating it if needed); subsequent
|
|
61
|
+
* calls reuse the same context so cookies, IDB, and inflight
|
|
62
|
+
* navigation state stay shared across downloaders within one run.
|
|
63
|
+
*/
|
|
64
|
+
private async ensureContext(): Promise<BrowserContext> {
|
|
65
|
+
if (this.context) return this.context;
|
|
66
|
+
const chromium = await loadChromium();
|
|
67
|
+
await mkdir(this.userDataDir, { recursive: true });
|
|
68
|
+
try {
|
|
69
|
+
this.context = await chromium.launchPersistentContext(this.userDataDir, {
|
|
70
|
+
headless: this.headless,
|
|
71
|
+
});
|
|
72
|
+
} catch (err) {
|
|
73
|
+
throw new HelpfulError({
|
|
74
|
+
kind: "internal_error",
|
|
75
|
+
message: `chromium failed to launch: ${err instanceof Error ? err.message : String(err)}`,
|
|
76
|
+
hint: this.headless
|
|
77
|
+
? "Run `bunx playwright install chromium` to download the browser binary."
|
|
78
|
+
: "Close any other membot process holding the browser profile, then retry.",
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
return this.context;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Return the request context for downloaders that just need authenticated HTTP. */
|
|
85
|
+
async request(): Promise<APIRequestContext> {
|
|
86
|
+
const ctx = await this.ensureContext();
|
|
87
|
+
return ctx.request;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** Open a fresh page (caller is responsible for `page.close()`). */
|
|
91
|
+
async newPage(): Promise<Page> {
|
|
92
|
+
const ctx = await this.ensureContext();
|
|
93
|
+
return ctx.newPage();
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* How many cookies are in the live context. Used by the auth-prompt
|
|
98
|
+
* flow to detect "user closed the window without logging in" — must
|
|
99
|
+
* be called BEFORE `dispose()` since the context closes its own
|
|
100
|
+
* stores when shutting down.
|
|
101
|
+
*/
|
|
102
|
+
async cookieCount(): Promise<number> {
|
|
103
|
+
if (!this.context) return 0;
|
|
104
|
+
try {
|
|
105
|
+
const cookies = await this.context.cookies();
|
|
106
|
+
return cookies.length;
|
|
107
|
+
} catch {
|
|
108
|
+
return 0;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Return the cookies stored in the persistent profile for a given
|
|
114
|
+
* URL/origin (or all cookies when omitted). Used by downloaders that
|
|
115
|
+
* call services with their own HTTP client (e.g. Node's built-in
|
|
116
|
+
* `fetch`) — they read the cookies once here and pass them via a
|
|
117
|
+
* `Cookie` header. Bypasses Playwright's APIRequestContext, which
|
|
118
|
+
* has a known cookie-parser bug on Google's same-origin redirects.
|
|
119
|
+
*/
|
|
120
|
+
async cookieHeader(url: string): Promise<string> {
|
|
121
|
+
const ctx = await this.ensureContext();
|
|
122
|
+
const cookies = await ctx.cookies(url);
|
|
123
|
+
return cookies.map((c) => `${c.name}=${c.value}`).join("; ");
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Resolve when the user is "done" with the headed browser session,
|
|
128
|
+
* detected as: the supplied page closes, OR its context closes, OR
|
|
129
|
+
* the underlying browser disconnects — whichever fires first. We
|
|
130
|
+
* can't rely on the browser-disconnect event alone: on macOS,
|
|
131
|
+
* closing the last window does NOT quit chromium (the app stays
|
|
132
|
+
* alive in the background), so the disconnect event never fires
|
|
133
|
+
* and the caller hangs forever. The page-close event is the only
|
|
134
|
+
* signal that's consistent across macOS, Linux, and Windows.
|
|
135
|
+
*/
|
|
136
|
+
async waitForUserDone(page: Page): Promise<void> {
|
|
137
|
+
const ctx = page.context();
|
|
138
|
+
const browser = ctx.browser();
|
|
139
|
+
await new Promise<void>((resolve) => {
|
|
140
|
+
let done = false;
|
|
141
|
+
const finish = () => {
|
|
142
|
+
if (done) return;
|
|
143
|
+
done = true;
|
|
144
|
+
resolve();
|
|
145
|
+
};
|
|
146
|
+
page.on("close", finish);
|
|
147
|
+
ctx.on("close", finish);
|
|
148
|
+
browser?.on("disconnected", finish);
|
|
149
|
+
if (page.isClosed() || (browser && !browser.isConnected())) finish();
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** Close the context (which releases the userDataDir lock). Idempotent. */
|
|
154
|
+
async dispose(): Promise<void> {
|
|
155
|
+
try {
|
|
156
|
+
await this.context?.close();
|
|
157
|
+
} catch {
|
|
158
|
+
// best-effort
|
|
159
|
+
}
|
|
160
|
+
this.context = null;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Resolve `maybeRelative` against `base` and return a `URL`, or `null`
|
|
166
|
+
* if neither parses. Playwright's `APIResponse.url()` sometimes hands
|
|
167
|
+
* back a path-only string (`"/"`) instead of an absolute URL after a
|
|
168
|
+
* same-origin redirect — every downloader that wants to inspect the
|
|
169
|
+
* final URL goes through this helper so the relative-URL handling
|
|
170
|
+
* lives in one place. Login-redirect detection itself is each
|
|
171
|
+
* downloader's responsibility — it's the only code that knows which
|
|
172
|
+
* host its export endpoint redirects to when the session is missing.
|
|
173
|
+
*/
|
|
174
|
+
export function safeResolveUrl(maybeRelative: string, base: string): URL | null {
|
|
175
|
+
try {
|
|
176
|
+
return new URL(maybeRelative, base);
|
|
177
|
+
} catch {
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { HelpfulError } from "../../errors.ts";
|
|
2
|
+
import { sha256Hex } from "../local-reader.ts";
|
|
3
|
+
import type { DownloadedRemote, Downloader } from "./index.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Catch-all downloader. Always matches HTTP/HTTPS URLs that no
|
|
7
|
+
* specific downloader claimed. Strategy:
|
|
8
|
+
* - Issue an authenticated GET via Playwright's request context
|
|
9
|
+
* (cookies from `membot login` flow through automatically).
|
|
10
|
+
* - If the server returned `text/html`, the page is probably a SPA
|
|
11
|
+
* or auth-gated render — open a real `page`, wait for
|
|
12
|
+
* `networkidle`, and `page.pdf()` the visible result. The rendered
|
|
13
|
+
* PDF goes through `convertPdf` so SPAs and login-walled docs
|
|
14
|
+
* work uniformly.
|
|
15
|
+
* - Otherwise the response IS the file (markdown, JSON, PDF, image,
|
|
16
|
+
* docx, …) — return its bytes verbatim and let the mime
|
|
17
|
+
* dispatcher pick the right native converter.
|
|
18
|
+
*
|
|
19
|
+
* This is what gives "no specific downloader needed" coverage to any
|
|
20
|
+
* URL the user throws at `membot add`.
|
|
21
|
+
*/
|
|
22
|
+
export const genericWebDownloader: Downloader = {
|
|
23
|
+
name: "generic-web",
|
|
24
|
+
description:
|
|
25
|
+
"Catch-all for any URL no other downloader handled — HEAD/GET, then either page.pdf() the rendered HTML or stream the raw bytes through the mime converter.",
|
|
26
|
+
matches(url) {
|
|
27
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
28
|
+
},
|
|
29
|
+
async download(url, ctx): Promise<DownloadedRemote> {
|
|
30
|
+
ctx.onProgress?.("fetching");
|
|
31
|
+
const request = await ctx.pool.request();
|
|
32
|
+
const response = await request.get(url.toString(), { timeout: 30_000 });
|
|
33
|
+
// As the catch-all we don't know which login page each unknown
|
|
34
|
+
// service redirects to. If the user lands on a rendered login
|
|
35
|
+
// page, it goes through the print-to-PDF path and they'll see
|
|
36
|
+
// an obviously-wrong "Sign in" PDF — the cue to run `membot login`.
|
|
37
|
+
// Specific downloaders own auth-redirect detection for the services
|
|
38
|
+
// they understand.
|
|
39
|
+
if (!response.ok() && response.status() !== 304) {
|
|
40
|
+
throw new HelpfulError({
|
|
41
|
+
kind: "network_error",
|
|
42
|
+
message: `HTTP ${response.status()} ${response.statusText()}: ${url.toString()}`,
|
|
43
|
+
hint: "Open the URL in your browser to verify it exists. For auth-gated content, run `membot login` first.",
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
const headers = response.headers();
|
|
47
|
+
const contentType =
|
|
48
|
+
(headers["content-type"] ?? "application/octet-stream").split(";")[0]?.trim() ?? "application/octet-stream";
|
|
49
|
+
|
|
50
|
+
if (contentType === "text/html" || contentType === "application/xhtml+xml") {
|
|
51
|
+
const page = await ctx.pool.newPage();
|
|
52
|
+
try {
|
|
53
|
+
ctx.onProgress?.("rendering page");
|
|
54
|
+
await page.goto(url.toString(), { waitUntil: "networkidle", timeout: 45_000 });
|
|
55
|
+
ctx.onProgress?.("printing to pdf");
|
|
56
|
+
const pdfBuf = await page.pdf({ format: "A4", printBackground: true, preferCSSPageSize: false });
|
|
57
|
+
const bytes = new Uint8Array(pdfBuf);
|
|
58
|
+
return {
|
|
59
|
+
bytes,
|
|
60
|
+
sha256: sha256Hex(pdfBuf),
|
|
61
|
+
mimeType: "application/pdf",
|
|
62
|
+
downloader: "generic-web",
|
|
63
|
+
downloaderArgs: { rendered: true, source_content_type: contentType },
|
|
64
|
+
sourceUrl: url.toString(),
|
|
65
|
+
};
|
|
66
|
+
} finally {
|
|
67
|
+
await page.close().catch(() => {});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const body = Buffer.from(await response.body());
|
|
72
|
+
return {
|
|
73
|
+
bytes: new Uint8Array(body),
|
|
74
|
+
sha256: sha256Hex(body),
|
|
75
|
+
mimeType: contentType,
|
|
76
|
+
downloader: "generic-web",
|
|
77
|
+
downloaderArgs: { rendered: false, source_content_type: contentType },
|
|
78
|
+
sourceUrl: url.toString(),
|
|
79
|
+
};
|
|
80
|
+
},
|
|
81
|
+
};
|