vault-fetch 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +4 -1
- package/README.md +4 -1
- package/dist/cli.js +7 -2
- package/dist/cli.js.map +1 -1
- package/dist/pdf-converter-FM6DCBO5.js +69 -0
- package/dist/pdf-converter-FM6DCBO5.js.map +1 -0
- package/package.json +1 -1
- package/dist/pdf-converter-U3SFA2HY.js +0 -42
- package/dist/pdf-converter-U3SFA2HY.js.map +0 -1
package/README.ja.md
CHANGED
|
@@ -63,7 +63,9 @@ vault-fetch fetch https://example.com/report.pdf --dest ~/Documents/Obsidian/Cli
|
|
|
63
63
|
|
|
64
64
|
サーバーが `Content-Type: application/pdf` を返す場合、vault-fetch は自動的に PDF をダウンロードし、[pdf2md](https://github.com/opendocsg/pdf2md) で Markdown に変換します。追加のフラグは不要です。
|
|
65
65
|
|
|
66
|
-
-
|
|
66
|
+
- タイトルは優先順位に従って抽出: PDF メタデータ(`dc:title` / `info.Title`)> Markdown の最初の `#` 見出し > URL のファイル名
|
|
67
|
+
- 著者・公開日も PDF メタデータから抽出(存在する場合)
|
|
68
|
+
- 自動抽出が不正確な場合は `--title` で手動指定可能
|
|
67
69
|
- `--selector` および `--raw` オプションは PDF URL と併用不可
|
|
68
70
|
- セッション機能により認証付き PDF のダウンロードにも対応
|
|
69
71
|
|
|
@@ -83,6 +85,7 @@ vault-fetch login https://note.com
|
|
|
83
85
|
| オプション | 説明 |
|
|
84
86
|
|---|---|
|
|
85
87
|
| `--dest <path>` | 保存先ディレクトリ(必須) |
|
|
88
|
+
| `--title <text>` | タイトル/ファイル名を手動指定 |
|
|
86
89
|
| `--headed` | ブラウザを表示して実行 |
|
|
87
90
|
| `--selector <css>` | CSS セレクタで要素を抽出 |
|
|
88
91
|
| `--timeout <sec>` | タイムアウト秒数(デフォルト: 30) |
|
package/README.md
CHANGED
|
@@ -63,7 +63,9 @@ vault-fetch fetch https://example.com/report.pdf --dest ~/Documents/Obsidian/Cli
|
|
|
63
63
|
|
|
64
64
|
When the server returns `Content-Type: application/pdf`, vault-fetch automatically downloads the PDF and converts it to Markdown using [pdf2md](https://github.com/opendocsg/pdf2md). No additional flags are needed.
|
|
65
65
|
|
|
66
|
-
- Title is extracted
|
|
66
|
+
- Title is extracted in priority order: PDF metadata (`dc:title` / `info.Title`) > first `#` heading in converted Markdown > URL filename
|
|
67
|
+
- Author and published date are also extracted from PDF metadata when available
|
|
68
|
+
- Use `--title` to manually override the title if automatic extraction is inaccurate
|
|
67
69
|
- `--selector` and `--raw` options cannot be used with PDF URLs
|
|
68
70
|
- Session support works with authenticated PDF downloads
|
|
69
71
|
|
|
@@ -83,6 +85,7 @@ Subsequent `fetch` commands will automatically use the saved session for that do
|
|
|
83
85
|
| Option | Description |
|
|
84
86
|
|---|---|
|
|
85
87
|
| `--dest <path>` | Destination directory (required) |
|
|
88
|
+
| `--title <text>` | Override the page title for the output filename |
|
|
86
89
|
| `--headed` | Run with browser visible |
|
|
87
90
|
| `--selector <css>` | Extract elements by CSS selector |
|
|
88
91
|
| `--timeout <sec>` | Timeout in seconds (default: 30) |
|
package/dist/cli.js
CHANGED
|
@@ -96,6 +96,7 @@ function resolveConfig(cliOptions, configPath) {
|
|
|
96
96
|
waitUntil,
|
|
97
97
|
headed: cliOptions.headed ?? false,
|
|
98
98
|
selector: cliOptions.selector ?? null,
|
|
99
|
+
title: cliOptions.title ?? null,
|
|
99
100
|
noSession: cliOptions.noSession ?? false,
|
|
100
101
|
dryRun: cliOptions.dryRun ?? false,
|
|
101
102
|
blockImages: cliOptions.blockImages ?? true,
|
|
@@ -405,7 +406,7 @@ program.command("fetch").description("Fetch a page and save as Markdown").argume
|
|
|
405
406
|
}, []).option(
|
|
406
407
|
"--wait-until <event>",
|
|
407
408
|
"Wait condition: load, domcontentloaded, networkidle"
|
|
408
|
-
).option("--skip-session", "Do not use saved session").option("--dry-run", "Output to stdout instead of saving").option("--no-block-images", "Do not block image requests").option("--no-block-fonts", "Do not block font requests").option("--no-block-media", "Do not block media requests").option("--raw", "Convert full page HTML without Readability extraction").action(async (url, options) => {
|
|
409
|
+
).option("--skip-session", "Do not use saved session").option("--dry-run", "Output to stdout instead of saving").option("--no-block-images", "Do not block image requests").option("--no-block-fonts", "Do not block font requests").option("--no-block-media", "Do not block media requests").option("--raw", "Convert full page HTML without Readability extraction").option("--title <text>", "Override the page title for the output filename").action(async (url, options) => {
|
|
409
410
|
try {
|
|
410
411
|
const configPath = existsSync2(CONFIG_PATH) ? CONFIG_PATH : void 0;
|
|
411
412
|
const config = resolveConfig(
|
|
@@ -416,6 +417,7 @@ program.command("fetch").description("Fetch a page and save as Markdown").argume
|
|
|
416
417
|
waitUntil: options.waitUntil,
|
|
417
418
|
headed: options.headed,
|
|
418
419
|
selector: options.selector,
|
|
420
|
+
title: options.title,
|
|
419
421
|
noSession: options.skipSession,
|
|
420
422
|
dryRun: options.dryRun,
|
|
421
423
|
blockImages: options.blockImages,
|
|
@@ -442,7 +444,7 @@ program.command("fetch").description("Fetch a page and save as Markdown").argume
|
|
|
442
444
|
if (config.raw) {
|
|
443
445
|
throw new Error("--raw cannot be used with PDF URLs.");
|
|
444
446
|
}
|
|
445
|
-
const { convertPdfToMarkdown } = await import("./pdf-converter-
|
|
447
|
+
const { convertPdfToMarkdown } = await import("./pdf-converter-FM6DCBO5.js");
|
|
446
448
|
const pdfResult = await convertPdfToMarkdown(
|
|
447
449
|
fetchResult.pdfBuffer,
|
|
448
450
|
fetchResult.finalUrl
|
|
@@ -460,6 +462,9 @@ program.command("fetch").description("Fetch a page and save as Markdown").argume
|
|
|
460
462
|
metadata = result.metadata;
|
|
461
463
|
markdown = convertToMarkdown(result.content);
|
|
462
464
|
}
|
|
465
|
+
if (config.title !== null) {
|
|
466
|
+
metadata = { ...metadata, title: config.title };
|
|
467
|
+
}
|
|
463
468
|
if (config.dryRun) {
|
|
464
469
|
const frontmatter = buildFrontmatter(metadata, config.tags);
|
|
465
470
|
process.stdout.write(`${frontmatter}
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/config.ts","../src/fetcher.ts","../src/session.ts","../src/extractor.ts","../src/converter.ts","../src/writer.ts"],"sourcesContent":["import { Command } from \"commander\";\nimport { once } from \"node:events\";\nimport { existsSync } from \"node:fs\";\nimport { homedir } from \"node:os\";\nimport { join } from \"node:path\";\nimport { resolveConfig } from \"./config.js\";\nimport { fetchPage } from \"./fetcher.js\";\nimport { extract, extractMetadata } from \"./extractor.js\";\nimport { convertToMarkdown } from \"./converter.js\";\nimport { writeMarkdownFile, buildFrontmatter } from \"./writer.js\";\nimport {\n getSessionDir,\n getSessionPath,\n ensureSessionDir,\n} from \"./session.js\";\nimport type { Metadata, WaitUntilOption } from \"./types.js\";\n\nconst CONFIG_PATH = join(homedir(), \".config\", \"vault-fetch\", \"config.yaml\");\n\nconst program = new Command();\n\nprogram\n .name(\"vault-fetch\")\n .description(\n \"Fetch JS-rendered web pages and save as Markdown to Obsidian Vault\",\n )\n .version(\"0.3.0\");\n\nprogram\n .command(\"fetch\")\n .description(\"Fetch a page and save as Markdown\")\n .argument(\"<url>\", \"URL to fetch\")\n .option(\"--dest <path>\", \"Destination directory\")\n .option(\"--headed\", \"Run browser in headed mode\")\n .option(\"--selector <css>\", \"CSS selector to extract\")\n .option(\"--timeout <seconds>\", \"Timeout in seconds\", parseInt)\n .option(\"--tag <name>\", \"Add tag (repeatable)\", (val: string, acc: string[]) => {\n acc.push(val);\n return acc;\n }, [] as string[])\n .option(\n \"--wait-until <event>\",\n \"Wait condition: load, domcontentloaded, networkidle\",\n )\n .option(\"--skip-session\", \"Do not use saved session\")\n .option(\"--dry-run\", \"Output to stdout instead of saving\")\n .option(\"--no-block-images\", \"Do not block image requests\")\n .option(\"--no-block-fonts\", \"Do not block font requests\")\n .option(\"--no-block-media\", \"Do not block media requests\")\n .option(\"--raw\", \"Convert full page HTML without Readability extraction\")\n .action(async (url: string, options: Record<string, unknown>) => {\n try {\n const configPath = existsSync(CONFIG_PATH) ? CONFIG_PATH : undefined;\n const config = resolveConfig(\n {\n dest: options.dest as string | undefined,\n tags: options.tag as string[] | undefined,\n timeout: options.timeout as number | undefined,\n waitUntil: options.waitUntil as WaitUntilOption | undefined,\n headed: options.headed as boolean | undefined,\n selector: options.selector as string | undefined,\n noSession: options.skipSession as boolean | undefined,\n dryRun: options.dryRun as boolean | undefined,\n blockImages: options.blockImages as boolean | undefined,\n blockFonts: options.blockFonts as boolean | undefined,\n blockMedia: options.blockMedia as boolean | undefined,\n raw: options.raw as boolean | undefined,\n },\n configPath,\n );\n\n if (config.raw && config.selector) {\n throw new Error(\"--raw and --selector cannot be used together.\");\n }\n\n // Validate dest directory exists\n if (!config.dryRun && !existsSync(config.dest)) {\n throw new Error(`Destination directory does not exist: ${config.dest}`);\n }\n\n const sessionsDir = getSessionDir();\n const fetchResult = await fetchPage(url, config, sessionsDir);\n\n let markdown: string;\n let metadata: Metadata;\n\n if (fetchResult.kind === \"pdf\") {\n if (config.selector) {\n throw new Error(\"--selector cannot be used with PDF URLs.\");\n }\n if (config.raw) {\n throw new Error(\"--raw cannot be used with PDF URLs.\");\n }\n const { convertPdfToMarkdown } = await import(\"./pdf-converter.js\");\n const pdfResult = await convertPdfToMarkdown(\n fetchResult.pdfBuffer,\n fetchResult.finalUrl,\n );\n markdown = pdfResult.markdown;\n metadata = pdfResult.metadata;\n } else if (config.selector) {\n // --selector mode: skip Readability, extract metadata from full page\n metadata = extractMetadata(fetchResult.fullHtml, fetchResult.finalUrl);\n markdown = convertToMarkdown(fetchResult.html);\n } else if (config.raw) {\n // --raw mode: skip Readability, convert full page HTML directly\n metadata = extractMetadata(fetchResult.fullHtml, fetchResult.finalUrl);\n markdown = convertToMarkdown(fetchResult.fullHtml);\n } else {\n const result = extract(fetchResult.html, fetchResult.finalUrl);\n metadata = result.metadata;\n markdown = convertToMarkdown(result.content);\n }\n\n if (config.dryRun) {\n const frontmatter = buildFrontmatter(metadata, config.tags);\n process.stdout.write(`${frontmatter}\\n\\n${markdown}\\n`);\n } else {\n const filePath = writeMarkdownFile(\n config.dest,\n metadata,\n markdown,\n config.tags,\n );\n console.error(`Saved: ${filePath}`);\n }\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n console.error(`Error: ${message}`);\n process.exit(1);\n }\n });\n\nprogram\n .command(\"login\")\n .description(\"Login to a site and save session\")\n .argument(\"<url>\", \"URL to login\")\n .option(\"--timeout <seconds>\", \"Login timeout in seconds\", parseInt)\n .action(async (url: string, options: Record<string, unknown>) => {\n const { chromium } = await import(\"playwright\");\n const sessionsDir = getSessionDir();\n ensureSessionDir(sessionsDir);\n\n const timeoutSec = (options.timeout as number | undefined) ?? 300;\n const browser = await chromium.launch({ headless: false });\n\n try {\n const context = await browser.newContext();\n const page = await context.newPage();\n\n await page.goto(url, { waitUntil: \"networkidle\", timeout: timeoutSec * 1000 });\n\n console.error(\"Browser opened. Log in manually, then press Enter here to save session.\");\n\n process.stdin.resume();\n await once(process.stdin, \"data\");\n process.stdin.pause();\n process.stdin.unref();\n\n const sessionPath = getSessionPath(url, sessionsDir);\n await context.storageState({ path: sessionPath });\n console.error(`Session saved: ${sessionPath}`);\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n console.error(`Error: ${message}`);\n process.exit(1);\n } finally {\n await browser.close();\n }\n });\n\nprogram.parse();\n","import { readFileSync } from \"node:fs\";\nimport { homedir } from \"node:os\";\nimport { resolve } from \"node:path\";\nimport yaml from \"js-yaml\";\nimport type { ResolvedConfig, WaitUntilOption } from \"./types.js\";\n\nconst DEFAULT_TIMEOUT = 30;\nconst DEFAULT_WAIT_UNTIL: WaitUntilOption = \"networkidle\";\nconst REQUIRED_TAG = \"clippings\";\n\ninterface FileConfig {\n dest?: string;\n tags?: string[];\n timeout?: number;\n waitUntil?: WaitUntilOption;\n}\n\ninterface CliOptions {\n dest?: string;\n tags?: string[];\n timeout?: number;\n waitUntil?: WaitUntilOption;\n headed?: boolean;\n selector?: string;\n noSession?: boolean;\n dryRun?: boolean;\n blockImages?: boolean;\n blockFonts?: boolean;\n blockMedia?: boolean;\n raw?: boolean;\n}\n\nfunction expandTilde(filePath: string): string {\n if (filePath.startsWith(\"~/\")) {\n return resolve(homedir(), filePath.slice(2));\n }\n return filePath;\n}\n\nconst VALID_WAIT_UNTIL: readonly string[] = [\"load\", \"domcontentloaded\", \"networkidle\"];\n\nfunction validateWaitUntil(value: string): WaitUntilOption {\n if (!VALID_WAIT_UNTIL.includes(value)) {\n throw new Error(\n `Invalid waitUntil value: \"${value}\". Must be one of: ${VALID_WAIT_UNTIL.join(\", \")}`,\n );\n }\n return value as WaitUntilOption;\n}\n\nfunction loadConfigFile(configPath: string): FileConfig {\n const content = readFileSync(configPath, \"utf-8\");\n const parsed = yaml.load(content);\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(`Invalid config file: ${configPath}`);\n }\n const config = parsed as Record<string, unknown>;\n\n if (config.timeout !== undefined && typeof config.timeout !== \"number\") {\n throw new Error(`Invalid timeout in config file: expected number, got ${typeof config.timeout}`);\n }\n if (config.dest !== undefined && typeof config.dest !== \"string\") {\n throw new Error(`Invalid dest in config file: expected string, got ${typeof config.dest}`);\n }\n if (config.waitUntil !== undefined) {\n if (typeof config.waitUntil !== \"string\") {\n throw new Error(`Invalid waitUntil in config file: expected string, got ${typeof config.waitUntil}`);\n }\n validateWaitUntil(config.waitUntil);\n }\n if (config.tags !== undefined) {\n if (!Array.isArray(config.tags) || !config.tags.every((t: unknown) => typeof t === \"string\")) {\n throw new Error(\"Invalid tags in config file: expected array of strings\");\n }\n }\n\n return config as FileConfig;\n}\n\nexport function resolveConfig(\n cliOptions: CliOptions,\n configPath: string | undefined,\n): ResolvedConfig {\n // Layer 1: Config file\n let fileConfig: FileConfig = {};\n if (configPath) {\n fileConfig = loadConfigFile(configPath);\n }\n\n // Layer 2: Environment variables\n const envDest = process.env.VAULT_FETCH_DEST;\n const envTimeout = process.env.VAULT_FETCH_TIMEOUT;\n\n // Resolve each field: CLI > env > file > default\n const dest = cliOptions.dest ?? envDest ?? fileConfig.dest;\n if (dest === undefined) {\n throw new Error(\n \"dest is required. Set via --dest, VAULT_FETCH_DEST, or config file.\",\n );\n }\n\n let timeout: number;\n if (cliOptions.timeout !== undefined) {\n timeout = cliOptions.timeout;\n } else if (envTimeout !== undefined) {\n const parsed = Number(envTimeout);\n if (Number.isNaN(parsed)) {\n throw new Error(`Invalid VAULT_FETCH_TIMEOUT value: ${envTimeout}`);\n }\n timeout = parsed;\n } else {\n timeout = fileConfig.timeout ?? DEFAULT_TIMEOUT;\n }\n\n const rawWaitUntil = cliOptions.waitUntil ?? fileConfig.waitUntil ?? DEFAULT_WAIT_UNTIL;\n const waitUntil = validateWaitUntil(rawWaitUntil);\n\n // Merge tags: file tags + CLI tags + always clippings\n const allTags = [\n ...(fileConfig.tags ?? []),\n ...(cliOptions.tags ?? []),\n REQUIRED_TAG,\n ];\n const tags = [...new Set(allTags)];\n\n return {\n dest: expandTilde(dest),\n tags,\n timeout,\n waitUntil,\n headed: cliOptions.headed ?? false,\n selector: cliOptions.selector ?? null,\n noSession: cliOptions.noSession ?? false,\n dryRun: cliOptions.dryRun ?? false,\n blockImages: cliOptions.blockImages ?? true,\n blockFonts: cliOptions.blockFonts ?? true,\n blockMedia: cliOptions.blockMedia ?? true,\n raw: cliOptions.raw ?? false,\n };\n}\n","import { chromium, type BrowserContext } from \"playwright\";\nimport type { FetchResult, ResolvedConfig } from \"./types.js\";\nimport { getSessionPath, sessionExists } from \"./session.js\";\n\nexport const CHROME_USER_AGENT =\n \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \" +\n \"AppleWebKit/537.36 (KHTML, like Gecko) \" +\n \"Chrome/134.0.0.0 Safari/537.36\";\n\ninterface BlockingOptions {\n blockImages: boolean;\n blockFonts: boolean;\n blockMedia: boolean;\n}\n\nexport function isPdfContentType(contentType: string): boolean {\n return contentType.toLowerCase().includes(\"application/pdf\");\n}\n\nexport function buildBlockedResourceTypes(options: BlockingOptions): Set<string> {\n const blocked = new Set<string>();\n if (options.blockImages) blocked.add(\"image\");\n if (options.blockFonts) blocked.add(\"font\");\n if (options.blockMedia) blocked.add(\"media\");\n return blocked;\n}\n\nconst PDF_MAGIC_BYTES = \"%PDF\";\n\nexport function validatePdfBuffer(pdfBuffer: Buffer, sourceUrl: string): void {\n if (pdfBuffer.length === 0) {\n throw new Error(`Empty PDF response received from ${sourceUrl}`);\n }\n const header = pdfBuffer.subarray(0, PDF_MAGIC_BYTES.length).toString(\"ascii\");\n if (!header.startsWith(PDF_MAGIC_BYTES)) {\n throw new Error(\n `Response Content-Type is application/pdf but body is not valid PDF data from ${sourceUrl}`,\n );\n }\n}\n\nasync function downloadPdf(\n context: BrowserContext,\n url: string,\n timeoutMs: number,\n): Promise<{ pdfBuffer: Buffer; finalUrl: string }> {\n const apiResponse = await context.request.get(url, { timeout: timeoutMs });\n const status = apiResponse.status();\n if (status >= 400) {\n throw new Error(`HTTP ${status} received when downloading PDF from ${url}`);\n }\n const pdfBuffer = Buffer.from(await apiResponse.body());\n const finalUrl = apiResponse.url();\n validatePdfBuffer(pdfBuffer, finalUrl);\n return { pdfBuffer, finalUrl };\n}\n\nexport async function fetchPage(\n url: string,\n config: ResolvedConfig,\n sessionsDir: string,\n): Promise<FetchResult> {\n const browser = await chromium.launch({\n headless: !config.headed,\n });\n\n try {\n const contextOptions: Parameters<typeof browser.newContext>[0] = {\n userAgent: CHROME_USER_AGENT,\n };\n\n // Load session if available and not disabled\n if (!config.noSession && sessionExists(url, sessionsDir)) {\n const sessionPath = getSessionPath(url, sessionsDir);\n contextOptions.storageState = sessionPath;\n }\n\n const context: BrowserContext = await browser.newContext(contextOptions);\n const page = await context.newPage();\n\n // Block specified resource types for faster loading\n const blockedTypes = buildBlockedResourceTypes(config);\n if (blockedTypes.size > 0) {\n await page.route(\"**/*\", async (route) => {\n if (blockedTypes.has(route.request().resourceType())) {\n await route.abort();\n } else {\n await route.continue();\n }\n });\n }\n\n const timeoutMs = config.timeout * 1000;\n\n // page.goto throws \"Download is starting\" when the server returns\n // Content-Disposition: attachment (common for PDF downloads).\n // Catch this and download the PDF via the context's HTTP client.\n let response;\n try {\n response = await page.goto(url, {\n waitUntil: config.waitUntil,\n timeout: timeoutMs,\n });\n } catch (error) {\n if (\n error instanceof Error &&\n error.message.includes(\"Download is starting\")\n ) {\n const result = await downloadPdf(context, url, timeoutMs);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: result.pdfBuffer, url, finalUrl: result.finalUrl };\n }\n throw error;\n }\n\n if (!response) {\n throw new Error(`No response received from ${url}`);\n }\n\n const status = response.status();\n if (status >= 400) {\n throw new Error(`HTTP ${status} received from ${response.url()}`);\n }\n\n const finalUrl = response.url();\n const contentType = response.headers()[\"content-type\"] ?? \"\";\n\n // Inline PDF (Content-Disposition: inline or absent).\n // Try response.body() first; fall back to context.request if the\n // browser returned its PDF viewer HTML instead of the actual bytes.\n if (isPdfContentType(contentType)) {\n const body = await response.body();\n try {\n validatePdfBuffer(body, finalUrl);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: body, url, finalUrl };\n } catch {\n // response.body() returned PDF viewer HTML; re-download via API\n const result = await downloadPdf(context, finalUrl, timeoutMs);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: result.pdfBuffer, url, finalUrl: result.finalUrl };\n }\n }\n\n const fullHtml = await page.content();\n let html: string;\n\n if (config.selector) {\n const element = await page.$(config.selector);\n if (!element) {\n throw new Error(`Selector not found: ${config.selector}`);\n }\n html = await element.innerHTML();\n } else {\n html = fullHtml;\n }\n\n await context.close();\n\n return { kind: \"html\", html, fullHtml, url, finalUrl };\n } finally {\n await browser.close();\n }\n}\n","import { existsSync, mkdirSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport { homedir } from \"node:os\";\n\nconst CONFIG_DIR = join(homedir(), \".config\", \"vault-fetch\");\nconst SESSIONS_DIR = join(CONFIG_DIR, \"sessions\");\n\nexport function getSessionDir(): string {\n return SESSIONS_DIR;\n}\n\nfunction extractDomain(url: string): string {\n const parsed = new URL(url);\n return parsed.hostname ?? \"\";\n}\n\nexport function getSessionPath(url: string, sessionsDir: string): string {\n const domain = extractDomain(url);\n return join(sessionsDir, `${domain}.json`);\n}\n\nexport function sessionExists(url: string, sessionsDir: string): boolean {\n const sessionPath = getSessionPath(url, sessionsDir);\n return existsSync(sessionPath);\n}\n\nexport function ensureSessionDir(sessionsDir: string): void {\n if (!existsSync(sessionsDir)) {\n mkdirSync(sessionsDir, { recursive: true });\n }\n}\n","import { Readability } from \"@mozilla/readability\";\nimport { JSDOM } from \"jsdom\";\nimport type { Metadata } from \"./types.js\";\n\nfunction getMetaContent(doc: Document, selector: string): string | null {\n const el = doc.querySelector(selector);\n return el?.getAttribute(\"content\") ?? null;\n}\n\nfunction formatAuthor(raw: string): string {\n return `[[${raw.trim()}]]`;\n}\n\nfunction extractPublishedDate(doc: Document): string | null {\n const published =\n getMetaContent(doc, 'meta[property=\"article:published_time\"]') ??\n getMetaContent(doc, 'meta[name=\"datePublished\"]');\n\n if (!published) {\n const jsonLd = doc.querySelector('script[type=\"application/ld+json\"]');\n if (jsonLd?.textContent) {\n try {\n const data = JSON.parse(jsonLd.textContent) as Record<string, unknown>;\n if (typeof data.datePublished === \"string\") {\n return data.datePublished.split(\"T\")[0];\n }\n } catch {\n // JSON-LD parse failed\n }\n }\n return null;\n }\n\n return published.split(\"T\")[0];\n}\n\nfunction extractAuthors(\n doc: Document,\n readabilityByline: string | null,\n): string[] {\n const articleAuthors = doc.querySelectorAll('meta[property=\"article:author\"]');\n if (articleAuthors.length > 0) {\n return Array.from(articleAuthors)\n .map((el) => el.getAttribute(\"content\"))\n .filter((v): v is string => v !== null)\n .map(formatAuthor);\n }\n\n const ogAuthor = getMetaContent(doc, 'meta[property=\"og:author\"]');\n if (ogAuthor) {\n return [formatAuthor(ogAuthor)];\n }\n\n if (readabilityByline) {\n return [formatAuthor(readabilityByline)];\n }\n\n return [];\n}\n\nexport interface ExtractResult {\n metadata: Metadata;\n content: string;\n}\n\ninterface ReadabilityArticle {\n title: string;\n byline: string | null;\n excerpt: string;\n content: string;\n}\n\nfunction buildMetadata(\n doc: Document,\n article: ReadabilityArticle | null,\n finalUrl: string,\n): Metadata {\n const title = article?.title ?? doc.title;\n const authors = extractAuthors(doc, article?.byline ?? null);\n const published = extractPublishedDate(doc);\n\n const description =\n getMetaContent(doc, 'meta[property=\"og:description\"]') ??\n getMetaContent(doc, 'meta[name=\"description\"]') ??\n (article?.excerpt ?? null);\n\n const today = new Date().toISOString().split(\"T\")[0];\n\n return {\n title,\n source: finalUrl,\n author: authors,\n published,\n created: today,\n description,\n };\n}\n\nfunction parseWithReadability(html: string, url: string): ReadabilityArticle | null {\n const dom = new JSDOM(html, { url });\n const reader = new Readability(dom.window.document);\n return reader.parse() as ReadabilityArticle | null;\n}\n\nexport function extract(html: string, finalUrl: string): ExtractResult {\n const metaDom = new JSDOM(html, { url: finalUrl });\n const doc = metaDom.window.document;\n\n const article = parseWithReadability(html, finalUrl);\n\n if (!article) {\n throw new Error(\n \"Readability failed to extract content from the page. \" +\n \"Try --raw to convert the full page, or --selector <css> to target specific content.\",\n );\n }\n\n if (!article.content) {\n throw new Error(\"Readability returned empty content for the page\");\n }\n\n return {\n metadata: buildMetadata(doc, article, finalUrl),\n content: article.content,\n };\n}\n\nexport function extractMetadata(html: string, finalUrl: string): Metadata {\n const metaDom = new JSDOM(html, { url: finalUrl });\n const doc = metaDom.window.document;\n\n const article = parseWithReadability(html, finalUrl);\n\n return buildMetadata(doc, article, finalUrl);\n}\n","import TurndownService from \"turndown\";\n\nexport function convertToMarkdown(html: string): string {\n const turndown = new TurndownService({\n headingStyle: \"atx\",\n codeBlockStyle: \"fenced\",\n bulletListMarker: \"-\",\n });\n\n return turndown.turndown(html);\n}\n","import { writeFileSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport yaml from \"js-yaml\";\nimport type { Metadata } from \"./types.js\";\n\nconst UNSAFE_CHARS = /[/\\\\:*?\"<>|]/g;\nconst CONTROL_CHARS = /[\\x00-\\x1f\\x7f]/g;\nconst MAX_FILENAME_LENGTH = 200;\n\nexport function sanitizeFilename(title: string): string {\n const sanitized = title\n .replace(CONTROL_CHARS, \"\")\n .replace(UNSAFE_CHARS, \"\")\n .replace(/\\s+/g, \" \")\n .trim();\n const base = sanitized.slice(0, MAX_FILENAME_LENGTH) || \"Untitled\";\n return `${base}.md`;\n}\n\nexport function buildFrontmatter(metadata: Metadata, tags: string[]): string {\n const data: Record<string, unknown> = {\n title: metadata.title,\n source: metadata.source,\n };\n\n if (metadata.author.length > 0) {\n data.author = metadata.author;\n }\n\n if (metadata.published) {\n data.published = metadata.published;\n }\n\n data.created = metadata.created;\n\n if (metadata.description) {\n data.description = metadata.description;\n }\n\n data.tags = tags;\n\n const yamlStr = yaml.dump(data, {\n quotingType: '\"',\n forceQuotes: false,\n lineWidth: -1,\n sortKeys: false,\n });\n\n return `---\\n${yamlStr}---`;\n}\n\nexport function writeMarkdownFile(\n dest: string,\n metadata: Metadata,\n markdownContent: string,\n tags: string[],\n): string {\n const filename = sanitizeFilename(metadata.title);\n const filePath = join(dest, filename);\n const frontmatter = buildFrontmatter(metadata, tags);\n const fullContent = `${frontmatter}\\n\\n${markdownContent}\\n`;\n\n writeFileSync(filePath, fullContent, \"utf-8\");\n\n return filePath;\n}\n"],"mappings":";;;AAAA,SAAS,eAAe;AACxB,SAAS,YAAY;AACrB,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,WAAAC,gBAAe;AACxB,SAAS,QAAAC,aAAY;;;ACJrB,SAAS,oBAAoB;AAC7B,SAAS,eAAe;AACxB,SAAS,eAAe;AACxB,OAAO,UAAU;AAGjB,IAAM,kBAAkB;AACxB,IAAM,qBAAsC;AAC5C,IAAM,eAAe;AAwBrB,SAAS,YAAY,UAA0B;AAC7C,MAAI,SAAS,WAAW,IAAI,GAAG;AAC7B,WAAO,QAAQ,QAAQ,GAAG,SAAS,MAAM,CAAC,CAAC;AAAA,EAC7C;AACA,SAAO;AACT;AAEA,IAAM,mBAAsC,CAAC,QAAQ,oBAAoB,aAAa;AAEtF,SAAS,kBAAkB,OAAgC;AACzD,MAAI,CAAC,iBAAiB,SAAS,KAAK,GAAG;AACrC,UAAM,IAAI;AAAA,MACR,6BAA6B,KAAK,sBAAsB,iBAAiB,KAAK,IAAI,CAAC;AAAA,IACrF;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,eAAe,YAAgC;AACtD,QAAM,UAAU,aAAa,YAAY,OAAO;AAChD,QAAM,SAAS,KAAK,KAAK,OAAO;AAChC,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,wBAAwB,UAAU,EAAE;AAAA,EACtD;AACA,QAAM,SAAS;AAEf,MAAI,OAAO,YAAY,UAAa,OAAO,OAAO,YAAY,UAAU;AACtE,UAAM,IAAI,MAAM,wDAAwD,OAAO,OAAO,OAAO,EAAE;AAAA,EACjG;AACA,MAAI,OAAO,SAAS,UAAa,OAAO,OAAO,SAAS,UAAU;AAChE,UAAM,IAAI,MAAM,qDAAqD,OAAO,OAAO,IAAI,EAAE;AAAA,EAC3F;AACA,MAAI,OAAO,cAAc,QAAW;AAClC,QAAI,OAAO,OAAO,cAAc,UAAU;AACxC,YAAM,IAAI,MAAM,0DAA0D,OAAO,OAAO,SAAS,EAAE;AAAA,IACrG;AACA,sBAAkB,OAAO,SAAS;AAAA,EACpC;AACA,MAAI,OAAO,SAAS,QAAW;AAC7B,QAAI,CAAC,MAAM,QAAQ,OAAO,IAAI,KAAK,CAAC,OAAO,KAAK,MAAM,CAAC,MAAe,OAAO,MAAM,QAAQ,GAAG;AAC5F,YAAM,IAAI,MAAM,wDAAwD;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO;AACT;AAEO,SAAS,cACd,YACA,YACgB;AAEhB,MAAI,aAAyB,CAAC;AAC9B,MAAI,YAAY;AACd,iBAAa,eAAe,UAAU;AAAA,EACxC;AAGA,QAAM,UAAU,QAAQ,IAAI;AAC5B,QAAM,aAAa,QAAQ,IAAI;AAG/B,QAAM,OAAO,WAAW,QAAQ,WAAW,WAAW;AACtD,MAAI,SAAS,QAAW;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI;AACJ,MAAI,WAAW,YAAY,QAAW;AACpC,cAAU,WAAW;AAAA,EACvB,WAAW,eAAe,QAAW;AACnC,UAAM,SAAS,OAAO,UAAU;AAChC,QAAI,OAAO,MAAM,MAAM,GAAG;AACxB,YAAM,IAAI,MAAM,sCAAsC,UAAU,EAAE;AAAA,IACpE;AACA,cAAU;AAAA,EACZ,OAAO;AACL,cAAU,WAAW,WAAW;AAAA,EAClC;AAEA,QAAM,eAAe,WAAW,aAAa,WAAW,aAAa;AACrE,QAAM,YAAY,kBAAkB,YAAY;AAGhD,QAAM,UAAU;AAAA,IACd,GAAI,WAAW,QAAQ,CAAC;AAAA,IACxB,GAAI,WAAW,QAAQ,CAAC;AAAA,IACxB;AAAA,EACF;AACA,QAAM,OAAO,CAAC,GAAG,IAAI,IAAI,OAAO,CAAC;AAEjC,SAAO;AAAA,IACL,MAAM,YAAY,IAAI;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA,QAAQ,WAAW,UAAU;AAAA,IAC7B,UAAU,WAAW,YAAY;AAAA,IACjC,WAAW,WAAW,aAAa;AAAA,IACnC,QAAQ,WAAW,UAAU;AAAA,IAC7B,aAAa,WAAW,eAAe;AAAA,IACvC,YAAY,WAAW,cAAc;AAAA,IACrC,YAAY,WAAW,cAAc;AAAA,IACrC,KAAK,WAAW,OAAO;AAAA,EACzB;AACF;;;AC3IA,SAAS,gBAAqC;;;ACA9C,SAAS,YAAY,iBAAiB;AACtC,SAAS,YAAY;AACrB,SAAS,WAAAC,gBAAe;AAExB,IAAM,aAAa,KAAKA,SAAQ,GAAG,WAAW,aAAa;AAC3D,IAAM,eAAe,KAAK,YAAY,UAAU;AAEzC,SAAS,gBAAwB;AACtC,SAAO;AACT;AAEA,SAAS,cAAc,KAAqB;AAC1C,QAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,SAAO,OAAO,YAAY;AAC5B;AAEO,SAAS,eAAe,KAAa,aAA6B;AACvE,QAAM,SAAS,cAAc,GAAG;AAChC,SAAO,KAAK,aAAa,GAAG,MAAM,OAAO;AAC3C;AAEO,SAAS,cAAc,KAAa,aAA8B;AACvE,QAAM,cAAc,eAAe,KAAK,WAAW;AACnD,SAAO,WAAW,WAAW;AAC/B;AAEO,SAAS,iBAAiB,aAA2B;AAC1D,MAAI,CAAC,WAAW,WAAW,GAAG;AAC5B,cAAU,aAAa,EAAE,WAAW,KAAK,CAAC;AAAA,EAC5C;AACF;;;AD1BO,IAAM,oBACX;AAUK,SAAS,iBAAiB,aAA8B;AAC7D,SAAO,YAAY,YAAY,EAAE,SAAS,iBAAiB;AAC7D;AAEO,SAAS,0BAA0B,SAAuC;AAC/E,QAAM,UAAU,oBAAI,IAAY;AAChC,MAAI,QAAQ,YAAa,SAAQ,IAAI,OAAO;AAC5C,MAAI,QAAQ,WAAY,SAAQ,IAAI,MAAM;AAC1C,MAAI,QAAQ,WAAY,SAAQ,IAAI,OAAO;AAC3C,SAAO;AACT;AAEA,IAAM,kBAAkB;AAEjB,SAAS,kBAAkB,WAAmB,WAAyB;AAC5E,MAAI,UAAU,WAAW,GAAG;AAC1B,UAAM,IAAI,MAAM,oCAAoC,SAAS,EAAE;AAAA,EACjE;AACA,QAAM,SAAS,UAAU,SAAS,GAAG,gBAAgB,MAAM,EAAE,SAAS,OAAO;AAC7E,MAAI,CAAC,OAAO,WAAW,eAAe,GAAG;AACvC,UAAM,IAAI;AAAA,MACR,gFAAgF,SAAS;AAAA,IAC3F;AAAA,EACF;AACF;AAEA,eAAe,YACb,SACA,KACA,WACkD;AAClD,QAAM,cAAc,MAAM,QAAQ,QAAQ,IAAI,KAAK,EAAE,SAAS,UAAU,CAAC;AACzE,QAAM,SAAS,YAAY,OAAO;AAClC,MAAI,UAAU,KAAK;AACjB,UAAM,IAAI,MAAM,QAAQ,MAAM,uCAAuC,GAAG,EAAE;AAAA,EAC5E;AACA,QAAM,YAAY,OAAO,KAAK,MAAM,YAAY,KAAK,CAAC;AACtD,QAAM,WAAW,YAAY,IAAI;AACjC,oBAAkB,WAAW,QAAQ;AACrC,SAAO,EAAE,WAAW,SAAS;AAC/B;AAEA,eAAsB,UACpB,KACA,QACA,aACsB;AACtB,QAAM,UAAU,MAAM,SAAS,OAAO;AAAA,IACpC,UAAU,CAAC,OAAO;AAAA,EACpB,CAAC;AAED,MAAI;AACF,UAAM,iBAA2D;AAAA,MAC/D,WAAW;AAAA,IACb;AAGA,QAAI,CAAC,OAAO,aAAa,cAAc,KAAK,WAAW,GAAG;AACxD,YAAM,cAAc,eAAe,KAAK,WAAW;AACnD,qBAAe,eAAe;AAAA,IAChC;AAEA,UAAM,UAA0B,MAAM,QAAQ,WAAW,cAAc;AACvE,UAAM,OAAO,MAAM,QAAQ,QAAQ;AAGnC,UAAM,eAAe,0BAA0B,MAAM;AACrD,QAAI,aAAa,OAAO,GAAG;AACzB,YAAM,KAAK,MAAM,QAAQ,OAAO,UAAU;AACxC,YAAI,aAAa,IAAI,MAAM,QAAQ,EAAE,aAAa,CAAC,GAAG;AACpD,gBAAM,MAAM,MAAM;AAAA,QACpB,OAAO;AACL,gBAAM,MAAM,SAAS;AAAA,QACvB;AAAA,MACF,CAAC;AAAA,IACH;AAEA,UAAM,YAAY,OAAO,UAAU;AAKnC,QAAI;AACJ,QAAI;AACF,iBAAW,MAAM,KAAK,KAAK,KAAK;AAAA,QAC9B,WAAW,OAAO;AAAA,QAClB,SAAS;AAAA,MACX,CAAC;AAAA,IACH,SAAS,OAAO;AACd,UACE,iBAAiB,SACjB,MAAM,QAAQ,SAAS,sBAAsB,GAC7C;AACA,cAAM,SAAS,MAAM,YAAY,SAAS,KAAK,SAAS;AACxD,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,OAAO,WAAW,KAAK,UAAU,OAAO,SAAS;AAAA,MACpF;AACA,YAAM;AAAA,IACR;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,IAAI,MAAM,6BAA6B,GAAG,EAAE;AAAA,IACpD;AAEA,UAAM,SAAS,SAAS,OAAO;AAC/B,QAAI,UAAU,KAAK;AACjB,YAAM,IAAI,MAAM,QAAQ,MAAM,kBAAkB,SAAS,IAAI,CAAC,EAAE;AAAA,IAClE;AAEA,UAAM,WAAW,SAAS,IAAI;AAC9B,UAAM,cAAc,SAAS,QAAQ,EAAE,cAAc,KAAK;AAK1D,QAAI,iBAAiB,WAAW,GAAG;AACjC,YAAM,OAAO,MAAM,SAAS,KAAK;AACjC,UAAI;AACF,0BAAkB,MAAM,QAAQ;AAChC,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,MAAM,KAAK,SAAS;AAAA,MACvD,QAAQ;AAEN,cAAM,SAAS,MAAM,YAAY,SAAS,UAAU,SAAS;AAC7D,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,OAAO,WAAW,KAAK,UAAU,OAAO,SAAS;AAAA,MACpF;AAAA,IACF;AAEA,UAAM,WAAW,MAAM,KAAK,QAAQ;AACpC,QAAI;AAEJ,QAAI,OAAO,UAAU;AACnB,YAAM,UAAU,MAAM,KAAK,EAAE,OAAO,QAAQ;AAC5C,UAAI,CAAC,SAAS;AACZ,cAAM,IAAI,MAAM,uBAAuB,OAAO,QAAQ,EAAE;AAAA,MAC1D;AACA,aAAO,MAAM,QAAQ,UAAU;AAAA,IACjC,OAAO;AACL,aAAO;AAAA,IACT;AAEA,UAAM,QAAQ,MAAM;AAEpB,WAAO,EAAE,MAAM,QAAQ,MAAM,UAAU,KAAK,SAAS;AAAA,EACvD,UAAE;AACA,UAAM,QAAQ,MAAM;AAAA,EACtB;AACF;;;AEnKA,SAAS,mBAAmB;AAC5B,SAAS,aAAa;AAGtB,SAAS,eAAe,KAAe,UAAiC;AACtE,QAAM,KAAK,IAAI,cAAc,QAAQ;AACrC,SAAO,IAAI,aAAa,SAAS,KAAK;AACxC;AAEA,SAAS,aAAa,KAAqB;AACzC,SAAO,KAAK,IAAI,KAAK,CAAC;AACxB;AAEA,SAAS,qBAAqB,KAA8B;AAC1D,QAAM,YACJ,eAAe,KAAK,yCAAyC,KAC7D,eAAe,KAAK,4BAA4B;AAElD,MAAI,CAAC,WAAW;AACd,UAAM,SAAS,IAAI,cAAc,oCAAoC;AACrE,QAAI,QAAQ,aAAa;AACvB,UAAI;AACF,cAAM,OAAO,KAAK,MAAM,OAAO,WAAW;AAC1C,YAAI,OAAO,KAAK,kBAAkB,UAAU;AAC1C,iBAAO,KAAK,cAAc,MAAM,GAAG,EAAE,CAAC;AAAA,QACxC;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAEA,SAAO,UAAU,MAAM,GAAG,EAAE,CAAC;AAC/B;AAEA,SAAS,eACP,KACA,mBACU;AACV,QAAM,iBAAiB,IAAI,iBAAiB,iCAAiC;AAC7E,MAAI,eAAe,SAAS,GAAG;AAC7B,WAAO,MAAM,KAAK,cAAc,EAC7B,IAAI,CAAC,OAAO,GAAG,aAAa,SAAS,CAAC,EACtC,OAAO,CAAC,MAAmB,MAAM,IAAI,EACrC,IAAI,YAAY;AAAA,EACrB;AAEA,QAAM,WAAW,eAAe,KAAK,4BAA4B;AACjE,MAAI,UAAU;AACZ,WAAO,CAAC,aAAa,QAAQ,CAAC;AAAA,EAChC;AAEA,MAAI,mBAAmB;AACrB,WAAO,CAAC,aAAa,iBAAiB,CAAC;AAAA,EACzC;AAEA,SAAO,CAAC;AACV;AAcA,SAAS,cACP,KACA,SACA,UACU;AACV,QAAM,QAAQ,SAAS,SAAS,IAAI;AACpC,QAAM,UAAU,eAAe,KAAK,SAAS,UAAU,IAAI;AAC3D,QAAM,YAAY,qBAAqB,GAAG;AAE1C,QAAM,cACJ,eAAe,KAAK,iCAAiC,KACrD,eAAe,KAAK,0BAA0B,MAC7C,SAAS,WAAW;AAEvB,QAAM,SAAQ,oBAAI,KAAK,GAAE,YAAY,EAAE,MAAM,GAAG,EAAE,CAAC;AAEnD,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR;AAAA,IACA,SAAS;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,qBAAqB,MAAc,KAAwC;AAClF,QAAM,MAAM,IAAI,MAAM,MAAM,EAAE,IAAI,CAAC;AACnC,QAAM,SAAS,IAAI,YAAY,IAAI,OAAO,QAAQ;AAClD,SAAO,OAAO,MAAM;AACtB;AAEO,SAAS,QAAQ,MAAc,UAAiC;AACrE,QAAM,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,SAAS,CAAC;AACjD,QAAM,MAAM,QAAQ,OAAO;AAE3B,QAAM,UAAU,qBAAqB,MAAM,QAAQ;AAEnD,MAAI,CAAC,SAAS;AACZ,UAAM,IAAI;AAAA,MACR;AAAA,IAEF;AAAA,EACF;AAEA,MAAI,CAAC,QAAQ,SAAS;AACpB,UAAM,IAAI,MAAM,iDAAiD;AAAA,EACnE;AAEA,SAAO;AAAA,IACL,UAAU,cAAc,KAAK,SAAS,QAAQ;AAAA,IAC9C,SAAS,QAAQ;AAAA,EACnB;AACF;AAEO,SAAS,gBAAgB,MAAc,UAA4B;AACxE,QAAM,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,SAAS,CAAC;AACjD,QAAM,MAAM,QAAQ,OAAO;AAE3B,QAAM,UAAU,qBAAqB,MAAM,QAAQ;AAEnD,SAAO,cAAc,KAAK,SAAS,QAAQ;AAC7C;;;ACtIA,OAAO,qBAAqB;AAErB,SAAS,kBAAkB,MAAsB;AACtD,QAAM,WAAW,IAAI,gBAAgB;AAAA,IACnC,cAAc;AAAA,IACd,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO,SAAS,SAAS,IAAI;AAC/B;;;ACVA,SAAS,qBAAqB;AAC9B,SAAS,QAAAC,aAAY;AACrB,OAAOC,WAAU;AAGjB,IAAM,eAAe;AACrB,IAAM,gBAAgB;AACtB,IAAM,sBAAsB;AAErB,SAAS,iBAAiB,OAAuB;AACtD,QAAM,YAAY,MACf,QAAQ,eAAe,EAAE,EACzB,QAAQ,cAAc,EAAE,EACxB,QAAQ,QAAQ,GAAG,EACnB,KAAK;AACR,QAAM,OAAO,UAAU,MAAM,GAAG,mBAAmB,KAAK;AACxD,SAAO,GAAG,IAAI;AAChB;AAEO,SAAS,iBAAiB,UAAoB,MAAwB;AAC3E,QAAM,OAAgC;AAAA,IACpC,OAAO,SAAS;AAAA,IAChB,QAAQ,SAAS;AAAA,EACnB;AAEA,MAAI,SAAS,OAAO,SAAS,GAAG;AAC9B,SAAK,SAAS,SAAS;AAAA,EACzB;AAEA,MAAI,SAAS,WAAW;AACtB,SAAK,YAAY,SAAS;AAAA,EAC5B;AAEA,OAAK,UAAU,SAAS;AAExB,MAAI,SAAS,aAAa;AACxB,SAAK,cAAc,SAAS;AAAA,EAC9B;AAEA,OAAK,OAAO;AAEZ,QAAM,UAAUA,MAAK,KAAK,MAAM;AAAA,IAC9B,aAAa;AAAA,IACb,aAAa;AAAA,IACb,WAAW;AAAA,IACX,UAAU;AAAA,EACZ,CAAC;AAED,SAAO;AAAA,EAAQ,OAAO;AACxB;AAEO,SAAS,kBACd,MACA,UACA,iBACA,MACQ;AACR,QAAM,WAAW,iBAAiB,SAAS,KAAK;AAChD,QAAM,WAAWD,MAAK,MAAM,QAAQ;AACpC,QAAM,cAAc,iBAAiB,UAAU,IAAI;AACnD,QAAM,cAAc,GAAG,WAAW;AAAA;AAAA,EAAO,eAAe;AAAA;AAExD,gBAAc,UAAU,aAAa,OAAO;AAE5C,SAAO;AACT;;;ANhDA,IAAM,cAAcE,MAAKC,SAAQ,GAAG,WAAW,eAAe,aAAa;AAE3E,IAAM,UAAU,IAAI,QAAQ;AAE5B,QACG,KAAK,aAAa,EAClB;AAAA,EACC;AACF,EACC,QAAQ,OAAO;AAElB,QACG,QAAQ,OAAO,EACf,YAAY,mCAAmC,EAC/C,SAAS,SAAS,cAAc,EAChC,OAAO,iBAAiB,uBAAuB,EAC/C,OAAO,YAAY,4BAA4B,EAC/C,OAAO,oBAAoB,yBAAyB,EACpD,OAAO,uBAAuB,sBAAsB,QAAQ,EAC5D,OAAO,gBAAgB,wBAAwB,CAAC,KAAa,QAAkB;AAC9E,MAAI,KAAK,GAAG;AACZ,SAAO;AACT,GAAG,CAAC,CAAa,EAChB;AAAA,EACC;AAAA,EACA;AACF,EACC,OAAO,kBAAkB,0BAA0B,EACnD,OAAO,aAAa,oCAAoC,EACxD,OAAO,qBAAqB,6BAA6B,EACzD,OAAO,oBAAoB,4BAA4B,EACvD,OAAO,oBAAoB,6BAA6B,EACxD,OAAO,SAAS,uDAAuD,EACvE,OAAO,OAAO,KAAa,YAAqC;AAC/D,MAAI;AACF,UAAM,aAAaC,YAAW,WAAW,IAAI,cAAc;AAC3D,UAAM,SAAS;AAAA,MACb;AAAA,QACE,MAAM,QAAQ;AAAA,QACd,MAAM,QAAQ;AAAA,QACd,SAAS,QAAQ;AAAA,QACjB,WAAW,QAAQ;AAAA,QACnB,QAAQ,QAAQ;AAAA,QAChB,UAAU,QAAQ;AAAA,QAClB,WAAW,QAAQ;AAAA,QACnB,QAAQ,QAAQ;AAAA,QAChB,aAAa,QAAQ;AAAA,QACrB,YAAY,QAAQ;AAAA,QACpB,YAAY,QAAQ;AAAA,QACpB,KAAK,QAAQ;AAAA,MACf;AAAA,MACA;AAAA,IACF;AAEA,QAAI,OAAO,OAAO,OAAO,UAAU;AACjC,YAAM,IAAI,MAAM,+CAA+C;AAAA,IACjE;AAGA,QAAI,CAAC,OAAO,UAAU,CAACA,YAAW,OAAO,IAAI,GAAG;AAC9C,YAAM,IAAI,MAAM,yCAAyC,OAAO,IAAI,EAAE;AAAA,IACxE;AAEA,UAAM,cAAc,cAAc;AAClC,UAAM,cAAc,MAAM,UAAU,KAAK,QAAQ,WAAW;AAE5D,QAAI;AACJ,QAAI;AAEJ,QAAI,YAAY,SAAS,OAAO;AAC9B,UAAI,OAAO,UAAU;AACnB,cAAM,IAAI,MAAM,0CAA0C;AAAA,MAC5D;AACA,UAAI,OAAO,KAAK;AACd,cAAM,IAAI,MAAM,qCAAqC;AAAA,MACvD;AACA,YAAM,EAAE,qBAAqB,IAAI,MAAM,OAAO,6BAAoB;AAClE,YAAM,YAAY,MAAM;AAAA,QACtB,YAAY;AAAA,QACZ,YAAY;AAAA,MACd;AACA,iBAAW,UAAU;AACrB,iBAAW,UAAU;AAAA,IACvB,WAAW,OAAO,UAAU;AAE1B,iBAAW,gBAAgB,YAAY,UAAU,YAAY,QAAQ;AACrE,iBAAW,kBAAkB,YAAY,IAAI;AAAA,IAC/C,WAAW,OAAO,KAAK;AAErB,iBAAW,gBAAgB,YAAY,UAAU,YAAY,QAAQ;AACrE,iBAAW,kBAAkB,YAAY,QAAQ;AAAA,IACnD,OAAO;AACL,YAAM,SAAS,QAAQ,YAAY,MAAM,YAAY,QAAQ;AAC7D,iBAAW,OAAO;AAClB,iBAAW,kBAAkB,OAAO,OAAO;AAAA,IAC7C;AAEA,QAAI,OAAO,QAAQ;AACjB,YAAM,cAAc,iBAAiB,UAAU,OAAO,IAAI;AAC1D,cAAQ,OAAO,MAAM,GAAG,WAAW;AAAA;AAAA,EAAO,QAAQ;AAAA,CAAI;AAAA,IACxD,OAAO;AACL,YAAM,WAAW;AAAA,QACf,OAAO;AAAA,QACP;AAAA,QACA;AAAA,QACA,OAAO;AAAA,MACT;AACA,cAAQ,MAAM,UAAU,QAAQ,EAAE;AAAA,IACpC;AAAA,EACF,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAQ,MAAM,UAAU,OAAO,EAAE;AACjC,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF,CAAC;AAEH,QACG,QAAQ,OAAO,EACf,YAAY,kCAAkC,EAC9C,SAAS,SAAS,cAAc,EAChC,OAAO,uBAAuB,4BAA4B,QAAQ,EAClE,OAAO,OAAO,KAAa,YAAqC;AAC/D,QAAM,EAAE,UAAAC,UAAS,IAAI,MAAM,OAAO,YAAY;AAC9C,QAAM,cAAc,cAAc;AAClC,mBAAiB,WAAW;AAE5B,QAAM,aAAc,QAAQ,WAAkC;AAC9D,QAAM,UAAU,MAAMA,UAAS,OAAO,EAAE,UAAU,MAAM,CAAC;AAEzD,MAAI;AACF,UAAM,UAAU,MAAM,QAAQ,WAAW;AACzC,UAAM,OAAO,MAAM,QAAQ,QAAQ;AAEnC,UAAM,KAAK,KAAK,KAAK,EAAE,WAAW,eAAe,SAAS,aAAa,IAAK,CAAC;AAE7E,YAAQ,MAAM,yEAAyE;AAEvF,YAAQ,MAAM,OAAO;AACrB,UAAM,KAAK,QAAQ,OAAO,MAAM;AAChC,YAAQ,MAAM,MAAM;AACpB,YAAQ,MAAM,MAAM;AAEpB,UAAM,cAAc,eAAe,KAAK,WAAW;AACnD,UAAM,QAAQ,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,YAAQ,MAAM,kBAAkB,WAAW,EAAE;AAAA,EAC/C,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAQ,MAAM,UAAU,OAAO,EAAE;AACjC,YAAQ,KAAK,CAAC;AAAA,EAChB,UAAE;AACA,UAAM,QAAQ,MAAM;AAAA,EACtB;AACF,CAAC;AAEH,QAAQ,MAAM;","names":["existsSync","homedir","join","homedir","join","yaml","join","homedir","existsSync","chromium"]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/config.ts","../src/fetcher.ts","../src/session.ts","../src/extractor.ts","../src/converter.ts","../src/writer.ts"],"sourcesContent":["import { Command } from \"commander\";\nimport { once } from \"node:events\";\nimport { existsSync } from \"node:fs\";\nimport { homedir } from \"node:os\";\nimport { join } from \"node:path\";\nimport { resolveConfig } from \"./config.js\";\nimport { fetchPage } from \"./fetcher.js\";\nimport { extract, extractMetadata } from \"./extractor.js\";\nimport { convertToMarkdown } from \"./converter.js\";\nimport { writeMarkdownFile, buildFrontmatter } from \"./writer.js\";\nimport {\n getSessionDir,\n getSessionPath,\n ensureSessionDir,\n} from \"./session.js\";\nimport type { Metadata, WaitUntilOption } from \"./types.js\";\n\nconst CONFIG_PATH = join(homedir(), \".config\", \"vault-fetch\", \"config.yaml\");\n\nconst program = new Command();\n\nprogram\n .name(\"vault-fetch\")\n .description(\n \"Fetch JS-rendered web pages and save as Markdown to Obsidian Vault\",\n )\n .version(\"0.3.0\");\n\nprogram\n .command(\"fetch\")\n .description(\"Fetch a page and save as Markdown\")\n .argument(\"<url>\", \"URL to fetch\")\n .option(\"--dest <path>\", \"Destination directory\")\n .option(\"--headed\", \"Run browser in headed mode\")\n .option(\"--selector <css>\", \"CSS selector to extract\")\n .option(\"--timeout <seconds>\", \"Timeout in seconds\", parseInt)\n .option(\"--tag <name>\", \"Add tag (repeatable)\", (val: string, acc: string[]) => {\n acc.push(val);\n return acc;\n }, [] as string[])\n .option(\n \"--wait-until <event>\",\n \"Wait condition: load, domcontentloaded, networkidle\",\n )\n .option(\"--skip-session\", \"Do not use saved session\")\n .option(\"--dry-run\", \"Output to stdout instead of saving\")\n .option(\"--no-block-images\", \"Do not block image requests\")\n .option(\"--no-block-fonts\", \"Do not block font requests\")\n .option(\"--no-block-media\", \"Do not block media requests\")\n .option(\"--raw\", \"Convert full page HTML without Readability extraction\")\n .option(\"--title <text>\", \"Override the page title for the output filename\")\n .action(async (url: string, options: Record<string, unknown>) => {\n try {\n const configPath = existsSync(CONFIG_PATH) ? CONFIG_PATH : undefined;\n const config = resolveConfig(\n {\n dest: options.dest as string | undefined,\n tags: options.tag as string[] | undefined,\n timeout: options.timeout as number | undefined,\n waitUntil: options.waitUntil as WaitUntilOption | undefined,\n headed: options.headed as boolean | undefined,\n selector: options.selector as string | undefined,\n title: options.title as string | undefined,\n noSession: options.skipSession as boolean | undefined,\n dryRun: options.dryRun as boolean | undefined,\n blockImages: options.blockImages as boolean | undefined,\n blockFonts: options.blockFonts as boolean | undefined,\n blockMedia: options.blockMedia as boolean | undefined,\n raw: options.raw as boolean | undefined,\n },\n configPath,\n );\n\n if (config.raw && config.selector) {\n throw new Error(\"--raw and --selector cannot be used together.\");\n }\n\n // Validate dest directory exists\n if (!config.dryRun && !existsSync(config.dest)) {\n throw new Error(`Destination directory does not exist: ${config.dest}`);\n }\n\n const sessionsDir = getSessionDir();\n const fetchResult = await fetchPage(url, config, sessionsDir);\n\n let markdown: string;\n let metadata: Metadata;\n\n if (fetchResult.kind === \"pdf\") {\n if (config.selector) {\n throw new Error(\"--selector cannot be used with PDF URLs.\");\n }\n if (config.raw) {\n throw new Error(\"--raw cannot be used with PDF URLs.\");\n }\n const { convertPdfToMarkdown } = await import(\"./pdf-converter.js\");\n const pdfResult = await convertPdfToMarkdown(\n fetchResult.pdfBuffer,\n fetchResult.finalUrl,\n );\n markdown = pdfResult.markdown;\n metadata = pdfResult.metadata;\n } else if (config.selector) {\n // --selector mode: skip Readability, extract metadata from full page\n metadata = extractMetadata(fetchResult.fullHtml, fetchResult.finalUrl);\n markdown = convertToMarkdown(fetchResult.html);\n } else if (config.raw) {\n // --raw mode: skip Readability, convert full page HTML directly\n metadata = extractMetadata(fetchResult.fullHtml, fetchResult.finalUrl);\n markdown = convertToMarkdown(fetchResult.fullHtml);\n } else {\n const result = extract(fetchResult.html, fetchResult.finalUrl);\n metadata = result.metadata;\n markdown = convertToMarkdown(result.content);\n }\n\n if (config.title !== null) {\n metadata = { ...metadata, title: config.title };\n }\n\n if (config.dryRun) {\n const frontmatter = buildFrontmatter(metadata, config.tags);\n process.stdout.write(`${frontmatter}\\n\\n${markdown}\\n`);\n } else {\n const filePath = writeMarkdownFile(\n config.dest,\n metadata,\n markdown,\n config.tags,\n );\n console.error(`Saved: ${filePath}`);\n }\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n console.error(`Error: ${message}`);\n process.exit(1);\n }\n });\n\nprogram\n .command(\"login\")\n .description(\"Login to a site and save session\")\n .argument(\"<url>\", \"URL to login\")\n .option(\"--timeout <seconds>\", \"Login timeout in seconds\", parseInt)\n .action(async (url: string, options: Record<string, unknown>) => {\n const { chromium } = await import(\"playwright\");\n const sessionsDir = getSessionDir();\n ensureSessionDir(sessionsDir);\n\n const timeoutSec = (options.timeout as number | undefined) ?? 300;\n const browser = await chromium.launch({ headless: false });\n\n try {\n const context = await browser.newContext();\n const page = await context.newPage();\n\n await page.goto(url, { waitUntil: \"networkidle\", timeout: timeoutSec * 1000 });\n\n console.error(\"Browser opened. Log in manually, then press Enter here to save session.\");\n\n process.stdin.resume();\n await once(process.stdin, \"data\");\n process.stdin.pause();\n process.stdin.unref();\n\n const sessionPath = getSessionPath(url, sessionsDir);\n await context.storageState({ path: sessionPath });\n console.error(`Session saved: ${sessionPath}`);\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n console.error(`Error: ${message}`);\n process.exit(1);\n } finally {\n await browser.close();\n }\n });\n\nprogram.parse();\n","import { readFileSync } from \"node:fs\";\nimport { homedir } from \"node:os\";\nimport { resolve } from \"node:path\";\nimport yaml from \"js-yaml\";\nimport type { ResolvedConfig, WaitUntilOption } from \"./types.js\";\n\nconst DEFAULT_TIMEOUT = 30;\nconst DEFAULT_WAIT_UNTIL: WaitUntilOption = \"networkidle\";\nconst REQUIRED_TAG = \"clippings\";\n\ninterface FileConfig {\n dest?: string;\n tags?: string[];\n timeout?: number;\n waitUntil?: WaitUntilOption;\n}\n\ninterface CliOptions {\n dest?: string;\n tags?: string[];\n timeout?: number;\n waitUntil?: WaitUntilOption;\n headed?: boolean;\n selector?: string;\n title?: string;\n noSession?: boolean;\n dryRun?: boolean;\n blockImages?: boolean;\n blockFonts?: boolean;\n blockMedia?: boolean;\n raw?: boolean;\n}\n\nfunction expandTilde(filePath: string): string {\n if (filePath.startsWith(\"~/\")) {\n return resolve(homedir(), filePath.slice(2));\n }\n return filePath;\n}\n\nconst VALID_WAIT_UNTIL: readonly string[] = [\"load\", \"domcontentloaded\", \"networkidle\"];\n\nfunction validateWaitUntil(value: string): WaitUntilOption {\n if (!VALID_WAIT_UNTIL.includes(value)) {\n throw new Error(\n `Invalid waitUntil value: \"${value}\". Must be one of: ${VALID_WAIT_UNTIL.join(\", \")}`,\n );\n }\n return value as WaitUntilOption;\n}\n\nfunction loadConfigFile(configPath: string): FileConfig {\n const content = readFileSync(configPath, \"utf-8\");\n const parsed = yaml.load(content);\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(`Invalid config file: ${configPath}`);\n }\n const config = parsed as Record<string, unknown>;\n\n if (config.timeout !== undefined && typeof config.timeout !== \"number\") {\n throw new Error(`Invalid timeout in config file: expected number, got ${typeof config.timeout}`);\n }\n if (config.dest !== undefined && typeof config.dest !== \"string\") {\n throw new Error(`Invalid dest in config file: expected string, got ${typeof config.dest}`);\n }\n if (config.waitUntil !== undefined) {\n if (typeof config.waitUntil !== \"string\") {\n throw new Error(`Invalid waitUntil in config file: expected string, got ${typeof config.waitUntil}`);\n }\n validateWaitUntil(config.waitUntil);\n }\n if (config.tags !== undefined) {\n if (!Array.isArray(config.tags) || !config.tags.every((t: unknown) => typeof t === \"string\")) {\n throw new Error(\"Invalid tags in config file: expected array of strings\");\n }\n }\n\n return config as FileConfig;\n}\n\nexport function resolveConfig(\n cliOptions: CliOptions,\n configPath: string | undefined,\n): ResolvedConfig {\n // Layer 1: Config file\n let fileConfig: FileConfig = {};\n if (configPath) {\n fileConfig = loadConfigFile(configPath);\n }\n\n // Layer 2: Environment variables\n const envDest = process.env.VAULT_FETCH_DEST;\n const envTimeout = process.env.VAULT_FETCH_TIMEOUT;\n\n // Resolve each field: CLI > env > file > default\n const dest = cliOptions.dest ?? envDest ?? fileConfig.dest;\n if (dest === undefined) {\n throw new Error(\n \"dest is required. Set via --dest, VAULT_FETCH_DEST, or config file.\",\n );\n }\n\n let timeout: number;\n if (cliOptions.timeout !== undefined) {\n timeout = cliOptions.timeout;\n } else if (envTimeout !== undefined) {\n const parsed = Number(envTimeout);\n if (Number.isNaN(parsed)) {\n throw new Error(`Invalid VAULT_FETCH_TIMEOUT value: ${envTimeout}`);\n }\n timeout = parsed;\n } else {\n timeout = fileConfig.timeout ?? DEFAULT_TIMEOUT;\n }\n\n const rawWaitUntil = cliOptions.waitUntil ?? fileConfig.waitUntil ?? DEFAULT_WAIT_UNTIL;\n const waitUntil = validateWaitUntil(rawWaitUntil);\n\n // Merge tags: file tags + CLI tags + always clippings\n const allTags = [\n ...(fileConfig.tags ?? []),\n ...(cliOptions.tags ?? []),\n REQUIRED_TAG,\n ];\n const tags = [...new Set(allTags)];\n\n return {\n dest: expandTilde(dest),\n tags,\n timeout,\n waitUntil,\n headed: cliOptions.headed ?? false,\n selector: cliOptions.selector ?? null,\n title: cliOptions.title ?? null,\n noSession: cliOptions.noSession ?? false,\n dryRun: cliOptions.dryRun ?? false,\n blockImages: cliOptions.blockImages ?? true,\n blockFonts: cliOptions.blockFonts ?? true,\n blockMedia: cliOptions.blockMedia ?? true,\n raw: cliOptions.raw ?? false,\n };\n}\n","import { chromium, type BrowserContext } from \"playwright\";\nimport type { FetchResult, ResolvedConfig } from \"./types.js\";\nimport { getSessionPath, sessionExists } from \"./session.js\";\n\nexport const CHROME_USER_AGENT =\n \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \" +\n \"AppleWebKit/537.36 (KHTML, like Gecko) \" +\n \"Chrome/134.0.0.0 Safari/537.36\";\n\ninterface BlockingOptions {\n blockImages: boolean;\n blockFonts: boolean;\n blockMedia: boolean;\n}\n\nexport function isPdfContentType(contentType: string): boolean {\n return contentType.toLowerCase().includes(\"application/pdf\");\n}\n\nexport function buildBlockedResourceTypes(options: BlockingOptions): Set<string> {\n const blocked = new Set<string>();\n if (options.blockImages) blocked.add(\"image\");\n if (options.blockFonts) blocked.add(\"font\");\n if (options.blockMedia) blocked.add(\"media\");\n return blocked;\n}\n\nconst PDF_MAGIC_BYTES = \"%PDF\";\n\nexport function validatePdfBuffer(pdfBuffer: Buffer, sourceUrl: string): void {\n if (pdfBuffer.length === 0) {\n throw new Error(`Empty PDF response received from ${sourceUrl}`);\n }\n const header = pdfBuffer.subarray(0, PDF_MAGIC_BYTES.length).toString(\"ascii\");\n if (!header.startsWith(PDF_MAGIC_BYTES)) {\n throw new Error(\n `Response Content-Type is application/pdf but body is not valid PDF data from ${sourceUrl}`,\n );\n }\n}\n\nasync function downloadPdf(\n context: BrowserContext,\n url: string,\n timeoutMs: number,\n): Promise<{ pdfBuffer: Buffer; finalUrl: string }> {\n const apiResponse = await context.request.get(url, { timeout: timeoutMs });\n const status = apiResponse.status();\n if (status >= 400) {\n throw new Error(`HTTP ${status} received when downloading PDF from ${url}`);\n }\n const pdfBuffer = Buffer.from(await apiResponse.body());\n const finalUrl = apiResponse.url();\n validatePdfBuffer(pdfBuffer, finalUrl);\n return { pdfBuffer, finalUrl };\n}\n\nexport async function fetchPage(\n url: string,\n config: ResolvedConfig,\n sessionsDir: string,\n): Promise<FetchResult> {\n const browser = await chromium.launch({\n headless: !config.headed,\n });\n\n try {\n const contextOptions: Parameters<typeof browser.newContext>[0] = {\n userAgent: CHROME_USER_AGENT,\n };\n\n // Load session if available and not disabled\n if (!config.noSession && sessionExists(url, sessionsDir)) {\n const sessionPath = getSessionPath(url, sessionsDir);\n contextOptions.storageState = sessionPath;\n }\n\n const context: BrowserContext = await browser.newContext(contextOptions);\n const page = await context.newPage();\n\n // Block specified resource types for faster loading\n const blockedTypes = buildBlockedResourceTypes(config);\n if (blockedTypes.size > 0) {\n await page.route(\"**/*\", async (route) => {\n if (blockedTypes.has(route.request().resourceType())) {\n await route.abort();\n } else {\n await route.continue();\n }\n });\n }\n\n const timeoutMs = config.timeout * 1000;\n\n // page.goto throws \"Download is starting\" when the server returns\n // Content-Disposition: attachment (common for PDF downloads).\n // Catch this and download the PDF via the context's HTTP client.\n let response;\n try {\n response = await page.goto(url, {\n waitUntil: config.waitUntil,\n timeout: timeoutMs,\n });\n } catch (error) {\n if (\n error instanceof Error &&\n error.message.includes(\"Download is starting\")\n ) {\n const result = await downloadPdf(context, url, timeoutMs);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: result.pdfBuffer, url, finalUrl: result.finalUrl };\n }\n throw error;\n }\n\n if (!response) {\n throw new Error(`No response received from ${url}`);\n }\n\n const status = response.status();\n if (status >= 400) {\n throw new Error(`HTTP ${status} received from ${response.url()}`);\n }\n\n const finalUrl = response.url();\n const contentType = response.headers()[\"content-type\"] ?? \"\";\n\n // Inline PDF (Content-Disposition: inline or absent).\n // Try response.body() first; fall back to context.request if the\n // browser returned its PDF viewer HTML instead of the actual bytes.\n if (isPdfContentType(contentType)) {\n const body = await response.body();\n try {\n validatePdfBuffer(body, finalUrl);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: body, url, finalUrl };\n } catch {\n // response.body() returned PDF viewer HTML; re-download via API\n const result = await downloadPdf(context, finalUrl, timeoutMs);\n await context.close();\n return { kind: \"pdf\", pdfBuffer: result.pdfBuffer, url, finalUrl: result.finalUrl };\n }\n }\n\n const fullHtml = await page.content();\n let html: string;\n\n if (config.selector) {\n const element = await page.$(config.selector);\n if (!element) {\n throw new Error(`Selector not found: ${config.selector}`);\n }\n html = await element.innerHTML();\n } else {\n html = fullHtml;\n }\n\n await context.close();\n\n return { kind: \"html\", html, fullHtml, url, finalUrl };\n } finally {\n await browser.close();\n }\n}\n","import { existsSync, mkdirSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport { homedir } from \"node:os\";\n\nconst CONFIG_DIR = join(homedir(), \".config\", \"vault-fetch\");\nconst SESSIONS_DIR = join(CONFIG_DIR, \"sessions\");\n\nexport function getSessionDir(): string {\n return SESSIONS_DIR;\n}\n\nfunction extractDomain(url: string): string {\n const parsed = new URL(url);\n return parsed.hostname ?? \"\";\n}\n\nexport function getSessionPath(url: string, sessionsDir: string): string {\n const domain = extractDomain(url);\n return join(sessionsDir, `${domain}.json`);\n}\n\nexport function sessionExists(url: string, sessionsDir: string): boolean {\n const sessionPath = getSessionPath(url, sessionsDir);\n return existsSync(sessionPath);\n}\n\nexport function ensureSessionDir(sessionsDir: string): void {\n if (!existsSync(sessionsDir)) {\n mkdirSync(sessionsDir, { recursive: true });\n }\n}\n","import { Readability } from \"@mozilla/readability\";\nimport { JSDOM } from \"jsdom\";\nimport type { Metadata } from \"./types.js\";\n\nfunction getMetaContent(doc: Document, selector: string): string | null {\n const el = doc.querySelector(selector);\n return el?.getAttribute(\"content\") ?? null;\n}\n\nfunction formatAuthor(raw: string): string {\n return `[[${raw.trim()}]]`;\n}\n\nfunction extractPublishedDate(doc: Document): string | null {\n const published =\n getMetaContent(doc, 'meta[property=\"article:published_time\"]') ??\n getMetaContent(doc, 'meta[name=\"datePublished\"]');\n\n if (!published) {\n const jsonLd = doc.querySelector('script[type=\"application/ld+json\"]');\n if (jsonLd?.textContent) {\n try {\n const data = JSON.parse(jsonLd.textContent) as Record<string, unknown>;\n if (typeof data.datePublished === \"string\") {\n return data.datePublished.split(\"T\")[0];\n }\n } catch {\n // JSON-LD parse failed\n }\n }\n return null;\n }\n\n return published.split(\"T\")[0];\n}\n\nfunction extractAuthors(\n doc: Document,\n readabilityByline: string | null,\n): string[] {\n const articleAuthors = doc.querySelectorAll('meta[property=\"article:author\"]');\n if (articleAuthors.length > 0) {\n return Array.from(articleAuthors)\n .map((el) => el.getAttribute(\"content\"))\n .filter((v): v is string => v !== null)\n .map(formatAuthor);\n }\n\n const ogAuthor = getMetaContent(doc, 'meta[property=\"og:author\"]');\n if (ogAuthor) {\n return [formatAuthor(ogAuthor)];\n }\n\n if (readabilityByline) {\n return [formatAuthor(readabilityByline)];\n }\n\n return [];\n}\n\nexport interface ExtractResult {\n metadata: Metadata;\n content: string;\n}\n\ninterface ReadabilityArticle {\n title: string;\n byline: string | null;\n excerpt: string;\n content: string;\n}\n\nfunction buildMetadata(\n doc: Document,\n article: ReadabilityArticle | null,\n finalUrl: string,\n): Metadata {\n const title = article?.title ?? doc.title;\n const authors = extractAuthors(doc, article?.byline ?? null);\n const published = extractPublishedDate(doc);\n\n const description =\n getMetaContent(doc, 'meta[property=\"og:description\"]') ??\n getMetaContent(doc, 'meta[name=\"description\"]') ??\n (article?.excerpt ?? null);\n\n const today = new Date().toISOString().split(\"T\")[0];\n\n return {\n title,\n source: finalUrl,\n author: authors,\n published,\n created: today,\n description,\n };\n}\n\nfunction parseWithReadability(html: string, url: string): ReadabilityArticle | null {\n const dom = new JSDOM(html, { url });\n const reader = new Readability(dom.window.document);\n return reader.parse() as ReadabilityArticle | null;\n}\n\nexport function extract(html: string, finalUrl: string): ExtractResult {\n const metaDom = new JSDOM(html, { url: finalUrl });\n const doc = metaDom.window.document;\n\n const article = parseWithReadability(html, finalUrl);\n\n if (!article) {\n throw new Error(\n \"Readability failed to extract content from the page. \" +\n \"Try --raw to convert the full page, or --selector <css> to target specific content.\",\n );\n }\n\n if (!article.content) {\n throw new Error(\"Readability returned empty content for the page\");\n }\n\n return {\n metadata: buildMetadata(doc, article, finalUrl),\n content: article.content,\n };\n}\n\nexport function extractMetadata(html: string, finalUrl: string): Metadata {\n const metaDom = new JSDOM(html, { url: finalUrl });\n const doc = metaDom.window.document;\n\n const article = parseWithReadability(html, finalUrl);\n\n return buildMetadata(doc, article, finalUrl);\n}\n","import TurndownService from \"turndown\";\n\nexport function convertToMarkdown(html: string): string {\n const turndown = new TurndownService({\n headingStyle: \"atx\",\n codeBlockStyle: \"fenced\",\n bulletListMarker: \"-\",\n });\n\n return turndown.turndown(html);\n}\n","import { writeFileSync } from \"node:fs\";\nimport { join } from \"node:path\";\nimport yaml from \"js-yaml\";\nimport type { Metadata } from \"./types.js\";\n\nconst UNSAFE_CHARS = /[/\\\\:*?\"<>|]/g;\nconst CONTROL_CHARS = /[\\x00-\\x1f\\x7f]/g;\nconst MAX_FILENAME_LENGTH = 200;\n\nexport function sanitizeFilename(title: string): string {\n const sanitized = title\n .replace(CONTROL_CHARS, \"\")\n .replace(UNSAFE_CHARS, \"\")\n .replace(/\\s+/g, \" \")\n .trim();\n const base = sanitized.slice(0, MAX_FILENAME_LENGTH) || \"Untitled\";\n return `${base}.md`;\n}\n\nexport function buildFrontmatter(metadata: Metadata, tags: string[]): string {\n const data: Record<string, unknown> = {\n title: metadata.title,\n source: metadata.source,\n };\n\n if (metadata.author.length > 0) {\n data.author = metadata.author;\n }\n\n if (metadata.published) {\n data.published = metadata.published;\n }\n\n data.created = metadata.created;\n\n if (metadata.description) {\n data.description = metadata.description;\n }\n\n data.tags = tags;\n\n const yamlStr = yaml.dump(data, {\n quotingType: '\"',\n forceQuotes: false,\n lineWidth: -1,\n sortKeys: false,\n });\n\n return `---\\n${yamlStr}---`;\n}\n\nexport function writeMarkdownFile(\n dest: string,\n metadata: Metadata,\n markdownContent: string,\n tags: string[],\n): string {\n const filename = sanitizeFilename(metadata.title);\n const filePath = join(dest, filename);\n const frontmatter = buildFrontmatter(metadata, tags);\n const fullContent = `${frontmatter}\\n\\n${markdownContent}\\n`;\n\n writeFileSync(filePath, fullContent, \"utf-8\");\n\n return filePath;\n}\n"],"mappings":";;;AAAA,SAAS,eAAe;AACxB,SAAS,YAAY;AACrB,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,WAAAC,gBAAe;AACxB,SAAS,QAAAC,aAAY;;;ACJrB,SAAS,oBAAoB;AAC7B,SAAS,eAAe;AACxB,SAAS,eAAe;AACxB,OAAO,UAAU;AAGjB,IAAM,kBAAkB;AACxB,IAAM,qBAAsC;AAC5C,IAAM,eAAe;AAyBrB,SAAS,YAAY,UAA0B;AAC7C,MAAI,SAAS,WAAW,IAAI,GAAG;AAC7B,WAAO,QAAQ,QAAQ,GAAG,SAAS,MAAM,CAAC,CAAC;AAAA,EAC7C;AACA,SAAO;AACT;AAEA,IAAM,mBAAsC,CAAC,QAAQ,oBAAoB,aAAa;AAEtF,SAAS,kBAAkB,OAAgC;AACzD,MAAI,CAAC,iBAAiB,SAAS,KAAK,GAAG;AACrC,UAAM,IAAI;AAAA,MACR,6BAA6B,KAAK,sBAAsB,iBAAiB,KAAK,IAAI,CAAC;AAAA,IACrF;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,eAAe,YAAgC;AACtD,QAAM,UAAU,aAAa,YAAY,OAAO;AAChD,QAAM,SAAS,KAAK,KAAK,OAAO;AAChC,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,wBAAwB,UAAU,EAAE;AAAA,EACtD;AACA,QAAM,SAAS;AAEf,MAAI,OAAO,YAAY,UAAa,OAAO,OAAO,YAAY,UAAU;AACtE,UAAM,IAAI,MAAM,wDAAwD,OAAO,OAAO,OAAO,EAAE;AAAA,EACjG;AACA,MAAI,OAAO,SAAS,UAAa,OAAO,OAAO,SAAS,UAAU;AAChE,UAAM,IAAI,MAAM,qDAAqD,OAAO,OAAO,IAAI,EAAE;AAAA,EAC3F;AACA,MAAI,OAAO,cAAc,QAAW;AAClC,QAAI,OAAO,OAAO,cAAc,UAAU;AACxC,YAAM,IAAI,MAAM,0DAA0D,OAAO,OAAO,SAAS,EAAE;AAAA,IACrG;AACA,sBAAkB,OAAO,SAAS;AAAA,EACpC;AACA,MAAI,OAAO,SAAS,QAAW;AAC7B,QAAI,CAAC,MAAM,QAAQ,OAAO,IAAI,KAAK,CAAC,OAAO,KAAK,MAAM,CAAC,MAAe,OAAO,MAAM,QAAQ,GAAG;AAC5F,YAAM,IAAI,MAAM,wDAAwD;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO;AACT;AAEO,SAAS,cACd,YACA,YACgB;AAEhB,MAAI,aAAyB,CAAC;AAC9B,MAAI,YAAY;AACd,iBAAa,eAAe,UAAU;AAAA,EACxC;AAGA,QAAM,UAAU,QAAQ,IAAI;AAC5B,QAAM,aAAa,QAAQ,IAAI;AAG/B,QAAM,OAAO,WAAW,QAAQ,WAAW,WAAW;AACtD,MAAI,SAAS,QAAW;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI;AACJ,MAAI,WAAW,YAAY,QAAW;AACpC,cAAU,WAAW;AAAA,EACvB,WAAW,eAAe,QAAW;AACnC,UAAM,SAAS,OAAO,UAAU;AAChC,QAAI,OAAO,MAAM,MAAM,GAAG;AACxB,YAAM,IAAI,MAAM,sCAAsC,UAAU,EAAE;AAAA,IACpE;AACA,cAAU;AAAA,EACZ,OAAO;AACL,cAAU,WAAW,WAAW;AAAA,EAClC;AAEA,QAAM,eAAe,WAAW,aAAa,WAAW,aAAa;AACrE,QAAM,YAAY,kBAAkB,YAAY;AAGhD,QAAM,UAAU;AAAA,IACd,GAAI,WAAW,QAAQ,CAAC;AAAA,IACxB,GAAI,WAAW,QAAQ,CAAC;AAAA,IACxB;AAAA,EACF;AACA,QAAM,OAAO,CAAC,GAAG,IAAI,IAAI,OAAO,CAAC;AAEjC,SAAO;AAAA,IACL,MAAM,YAAY,IAAI;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA,QAAQ,WAAW,UAAU;AAAA,IAC7B,UAAU,WAAW,YAAY;AAAA,IACjC,OAAO,WAAW,SAAS;AAAA,IAC3B,WAAW,WAAW,aAAa;AAAA,IACnC,QAAQ,WAAW,UAAU;AAAA,IAC7B,aAAa,WAAW,eAAe;AAAA,IACvC,YAAY,WAAW,cAAc;AAAA,IACrC,YAAY,WAAW,cAAc;AAAA,IACrC,KAAK,WAAW,OAAO;AAAA,EACzB;AACF;;;AC7IA,SAAS,gBAAqC;;;ACA9C,SAAS,YAAY,iBAAiB;AACtC,SAAS,YAAY;AACrB,SAAS,WAAAC,gBAAe;AAExB,IAAM,aAAa,KAAKA,SAAQ,GAAG,WAAW,aAAa;AAC3D,IAAM,eAAe,KAAK,YAAY,UAAU;AAEzC,SAAS,gBAAwB;AACtC,SAAO;AACT;AAEA,SAAS,cAAc,KAAqB;AAC1C,QAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,SAAO,OAAO,YAAY;AAC5B;AAEO,SAAS,eAAe,KAAa,aAA6B;AACvE,QAAM,SAAS,cAAc,GAAG;AAChC,SAAO,KAAK,aAAa,GAAG,MAAM,OAAO;AAC3C;AAEO,SAAS,cAAc,KAAa,aAA8B;AACvE,QAAM,cAAc,eAAe,KAAK,WAAW;AACnD,SAAO,WAAW,WAAW;AAC/B;AAEO,SAAS,iBAAiB,aAA2B;AAC1D,MAAI,CAAC,WAAW,WAAW,GAAG;AAC5B,cAAU,aAAa,EAAE,WAAW,KAAK,CAAC;AAAA,EAC5C;AACF;;;AD1BO,IAAM,oBACX;AAUK,SAAS,iBAAiB,aAA8B;AAC7D,SAAO,YAAY,YAAY,EAAE,SAAS,iBAAiB;AAC7D;AAEO,SAAS,0BAA0B,SAAuC;AAC/E,QAAM,UAAU,oBAAI,IAAY;AAChC,MAAI,QAAQ,YAAa,SAAQ,IAAI,OAAO;AAC5C,MAAI,QAAQ,WAAY,SAAQ,IAAI,MAAM;AAC1C,MAAI,QAAQ,WAAY,SAAQ,IAAI,OAAO;AAC3C,SAAO;AACT;AAEA,IAAM,kBAAkB;AAEjB,SAAS,kBAAkB,WAAmB,WAAyB;AAC5E,MAAI,UAAU,WAAW,GAAG;AAC1B,UAAM,IAAI,MAAM,oCAAoC,SAAS,EAAE;AAAA,EACjE;AACA,QAAM,SAAS,UAAU,SAAS,GAAG,gBAAgB,MAAM,EAAE,SAAS,OAAO;AAC7E,MAAI,CAAC,OAAO,WAAW,eAAe,GAAG;AACvC,UAAM,IAAI;AAAA,MACR,gFAAgF,SAAS;AAAA,IAC3F;AAAA,EACF;AACF;AAEA,eAAe,YACb,SACA,KACA,WACkD;AAClD,QAAM,cAAc,MAAM,QAAQ,QAAQ,IAAI,KAAK,EAAE,SAAS,UAAU,CAAC;AACzE,QAAM,SAAS,YAAY,OAAO;AAClC,MAAI,UAAU,KAAK;AACjB,UAAM,IAAI,MAAM,QAAQ,MAAM,uCAAuC,GAAG,EAAE;AAAA,EAC5E;AACA,QAAM,YAAY,OAAO,KAAK,MAAM,YAAY,KAAK,CAAC;AACtD,QAAM,WAAW,YAAY,IAAI;AACjC,oBAAkB,WAAW,QAAQ;AACrC,SAAO,EAAE,WAAW,SAAS;AAC/B;AAEA,eAAsB,UACpB,KACA,QACA,aACsB;AACtB,QAAM,UAAU,MAAM,SAAS,OAAO;AAAA,IACpC,UAAU,CAAC,OAAO;AAAA,EACpB,CAAC;AAED,MAAI;AACF,UAAM,iBAA2D;AAAA,MAC/D,WAAW;AAAA,IACb;AAGA,QAAI,CAAC,OAAO,aAAa,cAAc,KAAK,WAAW,GAAG;AACxD,YAAM,cAAc,eAAe,KAAK,WAAW;AACnD,qBAAe,eAAe;AAAA,IAChC;AAEA,UAAM,UAA0B,MAAM,QAAQ,WAAW,cAAc;AACvE,UAAM,OAAO,MAAM,QAAQ,QAAQ;AAGnC,UAAM,eAAe,0BAA0B,MAAM;AACrD,QAAI,aAAa,OAAO,GAAG;AACzB,YAAM,KAAK,MAAM,QAAQ,OAAO,UAAU;AACxC,YAAI,aAAa,IAAI,MAAM,QAAQ,EAAE,aAAa,CAAC,GAAG;AACpD,gBAAM,MAAM,MAAM;AAAA,QACpB,OAAO;AACL,gBAAM,MAAM,SAAS;AAAA,QACvB;AAAA,MACF,CAAC;AAAA,IACH;AAEA,UAAM,YAAY,OAAO,UAAU;AAKnC,QAAI;AACJ,QAAI;AACF,iBAAW,MAAM,KAAK,KAAK,KAAK;AAAA,QAC9B,WAAW,OAAO;AAAA,QAClB,SAAS;AAAA,MACX,CAAC;AAAA,IACH,SAAS,OAAO;AACd,UACE,iBAAiB,SACjB,MAAM,QAAQ,SAAS,sBAAsB,GAC7C;AACA,cAAM,SAAS,MAAM,YAAY,SAAS,KAAK,SAAS;AACxD,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,OAAO,WAAW,KAAK,UAAU,OAAO,SAAS;AAAA,MACpF;AACA,YAAM;AAAA,IACR;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,IAAI,MAAM,6BAA6B,GAAG,EAAE;AAAA,IACpD;AAEA,UAAM,SAAS,SAAS,OAAO;AAC/B,QAAI,UAAU,KAAK;AACjB,YAAM,IAAI,MAAM,QAAQ,MAAM,kBAAkB,SAAS,IAAI,CAAC,EAAE;AAAA,IAClE;AAEA,UAAM,WAAW,SAAS,IAAI;AAC9B,UAAM,cAAc,SAAS,QAAQ,EAAE,cAAc,KAAK;AAK1D,QAAI,iBAAiB,WAAW,GAAG;AACjC,YAAM,OAAO,MAAM,SAAS,KAAK;AACjC,UAAI;AACF,0BAAkB,MAAM,QAAQ;AAChC,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,MAAM,KAAK,SAAS;AAAA,MACvD,QAAQ;AAEN,cAAM,SAAS,MAAM,YAAY,SAAS,UAAU,SAAS;AAC7D,cAAM,QAAQ,MAAM;AACpB,eAAO,EAAE,MAAM,OAAO,WAAW,OAAO,WAAW,KAAK,UAAU,OAAO,SAAS;AAAA,MACpF;AAAA,IACF;AAEA,UAAM,WAAW,MAAM,KAAK,QAAQ;AACpC,QAAI;AAEJ,QAAI,OAAO,UAAU;AACnB,YAAM,UAAU,MAAM,KAAK,EAAE,OAAO,QAAQ;AAC5C,UAAI,CAAC,SAAS;AACZ,cAAM,IAAI,MAAM,uBAAuB,OAAO,QAAQ,EAAE;AAAA,MAC1D;AACA,aAAO,MAAM,QAAQ,UAAU;AAAA,IACjC,OAAO;AACL,aAAO;AAAA,IACT;AAEA,UAAM,QAAQ,MAAM;AAEpB,WAAO,EAAE,MAAM,QAAQ,MAAM,UAAU,KAAK,SAAS;AAAA,EACvD,UAAE;AACA,UAAM,QAAQ,MAAM;AAAA,EACtB;AACF;;;AEnKA,SAAS,mBAAmB;AAC5B,SAAS,aAAa;AAGtB,SAAS,eAAe,KAAe,UAAiC;AACtE,QAAM,KAAK,IAAI,cAAc,QAAQ;AACrC,SAAO,IAAI,aAAa,SAAS,KAAK;AACxC;AAEA,SAAS,aAAa,KAAqB;AACzC,SAAO,KAAK,IAAI,KAAK,CAAC;AACxB;AAEA,SAAS,qBAAqB,KAA8B;AAC1D,QAAM,YACJ,eAAe,KAAK,yCAAyC,KAC7D,eAAe,KAAK,4BAA4B;AAElD,MAAI,CAAC,WAAW;AACd,UAAM,SAAS,IAAI,cAAc,oCAAoC;AACrE,QAAI,QAAQ,aAAa;AACvB,UAAI;AACF,cAAM,OAAO,KAAK,MAAM,OAAO,WAAW;AAC1C,YAAI,OAAO,KAAK,kBAAkB,UAAU;AAC1C,iBAAO,KAAK,cAAc,MAAM,GAAG,EAAE,CAAC;AAAA,QACxC;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAEA,SAAO,UAAU,MAAM,GAAG,EAAE,CAAC;AAC/B;AAEA,SAAS,eACP,KACA,mBACU;AACV,QAAM,iBAAiB,IAAI,iBAAiB,iCAAiC;AAC7E,MAAI,eAAe,SAAS,GAAG;AAC7B,WAAO,MAAM,KAAK,cAAc,EAC7B,IAAI,CAAC,OAAO,GAAG,aAAa,SAAS,CAAC,EACtC,OAAO,CAAC,MAAmB,MAAM,IAAI,EACrC,IAAI,YAAY;AAAA,EACrB;AAEA,QAAM,WAAW,eAAe,KAAK,4BAA4B;AACjE,MAAI,UAAU;AACZ,WAAO,CAAC,aAAa,QAAQ,CAAC;AAAA,EAChC;AAEA,MAAI,mBAAmB;AACrB,WAAO,CAAC,aAAa,iBAAiB,CAAC;AAAA,EACzC;AAEA,SAAO,CAAC;AACV;AAcA,SAAS,cACP,KACA,SACA,UACU;AACV,QAAM,QAAQ,SAAS,SAAS,IAAI;AACpC,QAAM,UAAU,eAAe,KAAK,SAAS,UAAU,IAAI;AAC3D,QAAM,YAAY,qBAAqB,GAAG;AAE1C,QAAM,cACJ,eAAe,KAAK,iCAAiC,KACrD,eAAe,KAAK,0BAA0B,MAC7C,SAAS,WAAW;AAEvB,QAAM,SAAQ,oBAAI,KAAK,GAAE,YAAY,EAAE,MAAM,GAAG,EAAE,CAAC;AAEnD,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR;AAAA,IACA,SAAS;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,qBAAqB,MAAc,KAAwC;AAClF,QAAM,MAAM,IAAI,MAAM,MAAM,EAAE,IAAI,CAAC;AACnC,QAAM,SAAS,IAAI,YAAY,IAAI,OAAO,QAAQ;AAClD,SAAO,OAAO,MAAM;AACtB;AAEO,SAAS,QAAQ,MAAc,UAAiC;AACrE,QAAM,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,SAAS,CAAC;AACjD,QAAM,MAAM,QAAQ,OAAO;AAE3B,QAAM,UAAU,qBAAqB,MAAM,QAAQ;AAEnD,MAAI,CAAC,SAAS;AACZ,UAAM,IAAI;AAAA,MACR;AAAA,IAEF;AAAA,EACF;AAEA,MAAI,CAAC,QAAQ,SAAS;AACpB,UAAM,IAAI,MAAM,iDAAiD;AAAA,EACnE;AAEA,SAAO;AAAA,IACL,UAAU,cAAc,KAAK,SAAS,QAAQ;AAAA,IAC9C,SAAS,QAAQ;AAAA,EACnB;AACF;AAEO,SAAS,gBAAgB,MAAc,UAA4B;AACxE,QAAM,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,SAAS,CAAC;AACjD,QAAM,MAAM,QAAQ,OAAO;AAE3B,QAAM,UAAU,qBAAqB,MAAM,QAAQ;AAEnD,SAAO,cAAc,KAAK,SAAS,QAAQ;AAC7C;;;ACtIA,OAAO,qBAAqB;AAErB,SAAS,kBAAkB,MAAsB;AACtD,QAAM,WAAW,IAAI,gBAAgB;AAAA,IACnC,cAAc;AAAA,IACd,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO,SAAS,SAAS,IAAI;AAC/B;;;ACVA,SAAS,qBAAqB;AAC9B,SAAS,QAAAC,aAAY;AACrB,OAAOC,WAAU;AAGjB,IAAM,eAAe;AACrB,IAAM,gBAAgB;AACtB,IAAM,sBAAsB;AAErB,SAAS,iBAAiB,OAAuB;AACtD,QAAM,YAAY,MACf,QAAQ,eAAe,EAAE,EACzB,QAAQ,cAAc,EAAE,EACxB,QAAQ,QAAQ,GAAG,EACnB,KAAK;AACR,QAAM,OAAO,UAAU,MAAM,GAAG,mBAAmB,KAAK;AACxD,SAAO,GAAG,IAAI;AAChB;AAEO,SAAS,iBAAiB,UAAoB,MAAwB;AAC3E,QAAM,OAAgC;AAAA,IACpC,OAAO,SAAS;AAAA,IAChB,QAAQ,SAAS;AAAA,EACnB;AAEA,MAAI,SAAS,OAAO,SAAS,GAAG;AAC9B,SAAK,SAAS,SAAS;AAAA,EACzB;AAEA,MAAI,SAAS,WAAW;AACtB,SAAK,YAAY,SAAS;AAAA,EAC5B;AAEA,OAAK,UAAU,SAAS;AAExB,MAAI,SAAS,aAAa;AACxB,SAAK,cAAc,SAAS;AAAA,EAC9B;AAEA,OAAK,OAAO;AAEZ,QAAM,UAAUA,MAAK,KAAK,MAAM;AAAA,IAC9B,aAAa;AAAA,IACb,aAAa;AAAA,IACb,WAAW;AAAA,IACX,UAAU;AAAA,EACZ,CAAC;AAED,SAAO;AAAA,EAAQ,OAAO;AACxB;AAEO,SAAS,kBACd,MACA,UACA,iBACA,MACQ;AACR,QAAM,WAAW,iBAAiB,SAAS,KAAK;AAChD,QAAM,WAAWD,MAAK,MAAM,QAAQ;AACpC,QAAM,cAAc,iBAAiB,UAAU,IAAI;AACnD,QAAM,cAAc,GAAG,WAAW;AAAA;AAAA,EAAO,eAAe;AAAA;AAExD,gBAAc,UAAU,aAAa,OAAO;AAE5C,SAAO;AACT;;;ANhDA,IAAM,cAAcE,MAAKC,SAAQ,GAAG,WAAW,eAAe,aAAa;AAE3E,IAAM,UAAU,IAAI,QAAQ;AAE5B,QACG,KAAK,aAAa,EAClB;AAAA,EACC;AACF,EACC,QAAQ,OAAO;AAElB,QACG,QAAQ,OAAO,EACf,YAAY,mCAAmC,EAC/C,SAAS,SAAS,cAAc,EAChC,OAAO,iBAAiB,uBAAuB,EAC/C,OAAO,YAAY,4BAA4B,EAC/C,OAAO,oBAAoB,yBAAyB,EACpD,OAAO,uBAAuB,sBAAsB,QAAQ,EAC5D,OAAO,gBAAgB,wBAAwB,CAAC,KAAa,QAAkB;AAC9E,MAAI,KAAK,GAAG;AACZ,SAAO;AACT,GAAG,CAAC,CAAa,EAChB;AAAA,EACC;AAAA,EACA;AACF,EACC,OAAO,kBAAkB,0BAA0B,EACnD,OAAO,aAAa,oCAAoC,EACxD,OAAO,qBAAqB,6BAA6B,EACzD,OAAO,oBAAoB,4BAA4B,EACvD,OAAO,oBAAoB,6BAA6B,EACxD,OAAO,SAAS,uDAAuD,EACvE,OAAO,kBAAkB,iDAAiD,EAC1E,OAAO,OAAO,KAAa,YAAqC;AAC/D,MAAI;AACF,UAAM,aAAaC,YAAW,WAAW,IAAI,cAAc;AAC3D,UAAM,SAAS;AAAA,MACb;AAAA,QACE,MAAM,QAAQ;AAAA,QACd,MAAM,QAAQ;AAAA,QACd,SAAS,QAAQ;AAAA,QACjB,WAAW,QAAQ;AAAA,QACnB,QAAQ,QAAQ;AAAA,QAChB,UAAU,QAAQ;AAAA,QAClB,OAAO,QAAQ;AAAA,QACf,WAAW,QAAQ;AAAA,QACnB,QAAQ,QAAQ;AAAA,QAChB,aAAa,QAAQ;AAAA,QACrB,YAAY,QAAQ;AAAA,QACpB,YAAY,QAAQ;AAAA,QACpB,KAAK,QAAQ;AAAA,MACf;AAAA,MACA;AAAA,IACF;AAEA,QAAI,OAAO,OAAO,OAAO,UAAU;AACjC,YAAM,IAAI,MAAM,+CAA+C;AAAA,IACjE;AAGA,QAAI,CAAC,OAAO,UAAU,CAACA,YAAW,OAAO,IAAI,GAAG;AAC9C,YAAM,IAAI,MAAM,yCAAyC,OAAO,IAAI,EAAE;AAAA,IACxE;AAEA,UAAM,cAAc,cAAc;AAClC,UAAM,cAAc,MAAM,UAAU,KAAK,QAAQ,WAAW;AAE5D,QAAI;AACJ,QAAI;AAEJ,QAAI,YAAY,SAAS,OAAO;AAC9B,UAAI,OAAO,UAAU;AACnB,cAAM,IAAI,MAAM,0CAA0C;AAAA,MAC5D;AACA,UAAI,OAAO,KAAK;AACd,cAAM,IAAI,MAAM,qCAAqC;AAAA,MACvD;AACA,YAAM,EAAE,qBAAqB,IAAI,MAAM,OAAO,6BAAoB;AAClE,YAAM,YAAY,MAAM;AAAA,QACtB,YAAY;AAAA,QACZ,YAAY;AAAA,MACd;AACA,iBAAW,UAAU;AACrB,iBAAW,UAAU;AAAA,IACvB,WAAW,OAAO,UAAU;AAE1B,iBAAW,gBAAgB,YAAY,UAAU,YAAY,QAAQ;AACrE,iBAAW,kBAAkB,YAAY,IAAI;AAAA,IAC/C,WAAW,OAAO,KAAK;AAErB,iBAAW,gBAAgB,YAAY,UAAU,YAAY,QAAQ;AACrE,iBAAW,kBAAkB,YAAY,QAAQ;AAAA,IACnD,OAAO;AACL,YAAM,SAAS,QAAQ,YAAY,MAAM,YAAY,QAAQ;AAC7D,iBAAW,OAAO;AAClB,iBAAW,kBAAkB,OAAO,OAAO;AAAA,IAC7C;AAEA,QAAI,OAAO,UAAU,MAAM;AACzB,iBAAW,EAAE,GAAG,UAAU,OAAO,OAAO,MAAM;AAAA,IAChD;AAEA,QAAI,OAAO,QAAQ;AACjB,YAAM,cAAc,iBAAiB,UAAU,OAAO,IAAI;AAC1D,cAAQ,OAAO,MAAM,GAAG,WAAW;AAAA;AAAA,EAAO,QAAQ;AAAA,CAAI;AAAA,IACxD,OAAO;AACL,YAAM,WAAW;AAAA,QACf,OAAO;AAAA,QACP;AAAA,QACA;AAAA,QACA,OAAO;AAAA,MACT;AACA,cAAQ,MAAM,UAAU,QAAQ,EAAE;AAAA,IACpC;AAAA,EACF,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAQ,MAAM,UAAU,OAAO,EAAE;AACjC,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF,CAAC;AAEH,QACG,QAAQ,OAAO,EACf,YAAY,kCAAkC,EAC9C,SAAS,SAAS,cAAc,EAChC,OAAO,uBAAuB,4BAA4B,QAAQ,EAClE,OAAO,OAAO,KAAa,YAAqC;AAC/D,QAAM,EAAE,UAAAC,UAAS,IAAI,MAAM,OAAO,YAAY;AAC9C,QAAM,cAAc,cAAc;AAClC,mBAAiB,WAAW;AAE5B,QAAM,aAAc,QAAQ,WAAkC;AAC9D,QAAM,UAAU,MAAMA,UAAS,OAAO,EAAE,UAAU,MAAM,CAAC;AAEzD,MAAI;AACF,UAAM,UAAU,MAAM,QAAQ,WAAW;AACzC,UAAM,OAAO,MAAM,QAAQ,QAAQ;AAEnC,UAAM,KAAK,KAAK,KAAK,EAAE,WAAW,eAAe,SAAS,aAAa,IAAK,CAAC;AAE7E,YAAQ,MAAM,yEAAyE;AAEvF,YAAQ,MAAM,OAAO;AACrB,UAAM,KAAK,QAAQ,OAAO,MAAM;AAChC,YAAQ,MAAM,MAAM;AACpB,YAAQ,MAAM,MAAM;AAEpB,UAAM,cAAc,eAAe,KAAK,WAAW;AACnD,UAAM,QAAQ,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,YAAQ,MAAM,kBAAkB,WAAW,EAAE;AAAA,EAC/C,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAQ,MAAM,UAAU,OAAO,EAAE;AACjC,YAAQ,KAAK,CAAC;AAAA,EAChB,UAAE;AACA,UAAM,QAAQ,MAAM;AAAA,EACtB;AACF,CAAC;AAEH,QAAQ,MAAM;","names":["existsSync","homedir","join","homedir","join","yaml","join","homedir","existsSync","chromium"]}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/pdf-converter.ts
|
|
4
|
+
import pdf2md from "@opendocsg/pdf2md";
|
|
5
|
+
var PDF_DATE_PATTERN = /^D:(\d{4})(\d{2})(\d{2})/;
|
|
6
|
+
function parsePdfDate(dateStr) {
|
|
7
|
+
const match = dateStr.match(PDF_DATE_PATTERN);
|
|
8
|
+
if (!match) return null;
|
|
9
|
+
const [, year, month, day] = match;
|
|
10
|
+
return `${year}-${month}-${day}`;
|
|
11
|
+
}
|
|
12
|
+
var AUTHOR_SEPARATOR = /\s*(?:;|\s+and\s+|&)\s*/;
|
|
13
|
+
function formatAuthor(raw) {
|
|
14
|
+
const sanitized = raw.trim().replace(/[[\]]/g, "");
|
|
15
|
+
return `[[${sanitized}]]`;
|
|
16
|
+
}
|
|
17
|
+
function parseAuthors(raw) {
|
|
18
|
+
return raw.split(AUTHOR_SEPARATOR).map((s) => s.trim()).filter(Boolean).map(formatAuthor);
|
|
19
|
+
}
|
|
20
|
+
async function convertPdfToMarkdown(pdfBuffer, sourceUrl) {
|
|
21
|
+
let pdfMeta = null;
|
|
22
|
+
const markdown = await pdf2md(pdfBuffer, {
|
|
23
|
+
metadataParsed: (metadata) => {
|
|
24
|
+
pdfMeta = metadata;
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
if (!markdown.trim()) {
|
|
28
|
+
throw new Error("pdf2md returned empty content from the PDF");
|
|
29
|
+
}
|
|
30
|
+
const xmpTitle = pdfMeta?.metadata?.get("dc:title");
|
|
31
|
+
const infoTitle = pdfMeta?.info.Title;
|
|
32
|
+
const title = (xmpTitle && xmpTitle.trim() ? xmpTitle.trim() : null) ?? (infoTitle && infoTitle.trim() ? infoTitle.trim() : null) ?? extractTitleFromMarkdown(markdown) ?? extractTitleFromUrl(sourceUrl);
|
|
33
|
+
const rawAuthor = pdfMeta?.info.Author;
|
|
34
|
+
const author = rawAuthor && rawAuthor.trim() ? parseAuthors(rawAuthor) : [];
|
|
35
|
+
const rawDate = pdfMeta?.info.CreationDate;
|
|
36
|
+
const published = rawDate ? parsePdfDate(rawDate) : null;
|
|
37
|
+
const today = (/* @__PURE__ */ new Date()).toISOString().split("T")[0];
|
|
38
|
+
return {
|
|
39
|
+
markdown,
|
|
40
|
+
metadata: {
|
|
41
|
+
title,
|
|
42
|
+
source: sourceUrl,
|
|
43
|
+
author,
|
|
44
|
+
published,
|
|
45
|
+
created: today,
|
|
46
|
+
description: null
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
function extractTitleFromMarkdown(markdown) {
|
|
51
|
+
const match = markdown.match(/^#\s+(.+?)(?:\s+#+)?$/m);
|
|
52
|
+
if (!match) return null;
|
|
53
|
+
const trimmed = match[1].trim();
|
|
54
|
+
return trimmed || null;
|
|
55
|
+
}
|
|
56
|
+
function extractTitleFromUrl(url) {
|
|
57
|
+
const pathname = new URL(url).pathname;
|
|
58
|
+
const lastSegment = pathname.split("/").filter(Boolean).pop();
|
|
59
|
+
if (!lastSegment) {
|
|
60
|
+
throw new Error(`Cannot extract filename from URL: ${url}`);
|
|
61
|
+
}
|
|
62
|
+
return decodeURIComponent(lastSegment.replace(/\.pdf$/i, ""));
|
|
63
|
+
}
|
|
64
|
+
export {
|
|
65
|
+
convertPdfToMarkdown,
|
|
66
|
+
extractTitleFromUrl,
|
|
67
|
+
parsePdfDate
|
|
68
|
+
};
|
|
69
|
+
//# sourceMappingURL=pdf-converter-FM6DCBO5.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/pdf-converter.ts"],"sourcesContent":["import pdf2md from \"@opendocsg/pdf2md\";\nimport type { Metadata } from \"./types.js\";\n\nexport interface PdfConvertResult {\n markdown: string;\n metadata: Metadata;\n}\n\ninterface PdfMetadataInfo {\n Title?: string;\n Author?: string;\n Creator?: string;\n CreationDate?: string;\n}\n\ninterface PdfRawMetadata {\n info: PdfMetadataInfo;\n metadata: { get: (name: string) => string | null } | null;\n}\n\nconst PDF_DATE_PATTERN = /^D:(\\d{4})(\\d{2})(\\d{2})/;\n\nexport function parsePdfDate(dateStr: string): string | null {\n const match = dateStr.match(PDF_DATE_PATTERN);\n if (!match) return null;\n const [, year, month, day] = match;\n return `${year}-${month}-${day}`;\n}\n\nconst AUTHOR_SEPARATOR = /\\s*(?:;|\\s+and\\s+|&)\\s*/;\n\nfunction formatAuthor(raw: string): string {\n const sanitized = raw.trim().replace(/[[\\]]/g, \"\");\n return `[[${sanitized}]]`;\n}\n\nfunction parseAuthors(raw: string): string[] {\n return raw\n .split(AUTHOR_SEPARATOR)\n .map((s) => s.trim())\n .filter(Boolean)\n .map(formatAuthor);\n}\n\nexport async function convertPdfToMarkdown(\n pdfBuffer: Buffer,\n sourceUrl: string,\n): Promise<PdfConvertResult> {\n // TypeScript narrows this to `null` without the assertion because\n // it cannot see that metadataParsed is called synchronously inside pdf2md.\n let pdfMeta = null as PdfRawMetadata | null;\n\n const markdown = await pdf2md(pdfBuffer, {\n metadataParsed: (metadata) => {\n pdfMeta = metadata as PdfRawMetadata;\n },\n });\n\n if (!markdown.trim()) {\n throw new Error(\"pdf2md returned empty content from the PDF\");\n }\n\n // Title priority: XMP dc:title > info.Title > Markdown H1 > URL segment\n const xmpTitle = pdfMeta?.metadata?.get(\"dc:title\");\n const infoTitle = pdfMeta?.info.Title;\n const title =\n (xmpTitle && xmpTitle.trim() ? xmpTitle.trim() : null) ??\n (infoTitle && infoTitle.trim() ? infoTitle.trim() : null) ??\n extractTitleFromMarkdown(markdown) ??\n extractTitleFromUrl(sourceUrl);\n\n // Author\n const rawAuthor = pdfMeta?.info.Author;\n const author = rawAuthor && rawAuthor.trim() ? parseAuthors(rawAuthor) : [];\n\n // Published\n const rawDate = pdfMeta?.info.CreationDate;\n const published = rawDate ? parsePdfDate(rawDate) : null;\n\n const today = new Date().toISOString().split(\"T\")[0];\n\n return {\n markdown,\n metadata: {\n title,\n source: sourceUrl,\n author,\n published,\n created: today,\n description: null,\n },\n };\n}\n\nfunction extractTitleFromMarkdown(markdown: string): string | null {\n const match = markdown.match(/^#\\s+(.+?)(?:\\s+#+)?$/m);\n if (!match) return null;\n const trimmed = match[1].trim();\n return trimmed || null;\n}\n\nexport function extractTitleFromUrl(url: string): string {\n const pathname = new URL(url).pathname;\n const lastSegment = pathname.split(\"/\").filter(Boolean).pop();\n if (!lastSegment) {\n throw new Error(`Cannot extract filename from URL: ${url}`);\n }\n return decodeURIComponent(lastSegment.replace(/\\.pdf$/i, \"\"));\n}\n"],"mappings":";;;AAAA,OAAO,YAAY;AAoBnB,IAAM,mBAAmB;AAElB,SAAS,aAAa,SAAgC;AAC3D,QAAM,QAAQ,QAAQ,MAAM,gBAAgB;AAC5C,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,CAAC,EAAE,MAAM,OAAO,GAAG,IAAI;AAC7B,SAAO,GAAG,IAAI,IAAI,KAAK,IAAI,GAAG;AAChC;AAEA,IAAM,mBAAmB;AAEzB,SAAS,aAAa,KAAqB;AACzC,QAAM,YAAY,IAAI,KAAK,EAAE,QAAQ,UAAU,EAAE;AACjD,SAAO,KAAK,SAAS;AACvB;AAEA,SAAS,aAAa,KAAuB;AAC3C,SAAO,IACJ,MAAM,gBAAgB,EACtB,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,OAAO,EACd,IAAI,YAAY;AACrB;AAEA,eAAsB,qBACpB,WACA,WAC2B;AAG3B,MAAI,UAAU;AAEd,QAAM,WAAW,MAAM,OAAO,WAAW;AAAA,IACvC,gBAAgB,CAAC,aAAa;AAC5B,gBAAU;AAAA,IACZ;AAAA,EACF,CAAC;AAED,MAAI,CAAC,SAAS,KAAK,GAAG;AACpB,UAAM,IAAI,MAAM,4CAA4C;AAAA,EAC9D;AAGA,QAAM,WAAW,SAAS,UAAU,IAAI,UAAU;AAClD,QAAM,YAAY,SAAS,KAAK;AAChC,QAAM,SACH,YAAY,SAAS,KAAK,IAAI,SAAS,KAAK,IAAI,UAChD,aAAa,UAAU,KAAK,IAAI,UAAU,KAAK,IAAI,SACpD,yBAAyB,QAAQ,KACjC,oBAAoB,SAAS;AAG/B,QAAM,YAAY,SAAS,KAAK;AAChC,QAAM,SAAS,aAAa,UAAU,KAAK,IAAI,aAAa,SAAS,IAAI,CAAC;AAG1E,QAAM,UAAU,SAAS,KAAK;AAC9B,QAAM,YAAY,UAAU,aAAa,OAAO,IAAI;AAEpD,QAAM,SAAQ,oBAAI,KAAK,GAAE,YAAY,EAAE,MAAM,GAAG,EAAE,CAAC;AAEnD,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR;AAAA,MACA,QAAQ;AAAA,MACR;AAAA,MACA;AAAA,MACA,SAAS;AAAA,MACT,aAAa;AAAA,IACf;AAAA,EACF;AACF;AAEA,SAAS,yBAAyB,UAAiC;AACjE,QAAM,QAAQ,SAAS,MAAM,wBAAwB;AACrD,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,UAAU,MAAM,CAAC,EAAE,KAAK;AAC9B,SAAO,WAAW;AACpB;AAEO,SAAS,oBAAoB,KAAqB;AACvD,QAAM,WAAW,IAAI,IAAI,GAAG,EAAE;AAC9B,QAAM,cAAc,SAAS,MAAM,GAAG,EAAE,OAAO,OAAO,EAAE,IAAI;AAC5D,MAAI,CAAC,aAAa;AAChB,UAAM,IAAI,MAAM,qCAAqC,GAAG,EAAE;AAAA,EAC5D;AACA,SAAO,mBAAmB,YAAY,QAAQ,WAAW,EAAE,CAAC;AAC9D;","names":[]}
|
package/package.json
CHANGED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// src/pdf-converter.ts
|
|
4
|
-
import pdf2md from "@opendocsg/pdf2md";
|
|
5
|
-
async function convertPdfToMarkdown(pdfBuffer, sourceUrl) {
|
|
6
|
-
const markdown = await pdf2md(pdfBuffer);
|
|
7
|
-
if (!markdown.trim()) {
|
|
8
|
-
throw new Error("pdf2md returned empty content from the PDF");
|
|
9
|
-
}
|
|
10
|
-
const title = extractTitleFromMarkdown(markdown) ?? extractTitleFromUrl(sourceUrl);
|
|
11
|
-
const today = (/* @__PURE__ */ new Date()).toISOString().split("T")[0];
|
|
12
|
-
return {
|
|
13
|
-
markdown,
|
|
14
|
-
metadata: {
|
|
15
|
-
title,
|
|
16
|
-
source: sourceUrl,
|
|
17
|
-
author: [],
|
|
18
|
-
published: null,
|
|
19
|
-
created: today,
|
|
20
|
-
description: null
|
|
21
|
-
}
|
|
22
|
-
};
|
|
23
|
-
}
|
|
24
|
-
function extractTitleFromMarkdown(markdown) {
|
|
25
|
-
const match = markdown.match(/^#\s+(.+?)(?:\s+#+)?$/m);
|
|
26
|
-
if (!match) return null;
|
|
27
|
-
const trimmed = match[1].trim();
|
|
28
|
-
return trimmed || null;
|
|
29
|
-
}
|
|
30
|
-
function extractTitleFromUrl(url) {
|
|
31
|
-
const pathname = new URL(url).pathname;
|
|
32
|
-
const lastSegment = pathname.split("/").filter(Boolean).pop();
|
|
33
|
-
if (!lastSegment) {
|
|
34
|
-
throw new Error(`Cannot extract filename from URL: ${url}`);
|
|
35
|
-
}
|
|
36
|
-
return decodeURIComponent(lastSegment.replace(/\.pdf$/i, ""));
|
|
37
|
-
}
|
|
38
|
-
export {
|
|
39
|
-
convertPdfToMarkdown,
|
|
40
|
-
extractTitleFromUrl
|
|
41
|
-
};
|
|
42
|
-
//# sourceMappingURL=pdf-converter-U3SFA2HY.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/pdf-converter.ts"],"sourcesContent":["import pdf2md from \"@opendocsg/pdf2md\";\nimport type { Metadata } from \"./types.js\";\n\nexport interface PdfConvertResult {\n markdown: string;\n metadata: Metadata;\n}\n\nexport async function convertPdfToMarkdown(\n pdfBuffer: Buffer,\n sourceUrl: string,\n): Promise<PdfConvertResult> {\n const markdown = await pdf2md(pdfBuffer);\n if (!markdown.trim()) {\n throw new Error(\"pdf2md returned empty content from the PDF\");\n }\n const title = extractTitleFromMarkdown(markdown) ?? extractTitleFromUrl(sourceUrl);\n const today = new Date().toISOString().split(\"T\")[0];\n\n return {\n markdown,\n metadata: {\n title,\n source: sourceUrl,\n author: [],\n published: null,\n created: today,\n description: null,\n },\n };\n}\n\nfunction extractTitleFromMarkdown(markdown: string): string | null {\n const match = markdown.match(/^#\\s+(.+?)(?:\\s+#+)?$/m);\n if (!match) return null;\n const trimmed = match[1].trim();\n return trimmed || null;\n}\n\nexport function extractTitleFromUrl(url: string): string {\n const pathname = new URL(url).pathname;\n const lastSegment = pathname.split(\"/\").filter(Boolean).pop();\n if (!lastSegment) {\n throw new Error(`Cannot extract filename from URL: ${url}`);\n }\n return decodeURIComponent(lastSegment.replace(/\\.pdf$/i, \"\"));\n}\n"],"mappings":";;;AAAA,OAAO,YAAY;AAQnB,eAAsB,qBACpB,WACA,WAC2B;AAC3B,QAAM,WAAW,MAAM,OAAO,SAAS;AACvC,MAAI,CAAC,SAAS,KAAK,GAAG;AACpB,UAAM,IAAI,MAAM,4CAA4C;AAAA,EAC9D;AACA,QAAM,QAAQ,yBAAyB,QAAQ,KAAK,oBAAoB,SAAS;AACjF,QAAM,SAAQ,oBAAI,KAAK,GAAE,YAAY,EAAE,MAAM,GAAG,EAAE,CAAC;AAEnD,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR;AAAA,MACA,QAAQ;AAAA,MACR,QAAQ,CAAC;AAAA,MACT,WAAW;AAAA,MACX,SAAS;AAAA,MACT,aAAa;AAAA,IACf;AAAA,EACF;AACF;AAEA,SAAS,yBAAyB,UAAiC;AACjE,QAAM,QAAQ,SAAS,MAAM,wBAAwB;AACrD,MAAI,CAAC,MAAO,QAAO;AACnB,QAAM,UAAU,MAAM,CAAC,EAAE,KAAK;AAC9B,SAAO,WAAW;AACpB;AAEO,SAAS,oBAAoB,KAAqB;AACvD,QAAM,WAAW,IAAI,IAAI,GAAG,EAAE;AAC9B,QAAM,cAAc,SAAS,MAAM,GAAG,EAAE,OAAO,OAAO,EAAE,IAAI;AAC5D,MAAI,CAAC,aAAa;AAChB,UAAM,IAAI,MAAM,qCAAqC,GAAG,EAAE;AAAA,EAC5D;AACA,SAAO,mBAAmB,YAAY,QAAQ,WAAW,EAAE,CAAC;AAC9D;","names":[]}
|