pi-memory-stone 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ /** URL capture for memory vault source pages. */
2
+
3
+ import { createHash } from "node:crypto";
4
+ import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
5
+ import { join, relative } from "node:path";
6
+ import { redactSecrets } from "../privacy/index.js";
7
+ import { initVault, getVaultStatus, type VaultRegistry, type VaultRegistryPage } from "./index.js";
8
+ import { sanitizeSlug } from "./markdown.js";
9
+ import { resolveSourcePacketPath, resolveVaultPath, type VaultScope } from "./paths.js";
10
+ import { extractArticle, type ExtractedArticle } from "./extract.js";
11
+ import { assessCaptureQuality, type CaptureQuality, type CaptureQualityReport } from "./quality.js";
12
+ import { fetchCandidate, type CaptureFetchAttempt, type CaptureFetchOptions, type FetchedCandidate } from "./fetch.js";
13
+ import { resolveCaptureTargets, type CaptureCandidate } from "./url-resolvers.js";
14
+
15
+ const MAX_EXTRACTED_CHARS = 200_000;
16
+
17
+ export interface CaptureUrlOptions extends CaptureFetchOptions {}
18
+
19
+ export interface CaptureUrlResult {
20
+ vaultPath: string;
21
+ pagePath: string;
22
+ sourcePacketPath: string;
23
+ title: string;
24
+ url: string;
25
+ finalUrl: string;
26
+ initialized: boolean;
27
+ quality: CaptureQuality;
28
+ qualityScore: number;
29
+ warnings: string[];
30
+ }
31
+
32
+ export async function captureUrlToVault(
33
+ scope: VaultScope,
34
+ projectId: string | null,
35
+ cwd: string,
36
+ url: string,
37
+ options: CaptureUrlOptions = {},
38
+ ): Promise<CaptureUrlResult> {
39
+ const targets = resolveCaptureTargets(url);
40
+ const vaultPath = resolveVaultPath(scope, projectId, cwd);
41
+ const wasInitialized = getVaultStatus(scope, projectId, cwd).initialized;
42
+ if (!wasInitialized) {
43
+ initVault(scope, projectId, cwd);
44
+ }
45
+
46
+ const selected = await fetchAndExtractBest(targets.candidates, options);
47
+ const title = selected.extracted.title || new URL(selected.fetched.finalUrl).hostname;
48
+ const slug = sanitizeSlug(title).slice(0, 70) || "captured-page";
49
+ const captureId = `SRC-${new Date().toISOString().slice(0, 10)}-${sha256(targets.originalUrl).slice(0, 8)}`;
50
+ const packetPath = resolveSourcePacketPath(scope, projectId, cwd, captureId);
51
+ const packetRelPath = normalizePath(relative(vaultPath, packetPath));
52
+ const sourcePageRelPath = join("sources", `${slug}-${sha256(targets.originalUrl).slice(0, 8)}.md`);
53
+ const sourcePagePath = join(vaultPath, sourcePageRelPath);
54
+
55
+ mkdirSync(join(packetPath, "original"), { recursive: true, mode: 0o700 });
56
+ mkdirSync(join(packetPath, "attachments"), { recursive: true, mode: 0o700 });
57
+ mkdirSync(join(vaultPath, "sources"), { recursive: true, mode: 0o700 });
58
+
59
+ const capturedAt = new Date().toISOString();
60
+ const originalName = originalArtifactName(selected.fetched.contentType, selected.fetched.finalUrl, selected.extracted.extractor);
61
+ const extractedMarkdown = unescapeRedactionMarkers(redactSecrets(selected.extracted.markdown)).slice(0, MAX_EXTRACTED_CHARS);
62
+ const redactedRaw = redactSecrets(selected.fetched.raw);
63
+ const contentHash = sha256(extractedMarkdown);
64
+
65
+ const manifest = {
66
+ id: captureId,
67
+ url: targets.originalUrl,
68
+ canonical_url: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
69
+ final_url: selected.fetched.finalUrl,
70
+ title,
71
+ byline: selected.extracted.byline,
72
+ site_name: selected.extracted.siteName,
73
+ excerpt: selected.extracted.excerpt,
74
+ published_at: selected.extracted.publishedAt,
75
+ content_type: selected.fetched.contentType,
76
+ captured_at: capturedAt,
77
+ original: `original/${originalName}`,
78
+ extracted: "extracted.md",
79
+ metadata: "metadata.json",
80
+ attempts: selected.attempts,
81
+ extraction: {
82
+ extractor: selected.extracted.extractor,
83
+ strategy: selected.fetched.candidate.strategy,
84
+ candidate_kind: selected.fetched.candidate.kind,
85
+ },
86
+ quality: selected.quality,
87
+ content_hash: contentHash,
88
+ };
89
+
90
+ const metadata = {
91
+ title,
92
+ byline: selected.extracted.byline,
93
+ site_name: selected.extracted.siteName,
94
+ excerpt: selected.extracted.excerpt,
95
+ published_at: selected.extracted.publishedAt,
96
+ source_url: targets.originalUrl,
97
+ canonical_url: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
98
+ final_url: selected.fetched.finalUrl,
99
+ content_hash: contentHash,
100
+ extractor: selected.extracted.extractor,
101
+ fetch_strategy: selected.fetched.candidate.strategy,
102
+ quality: selected.quality,
103
+ };
104
+
105
+ writeFileSync(join(packetPath, "manifest.json"), JSON.stringify(manifest, null, 2) + "\n", { mode: 0o600 });
106
+ writeFileSync(join(packetPath, "metadata.json"), JSON.stringify(metadata, null, 2) + "\n", { mode: 0o600 });
107
+ writeFileSync(join(packetPath, "original", originalName), redactedRaw, { mode: 0o600 });
108
+ writeFileSync(join(packetPath, "extracted.md"), extractedMarkdown, { mode: 0o600 });
109
+
110
+ const pageMarkdown = renderSourcePage({
111
+ title,
112
+ url: targets.originalUrl,
113
+ canonicalUrl: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
114
+ capturedAt,
115
+ captureId,
116
+ packetRelPath,
117
+ extractedMarkdown,
118
+ quality: selected.quality,
119
+ warnings: selected.quality.warnings,
120
+ });
121
+ writeFileSync(sourcePagePath, pageMarkdown, { mode: 0o600 });
122
+
123
+ updateRegistry(vaultPath, {
124
+ path: normalizePath(sourcePageRelPath),
125
+ title,
126
+ kind: "web_source",
127
+ source_url: targets.originalUrl,
128
+ source_packet: packetRelPath,
129
+ content_hash: sha256(pageMarkdown),
130
+ generated: true,
131
+ created_at: capturedAt,
132
+ updated_at: capturedAt,
133
+ });
134
+
135
+ return {
136
+ vaultPath,
137
+ pagePath: sourcePagePath,
138
+ sourcePacketPath: packetPath,
139
+ title,
140
+ url: targets.originalUrl,
141
+ finalUrl: selected.fetched.finalUrl,
142
+ initialized: !wasInitialized,
143
+ quality: selected.quality.quality,
144
+ qualityScore: selected.quality.score,
145
+ warnings: selected.quality.warnings,
146
+ };
147
+ }
148
+
149
+ interface ExtractedCandidate {
150
+ fetched: FetchedCandidate;
151
+ extracted: ExtractedArticle;
152
+ quality: CaptureQualityReport;
153
+ attempts: CaptureFetchAttempt[];
154
+ }
155
+
156
+ async function fetchAndExtractBest(candidates: CaptureCandidate[], options: CaptureUrlOptions): Promise<ExtractedCandidate> {
157
+ const allAttempts: CaptureFetchAttempt[] = [];
158
+ let best: ExtractedCandidate | null = null;
159
+ const errors: string[] = [];
160
+
161
+ for (const candidate of candidates) {
162
+ try {
163
+ const fetched = await fetchCandidate(candidate, options);
164
+ allAttempts.push(...fetched.attempts);
165
+ const redactedRaw = redactSecrets(fetched.raw);
166
+ const extracted = extractArticle({
167
+ raw: redactedRaw,
168
+ contentType: fetched.contentType,
169
+ url: fetched.finalUrl,
170
+ candidateKind: candidate.kind,
171
+ });
172
+ const quality = assessCaptureQuality({
173
+ title: extracted.title,
174
+ markdown: extracted.markdown,
175
+ extractor: extracted.extractor,
176
+ });
177
+ const current: ExtractedCandidate = { fetched, extracted, quality, attempts: [...allAttempts] };
178
+ if (!best || current.quality.score > best.quality.score) best = current;
179
+ if (quality.quality === "good") return current;
180
+ } catch (error) {
181
+ const attempts = (error as Error & { attempts?: CaptureFetchAttempt[] }).attempts;
182
+ if (attempts) allAttempts.push(...attempts);
183
+ errors.push(`${candidate.strategy}: ${error instanceof Error ? error.message : String(error)}`);
184
+ }
185
+ }
186
+
187
+ if (best) return { ...best, attempts: allAttempts };
188
+ throw new Error(`Unable to fetch article. Attempts failed: ${errors.join("; ")}`);
189
+ }
190
+
191
+ function renderSourcePage(input: {
192
+ title: string;
193
+ url: string;
194
+ canonicalUrl: string;
195
+ capturedAt: string;
196
+ captureId: string;
197
+ packetRelPath: string;
198
+ extractedMarkdown: string;
199
+ quality: CaptureQualityReport;
200
+ warnings: string[];
201
+ }): string {
202
+ const warningLines = input.warnings.length > 0
203
+ ? ["", "Warnings:", ...input.warnings.map((warning) => `- ${warning}`)]
204
+ : [];
205
+
206
+ return [
207
+ "---",
208
+ `title: ${JSON.stringify(input.title)}`,
209
+ "kind: web_source",
210
+ `source_url: ${JSON.stringify(input.url)}`,
211
+ `canonical_url: ${JSON.stringify(input.canonicalUrl)}`,
212
+ `source_packet: ${JSON.stringify(input.packetRelPath)}`,
213
+ `captured_at: ${JSON.stringify(input.capturedAt)}`,
214
+ `capture_id: ${JSON.stringify(input.captureId)}`,
215
+ `quality: ${JSON.stringify(input.quality.quality)}`,
216
+ `quality_score: ${input.quality.score}`,
217
+ "generated: true",
218
+ "source: pi-memory-stone",
219
+ "---",
220
+ "",
221
+ `# ${input.title.replace(/[\r\n]+/g, " ").trim()}`,
222
+ "",
223
+ `Source: ${input.url}`,
224
+ `Canonical: ${input.canonicalUrl}`,
225
+ `Captured: ${input.capturedAt}`,
226
+ `Quality: ${input.quality.quality} (${input.quality.score})`,
227
+ `Source packet: ${input.captureId} (stored outside vault: ${input.packetRelPath})`,
228
+ ...warningLines,
229
+ "",
230
+ "## Extracted text",
231
+ "",
232
+ input.extractedMarkdown.trim() || "_No text extracted._",
233
+ "",
234
+ ].join("\n");
235
+ }
236
+
237
+ function updateRegistry(vaultPath: string, page: VaultRegistryPage & {
238
+ source_url: string;
239
+ source_packet: string;
240
+ }): void {
241
+ const registryPath = join(vaultPath, "meta", "registry.json");
242
+ const registry = JSON.parse(readFileSync(registryPath, "utf8")) as VaultRegistry;
243
+ const pages = registry.pages.filter((existing) => existing.path !== page.path);
244
+ pages.push(page);
245
+ pages.sort((a, b) => a.path.localeCompare(b.path));
246
+ registry.pages = pages;
247
+ registry.generated_at = new Date().toISOString();
248
+ writeFileSync(registryPath, JSON.stringify(registry, null, 2) + "\n", { mode: 0o600 });
249
+ }
250
+
251
+ function originalArtifactName(contentType: string, finalUrl: string, extractor: string): string {
252
+ if (contentType.includes("html") || extractor.startsWith("html")) return "response.html";
253
+ if (contentType.includes("markdown") || finalUrl.toLowerCase().match(/\.(md|markdown|mdx)(?:$|[?#])/) || extractor === "markdown") return "response.md";
254
+ if (contentType.includes("pdf") || finalUrl.toLowerCase().match(/\.pdf(?:$|[?#])/)) return "response.pdf.txt";
255
+ return "response.txt";
256
+ }
257
+
258
+ function normalizePath(path: string): string {
259
+ return path.split(/[\\/]+/).join("/");
260
+ }
261
+
262
+ function unescapeRedactionMarkers(markdown: string): string {
263
+ return markdown.replace(/\\\[REDACTED:([a-z-]+)\\\]/g, "[REDACTED:$1]");
264
+ }
265
+
266
+ function sha256(content: string): string {
267
+ return createHash("sha256").update(content).digest("hex");
268
+ }
@@ -0,0 +1,259 @@
1
+ /** Content-type aware article extraction for vault source capture. */
2
+
3
+ import { JSDOM } from "jsdom";
4
+ import { Readability } from "@mozilla/readability";
5
+ import TurndownService from "turndown";
6
+ import type { CaptureCandidateKind } from "./url-resolvers.js";
7
+
8
+ export type ExtractionKind = "html-readability" | "html-main" | "markdown" | "text" | "pdf-unsupported";
9
+
10
+ export interface ExtractedArticle {
11
+ title: string;
12
+ markdown: string;
13
+ byline?: string;
14
+ siteName?: string;
15
+ excerpt?: string;
16
+ publishedAt?: string;
17
+ canonicalUrl?: string;
18
+ extractor: ExtractionKind;
19
+ }
20
+
21
+ export function extractArticle(input: {
22
+ raw: string;
23
+ contentType: string;
24
+ url: string;
25
+ candidateKind: CaptureCandidateKind;
26
+ }): ExtractedArticle {
27
+ if (isPdf(input.contentType, input.url, input.candidateKind)) {
28
+ return {
29
+ title: titleFromUrl(input.url),
30
+ markdown: "_PDF capture is not yet supported. Raw response was stored for future extraction._",
31
+ canonicalUrl: input.url,
32
+ extractor: "pdf-unsupported",
33
+ };
34
+ }
35
+
36
+ if (isHtml(input.raw, input.contentType, input.candidateKind)) {
37
+ return extractHtml(input.raw, input.url);
38
+ }
39
+
40
+ if (isMarkdown(input.contentType, input.url, input.candidateKind)) {
41
+ return extractMarkdown(input.raw, input.url);
42
+ }
43
+
44
+ return extractText(input.raw, input.url);
45
+ }
46
+
47
+ function extractHtml(html: string, url: string): ExtractedArticle {
48
+ const dom = new JSDOM(html, { url });
49
+ const document = dom.window.document;
50
+ const metadata = htmlMetadata(document, url);
51
+
52
+ const reader = new Readability(document.cloneNode(true) as Document);
53
+ const article = reader.parse();
54
+ if (article?.content && (article.textContent ?? "").trim().length > 0) {
55
+ const markdown = htmlFragmentToMarkdown(article.content);
56
+ return {
57
+ title: normalizeWhitespace(article.title || metadata.title || titleFromUrl(url)),
58
+ markdown,
59
+ byline: article.byline || metadata.byline,
60
+ siteName: article.siteName || metadata.siteName,
61
+ excerpt: article.excerpt || metadata.excerpt,
62
+ publishedAt: metadata.publishedAt,
63
+ canonicalUrl: metadata.canonicalUrl,
64
+ extractor: "html-readability",
65
+ };
66
+ }
67
+
68
+ const fallbackElement = document.querySelector("article") ?? document.querySelector("main") ?? document.body;
69
+ return {
70
+ title: metadata.title || titleFromUrl(url),
71
+ markdown: htmlFragmentToMarkdown(fallbackElement?.innerHTML ?? html),
72
+ byline: metadata.byline,
73
+ siteName: metadata.siteName,
74
+ excerpt: metadata.excerpt,
75
+ publishedAt: metadata.publishedAt,
76
+ canonicalUrl: metadata.canonicalUrl,
77
+ extractor: "html-main",
78
+ };
79
+ }
80
+
81
+ function extractMarkdown(markdown: string, url: string): ExtractedArticle {
82
+ const title = markdown.match(/^#\s+(.+)$/m)?.[1]
83
+ ?? markdown.match(/^title:\s*["']?(.+?)["']?\s*$/im)?.[1]
84
+ ?? titleFromUrl(url);
85
+ return {
86
+ title: normalizeWhitespace(stripMarkdown(title)),
87
+ markdown: markdown.trim(),
88
+ canonicalUrl: url,
89
+ extractor: "markdown",
90
+ };
91
+ }
92
+
93
+ function extractText(text: string, url: string): ExtractedArticle {
94
+ const lines = text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
95
+ return {
96
+ title: normalizeWhitespace(lines[0] || titleFromUrl(url)),
97
+ markdown: text.trim(),
98
+ canonicalUrl: url,
99
+ extractor: "text",
100
+ };
101
+ }
102
+
103
+ function htmlMetadata(document: Document, url: string): Omit<ExtractedArticle, "markdown" | "extractor"> {
104
+ const jsonLd = parseJsonLd(document);
105
+ const title = meta(document, "property", "og:title")
106
+ ?? meta(document, "name", "twitter:title")
107
+ ?? document.querySelector("title")?.textContent
108
+ ?? document.querySelector("h1")?.textContent
109
+ ?? jsonLd?.headline
110
+ ?? titleFromUrl(url);
111
+
112
+ return {
113
+ title: normalizeWhitespace(title),
114
+ byline: meta(document, "name", "author") ?? jsonLd?.author,
115
+ siteName: meta(document, "property", "og:site_name"),
116
+ excerpt: meta(document, "name", "description") ?? meta(document, "property", "og:description"),
117
+ publishedAt: meta(document, "property", "article:published_time") ?? jsonLd?.datePublished,
118
+ canonicalUrl: document.querySelector<HTMLLinkElement>('link[rel="canonical"]')?.href
119
+ ?? meta(document, "property", "og:url")
120
+ ?? url,
121
+ };
122
+ }
123
+
124
+ function parseJsonLd(document: Document): { headline?: string; author?: string; datePublished?: string } | null {
125
+ for (const script of [...document.querySelectorAll<HTMLScriptElement>('script[type="application/ld+json"]')]) {
126
+ try {
127
+ const parsed = JSON.parse(script.textContent || "null") as unknown;
128
+ const objects = Array.isArray(parsed) ? parsed : [parsed];
129
+ for (const object of objects) {
130
+ if (!object || typeof object !== "object") continue;
131
+ const record = object as Record<string, unknown>;
132
+ const author = record.author;
133
+ return {
134
+ headline: stringValue(record.headline ?? record.name),
135
+ author: typeof author === "string"
136
+ ? author
137
+ : Array.isArray(author)
138
+ ? stringValue((author[0] as Record<string, unknown> | undefined)?.name)
139
+ : stringValue((author as Record<string, unknown> | undefined)?.name),
140
+ datePublished: stringValue(record.datePublished),
141
+ };
142
+ }
143
+ } catch {
144
+ // Ignore malformed embedded metadata.
145
+ }
146
+ }
147
+ return null;
148
+ }
149
+
150
+ function meta(document: Document, attr: "name" | "property", value: string): string | undefined {
151
+ return normalizeWhitespace(document.querySelector<HTMLMetaElement>(`meta[${attr}="${value}"]`)?.content ?? "") || undefined;
152
+ }
153
+
154
+ function htmlFragmentToMarkdown(html: string): string {
155
+ const turndown = new TurndownService({
156
+ headingStyle: "atx",
157
+ codeBlockStyle: "fenced",
158
+ bulletListMarker: "-",
159
+ });
160
+ turndown.remove(["script", "style", "noscript"]);
161
+ const markdown = turndown.turndown(html)
162
+ .split(/\r?\n/)
163
+ .map((line) => line.replace(/[ \t]+$/g, ""))
164
+ .join("\n")
165
+ .replace(/\n{4,}/g, "\n\n\n")
166
+ .trim();
167
+ return normalizeObsidianMarkdown(markdown);
168
+ }
169
+
170
+ function normalizeObsidianMarkdown(markdown: string): string {
171
+ return normalizeBlockLinks(normalizeLinkedImages(markdown));
172
+ }
173
+
174
+ function normalizeLinkedImages(markdown: string): string {
175
+ // Turndown can emit image-only links as `[![alt](src)](href)` for linked
176
+ // article images. Obsidian renders multiline variants as stray `[` / `](href)`
177
+ // text around the image, so keep the image and drop the redundant outer link.
178
+ return markdown.replace(/\[\s*(!\[[^\]]*\]\([^)]+\))\s*\]\([^)]+\)/g, "$1");
179
+ }
180
+
181
+ function normalizeBlockLinks(markdown: string): string {
182
+ const lines = markdown.split("\n");
183
+ const output: string[] = [];
184
+
185
+ for (let i = 0; i < lines.length; i += 1) {
186
+ if (lines[i]?.trim() !== "[") {
187
+ output.push(lines[i] ?? "");
188
+ continue;
189
+ }
190
+
191
+ const closeIndex = lines.findIndex((line, index) => index > i && /^\]\(([^)]+)\)$/.test(line.trim()));
192
+ if (closeIndex === -1) {
193
+ output.push(lines[i] ?? "");
194
+ continue;
195
+ }
196
+
197
+ const href = lines[closeIndex]?.trim().match(/^\]\(([^)]+)\)$/)?.[1] ?? "";
198
+ const inner = trimBlankLines(lines.slice(i + 1, closeIndex));
199
+ output.push(...inner);
200
+ if (!isImageOnlyBlock(inner) && href) {
201
+ if (inner.length > 0) output.push("");
202
+ output.push(`[Link](${href})`);
203
+ }
204
+ i = closeIndex;
205
+ }
206
+
207
+ return output.join("\n").replace(/\n{4,}/g, "\n\n\n").trim();
208
+ }
209
+
210
+ function trimBlankLines(lines: string[]): string[] {
211
+ let start = 0;
212
+ let end = lines.length;
213
+ while (start < end && lines[start]?.trim() === "") start += 1;
214
+ while (end > start && lines[end - 1]?.trim() === "") end -= 1;
215
+ return lines.slice(start, end);
216
+ }
217
+
218
+ function isImageOnlyBlock(lines: string[]): boolean {
219
+ const nonBlank = lines.map((line) => line.trim()).filter(Boolean);
220
+ return nonBlank.length === 1 && /^!\[[^\]]*\]\([^)]+\)$/.test(nonBlank[0] ?? "");
221
+ }
222
+
223
+ function isPdf(contentType: string, url: string, kind: CaptureCandidateKind): boolean {
224
+ return kind === "pdf" || /application\/pdf/i.test(contentType) || new URL(url).pathname.toLowerCase().endsWith(".pdf");
225
+ }
226
+
227
+ function isHtml(raw: string, contentType: string, kind: CaptureCandidateKind): boolean {
228
+ return kind === "html"
229
+ || /text\/html|application\/xhtml\+xml/i.test(contentType)
230
+ || /^\s*<!doctype html/i.test(raw)
231
+ || /^\s*<html[\s>]/i.test(raw);
232
+ }
233
+
234
+ function isMarkdown(contentType: string, url: string, kind: CaptureCandidateKind): boolean {
235
+ const path = new URL(url).pathname.toLowerCase();
236
+ return kind === "markdown"
237
+ || /text\/(markdown|x-markdown)|application\/markdown/i.test(contentType)
238
+ || path.endsWith(".md")
239
+ || path.endsWith(".markdown")
240
+ || path.endsWith(".mdx");
241
+ }
242
+
243
+ function titleFromUrl(url: string): string {
244
+ const parsed = new URL(url);
245
+ const last = parsed.pathname.split("/").filter(Boolean).pop();
246
+ return normalizeWhitespace(decodeURIComponent(last || parsed.hostname).replace(/[-_]+/g, " ")) || parsed.hostname;
247
+ }
248
+
249
+ function stripMarkdown(value: string): string {
250
+ return value.replace(/^[#>*\-\s]+/, "").replace(/[`*_~\[\]()]/g, "");
251
+ }
252
+
253
+ function stringValue(value: unknown): string | undefined {
254
+ return typeof value === "string" && value.trim() ? normalizeWhitespace(value) : undefined;
255
+ }
256
+
257
+ function normalizeWhitespace(value: string): string {
258
+ return value.replace(/\s+/g, " ").trim();
259
+ }
@@ -0,0 +1,155 @@
1
+ /** Robust bounded fetch helpers for vault source capture. */
2
+
3
+ import type { CaptureCandidate } from "./url-resolvers.js";
4
+
5
+ export const MAX_CAPTURE_BYTES = 5 * 1024 * 1024;
6
+ export const DEFAULT_TIMEOUT_MS = 15_000;
7
+ const DEFAULT_RETRIES = 2;
8
+
9
+ export interface CaptureFetchOptions {
10
+ timeoutMs?: number;
11
+ signal?: AbortSignal;
12
+ maxBytes?: number;
13
+ retries?: number;
14
+ }
15
+
16
+ export interface CaptureFetchAttempt {
17
+ url: string;
18
+ final_url?: string;
19
+ status?: number;
20
+ status_text?: string;
21
+ content_type?: string;
22
+ bytes?: number;
23
+ strategy: string;
24
+ error?: string;
25
+ }
26
+
27
+ export interface FetchedCandidate {
28
+ candidate: CaptureCandidate;
29
+ finalUrl: string;
30
+ status: number;
31
+ statusText: string;
32
+ contentType: string;
33
+ headers: Record<string, string>;
34
+ raw: string;
35
+ bytes: number;
36
+ attempts: CaptureFetchAttempt[];
37
+ }
38
+
39
+ export async function fetchCandidate(candidate: CaptureCandidate, options: CaptureFetchOptions = {}): Promise<FetchedCandidate> {
40
+ const attempts: CaptureFetchAttempt[] = [];
41
+ const retries = options.retries ?? DEFAULT_RETRIES;
42
+ let lastError: unknown;
43
+
44
+ for (let attemptNumber = 0; attemptNumber <= retries; attemptNumber += 1) {
45
+ try {
46
+ const fetched = await fetchOnce(candidate, options);
47
+ attempts.push(...fetched.attempts);
48
+ return { ...fetched, attempts };
49
+ } catch (error) {
50
+ lastError = error;
51
+ const attemptFromError = (error as Error & { attempt?: CaptureFetchAttempt }).attempt;
52
+ attempts.push({
53
+ ...(attemptFromError ?? { url: candidate.url, strategy: candidate.strategy }),
54
+ error: error instanceof Error ? error.message : String(error),
55
+ });
56
+
57
+ if (attemptNumber >= retries || !isRetryableError(error)) break;
58
+ await sleep(150 * 2 ** attemptNumber);
59
+ }
60
+ }
61
+
62
+ const message = lastError instanceof Error ? lastError.message : String(lastError);
63
+ const error = new Error(message || `Failed to fetch ${candidate.url}`) as Error & { attempts?: CaptureFetchAttempt[] };
64
+ error.attempts = attempts;
65
+ throw error;
66
+ }
67
+
68
+ async function fetchOnce(candidate: CaptureCandidate, options: CaptureFetchOptions): Promise<FetchedCandidate> {
69
+ const maxBytes = options.maxBytes ?? MAX_CAPTURE_BYTES;
70
+ const controller = new AbortController();
71
+ const timeout = setTimeout(() => controller.abort(), options.timeoutMs ?? DEFAULT_TIMEOUT_MS);
72
+ const abortFromParent = () => controller.abort();
73
+ if (options.signal) {
74
+ if (options.signal.aborted) controller.abort();
75
+ options.signal.addEventListener("abort", abortFromParent, { once: true });
76
+ }
77
+
78
+ try {
79
+ const response = await fetch(candidate.url, {
80
+ redirect: "follow",
81
+ signal: controller.signal,
82
+ headers: {
83
+ "user-agent": "Mozilla/5.0 (compatible; pi-memory-stone/0.1; +https://github.com/nikolasp/pi-memory-stone)",
84
+ "accept": "text/html, text/markdown, text/plain, application/xhtml+xml, application/pdf, */*;q=0.5",
85
+ },
86
+ });
87
+
88
+ const contentType = response.headers.get("content-type") ?? "text/plain";
89
+ const attempt: CaptureFetchAttempt = {
90
+ url: candidate.url,
91
+ final_url: response.url,
92
+ status: response.status,
93
+ status_text: response.statusText,
94
+ content_type: contentType,
95
+ strategy: candidate.strategy,
96
+ };
97
+
98
+ if (!response.ok) {
99
+ throw httpError(response, attempt);
100
+ }
101
+
102
+ const contentLength = Number(response.headers.get("content-length") ?? "0");
103
+ if (contentLength > maxBytes) {
104
+ throw new Error(`Response is too large (${contentLength} bytes; max ${maxBytes})`);
105
+ }
106
+
107
+ const buffer = await response.arrayBuffer();
108
+ attempt.bytes = buffer.byteLength;
109
+ if (buffer.byteLength > maxBytes) {
110
+ throw new Error(`Response is too large (${buffer.byteLength} bytes; max ${maxBytes})`);
111
+ }
112
+
113
+ const raw = new TextDecoder(detectCharset(contentType)).decode(buffer);
114
+ const headers = Object.fromEntries(response.headers.entries());
115
+
116
+ return {
117
+ candidate,
118
+ finalUrl: response.url,
119
+ status: response.status,
120
+ statusText: response.statusText,
121
+ contentType,
122
+ headers,
123
+ raw,
124
+ bytes: buffer.byteLength,
125
+ attempts: [attempt],
126
+ };
127
+ } finally {
128
+ clearTimeout(timeout);
129
+ options.signal?.removeEventListener("abort", abortFromParent);
130
+ }
131
+ }
132
+
133
+ function httpError(response: Response, attempt: CaptureFetchAttempt): Error & { retryable?: boolean; attempt?: CaptureFetchAttempt } {
134
+ const error = new Error(`HTTP ${response.status} ${response.statusText}`.trim()) as Error & { retryable?: boolean; attempt?: CaptureFetchAttempt };
135
+ error.retryable = response.status === 429 || response.status >= 500;
136
+ error.attempt = attempt;
137
+ return error;
138
+ }
139
+
140
+ function isRetryableError(error: unknown): boolean {
141
+ if (error && typeof error === "object" && "retryable" in error) {
142
+ return Boolean((error as { retryable?: boolean }).retryable);
143
+ }
144
+ const message = error instanceof Error ? error.message : String(error);
145
+ return /fetch failed|network|ECONNRESET|ETIMEDOUT|aborted/i.test(message);
146
+ }
147
+
148
+ function detectCharset(contentType: string): string {
149
+ const charset = contentType.match(/charset=([^;]+)/i)?.[1]?.trim();
150
+ return charset || "utf-8";
151
+ }
152
+
153
+ function sleep(ms: number): Promise<void> {
154
+ return new Promise((resolve) => setTimeout(resolve, ms));
155
+ }