npm - personal-ai - Versions diffs - 0.2.1 → 0.2.4 - Mend

personal-ai 0.2.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.mjs CHANGED Viewed

@@ -292,11 +292,123 @@ function spinner(text) {
 //#endregion
 //#region src/scraper/index.ts
 /**
-* Scrape a URL and return title + markdown content.
-* Uses fetch + defuddle for content extraction.
-* Falls back to basic HTML extraction if defuddle is unavailable.
+* Web scraper: three-tier strategy for content extraction.
+*
+* 1. GitHub blob URLs → convert to raw.githubusercontent.com, fetch raw markdown (zero rendering)
+* 2. General URLs → Playwright headless browser + DOM preprocessing + Defuddle extraction
+* 3. Fallback → plain fetch + Defuddle (for when Playwright is unavailable)
+*
+* Reference: linkmind-master/src/scraper.ts
 */
-async function scrapeUrl(url, timeout = 3e4) {
+const CHROME_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
+/** Minimum markdown length to consider a scrape successful */
+const MIN_CONTENT_LENGTH = 100;
+/** Check if a URL points to a file on GitHub (blob view) */
+function isGithubBlobUrl(url) {
+	try {
+		const u = new URL(url);
+		return u.hostname === "github.com" && /\/blob\//.test(u.pathname);
+	} catch {
+		return false;
+	}
+}
+/** Convert GitHub blob URL to raw.githubusercontent.com URL */
+function toRawGithubUrl(url) {
+	return url.replace("https://github.com/", "https://raw.githubusercontent.com/").replace("/blob/", "/");
+}
+/** Fast path: fetch raw content directly from GitHub (returns markdown/text as-is) */
+async function scrapeGithubRaw(url, timeout) {
+	const rawUrl = toRawGithubUrl(url);
+	const controller = new AbortController();
+	const timer = setTimeout(() => controller.abort(), timeout);
+	try {
+		const resp = await fetch(rawUrl, {
+			signal: controller.signal,
+			headers: { "User-Agent": CHROME_UA }
+		});
+		if (!resp.ok) throw new Error(`GitHub raw fetch failed: HTTP ${resp.status}`);
+		const markdown = await resp.text();
+		const titleMatch = markdown.match(/^#\s+(.+)$/m);
+		const pathParts = new URL(url).pathname.split("/");
+		const filename = pathParts[pathParts.length - 1] ?? "Untitled";
+		return {
+			url,
+			title: titleMatch?.[1]?.trim() ?? filename,
+			markdown
+		};
+	} finally {
+		clearTimeout(timer);
+	}
+}
+/**
+* DOM preprocessing script executed inside the browser.
+* Removes navigation, ads, cookie banners, and other non-content elements
+* before Defuddle extraction. (Borrowed from linkmind)
+*/
+const DOM_PREPROCESS_SCRIPT = `(() => {
+  // Remove script, style, stylesheet links
+  document.querySelectorAll("script, style, link[rel='stylesheet']").forEach(el => el.remove());
+  // Remove navigation elements
+  document.querySelectorAll("nav, footer, aside").forEach(el => el.remove());
+  // Remove headers not inside article/main
+  document.querySelectorAll("header").forEach(el => {
+    if (!el.closest("article") && !el.closest("main")) el.remove();
+  });
+  // Remove ARIA landmark roles
+  document.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"], [role="complementary"], [role="search"]').forEach(el => el.remove());
+  // Remove cookie/share/comment noise
+  document.querySelectorAll('[class*="cookie-banner"], [id*="cookie-banner"], [class*="cookie-consent"], [class*="share-buttons"], [class*="social-share"], [class*="comment-section"], [id*="comments"]').forEach(el => el.remove());
+  // Remove hidden elements
+  document.querySelectorAll('[hidden], [aria-hidden="true"]').forEach(el => el.remove());
+  return {
+    title: document.title,
+    html: document.documentElement.outerHTML,
+  };
+})()`;
+/** Scrape with Playwright headless browser + Defuddle */
+async function scrapeWithPlaywright(url, timeout) {
+	const pw = await import("playwright");
+	const { Defuddle } = await import("defuddle/node");
+	const browser = await pw.chromium.launch({
+		headless: true,
+		args: ["--disable-blink-features=AutomationControlled"]
+	});
+	try {
+		const page = await (await browser.newContext({
+			viewport: {
+				width: 1280,
+				height: 900
+			},
+			userAgent: CHROME_UA,
+			locale: "en-US"
+		})).newPage();
+		await page.goto(url, {
+			waitUntil: "domcontentloaded",
+			timeout
+		});
+		await page.waitForTimeout(2e3);
+		const { title: pageTitle, html } = await page.evaluate(DOM_PREPROCESS_SCRIPT);
+		await browser.close();
+		const origLog = globalThis.console.log;
+		globalThis.console.log = (msg, ...args) => {
+			if (typeof msg === "string" && msg.includes("Initial parse returned very little content")) return;
+			origLog(msg, ...args);
+		};
+		const result = await Defuddle(html, url);
+		globalThis.console.log = origLog;
+		return {
+			url,
+			title: result.title || pageTitle || "Untitled",
+			markdown: result.content ? htmlToSimpleMarkdown(result.content) : ""
+		};
+	} catch (err) {
+		await browser.close().catch(() => {});
+		throw err;
+	}
+}
+/** Lightweight scrape: plain HTTP fetch + Defuddle (no JS rendering) */
+async function scrapeWithFetch(url, timeout) {
 	const controller = new AbortController();
 	const timer = setTimeout(() => controller.abort(), timeout);
 	let html;
@@ -304,7 +416,7 @@ async function scrapeUrl(url, timeout = 3e4) {
 		const response = await fetch(url, {
 			signal: controller.signal,
 			headers: {
-				"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+				"User-Agent": CHROME_UA,
 				Accept: "text/html,application/xhtml+xml"
 			}
 		});
@@ -314,16 +426,13 @@ async function scrapeUrl(url, timeout = 3e4) {
 		clearTimeout(timer);
 	}
 	try {
-		const defuddleMod = await import("defuddle/node");
-		const Defuddle = defuddleMod.Defuddle ?? defuddleMod.default;
-		if (Defuddle) {
-			const result = new Defuddle(html, { url }).parse();
-			return {
-				url,
-				title: result.title || extractTitleFromHtml(html),
-				markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
-			};
-		}
+		const { Defuddle } = await import("defuddle/node");
+		const result = await Defuddle(html, url);
+		return {
+			url,
+			title: result.title || extractTitleFromHtml(html),
+			markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
+		};
 	} catch {
 		warn("defuddle not available, using basic HTML extraction");
 	}
@@ -333,17 +442,77 @@ async function scrapeUrl(url, timeout = 3e4) {
 		markdown: extractTextFromHtml(html)
 	};
 }
+/**
+* Scrape a URL and return title + markdown content.
+*
+* Strategy:
+* 1. GitHub blob URL → raw.githubusercontent.com (instant, perfect fidelity)
+* 2. Playwright + Defuddle (handles JS-rendered pages)
+* 3. Fetch + Defuddle fallback (static pages, or when Playwright missing)
+*/
+async function scrapeUrl(url, timeout = 3e4) {
+	if (isGithubBlobUrl(url)) {
+		info("GitHub blob detected — fetching raw content directly");
+		return scrapeGithubRaw(url, timeout);
+	}
+	try {
+		const result = await scrapeWithPlaywright(url, timeout);
+		if (result.markdown.length < MIN_CONTENT_LENGTH) {
+			warn(`Playwright extracted only ${result.markdown.length} chars — trying fetch fallback`);
+			const fallback = await scrapeWithFetch(url, timeout);
+			return fallback.markdown.length > result.markdown.length ? fallback : result;
+		}
+		return result;
+	} catch (err) {
+		const msg = err instanceof Error ? err.message : String(err);
+		warn(`Playwright scrape failed (${msg}) — falling back to fetch`);
+	}
+	return scrapeWithFetch(url, timeout);
+}
 /** Extract <title> from HTML */
 function extractTitleFromHtml(html) {
 	return html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? "Untitled";
 }
-/** Basic HTML to text extraction (fallback) */
+/** Basic HTML to text extraction (last-resort fallback) */
 function extractTextFromHtml(html) {
 	return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#39;/g, "'").replace(/&nbsp;/g, " ").replace(/\s+/g, " ").trim().slice(0, 1e4);
 }
-/** Convert simple HTML to markdown */
+/** Convert HTML fragment to simple Markdown (from linkmind, extended) */
 function htmlToSimpleMarkdown(html) {
-	return html.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, "# $1\n\n").replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, "## $1\n\n").replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, "### $1\n\n").replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, "$1\n\n").replace(/<br\s*\/?>/gi, "\n").replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, "**$1**").replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, "*$1*").replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)").replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, "- $1\n").replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, "`$1`").replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, "```\n$1\n```\n").replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#39;/g, "'").replace(/&nbsp;/g, " ").replace(/\n{3,}/g, "\n\n").trim();
+	if (!html) return "";
+	let md = html;
+	md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
+	md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
+	md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
+	md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
+	md = md.replace(/<h5[^>]*>(.*?)<\/h5>/gi, "##### $1\n\n");
+	md = md.replace(/<h6[^>]*>(.*?)<\/h6>/gi, "###### $1\n\n");
+	md = md.replace(/<p[^>]*>/gi, "\n\n");
+	md = md.replace(/<\/p>/gi, "");
+	md = md.replace(/<br\s*\/?>/gi, "\n");
+	md = md.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, "**$2**");
+	md = md.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, "*$2*");
+	md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
+	md = md.replace(/<code[^>]*>(.*?)<\/code>/gi, "`$1`");
+	md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "\n```\n$1\n```\n");
+	md = md.replace(/<li[^>]*>/gi, "- ");
+	md = md.replace(/<\/li>/gi, "\n");
+	md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
+	md = md.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content) => {
+		return content.split("\n").map((line) => `> ${line}`).join("\n");
+	});
+	md = md.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, "![$2]($1)");
+	md = md.replace(/<img[^>]*src="([^"]*)"[^>]*\/?>/gi, "![]($1)");
+	md = md.replace(/<[^>]+>/g, "");
+	md = md.replace(/&amp;/g, "&");
+	md = md.replace(/&lt;/g, "<");
+	md = md.replace(/&gt;/g, ">");
+	md = md.replace(/&quot;/g, "\"");
+	md = md.replace(/&#39;/g, "'");
+	md = md.replace(/&nbsp;/g, " ");
+	md = md.replace(/\n{3,}/g, "\n\n");
+	md = md.trim();
+	return md;
 }
 //#endregion
@@ -658,44 +827,95 @@ const VALID_TYPES = new Set([
 	"entity",
 	"event"
 ]);
-/** Extract PINData entries from raw/journal content via a single LLM call */
+/**
+* Budget for the total text sent to LLM.
+* ~10K chars ≈ ~2.5K tokens (English) / ~5K tokens (CJK).
+* PINData extraction only needs gist, not full content.
+*/
+const MAX_INPUT_CHARS = 1e4;
+/** Head portion gets the lion's share — title, intro, overview */
+const HEAD_CHARS = 4e3;
+/** Tail portion — conclusions, takeaways, resource lists */
+const TAIL_CHARS = 3e3;
+/** Remaining budget goes to random middle samples */
+const MIDDLE_BUDGET = MAX_INPUT_CHARS - HEAD_CHARS - TAIL_CHARS;
+/** Number of random middle samples to pick */
+const MIDDLE_SAMPLES = 2;
+/**
+* For content that fits the budget, return as-is.
+* For long content, sample: head + tail + random middle paragraphs.
+*
+* Rationale: PINData extraction asks "what does this mean to the USER",
+* not "summarize the entire document". The head (title/intro) and tail
+* (conclusions/resources) carry 80%+ of personal signal. Middle sections
+* of long articles (paper tables, code listings, repetitive data) are
+* mostly noise for personal knowledge extraction.
+*/
+function prepareContent(text) {
+	if (text.length <= MAX_INPUT_CHARS) return text;
+	const head = text.slice(0, HEAD_CHARS);
+	const tail = text.slice(-TAIL_CHARS);
+	const middleStart = HEAD_CHARS;
+	const middleEnd = text.length - TAIL_CHARS;
+	const paragraphs = text.slice(middleStart, middleEnd).split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 100);
+	const samples = [];
+	let sampledChars = 0;
+	const perSampleBudget = Math.floor(MIDDLE_BUDGET / MIDDLE_SAMPLES);
+	if (paragraphs.length > 0) {
+		const step = Math.max(1, Math.floor(paragraphs.length / MIDDLE_SAMPLES));
+		for (let i = 0; i < MIDDLE_SAMPLES && i * step < paragraphs.length; i++) {
+			const truncated = paragraphs[i * step].slice(0, perSampleBudget);
+			samples.push(truncated);
+			sampledChars += truncated.length;
+		}
+	}
+	const assembled = `${head}${samples.length > 0 ? `\n\n[... middle section sampled — ${(middleEnd - middleStart).toLocaleString()} chars total ...]\n\n${samples.join("\n\n---\n\n")}` : ""}\n\n[... end section ...]\n\n${tail}`;
+	info(`Content ${text.length.toLocaleString()} chars → sampled to ${assembled.length.toLocaleString()} chars (head:${HEAD_CHARS} + ${samples.length} mid-samples:${sampledChars} + tail:${TAIL_CHARS})`);
+	return assembled;
+}
+/** Parse one LLM JSON response into validated PINDataEntry[] */
+function parseExtractResponse(response) {
+	const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
+	const raw = JSON.parse(cleaned);
+	let rawEntries;
+	if (Array.isArray(raw)) rawEntries = raw;
+	else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
+	else return [];
+	const entries = [];
+	for (const item of rawEntries) {
+		if (!item || typeof item !== "object") continue;
+		const obj = item;
+		const type = obj.type;
+		const entryContent = obj.content;
+		const topic = obj.topic;
+		if (!type || !entryContent || !topic) continue;
+		if (!VALID_TYPES.has(type)) continue;
+		entries.push({
+			type,
+			content: entryContent.trim(),
+			topic: topic.trim(),
+			tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
+		});
+	}
+	return entries;
+}
+/**
+* Extract PINData entries from raw/journal content via a single LLM call.
+* Long content is sampled (head + tail + middle samples) to fit the budget.
+*/
 async function extractPinData(content, source) {
 	const system = extractSystemPrompt();
-	const response = await llmCall(extractUserPrompt(content, source), system);
+	const prompt = extractUserPrompt(prepareContent(content), source);
 	try {
-		const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
-		const raw = JSON.parse(cleaned);
-		let rawEntries;
-		if (Array.isArray(raw)) rawEntries = raw;
-		else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
-		else return {
-			entries: [],
-			summary: ""
-		};
-		const entries = [];
-		for (const item of rawEntries) {
-			if (!item || typeof item !== "object") continue;
-			const obj = item;
-			const type = obj.type;
-			const content = obj.content;
-			const topic = obj.topic;
-			if (!type || !content || !topic) continue;
-			if (!VALID_TYPES.has(type)) continue;
-			entries.push({
-				type,
-				content: content.trim(),
-				topic: topic.trim(),
-				tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
-			});
-		}
+		const entries = parseExtractResponse(await llmCall(prompt, system));
 		return {
 			entries,
 			summary: entries.length > 0 ? entries.slice(0, 3).map((e) => e.content).join("; ") : "no extractable signal"
 		};
-	} catch {
+	} catch (err) {
 		return {
 			entries: [],
-			summary: `Failed to parse extract response: ${response.slice(0, 100)}`
+			summary: `Failed to parse extract response: ${(err instanceof Error ? err.message : String(err)).slice(0, 100)}`
 		};
 	}
 }