personal-ai 0.2.1 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +127 -268
- package/dist/entry.mjs +350 -51
- package/dist/entry.mjs.map +1 -1
- package/dist/index.mjs +267 -47
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -1
package/dist/index.mjs
CHANGED
|
@@ -292,11 +292,123 @@ function spinner(text) {
|
|
|
292
292
|
//#endregion
|
|
293
293
|
//#region src/scraper/index.ts
|
|
294
294
|
/**
|
|
295
|
-
*
|
|
296
|
-
*
|
|
297
|
-
*
|
|
295
|
+
* Web scraper: three-tier strategy for content extraction.
|
|
296
|
+
*
|
|
297
|
+
* 1. GitHub blob URLs → convert to raw.githubusercontent.com, fetch raw markdown (zero rendering)
|
|
298
|
+
* 2. General URLs → Playwright headless browser + DOM preprocessing + Defuddle extraction
|
|
299
|
+
* 3. Fallback → plain fetch + Defuddle (for when Playwright is unavailable)
|
|
300
|
+
*
|
|
301
|
+
* Reference: linkmind-master/src/scraper.ts
|
|
298
302
|
*/
|
|
299
|
-
|
|
303
|
+
const CHROME_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
304
|
+
/** Minimum markdown length to consider a scrape successful */
|
|
305
|
+
const MIN_CONTENT_LENGTH = 100;
|
|
306
|
+
/** Check if a URL points to a file on GitHub (blob view) */
|
|
307
|
+
function isGithubBlobUrl(url) {
|
|
308
|
+
try {
|
|
309
|
+
const u = new URL(url);
|
|
310
|
+
return u.hostname === "github.com" && /\/blob\//.test(u.pathname);
|
|
311
|
+
} catch {
|
|
312
|
+
return false;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
/** Convert GitHub blob URL to raw.githubusercontent.com URL */
|
|
316
|
+
function toRawGithubUrl(url) {
|
|
317
|
+
return url.replace("https://github.com/", "https://raw.githubusercontent.com/").replace("/blob/", "/");
|
|
318
|
+
}
|
|
319
|
+
/** Fast path: fetch raw content directly from GitHub (returns markdown/text as-is) */
|
|
320
|
+
async function scrapeGithubRaw(url, timeout) {
|
|
321
|
+
const rawUrl = toRawGithubUrl(url);
|
|
322
|
+
const controller = new AbortController();
|
|
323
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
324
|
+
try {
|
|
325
|
+
const resp = await fetch(rawUrl, {
|
|
326
|
+
signal: controller.signal,
|
|
327
|
+
headers: { "User-Agent": CHROME_UA }
|
|
328
|
+
});
|
|
329
|
+
if (!resp.ok) throw new Error(`GitHub raw fetch failed: HTTP ${resp.status}`);
|
|
330
|
+
const markdown = await resp.text();
|
|
331
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
332
|
+
const pathParts = new URL(url).pathname.split("/");
|
|
333
|
+
const filename = pathParts[pathParts.length - 1] ?? "Untitled";
|
|
334
|
+
return {
|
|
335
|
+
url,
|
|
336
|
+
title: titleMatch?.[1]?.trim() ?? filename,
|
|
337
|
+
markdown
|
|
338
|
+
};
|
|
339
|
+
} finally {
|
|
340
|
+
clearTimeout(timer);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* DOM preprocessing script executed inside the browser.
|
|
345
|
+
* Removes navigation, ads, cookie banners, and other non-content elements
|
|
346
|
+
* before Defuddle extraction. (Borrowed from linkmind)
|
|
347
|
+
*/
|
|
348
|
+
const DOM_PREPROCESS_SCRIPT = `(() => {
|
|
349
|
+
// Remove script, style, stylesheet links
|
|
350
|
+
document.querySelectorAll("script, style, link[rel='stylesheet']").forEach(el => el.remove());
|
|
351
|
+
// Remove navigation elements
|
|
352
|
+
document.querySelectorAll("nav, footer, aside").forEach(el => el.remove());
|
|
353
|
+
// Remove headers not inside article/main
|
|
354
|
+
document.querySelectorAll("header").forEach(el => {
|
|
355
|
+
if (!el.closest("article") && !el.closest("main")) el.remove();
|
|
356
|
+
});
|
|
357
|
+
// Remove ARIA landmark roles
|
|
358
|
+
document.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"], [role="complementary"], [role="search"]').forEach(el => el.remove());
|
|
359
|
+
// Remove cookie/share/comment noise
|
|
360
|
+
document.querySelectorAll('[class*="cookie-banner"], [id*="cookie-banner"], [class*="cookie-consent"], [class*="share-buttons"], [class*="social-share"], [class*="comment-section"], [id*="comments"]').forEach(el => el.remove());
|
|
361
|
+
// Remove hidden elements
|
|
362
|
+
document.querySelectorAll('[hidden], [aria-hidden="true"]').forEach(el => el.remove());
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
title: document.title,
|
|
366
|
+
html: document.documentElement.outerHTML,
|
|
367
|
+
};
|
|
368
|
+
})()`;
|
|
369
|
+
/** Scrape with Playwright headless browser + Defuddle */
|
|
370
|
+
async function scrapeWithPlaywright(url, timeout) {
|
|
371
|
+
const pw = await import("playwright");
|
|
372
|
+
const { Defuddle } = await import("defuddle/node");
|
|
373
|
+
const browser = await pw.chromium.launch({
|
|
374
|
+
headless: true,
|
|
375
|
+
args: ["--disable-blink-features=AutomationControlled"]
|
|
376
|
+
});
|
|
377
|
+
try {
|
|
378
|
+
const page = await (await browser.newContext({
|
|
379
|
+
viewport: {
|
|
380
|
+
width: 1280,
|
|
381
|
+
height: 900
|
|
382
|
+
},
|
|
383
|
+
userAgent: CHROME_UA,
|
|
384
|
+
locale: "en-US"
|
|
385
|
+
})).newPage();
|
|
386
|
+
await page.goto(url, {
|
|
387
|
+
waitUntil: "domcontentloaded",
|
|
388
|
+
timeout
|
|
389
|
+
});
|
|
390
|
+
await page.waitForTimeout(2e3);
|
|
391
|
+
const { title: pageTitle, html } = await page.evaluate(DOM_PREPROCESS_SCRIPT);
|
|
392
|
+
await browser.close();
|
|
393
|
+
const origLog = globalThis.console.log;
|
|
394
|
+
globalThis.console.log = (msg, ...args) => {
|
|
395
|
+
if (typeof msg === "string" && msg.includes("Initial parse returned very little content")) return;
|
|
396
|
+
origLog(msg, ...args);
|
|
397
|
+
};
|
|
398
|
+
const result = await Defuddle(html, url);
|
|
399
|
+
globalThis.console.log = origLog;
|
|
400
|
+
return {
|
|
401
|
+
url,
|
|
402
|
+
title: result.title || pageTitle || "Untitled",
|
|
403
|
+
markdown: result.content ? htmlToSimpleMarkdown(result.content) : ""
|
|
404
|
+
};
|
|
405
|
+
} catch (err) {
|
|
406
|
+
await browser.close().catch(() => {});
|
|
407
|
+
throw err;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
/** Lightweight scrape: plain HTTP fetch + Defuddle (no JS rendering) */
|
|
411
|
+
async function scrapeWithFetch(url, timeout) {
|
|
300
412
|
const controller = new AbortController();
|
|
301
413
|
const timer = setTimeout(() => controller.abort(), timeout);
|
|
302
414
|
let html;
|
|
@@ -304,7 +416,7 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
304
416
|
const response = await fetch(url, {
|
|
305
417
|
signal: controller.signal,
|
|
306
418
|
headers: {
|
|
307
|
-
"User-Agent":
|
|
419
|
+
"User-Agent": CHROME_UA,
|
|
308
420
|
Accept: "text/html,application/xhtml+xml"
|
|
309
421
|
}
|
|
310
422
|
});
|
|
@@ -314,16 +426,13 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
314
426
|
clearTimeout(timer);
|
|
315
427
|
}
|
|
316
428
|
try {
|
|
317
|
-
const
|
|
318
|
-
const
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
|
|
325
|
-
};
|
|
326
|
-
}
|
|
429
|
+
const { Defuddle } = await import("defuddle/node");
|
|
430
|
+
const result = await Defuddle(html, url);
|
|
431
|
+
return {
|
|
432
|
+
url,
|
|
433
|
+
title: result.title || extractTitleFromHtml(html),
|
|
434
|
+
markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
|
|
435
|
+
};
|
|
327
436
|
} catch {
|
|
328
437
|
warn("defuddle not available, using basic HTML extraction");
|
|
329
438
|
}
|
|
@@ -333,17 +442,77 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
333
442
|
markdown: extractTextFromHtml(html)
|
|
334
443
|
};
|
|
335
444
|
}
|
|
445
|
+
/**
|
|
446
|
+
* Scrape a URL and return title + markdown content.
|
|
447
|
+
*
|
|
448
|
+
* Strategy:
|
|
449
|
+
* 1. GitHub blob URL → raw.githubusercontent.com (instant, perfect fidelity)
|
|
450
|
+
* 2. Playwright + Defuddle (handles JS-rendered pages)
|
|
451
|
+
* 3. Fetch + Defuddle fallback (static pages, or when Playwright missing)
|
|
452
|
+
*/
|
|
453
|
+
async function scrapeUrl(url, timeout = 3e4) {
|
|
454
|
+
if (isGithubBlobUrl(url)) {
|
|
455
|
+
info("GitHub blob detected — fetching raw content directly");
|
|
456
|
+
return scrapeGithubRaw(url, timeout);
|
|
457
|
+
}
|
|
458
|
+
try {
|
|
459
|
+
const result = await scrapeWithPlaywright(url, timeout);
|
|
460
|
+
if (result.markdown.length < MIN_CONTENT_LENGTH) {
|
|
461
|
+
warn(`Playwright extracted only ${result.markdown.length} chars — trying fetch fallback`);
|
|
462
|
+
const fallback = await scrapeWithFetch(url, timeout);
|
|
463
|
+
return fallback.markdown.length > result.markdown.length ? fallback : result;
|
|
464
|
+
}
|
|
465
|
+
return result;
|
|
466
|
+
} catch (err) {
|
|
467
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
468
|
+
warn(`Playwright scrape failed (${msg}) — falling back to fetch`);
|
|
469
|
+
}
|
|
470
|
+
return scrapeWithFetch(url, timeout);
|
|
471
|
+
}
|
|
336
472
|
/** Extract <title> from HTML */
|
|
337
473
|
function extractTitleFromHtml(html) {
|
|
338
474
|
return html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? "Untitled";
|
|
339
475
|
}
|
|
340
|
-
/** Basic HTML to text extraction (fallback) */
|
|
476
|
+
/** Basic HTML to text extraction (last-resort fallback) */
|
|
341
477
|
function extractTextFromHtml(html) {
|
|
342
478
|
return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/'/g, "'").replace(/ /g, " ").replace(/\s+/g, " ").trim().slice(0, 1e4);
|
|
343
479
|
}
|
|
344
|
-
/** Convert
|
|
480
|
+
/** Convert HTML fragment to simple Markdown (from linkmind, extended) */
|
|
345
481
|
function htmlToSimpleMarkdown(html) {
|
|
346
|
-
|
|
482
|
+
if (!html) return "";
|
|
483
|
+
let md = html;
|
|
484
|
+
md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
|
|
485
|
+
md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
|
|
486
|
+
md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
|
|
487
|
+
md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
|
|
488
|
+
md = md.replace(/<h5[^>]*>(.*?)<\/h5>/gi, "##### $1\n\n");
|
|
489
|
+
md = md.replace(/<h6[^>]*>(.*?)<\/h6>/gi, "###### $1\n\n");
|
|
490
|
+
md = md.replace(/<p[^>]*>/gi, "\n\n");
|
|
491
|
+
md = md.replace(/<\/p>/gi, "");
|
|
492
|
+
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
493
|
+
md = md.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, "**$2**");
|
|
494
|
+
md = md.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, "*$2*");
|
|
495
|
+
md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
|
|
496
|
+
md = md.replace(/<code[^>]*>(.*?)<\/code>/gi, "`$1`");
|
|
497
|
+
md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "\n```\n$1\n```\n");
|
|
498
|
+
md = md.replace(/<li[^>]*>/gi, "- ");
|
|
499
|
+
md = md.replace(/<\/li>/gi, "\n");
|
|
500
|
+
md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
|
|
501
|
+
md = md.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content) => {
|
|
502
|
+
return content.split("\n").map((line) => `> ${line}`).join("\n");
|
|
503
|
+
});
|
|
504
|
+
md = md.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, "");
|
|
505
|
+
md = md.replace(/<img[^>]*src="([^"]*)"[^>]*\/?>/gi, "");
|
|
506
|
+
md = md.replace(/<[^>]+>/g, "");
|
|
507
|
+
md = md.replace(/&/g, "&");
|
|
508
|
+
md = md.replace(/</g, "<");
|
|
509
|
+
md = md.replace(/>/g, ">");
|
|
510
|
+
md = md.replace(/"/g, "\"");
|
|
511
|
+
md = md.replace(/'/g, "'");
|
|
512
|
+
md = md.replace(/ /g, " ");
|
|
513
|
+
md = md.replace(/\n{3,}/g, "\n\n");
|
|
514
|
+
md = md.trim();
|
|
515
|
+
return md;
|
|
347
516
|
}
|
|
348
517
|
|
|
349
518
|
//#endregion
|
|
@@ -658,44 +827,95 @@ const VALID_TYPES = new Set([
|
|
|
658
827
|
"entity",
|
|
659
828
|
"event"
|
|
660
829
|
]);
|
|
661
|
-
/**
|
|
830
|
+
/**
|
|
831
|
+
* Budget for the total text sent to LLM.
|
|
832
|
+
* ~10K chars ≈ ~2.5K tokens (English) / ~5K tokens (CJK).
|
|
833
|
+
* PINData extraction only needs gist, not full content.
|
|
834
|
+
*/
|
|
835
|
+
const MAX_INPUT_CHARS = 1e4;
|
|
836
|
+
/** Head portion gets the lion's share — title, intro, overview */
|
|
837
|
+
const HEAD_CHARS = 4e3;
|
|
838
|
+
/** Tail portion — conclusions, takeaways, resource lists */
|
|
839
|
+
const TAIL_CHARS = 3e3;
|
|
840
|
+
/** Remaining budget goes to random middle samples */
|
|
841
|
+
const MIDDLE_BUDGET = MAX_INPUT_CHARS - HEAD_CHARS - TAIL_CHARS;
|
|
842
|
+
/** Number of random middle samples to pick */
|
|
843
|
+
const MIDDLE_SAMPLES = 2;
|
|
844
|
+
/**
|
|
845
|
+
* For content that fits the budget, return as-is.
|
|
846
|
+
* For long content, sample: head + tail + random middle paragraphs.
|
|
847
|
+
*
|
|
848
|
+
* Rationale: PINData extraction asks "what does this mean to the USER",
|
|
849
|
+
* not "summarize the entire document". The head (title/intro) and tail
|
|
850
|
+
* (conclusions/resources) carry 80%+ of personal signal. Middle sections
|
|
851
|
+
* of long articles (paper tables, code listings, repetitive data) are
|
|
852
|
+
* mostly noise for personal knowledge extraction.
|
|
853
|
+
*/
|
|
854
|
+
function prepareContent(text) {
|
|
855
|
+
if (text.length <= MAX_INPUT_CHARS) return text;
|
|
856
|
+
const head = text.slice(0, HEAD_CHARS);
|
|
857
|
+
const tail = text.slice(-TAIL_CHARS);
|
|
858
|
+
const middleStart = HEAD_CHARS;
|
|
859
|
+
const middleEnd = text.length - TAIL_CHARS;
|
|
860
|
+
const paragraphs = text.slice(middleStart, middleEnd).split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 100);
|
|
861
|
+
const samples = [];
|
|
862
|
+
let sampledChars = 0;
|
|
863
|
+
const perSampleBudget = Math.floor(MIDDLE_BUDGET / MIDDLE_SAMPLES);
|
|
864
|
+
if (paragraphs.length > 0) {
|
|
865
|
+
const step = Math.max(1, Math.floor(paragraphs.length / MIDDLE_SAMPLES));
|
|
866
|
+
for (let i = 0; i < MIDDLE_SAMPLES && i * step < paragraphs.length; i++) {
|
|
867
|
+
const truncated = paragraphs[i * step].slice(0, perSampleBudget);
|
|
868
|
+
samples.push(truncated);
|
|
869
|
+
sampledChars += truncated.length;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
const assembled = `${head}${samples.length > 0 ? `\n\n[... middle section sampled — ${(middleEnd - middleStart).toLocaleString()} chars total ...]\n\n${samples.join("\n\n---\n\n")}` : ""}\n\n[... end section ...]\n\n${tail}`;
|
|
873
|
+
info(`Content ${text.length.toLocaleString()} chars → sampled to ${assembled.length.toLocaleString()} chars (head:${HEAD_CHARS} + ${samples.length} mid-samples:${sampledChars} + tail:${TAIL_CHARS})`);
|
|
874
|
+
return assembled;
|
|
875
|
+
}
|
|
876
|
+
/** Parse one LLM JSON response into validated PINDataEntry[] */
|
|
877
|
+
function parseExtractResponse(response) {
|
|
878
|
+
const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
|
|
879
|
+
const raw = JSON.parse(cleaned);
|
|
880
|
+
let rawEntries;
|
|
881
|
+
if (Array.isArray(raw)) rawEntries = raw;
|
|
882
|
+
else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
|
|
883
|
+
else return [];
|
|
884
|
+
const entries = [];
|
|
885
|
+
for (const item of rawEntries) {
|
|
886
|
+
if (!item || typeof item !== "object") continue;
|
|
887
|
+
const obj = item;
|
|
888
|
+
const type = obj.type;
|
|
889
|
+
const entryContent = obj.content;
|
|
890
|
+
const topic = obj.topic;
|
|
891
|
+
if (!type || !entryContent || !topic) continue;
|
|
892
|
+
if (!VALID_TYPES.has(type)) continue;
|
|
893
|
+
entries.push({
|
|
894
|
+
type,
|
|
895
|
+
content: entryContent.trim(),
|
|
896
|
+
topic: topic.trim(),
|
|
897
|
+
tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
return entries;
|
|
901
|
+
}
|
|
902
|
+
/**
|
|
903
|
+
* Extract PINData entries from raw/journal content via a single LLM call.
|
|
904
|
+
* Long content is sampled (head + tail + middle samples) to fit the budget.
|
|
905
|
+
*/
|
|
662
906
|
async function extractPinData(content, source) {
|
|
663
907
|
const system = extractSystemPrompt();
|
|
664
|
-
const
|
|
908
|
+
const prompt = extractUserPrompt(prepareContent(content), source);
|
|
665
909
|
try {
|
|
666
|
-
const
|
|
667
|
-
const raw = JSON.parse(cleaned);
|
|
668
|
-
let rawEntries;
|
|
669
|
-
if (Array.isArray(raw)) rawEntries = raw;
|
|
670
|
-
else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
|
|
671
|
-
else return {
|
|
672
|
-
entries: [],
|
|
673
|
-
summary: ""
|
|
674
|
-
};
|
|
675
|
-
const entries = [];
|
|
676
|
-
for (const item of rawEntries) {
|
|
677
|
-
if (!item || typeof item !== "object") continue;
|
|
678
|
-
const obj = item;
|
|
679
|
-
const type = obj.type;
|
|
680
|
-
const content = obj.content;
|
|
681
|
-
const topic = obj.topic;
|
|
682
|
-
if (!type || !content || !topic) continue;
|
|
683
|
-
if (!VALID_TYPES.has(type)) continue;
|
|
684
|
-
entries.push({
|
|
685
|
-
type,
|
|
686
|
-
content: content.trim(),
|
|
687
|
-
topic: topic.trim(),
|
|
688
|
-
tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
|
|
689
|
-
});
|
|
690
|
-
}
|
|
910
|
+
const entries = parseExtractResponse(await llmCall(prompt, system));
|
|
691
911
|
return {
|
|
692
912
|
entries,
|
|
693
913
|
summary: entries.length > 0 ? entries.slice(0, 3).map((e) => e.content).join("; ") : "no extractable signal"
|
|
694
914
|
};
|
|
695
|
-
} catch {
|
|
915
|
+
} catch (err) {
|
|
696
916
|
return {
|
|
697
917
|
entries: [],
|
|
698
|
-
summary: `Failed to parse extract response: ${
|
|
918
|
+
summary: `Failed to parse extract response: ${(err instanceof Error ? err.message : String(err)).slice(0, 100)}`
|
|
699
919
|
};
|
|
700
920
|
}
|
|
701
921
|
}
|