personal-ai 0.2.1 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +127 -268
- package/dist/entry.mjs +350 -51
- package/dist/entry.mjs.map +1 -1
- package/dist/index.mjs +267 -47
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -1
package/dist/entry.mjs
CHANGED
|
@@ -5,7 +5,7 @@ import fs from "node:fs/promises";
|
|
|
5
5
|
import path from "node:path";
|
|
6
6
|
import os from "node:os";
|
|
7
7
|
import http from "node:http";
|
|
8
|
-
import { URL } from "node:url";
|
|
8
|
+
import { URL as URL$1 } from "node:url";
|
|
9
9
|
import { google } from "googleapis";
|
|
10
10
|
import open from "open";
|
|
11
11
|
import chalk from "chalk";
|
|
@@ -282,7 +282,7 @@ var GoogleOAuth = class {
|
|
|
282
282
|
return new Promise((resolve, reject) => {
|
|
283
283
|
const server = http.createServer((req, res) => {
|
|
284
284
|
if (req.url?.startsWith("/callback")) {
|
|
285
|
-
const url = new URL(req.url ?? "", `http://${req.headers.host}`);
|
|
285
|
+
const url = new URL$1(req.url ?? "", `http://${req.headers.host}`);
|
|
286
286
|
const code = url.searchParams.get("code");
|
|
287
287
|
const state = url.searchParams.get("state");
|
|
288
288
|
if (code && state === expectedState) {
|
|
@@ -1582,6 +1582,75 @@ async function rebuildProfile(opts) {
|
|
|
1582
1582
|
|
|
1583
1583
|
//#endregion
|
|
1584
1584
|
//#region src/cli/register.init.ts
|
|
1585
|
+
/**
|
|
1586
|
+
* Extract key highlights from profile.md for display after init.
|
|
1587
|
+
* Pulls: identity, active repos, recent commit stats, top tools.
|
|
1588
|
+
*/
|
|
1589
|
+
function extractProfileHighlights(content) {
|
|
1590
|
+
const lines = content.split("\n");
|
|
1591
|
+
const highlights = [];
|
|
1592
|
+
let inIdentity = false;
|
|
1593
|
+
let identityCount = 0;
|
|
1594
|
+
for (const line of lines) {
|
|
1595
|
+
if (line.startsWith("## ") && /identity/i.test(line)) {
|
|
1596
|
+
inIdentity = true;
|
|
1597
|
+
continue;
|
|
1598
|
+
}
|
|
1599
|
+
if (inIdentity && line.startsWith("## ")) break;
|
|
1600
|
+
if (inIdentity && line.trim()) {
|
|
1601
|
+
highlights.push(line);
|
|
1602
|
+
identityCount++;
|
|
1603
|
+
if (identityCount >= 8) break;
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
let inRepos = false;
|
|
1607
|
+
let repoCount = 0;
|
|
1608
|
+
for (const line of lines) {
|
|
1609
|
+
if (line.startsWith("## Active Git Repositories")) {
|
|
1610
|
+
inRepos = true;
|
|
1611
|
+
highlights.push("");
|
|
1612
|
+
highlights.push("Recent Projects:");
|
|
1613
|
+
continue;
|
|
1614
|
+
}
|
|
1615
|
+
if (inRepos && line.startsWith("## ")) break;
|
|
1616
|
+
if (inRepos && line.trim().startsWith("-")) {
|
|
1617
|
+
const repoPath = line.trim().replace(/^-\s*/, "");
|
|
1618
|
+
const name = repoPath.split("/").pop() || repoPath;
|
|
1619
|
+
highlights.push(` - ${name}`);
|
|
1620
|
+
repoCount++;
|
|
1621
|
+
if (repoCount >= 6) break;
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1624
|
+
let inCommits = false;
|
|
1625
|
+
for (const line of lines) {
|
|
1626
|
+
if (line.startsWith("## Recent Commit Activity")) {
|
|
1627
|
+
inCommits = true;
|
|
1628
|
+
continue;
|
|
1629
|
+
}
|
|
1630
|
+
if (inCommits && line.startsWith("## ")) break;
|
|
1631
|
+
if (inCommits && /^total commits/i.test(line.trim())) {
|
|
1632
|
+
highlights.push("");
|
|
1633
|
+
highlights.push(`Recent Activity: ${line.trim()}`);
|
|
1634
|
+
break;
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
let inTools = false;
|
|
1638
|
+
const tools = [];
|
|
1639
|
+
for (const line of lines) {
|
|
1640
|
+
if (line.startsWith("## Recently Used Tools")) {
|
|
1641
|
+
inTools = true;
|
|
1642
|
+
continue;
|
|
1643
|
+
}
|
|
1644
|
+
if (inTools && line.startsWith("## ")) break;
|
|
1645
|
+
if (inTools && line.trim().startsWith("-")) {
|
|
1646
|
+
const tool = line.trim().replace(/^-\s*/, "").split(":")[0].trim();
|
|
1647
|
+
tools.push(tool);
|
|
1648
|
+
if (tools.length >= 8) break;
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
if (tools.length > 0) highlights.push(`Top Tools: ${tools.join(", ")}`);
|
|
1652
|
+
return highlights.length > 0 ? highlights.join("\n") : null;
|
|
1653
|
+
}
|
|
1585
1654
|
/** Try to install QMD globally via npm */
|
|
1586
1655
|
async function installQmd() {
|
|
1587
1656
|
const spin = spinner("Installing QMD (search engine)...");
|
|
@@ -1787,6 +1856,16 @@ async function runInit(options = {}) {
|
|
|
1787
1856
|
else info(`[8/${TOTAL_STEPS}] Indexing for search... (skipped, QMD not available)`);
|
|
1788
1857
|
log("");
|
|
1789
1858
|
success("Setup complete!");
|
|
1859
|
+
log("");
|
|
1860
|
+
try {
|
|
1861
|
+
const highlights = extractProfileHighlights(await fs.readFile(path.join(paiHome, "profile.md"), "utf-8"));
|
|
1862
|
+
if (highlights) {
|
|
1863
|
+
log("── Profile Summary ──");
|
|
1864
|
+
log(highlights);
|
|
1865
|
+
log("─────────────────────");
|
|
1866
|
+
log("");
|
|
1867
|
+
}
|
|
1868
|
+
} catch {}
|
|
1790
1869
|
log(` Profile: ${paiHome}/profile.md (${profileLines} lines, ${profileSources} sources)`);
|
|
1791
1870
|
const rawCount = gmailCount + calendarCount;
|
|
1792
1871
|
if (rawCount > 0) log(` Raw data: ${rawCount} file(s) (${gmailCount} gmail, ${calendarCount} calendar)`);
|
|
@@ -1795,7 +1874,7 @@ async function runInit(options = {}) {
|
|
|
1795
1874
|
log(` Google: ${googleAuthed ? "connected" : "not connected"}`);
|
|
1796
1875
|
log("");
|
|
1797
1876
|
info("Next:");
|
|
1798
|
-
log(" pai profile # View
|
|
1877
|
+
log(" pai profile # View full profile");
|
|
1799
1878
|
log(" pai distribute # Deploy to Cursor/Claude");
|
|
1800
1879
|
log(" pai ask \"question\" # Ask about yourself");
|
|
1801
1880
|
}
|
|
@@ -1822,11 +1901,123 @@ function registerInitCommand(program) {
|
|
|
1822
1901
|
//#endregion
|
|
1823
1902
|
//#region src/scraper/index.ts
|
|
1824
1903
|
/**
|
|
1825
|
-
*
|
|
1826
|
-
*
|
|
1827
|
-
*
|
|
1904
|
+
* Web scraper: three-tier strategy for content extraction.
|
|
1905
|
+
*
|
|
1906
|
+
* 1. GitHub blob URLs → convert to raw.githubusercontent.com, fetch raw markdown (zero rendering)
|
|
1907
|
+
* 2. General URLs → Playwright headless browser + DOM preprocessing + Defuddle extraction
|
|
1908
|
+
* 3. Fallback → plain fetch + Defuddle (for when Playwright is unavailable)
|
|
1909
|
+
*
|
|
1910
|
+
* Reference: linkmind-master/src/scraper.ts
|
|
1828
1911
|
*/
|
|
1829
|
-
|
|
1912
|
+
const CHROME_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
1913
|
+
/** Minimum markdown length to consider a scrape successful */
|
|
1914
|
+
const MIN_CONTENT_LENGTH = 100;
|
|
1915
|
+
/** Check if a URL points to a file on GitHub (blob view) */
|
|
1916
|
+
function isGithubBlobUrl(url) {
|
|
1917
|
+
try {
|
|
1918
|
+
const u = new URL(url);
|
|
1919
|
+
return u.hostname === "github.com" && /\/blob\//.test(u.pathname);
|
|
1920
|
+
} catch {
|
|
1921
|
+
return false;
|
|
1922
|
+
}
|
|
1923
|
+
}
|
|
1924
|
+
/** Convert GitHub blob URL to raw.githubusercontent.com URL */
|
|
1925
|
+
function toRawGithubUrl(url) {
|
|
1926
|
+
return url.replace("https://github.com/", "https://raw.githubusercontent.com/").replace("/blob/", "/");
|
|
1927
|
+
}
|
|
1928
|
+
/** Fast path: fetch raw content directly from GitHub (returns markdown/text as-is) */
|
|
1929
|
+
async function scrapeGithubRaw(url, timeout) {
|
|
1930
|
+
const rawUrl = toRawGithubUrl(url);
|
|
1931
|
+
const controller = new AbortController();
|
|
1932
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
1933
|
+
try {
|
|
1934
|
+
const resp = await fetch(rawUrl, {
|
|
1935
|
+
signal: controller.signal,
|
|
1936
|
+
headers: { "User-Agent": CHROME_UA }
|
|
1937
|
+
});
|
|
1938
|
+
if (!resp.ok) throw new Error(`GitHub raw fetch failed: HTTP ${resp.status}`);
|
|
1939
|
+
const markdown = await resp.text();
|
|
1940
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
1941
|
+
const pathParts = new URL(url).pathname.split("/");
|
|
1942
|
+
const filename = pathParts[pathParts.length - 1] ?? "Untitled";
|
|
1943
|
+
return {
|
|
1944
|
+
url,
|
|
1945
|
+
title: titleMatch?.[1]?.trim() ?? filename,
|
|
1946
|
+
markdown
|
|
1947
|
+
};
|
|
1948
|
+
} finally {
|
|
1949
|
+
clearTimeout(timer);
|
|
1950
|
+
}
|
|
1951
|
+
}
|
|
1952
|
+
/**
|
|
1953
|
+
* DOM preprocessing script executed inside the browser.
|
|
1954
|
+
* Removes navigation, ads, cookie banners, and other non-content elements
|
|
1955
|
+
* before Defuddle extraction. (Borrowed from linkmind)
|
|
1956
|
+
*/
|
|
1957
|
+
const DOM_PREPROCESS_SCRIPT = `(() => {
|
|
1958
|
+
// Remove script, style, stylesheet links
|
|
1959
|
+
document.querySelectorAll("script, style, link[rel='stylesheet']").forEach(el => el.remove());
|
|
1960
|
+
// Remove navigation elements
|
|
1961
|
+
document.querySelectorAll("nav, footer, aside").forEach(el => el.remove());
|
|
1962
|
+
// Remove headers not inside article/main
|
|
1963
|
+
document.querySelectorAll("header").forEach(el => {
|
|
1964
|
+
if (!el.closest("article") && !el.closest("main")) el.remove();
|
|
1965
|
+
});
|
|
1966
|
+
// Remove ARIA landmark roles
|
|
1967
|
+
document.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"], [role="complementary"], [role="search"]').forEach(el => el.remove());
|
|
1968
|
+
// Remove cookie/share/comment noise
|
|
1969
|
+
document.querySelectorAll('[class*="cookie-banner"], [id*="cookie-banner"], [class*="cookie-consent"], [class*="share-buttons"], [class*="social-share"], [class*="comment-section"], [id*="comments"]').forEach(el => el.remove());
|
|
1970
|
+
// Remove hidden elements
|
|
1971
|
+
document.querySelectorAll('[hidden], [aria-hidden="true"]').forEach(el => el.remove());
|
|
1972
|
+
|
|
1973
|
+
return {
|
|
1974
|
+
title: document.title,
|
|
1975
|
+
html: document.documentElement.outerHTML,
|
|
1976
|
+
};
|
|
1977
|
+
})()`;
|
|
1978
|
+
/** Scrape with Playwright headless browser + Defuddle */
|
|
1979
|
+
async function scrapeWithPlaywright(url, timeout) {
|
|
1980
|
+
const pw = await import("playwright");
|
|
1981
|
+
const { Defuddle } = await import("defuddle/node");
|
|
1982
|
+
const browser = await pw.chromium.launch({
|
|
1983
|
+
headless: true,
|
|
1984
|
+
args: ["--disable-blink-features=AutomationControlled"]
|
|
1985
|
+
});
|
|
1986
|
+
try {
|
|
1987
|
+
const page = await (await browser.newContext({
|
|
1988
|
+
viewport: {
|
|
1989
|
+
width: 1280,
|
|
1990
|
+
height: 900
|
|
1991
|
+
},
|
|
1992
|
+
userAgent: CHROME_UA,
|
|
1993
|
+
locale: "en-US"
|
|
1994
|
+
})).newPage();
|
|
1995
|
+
await page.goto(url, {
|
|
1996
|
+
waitUntil: "domcontentloaded",
|
|
1997
|
+
timeout
|
|
1998
|
+
});
|
|
1999
|
+
await page.waitForTimeout(2e3);
|
|
2000
|
+
const { title: pageTitle, html } = await page.evaluate(DOM_PREPROCESS_SCRIPT);
|
|
2001
|
+
await browser.close();
|
|
2002
|
+
const origLog = globalThis.console.log;
|
|
2003
|
+
globalThis.console.log = (msg, ...args) => {
|
|
2004
|
+
if (typeof msg === "string" && msg.includes("Initial parse returned very little content")) return;
|
|
2005
|
+
origLog(msg, ...args);
|
|
2006
|
+
};
|
|
2007
|
+
const result = await Defuddle(html, url);
|
|
2008
|
+
globalThis.console.log = origLog;
|
|
2009
|
+
return {
|
|
2010
|
+
url,
|
|
2011
|
+
title: result.title || pageTitle || "Untitled",
|
|
2012
|
+
markdown: result.content ? htmlToSimpleMarkdown(result.content) : ""
|
|
2013
|
+
};
|
|
2014
|
+
} catch (err) {
|
|
2015
|
+
await browser.close().catch(() => {});
|
|
2016
|
+
throw err;
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
/** Lightweight scrape: plain HTTP fetch + Defuddle (no JS rendering) */
|
|
2020
|
+
async function scrapeWithFetch(url, timeout) {
|
|
1830
2021
|
const controller = new AbortController();
|
|
1831
2022
|
const timer = setTimeout(() => controller.abort(), timeout);
|
|
1832
2023
|
let html;
|
|
@@ -1834,7 +2025,7 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
1834
2025
|
const response = await fetch(url, {
|
|
1835
2026
|
signal: controller.signal,
|
|
1836
2027
|
headers: {
|
|
1837
|
-
"User-Agent":
|
|
2028
|
+
"User-Agent": CHROME_UA,
|
|
1838
2029
|
Accept: "text/html,application/xhtml+xml"
|
|
1839
2030
|
}
|
|
1840
2031
|
});
|
|
@@ -1844,16 +2035,13 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
1844
2035
|
clearTimeout(timer);
|
|
1845
2036
|
}
|
|
1846
2037
|
try {
|
|
1847
|
-
const
|
|
1848
|
-
const
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
|
|
1855
|
-
};
|
|
1856
|
-
}
|
|
2038
|
+
const { Defuddle } = await import("defuddle/node");
|
|
2039
|
+
const result = await Defuddle(html, url);
|
|
2040
|
+
return {
|
|
2041
|
+
url,
|
|
2042
|
+
title: result.title || extractTitleFromHtml(html),
|
|
2043
|
+
markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
|
|
2044
|
+
};
|
|
1857
2045
|
} catch {
|
|
1858
2046
|
warn("defuddle not available, using basic HTML extraction");
|
|
1859
2047
|
}
|
|
@@ -1863,17 +2051,77 @@ async function scrapeUrl(url, timeout = 3e4) {
|
|
|
1863
2051
|
markdown: extractTextFromHtml(html)
|
|
1864
2052
|
};
|
|
1865
2053
|
}
|
|
2054
|
+
/**
|
|
2055
|
+
* Scrape a URL and return title + markdown content.
|
|
2056
|
+
*
|
|
2057
|
+
* Strategy:
|
|
2058
|
+
* 1. GitHub blob URL → raw.githubusercontent.com (instant, perfect fidelity)
|
|
2059
|
+
* 2. Playwright + Defuddle (handles JS-rendered pages)
|
|
2060
|
+
* 3. Fetch + Defuddle fallback (static pages, or when Playwright missing)
|
|
2061
|
+
*/
|
|
2062
|
+
async function scrapeUrl(url, timeout = 3e4) {
|
|
2063
|
+
if (isGithubBlobUrl(url)) {
|
|
2064
|
+
info("GitHub blob detected — fetching raw content directly");
|
|
2065
|
+
return scrapeGithubRaw(url, timeout);
|
|
2066
|
+
}
|
|
2067
|
+
try {
|
|
2068
|
+
const result = await scrapeWithPlaywright(url, timeout);
|
|
2069
|
+
if (result.markdown.length < MIN_CONTENT_LENGTH) {
|
|
2070
|
+
warn(`Playwright extracted only ${result.markdown.length} chars — trying fetch fallback`);
|
|
2071
|
+
const fallback = await scrapeWithFetch(url, timeout);
|
|
2072
|
+
return fallback.markdown.length > result.markdown.length ? fallback : result;
|
|
2073
|
+
}
|
|
2074
|
+
return result;
|
|
2075
|
+
} catch (err) {
|
|
2076
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
2077
|
+
warn(`Playwright scrape failed (${msg}) — falling back to fetch`);
|
|
2078
|
+
}
|
|
2079
|
+
return scrapeWithFetch(url, timeout);
|
|
2080
|
+
}
|
|
1866
2081
|
/** Extract <title> from HTML */
|
|
1867
2082
|
function extractTitleFromHtml(html) {
|
|
1868
2083
|
return html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? "Untitled";
|
|
1869
2084
|
}
|
|
1870
|
-
/** Basic HTML to text extraction (fallback) */
|
|
2085
|
+
/** Basic HTML to text extraction (last-resort fallback) */
|
|
1871
2086
|
function extractTextFromHtml(html) {
|
|
1872
2087
|
return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, "\"").replace(/'/g, "'").replace(/ /g, " ").replace(/\s+/g, " ").trim().slice(0, 1e4);
|
|
1873
2088
|
}
|
|
1874
|
-
/** Convert
|
|
2089
|
+
/** Convert HTML fragment to simple Markdown (from linkmind, extended) */
|
|
1875
2090
|
function htmlToSimpleMarkdown(html) {
|
|
1876
|
-
|
|
2091
|
+
if (!html) return "";
|
|
2092
|
+
let md = html;
|
|
2093
|
+
md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
|
|
2094
|
+
md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
|
|
2095
|
+
md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
|
|
2096
|
+
md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
|
|
2097
|
+
md = md.replace(/<h5[^>]*>(.*?)<\/h5>/gi, "##### $1\n\n");
|
|
2098
|
+
md = md.replace(/<h6[^>]*>(.*?)<\/h6>/gi, "###### $1\n\n");
|
|
2099
|
+
md = md.replace(/<p[^>]*>/gi, "\n\n");
|
|
2100
|
+
md = md.replace(/<\/p>/gi, "");
|
|
2101
|
+
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
2102
|
+
md = md.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, "**$2**");
|
|
2103
|
+
md = md.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, "*$2*");
|
|
2104
|
+
md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
|
|
2105
|
+
md = md.replace(/<code[^>]*>(.*?)<\/code>/gi, "`$1`");
|
|
2106
|
+
md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "\n```\n$1\n```\n");
|
|
2107
|
+
md = md.replace(/<li[^>]*>/gi, "- ");
|
|
2108
|
+
md = md.replace(/<\/li>/gi, "\n");
|
|
2109
|
+
md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
|
|
2110
|
+
md = md.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content) => {
|
|
2111
|
+
return content.split("\n").map((line) => `> ${line}`).join("\n");
|
|
2112
|
+
});
|
|
2113
|
+
md = md.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, "");
|
|
2114
|
+
md = md.replace(/<img[^>]*src="([^"]*)"[^>]*\/?>/gi, "");
|
|
2115
|
+
md = md.replace(/<[^>]+>/g, "");
|
|
2116
|
+
md = md.replace(/&/g, "&");
|
|
2117
|
+
md = md.replace(/</g, "<");
|
|
2118
|
+
md = md.replace(/>/g, ">");
|
|
2119
|
+
md = md.replace(/"/g, "\"");
|
|
2120
|
+
md = md.replace(/'/g, "'");
|
|
2121
|
+
md = md.replace(/ /g, " ");
|
|
2122
|
+
md = md.replace(/\n{3,}/g, "\n\n");
|
|
2123
|
+
md = md.trim();
|
|
2124
|
+
return md;
|
|
1877
2125
|
}
|
|
1878
2126
|
|
|
1879
2127
|
//#endregion
|
|
@@ -2060,44 +2308,95 @@ const VALID_TYPES = new Set([
|
|
|
2060
2308
|
"entity",
|
|
2061
2309
|
"event"
|
|
2062
2310
|
]);
|
|
2063
|
-
/**
|
|
2311
|
+
/**
|
|
2312
|
+
* Budget for the total text sent to LLM.
|
|
2313
|
+
* ~10K chars ≈ ~2.5K tokens (English) / ~5K tokens (CJK).
|
|
2314
|
+
* PINData extraction only needs gist, not full content.
|
|
2315
|
+
*/
|
|
2316
|
+
const MAX_INPUT_CHARS = 1e4;
|
|
2317
|
+
/** Head portion gets the lion's share — title, intro, overview */
|
|
2318
|
+
const HEAD_CHARS = 4e3;
|
|
2319
|
+
/** Tail portion — conclusions, takeaways, resource lists */
|
|
2320
|
+
const TAIL_CHARS = 3e3;
|
|
2321
|
+
/** Remaining budget goes to random middle samples */
|
|
2322
|
+
const MIDDLE_BUDGET = MAX_INPUT_CHARS - HEAD_CHARS - TAIL_CHARS;
|
|
2323
|
+
/** Number of random middle samples to pick */
|
|
2324
|
+
const MIDDLE_SAMPLES = 2;
|
|
2325
|
+
/**
|
|
2326
|
+
* For content that fits the budget, return as-is.
|
|
2327
|
+
* For long content, sample: head + tail + random middle paragraphs.
|
|
2328
|
+
*
|
|
2329
|
+
* Rationale: PINData extraction asks "what does this mean to the USER",
|
|
2330
|
+
* not "summarize the entire document". The head (title/intro) and tail
|
|
2331
|
+
* (conclusions/resources) carry 80%+ of personal signal. Middle sections
|
|
2332
|
+
* of long articles (paper tables, code listings, repetitive data) are
|
|
2333
|
+
* mostly noise for personal knowledge extraction.
|
|
2334
|
+
*/
|
|
2335
|
+
function prepareContent(text) {
|
|
2336
|
+
if (text.length <= MAX_INPUT_CHARS) return text;
|
|
2337
|
+
const head = text.slice(0, HEAD_CHARS);
|
|
2338
|
+
const tail = text.slice(-TAIL_CHARS);
|
|
2339
|
+
const middleStart = HEAD_CHARS;
|
|
2340
|
+
const middleEnd = text.length - TAIL_CHARS;
|
|
2341
|
+
const paragraphs = text.slice(middleStart, middleEnd).split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 100);
|
|
2342
|
+
const samples = [];
|
|
2343
|
+
let sampledChars = 0;
|
|
2344
|
+
const perSampleBudget = Math.floor(MIDDLE_BUDGET / MIDDLE_SAMPLES);
|
|
2345
|
+
if (paragraphs.length > 0) {
|
|
2346
|
+
const step = Math.max(1, Math.floor(paragraphs.length / MIDDLE_SAMPLES));
|
|
2347
|
+
for (let i = 0; i < MIDDLE_SAMPLES && i * step < paragraphs.length; i++) {
|
|
2348
|
+
const truncated = paragraphs[i * step].slice(0, perSampleBudget);
|
|
2349
|
+
samples.push(truncated);
|
|
2350
|
+
sampledChars += truncated.length;
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
const assembled = `${head}${samples.length > 0 ? `\n\n[... middle section sampled — ${(middleEnd - middleStart).toLocaleString()} chars total ...]\n\n${samples.join("\n\n---\n\n")}` : ""}\n\n[... end section ...]\n\n${tail}`;
|
|
2354
|
+
info(`Content ${text.length.toLocaleString()} chars → sampled to ${assembled.length.toLocaleString()} chars (head:${HEAD_CHARS} + ${samples.length} mid-samples:${sampledChars} + tail:${TAIL_CHARS})`);
|
|
2355
|
+
return assembled;
|
|
2356
|
+
}
|
|
2357
|
+
/** Parse one LLM JSON response into validated PINDataEntry[] */
|
|
2358
|
+
function parseExtractResponse(response) {
|
|
2359
|
+
const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
|
|
2360
|
+
const raw = JSON.parse(cleaned);
|
|
2361
|
+
let rawEntries;
|
|
2362
|
+
if (Array.isArray(raw)) rawEntries = raw;
|
|
2363
|
+
else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
|
|
2364
|
+
else return [];
|
|
2365
|
+
const entries = [];
|
|
2366
|
+
for (const item of rawEntries) {
|
|
2367
|
+
if (!item || typeof item !== "object") continue;
|
|
2368
|
+
const obj = item;
|
|
2369
|
+
const type = obj.type;
|
|
2370
|
+
const entryContent = obj.content;
|
|
2371
|
+
const topic = obj.topic;
|
|
2372
|
+
if (!type || !entryContent || !topic) continue;
|
|
2373
|
+
if (!VALID_TYPES.has(type)) continue;
|
|
2374
|
+
entries.push({
|
|
2375
|
+
type,
|
|
2376
|
+
content: entryContent.trim(),
|
|
2377
|
+
topic: topic.trim(),
|
|
2378
|
+
tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
|
|
2379
|
+
});
|
|
2380
|
+
}
|
|
2381
|
+
return entries;
|
|
2382
|
+
}
|
|
2383
|
+
/**
|
|
2384
|
+
* Extract PINData entries from raw/journal content via a single LLM call.
|
|
2385
|
+
* Long content is sampled (head + tail + middle samples) to fit the budget.
|
|
2386
|
+
*/
|
|
2064
2387
|
async function extractPinData(content, source) {
|
|
2065
2388
|
const system = extractSystemPrompt();
|
|
2066
|
-
const
|
|
2389
|
+
const prompt = extractUserPrompt(prepareContent(content), source);
|
|
2067
2390
|
try {
|
|
2068
|
-
const
|
|
2069
|
-
const raw = JSON.parse(cleaned);
|
|
2070
|
-
let rawEntries;
|
|
2071
|
-
if (Array.isArray(raw)) rawEntries = raw;
|
|
2072
|
-
else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
|
|
2073
|
-
else return {
|
|
2074
|
-
entries: [],
|
|
2075
|
-
summary: ""
|
|
2076
|
-
};
|
|
2077
|
-
const entries = [];
|
|
2078
|
-
for (const item of rawEntries) {
|
|
2079
|
-
if (!item || typeof item !== "object") continue;
|
|
2080
|
-
const obj = item;
|
|
2081
|
-
const type = obj.type;
|
|
2082
|
-
const content = obj.content;
|
|
2083
|
-
const topic = obj.topic;
|
|
2084
|
-
if (!type || !content || !topic) continue;
|
|
2085
|
-
if (!VALID_TYPES.has(type)) continue;
|
|
2086
|
-
entries.push({
|
|
2087
|
-
type,
|
|
2088
|
-
content: content.trim(),
|
|
2089
|
-
topic: topic.trim(),
|
|
2090
|
-
tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
|
|
2091
|
-
});
|
|
2092
|
-
}
|
|
2391
|
+
const entries = parseExtractResponse(await llmCall(prompt, system));
|
|
2093
2392
|
return {
|
|
2094
2393
|
entries,
|
|
2095
2394
|
summary: entries.length > 0 ? entries.slice(0, 3).map((e) => e.content).join("; ") : "no extractable signal"
|
|
2096
2395
|
};
|
|
2097
|
-
} catch {
|
|
2396
|
+
} catch (err) {
|
|
2098
2397
|
return {
|
|
2099
2398
|
entries: [],
|
|
2100
|
-
summary: `Failed to parse extract response: ${
|
|
2399
|
+
summary: `Failed to parse extract response: ${(err instanceof Error ? err.message : String(err)).slice(0, 100)}`
|
|
2101
2400
|
};
|
|
2102
2401
|
}
|
|
2103
2402
|
}
|
|
@@ -4440,7 +4739,7 @@ function registerAllCommands(program) {
|
|
|
4440
4739
|
//#endregion
|
|
4441
4740
|
//#region src/cli/build-program.ts
|
|
4442
4741
|
function buildProgram() {
|
|
4443
|
-
const program = new Command().name("pai").description("Personal AI Identity Provider — local-first AI agent identity & memory system").version("0.2.
|
|
4742
|
+
const program = new Command().name("pai").description("Personal AI Identity Provider — local-first AI agent identity & memory system").version("0.2.4");
|
|
4444
4743
|
registerAllCommands(program);
|
|
4445
4744
|
return program;
|
|
4446
4745
|
}
|