personal-ai 0.2.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/entry.mjs CHANGED
@@ -5,7 +5,7 @@ import fs from "node:fs/promises";
5
5
  import path from "node:path";
6
6
  import os from "node:os";
7
7
  import http from "node:http";
8
- import { URL } from "node:url";
8
+ import { URL as URL$1 } from "node:url";
9
9
  import { google } from "googleapis";
10
10
  import open from "open";
11
11
  import chalk from "chalk";
@@ -282,7 +282,7 @@ var GoogleOAuth = class {
282
282
  return new Promise((resolve, reject) => {
283
283
  const server = http.createServer((req, res) => {
284
284
  if (req.url?.startsWith("/callback")) {
285
- const url = new URL(req.url ?? "", `http://${req.headers.host}`);
285
+ const url = new URL$1(req.url ?? "", `http://${req.headers.host}`);
286
286
  const code = url.searchParams.get("code");
287
287
  const state = url.searchParams.get("state");
288
288
  if (code && state === expectedState) {
@@ -1582,6 +1582,75 @@ async function rebuildProfile(opts) {
1582
1582
 
1583
1583
  //#endregion
1584
1584
  //#region src/cli/register.init.ts
1585
+ /**
1586
+ * Extract key highlights from profile.md for display after init.
1587
+ * Pulls: identity, active repos, recent commit stats, top tools.
1588
+ */
1589
+ function extractProfileHighlights(content) {
1590
+ const lines = content.split("\n");
1591
+ const highlights = [];
1592
+ let inIdentity = false;
1593
+ let identityCount = 0;
1594
+ for (const line of lines) {
1595
+ if (line.startsWith("## ") && /identity/i.test(line)) {
1596
+ inIdentity = true;
1597
+ continue;
1598
+ }
1599
+ if (inIdentity && line.startsWith("## ")) break;
1600
+ if (inIdentity && line.trim()) {
1601
+ highlights.push(line);
1602
+ identityCount++;
1603
+ if (identityCount >= 8) break;
1604
+ }
1605
+ }
1606
+ let inRepos = false;
1607
+ let repoCount = 0;
1608
+ for (const line of lines) {
1609
+ if (line.startsWith("## Active Git Repositories")) {
1610
+ inRepos = true;
1611
+ highlights.push("");
1612
+ highlights.push("Recent Projects:");
1613
+ continue;
1614
+ }
1615
+ if (inRepos && line.startsWith("## ")) break;
1616
+ if (inRepos && line.trim().startsWith("-")) {
1617
+ const repoPath = line.trim().replace(/^-\s*/, "");
1618
+ const name = repoPath.split("/").pop() || repoPath;
1619
+ highlights.push(` - ${name}`);
1620
+ repoCount++;
1621
+ if (repoCount >= 6) break;
1622
+ }
1623
+ }
1624
+ let inCommits = false;
1625
+ for (const line of lines) {
1626
+ if (line.startsWith("## Recent Commit Activity")) {
1627
+ inCommits = true;
1628
+ continue;
1629
+ }
1630
+ if (inCommits && line.startsWith("## ")) break;
1631
+ if (inCommits && /^total commits/i.test(line.trim())) {
1632
+ highlights.push("");
1633
+ highlights.push(`Recent Activity: ${line.trim()}`);
1634
+ break;
1635
+ }
1636
+ }
1637
+ let inTools = false;
1638
+ const tools = [];
1639
+ for (const line of lines) {
1640
+ if (line.startsWith("## Recently Used Tools")) {
1641
+ inTools = true;
1642
+ continue;
1643
+ }
1644
+ if (inTools && line.startsWith("## ")) break;
1645
+ if (inTools && line.trim().startsWith("-")) {
1646
+ const tool = line.trim().replace(/^-\s*/, "").split(":")[0].trim();
1647
+ tools.push(tool);
1648
+ if (tools.length >= 8) break;
1649
+ }
1650
+ }
1651
+ if (tools.length > 0) highlights.push(`Top Tools: ${tools.join(", ")}`);
1652
+ return highlights.length > 0 ? highlights.join("\n") : null;
1653
+ }
1585
1654
  /** Try to install QMD globally via npm */
1586
1655
  async function installQmd() {
1587
1656
  const spin = spinner("Installing QMD (search engine)...");
@@ -1787,6 +1856,16 @@ async function runInit(options = {}) {
1787
1856
  else info(`[8/${TOTAL_STEPS}] Indexing for search... (skipped, QMD not available)`);
1788
1857
  log("");
1789
1858
  success("Setup complete!");
1859
+ log("");
1860
+ try {
1861
+ const highlights = extractProfileHighlights(await fs.readFile(path.join(paiHome, "profile.md"), "utf-8"));
1862
+ if (highlights) {
1863
+ log("── Profile Summary ──");
1864
+ log(highlights);
1865
+ log("─────────────────────");
1866
+ log("");
1867
+ }
1868
+ } catch {}
1790
1869
  log(` Profile: ${paiHome}/profile.md (${profileLines} lines, ${profileSources} sources)`);
1791
1870
  const rawCount = gmailCount + calendarCount;
1792
1871
  if (rawCount > 0) log(` Raw data: ${rawCount} file(s) (${gmailCount} gmail, ${calendarCount} calendar)`);
@@ -1795,7 +1874,7 @@ async function runInit(options = {}) {
1795
1874
  log(` Google: ${googleAuthed ? "connected" : "not connected"}`);
1796
1875
  log("");
1797
1876
  info("Next:");
1798
- log(" pai profile # View your profile");
1877
+ log(" pai profile # View full profile");
1799
1878
  log(" pai distribute # Deploy to Cursor/Claude");
1800
1879
  log(" pai ask \"question\" # Ask about yourself");
1801
1880
  }
@@ -1822,11 +1901,123 @@ function registerInitCommand(program) {
1822
1901
  //#endregion
1823
1902
  //#region src/scraper/index.ts
1824
1903
  /**
1825
- * Scrape a URL and return title + markdown content.
1826
- * Uses fetch + defuddle for content extraction.
1827
- * Falls back to basic HTML extraction if defuddle is unavailable.
1904
+ * Web scraper: three-tier strategy for content extraction.
1905
+ *
1906
+ * 1. GitHub blob URLs → convert to raw.githubusercontent.com, fetch raw markdown (zero rendering)
1907
+ * 2. General URLs → Playwright headless browser + DOM preprocessing + Defuddle extraction
1908
+ * 3. Fallback → plain fetch + Defuddle (for when Playwright is unavailable)
1909
+ *
1910
+ * Reference: linkmind-master/src/scraper.ts
1828
1911
  */
1829
- async function scrapeUrl(url, timeout = 3e4) {
1912
+ const CHROME_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
1913
+ /** Minimum markdown length to consider a scrape successful */
1914
+ const MIN_CONTENT_LENGTH = 100;
1915
+ /** Check if a URL points to a file on GitHub (blob view) */
1916
+ function isGithubBlobUrl(url) {
1917
+ try {
1918
+ const u = new URL(url);
1919
+ return u.hostname === "github.com" && /\/blob\//.test(u.pathname);
1920
+ } catch {
1921
+ return false;
1922
+ }
1923
+ }
1924
+ /** Convert GitHub blob URL to raw.githubusercontent.com URL */
1925
+ function toRawGithubUrl(url) {
1926
+ return url.replace("https://github.com/", "https://raw.githubusercontent.com/").replace("/blob/", "/");
1927
+ }
1928
+ /** Fast path: fetch raw content directly from GitHub (returns markdown/text as-is) */
1929
+ async function scrapeGithubRaw(url, timeout) {
1930
+ const rawUrl = toRawGithubUrl(url);
1931
+ const controller = new AbortController();
1932
+ const timer = setTimeout(() => controller.abort(), timeout);
1933
+ try {
1934
+ const resp = await fetch(rawUrl, {
1935
+ signal: controller.signal,
1936
+ headers: { "User-Agent": CHROME_UA }
1937
+ });
1938
+ if (!resp.ok) throw new Error(`GitHub raw fetch failed: HTTP ${resp.status}`);
1939
+ const markdown = await resp.text();
1940
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
1941
+ const pathParts = new URL(url).pathname.split("/");
1942
+ const filename = pathParts[pathParts.length - 1] ?? "Untitled";
1943
+ return {
1944
+ url,
1945
+ title: titleMatch?.[1]?.trim() ?? filename,
1946
+ markdown
1947
+ };
1948
+ } finally {
1949
+ clearTimeout(timer);
1950
+ }
1951
+ }
1952
+ /**
1953
+ * DOM preprocessing script executed inside the browser.
1954
+ * Removes navigation, ads, cookie banners, and other non-content elements
1955
+ * before Defuddle extraction. (Borrowed from linkmind)
1956
+ */
1957
+ const DOM_PREPROCESS_SCRIPT = `(() => {
1958
+ // Remove script, style, stylesheet links
1959
+ document.querySelectorAll("script, style, link[rel='stylesheet']").forEach(el => el.remove());
1960
+ // Remove navigation elements
1961
+ document.querySelectorAll("nav, footer, aside").forEach(el => el.remove());
1962
+ // Remove headers not inside article/main
1963
+ document.querySelectorAll("header").forEach(el => {
1964
+ if (!el.closest("article") && !el.closest("main")) el.remove();
1965
+ });
1966
+ // Remove ARIA landmark roles
1967
+ document.querySelectorAll('[role="navigation"], [role="banner"], [role="contentinfo"], [role="complementary"], [role="search"]').forEach(el => el.remove());
1968
+ // Remove cookie/share/comment noise
1969
+ document.querySelectorAll('[class*="cookie-banner"], [id*="cookie-banner"], [class*="cookie-consent"], [class*="share-buttons"], [class*="social-share"], [class*="comment-section"], [id*="comments"]').forEach(el => el.remove());
1970
+ // Remove hidden elements
1971
+ document.querySelectorAll('[hidden], [aria-hidden="true"]').forEach(el => el.remove());
1972
+
1973
+ return {
1974
+ title: document.title,
1975
+ html: document.documentElement.outerHTML,
1976
+ };
1977
+ })()`;
1978
+ /** Scrape with Playwright headless browser + Defuddle */
1979
+ async function scrapeWithPlaywright(url, timeout) {
1980
+ const pw = await import("playwright");
1981
+ const { Defuddle } = await import("defuddle/node");
1982
+ const browser = await pw.chromium.launch({
1983
+ headless: true,
1984
+ args: ["--disable-blink-features=AutomationControlled"]
1985
+ });
1986
+ try {
1987
+ const page = await (await browser.newContext({
1988
+ viewport: {
1989
+ width: 1280,
1990
+ height: 900
1991
+ },
1992
+ userAgent: CHROME_UA,
1993
+ locale: "en-US"
1994
+ })).newPage();
1995
+ await page.goto(url, {
1996
+ waitUntil: "domcontentloaded",
1997
+ timeout
1998
+ });
1999
+ await page.waitForTimeout(2e3);
2000
+ const { title: pageTitle, html } = await page.evaluate(DOM_PREPROCESS_SCRIPT);
2001
+ await browser.close();
2002
+ const origLog = globalThis.console.log;
2003
+ globalThis.console.log = (msg, ...args) => {
2004
+ if (typeof msg === "string" && msg.includes("Initial parse returned very little content")) return;
2005
+ origLog(msg, ...args);
2006
+ };
2007
+ const result = await Defuddle(html, url);
2008
+ globalThis.console.log = origLog;
2009
+ return {
2010
+ url,
2011
+ title: result.title || pageTitle || "Untitled",
2012
+ markdown: result.content ? htmlToSimpleMarkdown(result.content) : ""
2013
+ };
2014
+ } catch (err) {
2015
+ await browser.close().catch(() => {});
2016
+ throw err;
2017
+ }
2018
+ }
2019
+ /** Lightweight scrape: plain HTTP fetch + Defuddle (no JS rendering) */
2020
+ async function scrapeWithFetch(url, timeout) {
1830
2021
  const controller = new AbortController();
1831
2022
  const timer = setTimeout(() => controller.abort(), timeout);
1832
2023
  let html;
@@ -1834,7 +2025,7 @@ async function scrapeUrl(url, timeout = 3e4) {
1834
2025
  const response = await fetch(url, {
1835
2026
  signal: controller.signal,
1836
2027
  headers: {
1837
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
2028
+ "User-Agent": CHROME_UA,
1838
2029
  Accept: "text/html,application/xhtml+xml"
1839
2030
  }
1840
2031
  });
@@ -1844,16 +2035,13 @@ async function scrapeUrl(url, timeout = 3e4) {
1844
2035
  clearTimeout(timer);
1845
2036
  }
1846
2037
  try {
1847
- const defuddleMod = await import("defuddle/node");
1848
- const Defuddle = defuddleMod.Defuddle ?? defuddleMod.default;
1849
- if (Defuddle) {
1850
- const result = new Defuddle(html, { url }).parse();
1851
- return {
1852
- url,
1853
- title: result.title || extractTitleFromHtml(html),
1854
- markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
1855
- };
1856
- }
2038
+ const { Defuddle } = await import("defuddle/node");
2039
+ const result = await Defuddle(html, url);
2040
+ return {
2041
+ url,
2042
+ title: result.title || extractTitleFromHtml(html),
2043
+ markdown: result.content ? htmlToSimpleMarkdown(result.content) : extractTextFromHtml(html)
2044
+ };
1857
2045
  } catch {
1858
2046
  warn("defuddle not available, using basic HTML extraction");
1859
2047
  }
@@ -1863,17 +2051,77 @@ async function scrapeUrl(url, timeout = 3e4) {
1863
2051
  markdown: extractTextFromHtml(html)
1864
2052
  };
1865
2053
  }
2054
+ /**
2055
+ * Scrape a URL and return title + markdown content.
2056
+ *
2057
+ * Strategy:
2058
+ * 1. GitHub blob URL → raw.githubusercontent.com (instant, perfect fidelity)
2059
+ * 2. Playwright + Defuddle (handles JS-rendered pages)
2060
+ * 3. Fetch + Defuddle fallback (static pages, or when Playwright missing)
2061
+ */
2062
+ async function scrapeUrl(url, timeout = 3e4) {
2063
+ if (isGithubBlobUrl(url)) {
2064
+ info("GitHub blob detected — fetching raw content directly");
2065
+ return scrapeGithubRaw(url, timeout);
2066
+ }
2067
+ try {
2068
+ const result = await scrapeWithPlaywright(url, timeout);
2069
+ if (result.markdown.length < MIN_CONTENT_LENGTH) {
2070
+ warn(`Playwright extracted only ${result.markdown.length} chars — trying fetch fallback`);
2071
+ const fallback = await scrapeWithFetch(url, timeout);
2072
+ return fallback.markdown.length > result.markdown.length ? fallback : result;
2073
+ }
2074
+ return result;
2075
+ } catch (err) {
2076
+ const msg = err instanceof Error ? err.message : String(err);
2077
+ warn(`Playwright scrape failed (${msg}) — falling back to fetch`);
2078
+ }
2079
+ return scrapeWithFetch(url, timeout);
2080
+ }
1866
2081
  /** Extract <title> from HTML */
1867
2082
  function extractTitleFromHtml(html) {
1868
2083
  return html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? "Untitled";
1869
2084
  }
1870
- /** Basic HTML to text extraction (fallback) */
2085
+ /** Basic HTML to text extraction (last-resort fallback) */
1871
2086
  function extractTextFromHtml(html) {
1872
2087
  return html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#39;/g, "'").replace(/&nbsp;/g, " ").replace(/\s+/g, " ").trim().slice(0, 1e4);
1873
2088
  }
1874
- /** Convert simple HTML to markdown */
2089
+ /** Convert HTML fragment to simple Markdown (from linkmind, extended) */
1875
2090
  function htmlToSimpleMarkdown(html) {
1876
- return html.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, "# $1\n\n").replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, "## $1\n\n").replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, "### $1\n\n").replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, "$1\n\n").replace(/<br\s*\/?>/gi, "\n").replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, "**$1**").replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, "*$1*").replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)").replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, "- $1\n").replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, "`$1`").replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, "```\n$1\n```\n").replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, "\"").replace(/&#39;/g, "'").replace(/&nbsp;/g, " ").replace(/\n{3,}/g, "\n\n").trim();
2091
+ if (!html) return "";
2092
+ let md = html;
2093
+ md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, "# $1\n\n");
2094
+ md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, "## $1\n\n");
2095
+ md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, "### $1\n\n");
2096
+ md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, "#### $1\n\n");
2097
+ md = md.replace(/<h5[^>]*>(.*?)<\/h5>/gi, "##### $1\n\n");
2098
+ md = md.replace(/<h6[^>]*>(.*?)<\/h6>/gi, "###### $1\n\n");
2099
+ md = md.replace(/<p[^>]*>/gi, "\n\n");
2100
+ md = md.replace(/<\/p>/gi, "");
2101
+ md = md.replace(/<br\s*\/?>/gi, "\n");
2102
+ md = md.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, "**$2**");
2103
+ md = md.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, "*$2*");
2104
+ md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, "[$2]($1)");
2105
+ md = md.replace(/<code[^>]*>(.*?)<\/code>/gi, "`$1`");
2106
+ md = md.replace(/<pre[^>]*>(.*?)<\/pre>/gis, "\n```\n$1\n```\n");
2107
+ md = md.replace(/<li[^>]*>/gi, "- ");
2108
+ md = md.replace(/<\/li>/gi, "\n");
2109
+ md = md.replace(/<\/?[uo]l[^>]*>/gi, "\n");
2110
+ md = md.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gis, (_, content) => {
2111
+ return content.split("\n").map((line) => `> ${line}`).join("\n");
2112
+ });
2113
+ md = md.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, "![$2]($1)");
2114
+ md = md.replace(/<img[^>]*src="([^"]*)"[^>]*\/?>/gi, "![]($1)");
2115
+ md = md.replace(/<[^>]+>/g, "");
2116
+ md = md.replace(/&amp;/g, "&");
2117
+ md = md.replace(/&lt;/g, "<");
2118
+ md = md.replace(/&gt;/g, ">");
2119
+ md = md.replace(/&quot;/g, "\"");
2120
+ md = md.replace(/&#39;/g, "'");
2121
+ md = md.replace(/&nbsp;/g, " ");
2122
+ md = md.replace(/\n{3,}/g, "\n\n");
2123
+ md = md.trim();
2124
+ return md;
1877
2125
  }
1878
2126
 
1879
2127
  //#endregion
@@ -2060,44 +2308,95 @@ const VALID_TYPES = new Set([
2060
2308
  "entity",
2061
2309
  "event"
2062
2310
  ]);
2063
- /** Extract PINData entries from raw/journal content via a single LLM call */
2311
+ /**
2312
+ * Budget for the total text sent to LLM.
2313
+ * ~10K chars ≈ ~2.5K tokens (English) / ~5K tokens (CJK).
2314
+ * PINData extraction only needs gist, not full content.
2315
+ */
2316
+ const MAX_INPUT_CHARS = 1e4;
2317
+ /** Head portion gets the lion's share — title, intro, overview */
2318
+ const HEAD_CHARS = 4e3;
2319
+ /** Tail portion — conclusions, takeaways, resource lists */
2320
+ const TAIL_CHARS = 3e3;
2321
+ /** Remaining budget goes to random middle samples */
2322
+ const MIDDLE_BUDGET = MAX_INPUT_CHARS - HEAD_CHARS - TAIL_CHARS;
2323
+ /** Number of random middle samples to pick */
2324
+ const MIDDLE_SAMPLES = 2;
2325
+ /**
2326
+ * For content that fits the budget, return as-is.
2327
+ * For long content, sample: head + tail + random middle paragraphs.
2328
+ *
2329
+ * Rationale: PINData extraction asks "what does this mean to the USER",
2330
+ * not "summarize the entire document". The head (title/intro) and tail
2331
+ * (conclusions/resources) carry 80%+ of personal signal. Middle sections
2332
+ * of long articles (paper tables, code listings, repetitive data) are
2333
+ * mostly noise for personal knowledge extraction.
2334
+ */
2335
+ function prepareContent(text) {
2336
+ if (text.length <= MAX_INPUT_CHARS) return text;
2337
+ const head = text.slice(0, HEAD_CHARS);
2338
+ const tail = text.slice(-TAIL_CHARS);
2339
+ const middleStart = HEAD_CHARS;
2340
+ const middleEnd = text.length - TAIL_CHARS;
2341
+ const paragraphs = text.slice(middleStart, middleEnd).split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 100);
2342
+ const samples = [];
2343
+ let sampledChars = 0;
2344
+ const perSampleBudget = Math.floor(MIDDLE_BUDGET / MIDDLE_SAMPLES);
2345
+ if (paragraphs.length > 0) {
2346
+ const step = Math.max(1, Math.floor(paragraphs.length / MIDDLE_SAMPLES));
2347
+ for (let i = 0; i < MIDDLE_SAMPLES && i * step < paragraphs.length; i++) {
2348
+ const truncated = paragraphs[i * step].slice(0, perSampleBudget);
2349
+ samples.push(truncated);
2350
+ sampledChars += truncated.length;
2351
+ }
2352
+ }
2353
+ const assembled = `${head}${samples.length > 0 ? `\n\n[... middle section sampled — ${(middleEnd - middleStart).toLocaleString()} chars total ...]\n\n${samples.join("\n\n---\n\n")}` : ""}\n\n[... end section ...]\n\n${tail}`;
2354
+ info(`Content ${text.length.toLocaleString()} chars → sampled to ${assembled.length.toLocaleString()} chars (head:${HEAD_CHARS} + ${samples.length} mid-samples:${sampledChars} + tail:${TAIL_CHARS})`);
2355
+ return assembled;
2356
+ }
2357
+ /** Parse one LLM JSON response into validated PINDataEntry[] */
2358
+ function parseExtractResponse(response) {
2359
+ const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
2360
+ const raw = JSON.parse(cleaned);
2361
+ let rawEntries;
2362
+ if (Array.isArray(raw)) rawEntries = raw;
2363
+ else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
2364
+ else return [];
2365
+ const entries = [];
2366
+ for (const item of rawEntries) {
2367
+ if (!item || typeof item !== "object") continue;
2368
+ const obj = item;
2369
+ const type = obj.type;
2370
+ const entryContent = obj.content;
2371
+ const topic = obj.topic;
2372
+ if (!type || !entryContent || !topic) continue;
2373
+ if (!VALID_TYPES.has(type)) continue;
2374
+ entries.push({
2375
+ type,
2376
+ content: entryContent.trim(),
2377
+ topic: topic.trim(),
2378
+ tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
2379
+ });
2380
+ }
2381
+ return entries;
2382
+ }
2383
+ /**
2384
+ * Extract PINData entries from raw/journal content via a single LLM call.
2385
+ * Long content is sampled (head + tail + middle samples) to fit the budget.
2386
+ */
2064
2387
  async function extractPinData(content, source) {
2065
2388
  const system = extractSystemPrompt();
2066
- const response = await llmCall(extractUserPrompt(content, source), system);
2389
+ const prompt = extractUserPrompt(prepareContent(content), source);
2067
2390
  try {
2068
- const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
2069
- const raw = JSON.parse(cleaned);
2070
- let rawEntries;
2071
- if (Array.isArray(raw)) rawEntries = raw;
2072
- else if (raw && typeof raw === "object" && "entries" in raw && Array.isArray(raw.entries)) rawEntries = raw.entries;
2073
- else return {
2074
- entries: [],
2075
- summary: ""
2076
- };
2077
- const entries = [];
2078
- for (const item of rawEntries) {
2079
- if (!item || typeof item !== "object") continue;
2080
- const obj = item;
2081
- const type = obj.type;
2082
- const content = obj.content;
2083
- const topic = obj.topic;
2084
- if (!type || !content || !topic) continue;
2085
- if (!VALID_TYPES.has(type)) continue;
2086
- entries.push({
2087
- type,
2088
- content: content.trim(),
2089
- topic: topic.trim(),
2090
- tags: Array.isArray(obj.tags) ? obj.tags.filter((t) => typeof t === "string") : void 0
2091
- });
2092
- }
2391
+ const entries = parseExtractResponse(await llmCall(prompt, system));
2093
2392
  return {
2094
2393
  entries,
2095
2394
  summary: entries.length > 0 ? entries.slice(0, 3).map((e) => e.content).join("; ") : "no extractable signal"
2096
2395
  };
2097
- } catch {
2396
+ } catch (err) {
2098
2397
  return {
2099
2398
  entries: [],
2100
- summary: `Failed to parse extract response: ${response.slice(0, 100)}`
2399
+ summary: `Failed to parse extract response: ${(err instanceof Error ? err.message : String(err)).slice(0, 100)}`
2101
2400
  };
2102
2401
  }
2103
2402
  }
@@ -4440,7 +4739,7 @@ function registerAllCommands(program) {
4440
4739
  //#endregion
4441
4740
  //#region src/cli/build-program.ts
4442
4741
  function buildProgram() {
4443
- const program = new Command().name("pai").description("Personal AI Identity Provider — local-first AI agent identity & memory system").version("0.2.1");
4742
+ const program = new Command().name("pai").description("Personal AI Identity Provider — local-first AI agent identity & memory system").version("0.2.4");
4444
4743
  registerAllCommands(program);
4445
4744
  return program;
4446
4745
  }