pi-memory-stone 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -9,6 +9,7 @@
9
9
  * - Deterministic turn_summary and file_activity capture on agent_end
10
10
  * - FTS5 search
11
11
  * - /memory-status, /memory-search, /memory-open, /memory-inject, /memory-last commands
12
+ * - /memory-vault-* commands and natural-language URL capture
12
13
  * - memory_search, memory_open, memory_remember, memory_forget tools
13
14
  * - Conservative same-project before_agent_start injection
14
15
  */
@@ -21,6 +22,8 @@ import { retrieve, buildInjectionPacket, formatInjectionForLlm } from "./retriev
21
22
  import { getProjectId, getConfig, clearProjectCache } from "./config/index.js";
22
23
  import { closeDb, getRecord, insertInjection } from "./db/index.js";
23
24
  import { getMemorySessionState, manualRecordsToRankedResults } from "./session-state/index.js";
25
+ import { captureUrlToVault } from "./vault/capture.js";
26
+ import { parseVaultCaptureIntent } from "./vault/intent.js";
24
27
  import { createHash } from "node:crypto";
25
28
 
26
29
  // ─── Session-scoped state ───────────────────────────────────────────
@@ -31,6 +34,37 @@ const injectedRefsThisSession: Set<string> = new Set();
31
34
  /** Whether memory injection is temporarily disabled for this session */
32
35
  let sessionEnabled = true;
33
36
 
37
+ async function maybeCaptureVaultUrl(
38
+ prompt: string,
39
+ projectId: string | null,
40
+ cwd: string,
41
+ signal?: AbortSignal,
42
+ ): Promise<string | null> {
43
+ const intent = parseVaultCaptureIntent(prompt);
44
+ if (!intent) return null;
45
+
46
+ try {
47
+ const result = await captureUrlToVault(intent.scope, projectId, cwd, intent.url, { signal });
48
+ const warnings = result.warnings.length > 0 ? `\nWarnings: ${result.warnings.join("; ")}` : "";
49
+ return `Captured web page into ${intent.scope} memory vault${result.initialized ? " (initialized vault)" : ""}: ${result.title}\nQuality: ${result.quality} (${result.qualityScore})${warnings}\nPage: ${result.pagePath}\nSource packet: ${result.sourcePacketPath}`;
50
+ } catch (err) {
51
+ const message = err instanceof Error ? err.message : String(err);
52
+ console.error("[pi-memory-stone] vault URL capture failed:", err);
53
+ return `Memory vault URL capture failed: ${message}`;
54
+ }
55
+ }
56
+
57
+ function vaultCaptureReturn(systemPrompt: string, notice: string) {
58
+ return {
59
+ message: {
60
+ customType: "memory-vault-capture",
61
+ content: notice,
62
+ display: true,
63
+ },
64
+ systemPrompt: `${systemPrompt}\n\n--- Memory Vault Capture ---\n${notice}\nThe user's vault capture request has already been handled by pi-memory-stone. Briefly confirm the result; do not fetch the same URL again unless the user asks.\n--- End Memory Vault Capture ---`,
65
+ };
66
+ }
67
+
34
68
  // ─── Extension entry point ─────────────────────────────────────────
35
69
 
36
70
  export default function (pi: ExtensionAPI) {
@@ -69,18 +103,24 @@ export default function (pi: ExtensionAPI) {
69
103
 
70
104
  pi.on("before_agent_start", async (event, ctx) => {
71
105
  try {
106
+ const prompt = event.prompt || "";
107
+ const projectId = getProjectId(ctx.cwd);
108
+ const vaultCaptureNotice = await maybeCaptureVaultUrl(prompt, projectId, ctx.cwd, ctx.signal);
109
+
72
110
  // Check if memory is enabled
73
111
  const config = getConfig(ctx.cwd);
74
- if (!config.enabled) return;
112
+ if (!config.enabled) {
113
+ return vaultCaptureNotice ? vaultCaptureReturn(event.systemPrompt || "", vaultCaptureNotice) : undefined;
114
+ }
75
115
 
76
116
  const sessionState = getMemorySessionState(ctx.sessionManager.getBranch());
77
117
  sessionEnabled = sessionState.enabled;
78
118
 
79
- if (!sessionEnabled) return;
119
+ if (!sessionEnabled) {
120
+ return vaultCaptureNotice ? vaultCaptureReturn(event.systemPrompt || "", vaultCaptureNotice) : undefined;
121
+ }
80
122
 
81
- const prompt = event.prompt || "";
82
123
  const promptHash = createHash("sha256").update(prompt).digest("hex").slice(0, 12);
83
- const projectId = getProjectId(ctx.cwd);
84
124
  const injectionMode = sessionState.injectionMode ?? config.injectionMode;
85
125
 
86
126
  const manualRecords = sessionState.manualRefs
@@ -103,10 +143,11 @@ export default function (pi: ExtensionAPI) {
103
143
  }
104
144
 
105
145
  const selectedResults = [...manualResults, ...autoResults];
106
- if (selectedResults.length === 0) return;
107
-
108
- const packet = buildInjectionPacket(selectedResults);
109
- const formatted = formatInjectionForLlm(packet, config.maxInjectedTokens);
146
+ let formatted: string | null = null;
147
+ if (selectedResults.length > 0) {
148
+ const packet = buildInjectionPacket(selectedResults);
149
+ formatted = formatInjectionForLlm(packet, config.maxInjectedTokens);
150
+ }
110
151
 
111
152
  // Track only search-selected refs. Manually chosen refs are intentionally
112
153
  // injected on every turn until /memory-clear-injected is used.
@@ -114,27 +155,36 @@ export default function (pi: ExtensionAPI) {
114
155
  injectedRefsThisSession.add(r.record.id);
115
156
  }
116
157
 
117
- insertInjection({
118
- session_id: ctx.sessionManager.getSessionId(),
119
- turn_entry_id: ctx.sessionManager.getLeafId() ?? undefined,
120
- prompt_hash: promptHash,
121
- injected_refs: selectedResults.map((r) => r.record.id).join(","),
122
- packet: formatted,
123
- reasons: selectedResults.map((r) => r.reasons.join(";")).join(" | "),
124
- });
125
-
126
- // Inject as a non-context audit custom entry (separate from LLM context)
127
- // but also as a system prompt addition for the LLM
128
- const systemPromptAddition = [
129
- "",
130
- "--- Memory Stone Context ---",
131
- formatted,
132
- "--- End Memory Stone Context ---",
133
- ].join("\n");
134
-
135
- return {
136
- systemPrompt: (event.systemPrompt || "") + systemPromptAddition,
137
- };
158
+ if (formatted) {
159
+ insertInjection({
160
+ session_id: ctx.sessionManager.getSessionId(),
161
+ turn_entry_id: ctx.sessionManager.getLeafId() ?? undefined,
162
+ prompt_hash: promptHash,
163
+ injected_refs: selectedResults.map((r) => r.record.id).join(","),
164
+ packet: formatted,
165
+ reasons: selectedResults.map((r) => r.reasons.join(";")).join(" | "),
166
+ });
167
+ }
168
+
169
+ if (!formatted && !vaultCaptureNotice) return;
170
+
171
+ let systemPrompt = event.systemPrompt || "";
172
+ if (formatted) {
173
+ // Inject as a non-context audit custom entry (separate from LLM context)
174
+ // but also as a system prompt addition for the LLM
175
+ systemPrompt += [
176
+ "",
177
+ "--- Memory Stone Context ---",
178
+ formatted,
179
+ "--- End Memory Stone Context ---",
180
+ ].join("\n");
181
+ }
182
+
183
+ if (vaultCaptureNotice) {
184
+ return vaultCaptureReturn(systemPrompt, vaultCaptureNotice);
185
+ }
186
+
187
+ return { systemPrompt };
138
188
  } catch (err) {
139
189
  console.error("[pi-memory-stone] before_agent_start handler error:", err);
140
190
  }
@@ -2,10 +2,11 @@
2
2
  * Portable export/import/backup helpers for memory records.
3
3
  */
4
4
 
5
- import { copyFileSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
5
+ import { chmodSync, copyFileSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
6
6
  import { dirname, isAbsolute, resolve } from "node:path";
7
- import { getDb, getDbPath, listRecords, upsertRecord, type RecordRow } from "../db/index.js";
7
+ import { getDb, getDbPath, hardenDbFilePermissions, listRecords, upsertRecord, type RecordRow } from "../db/index.js";
8
8
  import { SCHEMA_VERSION, RECORD_KINDS, RECORD_SCOPES, RECORD_STATUSES, type RecordKind, type RecordScope, type RecordStatus } from "../db/schema.js";
9
+ import { isSensitiveForGlobalMemory } from "../privacy/index.js";
9
10
 
10
11
  export type ExportFormat = "json" | "md";
11
12
 
@@ -68,8 +69,8 @@ export function exportMemory(format: ExportFormat, includeInactive = false): str
68
69
  export function writeMemoryExport(path: string, format: ExportFormat, includeInactive = false): number {
69
70
  const payload = buildMemoryExport(includeInactive);
70
71
  const content = format === "json" ? JSON.stringify(payload, null, 2) + "\n" : exportMarkdown(payload);
71
- mkdirSync(dirname(path), { recursive: true });
72
- writeFileSync(path, content, "utf8");
72
+ mkdirSync(dirname(path), { recursive: true, mode: 0o700 });
73
+ writeFileSync(path, content, { encoding: "utf8", mode: 0o600 });
73
74
  return payload.records.length;
74
75
  }
75
76
 
@@ -94,6 +95,14 @@ export function importMemoryJson(raw: string, options: ImportOptions = {}): Impo
94
95
 
95
96
  const scope = options.scopeOverride ?? record.scope;
96
97
  const projectId = scope === "global" ? null : (options.projectId !== undefined ? options.projectId : record.project_id);
98
+ if (scope === "project" && !projectId) {
99
+ result.skipped += 1;
100
+ continue;
101
+ }
102
+ if (scope === "global" && isSensitiveForGlobalMemory(`${record.text}\n${record.tags ?? ""}`)) {
103
+ result.skipped += 1;
104
+ continue;
105
+ }
97
106
  const id = upsertRecord({
98
107
  kind: record.kind,
99
108
  scope,
@@ -117,9 +126,13 @@ export function importMemoryJson(raw: string, options: ImportOptions = {}): Impo
117
126
  }
118
127
 
119
128
  export function backupMemoryDatabase(path: string): void {
120
- mkdirSync(dirname(path), { recursive: true });
129
+ mkdirSync(dirname(path), { recursive: true, mode: 0o700 });
121
130
  getDb().exec("PRAGMA wal_checkpoint(TRUNCATE)");
131
+ hardenDbFilePermissions();
122
132
  copyFileSync(getDbPath(), path);
133
+ try {
134
+ chmodSync(path, 0o600);
135
+ } catch {}
123
136
  }
124
137
 
125
138
  export function defaultPortablePath(cwd: string, prefix: string, extension: string): string {
@@ -27,7 +27,7 @@ const SECRET_PATTERNS: { name: string; regex: RegExp; replacement: SecretReplace
27
27
  },
28
28
  {
29
29
  name: "aws-secret",
30
- regex: /(?<=SecretAccessKey[=:]\s*)[A-Za-z0-9/+]{40,}/g,
30
+ regex: /\b(?:aws[_-]?)?secret[_-]?access[_-]?key\b\s*[=:]\s*['"]?[A-Za-z0-9/+=]{40,}['"]?/gi,
31
31
  replacement: "[REDACTED:aws-secret]",
32
32
  },
33
33
  {
@@ -37,12 +37,12 @@ const SECRET_PATTERNS: { name: string; regex: RegExp; replacement: SecretReplace
37
37
  },
38
38
  {
39
39
  name: "generic-api-key",
40
- regex: /(?:api[_-]?key|apikey|api[_-]?secret|secret[_-]?key)[=:]\s*['"]?[A-Za-z0-9_\-.]{16,}['"]?/gi,
40
+ regex: /\b(?:api[_-]?key|apikey|api[_-]?secret|secret[_-]?key|client[_-]?secret|private[_-]?key|access[_-]?key|auth[_-]?key)\b\s*[=:]\s*['"]?[A-Za-z0-9_\-./+=]{16,}['"]?/gi,
41
41
  replacement: "[REDACTED:api-key]",
42
42
  },
43
43
  {
44
44
  name: "secret-assignment",
45
- regex: /\b(?:secret|secret[_-]?key)\b\s*[=:]\s*(?:['"][^'"]+['"]|[^\s'"`]+)/gi,
45
+ regex: /\b(?:secret|secret[_-]?key|client[_-]?secret|app[_-]?secret|webhook[_-]?secret|signing[_-]?secret)\b\s*[=:]\s*(?:['"][^'"]+['"]|[^\s'"`]+)/gi,
46
46
  replacement: "[REDACTED:secret]",
47
47
  },
48
48
  {
@@ -126,6 +126,22 @@ export function redactSecrets(text: string): string {
126
126
  return result;
127
127
  }
128
128
 
129
+ export function isSensitiveForGlobalMemory(text: string): boolean {
130
+ if (redactSecrets(text) !== text) return true;
131
+
132
+ return [
133
+ // Local/absolute/relative filesystem paths and common repo paths.
134
+ /(?:^|\s)(?:~|\.|\.\.|[A-Za-z]:)?[/\\][^\s]+/,
135
+ /\b(?:src|lib|test|tests|packages|apps|docs|config)\/[\w./-]+\b/i,
136
+ /\b[\w.-]+\.(?:ts|tsx|js|jsx|mjs|cjs|json|yaml|yml|toml|env|db|sqlite|pem|key|crt)\b/i,
137
+ // Hostnames and network endpoints.
138
+ /\b(?:localhost|127\.0\.0\.1|0\.0\.0\.0|::1)\b/i,
139
+ /\b[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)+(?::\d{2,5})?\b/i,
140
+ // Implementation/internal detail markers that should stay project-local.
141
+ /\b(?:internal|private|implementation detail|class|function|method|module|endpoint|schema|table|column)\b/i,
142
+ ].some((pattern) => pattern.test(text));
143
+ }
144
+
129
145
  export function isSensitivePath(path: string, extraPatterns: RegExp[] = []): boolean {
130
146
  const allPatterns = [...DEFAULT_SENSITIVE_PATHS, ...extraPatterns];
131
147
 
@@ -23,6 +23,8 @@ const KIND_BOOST: Record<string, number> = {
23
23
  // ─── Recency decay ──────────────────────────────────────────────────
24
24
 
25
25
  const RECENCY_HALF_LIFE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
26
+ export const MAX_RETRIEVAL_LIMIT = 20;
27
+ const MAX_CANDIDATE_LIMIT = MAX_RETRIEVAL_LIMIT * 10;
26
28
 
27
29
  function recencyDecay(createdAt: number): number {
28
30
  const age = Date.now() - createdAt;
@@ -72,7 +74,7 @@ export function rankAndFilter(
72
74
  // and require explicit cross-project retrieval.
73
75
  if (rec.scope === "global") {
74
76
  if (!crossProjectEnabled) continue;
75
- } else if (rec.project_id && currentProjectId && rec.project_id !== currentProjectId) {
77
+ } else if (!rec.project_id || !currentProjectId || rec.project_id !== currentProjectId) {
76
78
  continue;
77
79
  }
78
80
 
@@ -121,6 +123,11 @@ export function rankAndFilter(
121
123
 
122
124
  // ─── Full retrieval pipeline ────────────────────────────────────────
123
125
 
126
+ export function normalizeRetrievalLimit(value: unknown, fallback: number): number {
127
+ const numeric = typeof value === "number" && Number.isFinite(value) ? Math.floor(value) : fallback;
128
+ return Math.max(1, Math.min(MAX_RETRIEVAL_LIMIT, numeric));
129
+ }
130
+
124
131
  export function retrieve(
125
132
  userPrompt: string,
126
133
  currentProjectId: string | null,
@@ -133,13 +140,14 @@ export function retrieve(
133
140
  },
134
141
  ): RankedResult[] {
135
142
  const config = getConfig();
136
- const limit = opts?.limit ?? config.maxInjectedRecords;
143
+ const limit = normalizeRetrievalLimit(opts?.limit, config.maxInjectedRecords);
137
144
  const crossProject = opts?.crossProjectEnabled ?? config.crossProjectEnabled;
138
145
 
139
146
  const query = buildSearchQuery(userPrompt, recentFiles);
140
147
 
141
- // Get more candidates than needed (ranking will filter)
142
- const candidates = searchRecordsFts(query, limit * 10, opts?.kindFilter, opts?.scopeFilter);
148
+ // Get more candidates than needed (ranking will filter), but keep local work bounded.
149
+ const candidateLimit = Math.min(MAX_CANDIDATE_LIMIT, limit * 10);
150
+ const candidates = searchRecordsFts(query, candidateLimit, opts?.kindFilter, opts?.scopeFilter);
143
151
 
144
152
  const ranked = rankAndFilter(candidates, currentProjectId, crossProject);
145
153
 
@@ -67,7 +67,8 @@ export function parseRefArgs(args: string): string[] {
67
67
  }
68
68
 
69
69
  export function isRecordVisibleInProject(record: RecordRow, currentProjectId: string | null): boolean {
70
- return record.scope === "global" || record.project_id === null || record.project_id === currentProjectId;
70
+ if (record.scope === "global") return true;
71
+ return Boolean(currentProjectId && record.project_id && record.project_id === currentProjectId);
71
72
  }
72
73
 
73
74
  export function manualRecordsToRankedResults(
@@ -6,8 +6,10 @@ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
6
6
  import { Type } from "typebox";
7
7
  import { StringEnum } from "@earendil-works/pi-ai";
8
8
  import { getRecord, softForgetRecord, upsertRecord } from "../db/index.js";
9
- import { retrieve, buildInjectionPacket, formatInjectionForLlm } from "../retrieval/index.js";
9
+ import { retrieve, buildInjectionPacket, formatInjectionForLlm, normalizeRetrievalLimit } from "../retrieval/index.js";
10
10
  import { getProjectId, getConfig } from "../config/index.js";
11
+ import { isSensitiveForGlobalMemory } from "../privacy/index.js";
12
+ import { isRecordVisibleInProject } from "../session-state/index.js";
11
13
  import type { RecordKind, RecordScope } from "../db/schema.js";
12
14
 
13
15
  export function registerTools(pi: ExtensionAPI): void {
@@ -37,12 +39,12 @@ export function registerTools(pi: ExtensionAPI): void {
37
39
  ] as const),
38
40
  ),
39
41
  scope: Type.Optional(StringEnum(["project", "global"] as const)),
40
- limit: Type.Optional(Type.Number({ description: "Max results (default 5)" })),
42
+ limit: Type.Optional(Type.Number({ description: "Max results (default 5, max 20)", minimum: 1, maximum: 20 })),
41
43
  }),
42
44
  async execute(toolCallId, params, _signal, _onUpdate, ctx) {
43
45
  const projectId = getProjectId(ctx.cwd);
44
46
  const config = getConfig(ctx.cwd);
45
- const limit = params.limit ?? 5;
47
+ const limit = normalizeRetrievalLimit(params.limit, 5);
46
48
 
47
49
  const results = retrieve(params.query, projectId, [], {
48
50
  limit,
@@ -102,10 +104,8 @@ export function registerTools(pi: ExtensionAPI): void {
102
104
  }
103
105
 
104
106
  const currentProjectId = getProjectId(ctx.cwd);
105
- const visibleInCurrentProject =
106
- record.scope === "global" || record.project_id === null || record.project_id === currentProjectId;
107
107
 
108
- if (record.status !== "active" || !visibleInCurrentProject) {
108
+ if (record.status !== "active" || !isRecordVisibleInProject(record, currentProjectId)) {
109
109
  return {
110
110
  content: [{ type: "text", text: `Memory record ${params.ref} is not available.` }],
111
111
  details: { ref: params.ref, found: false, unavailable: true },
@@ -168,10 +168,7 @@ export function registerTools(pi: ExtensionAPI): void {
168
168
  let scope = params.scope ?? "project";
169
169
 
170
170
  // Safety: never allow global for implementation details, paths, etc.
171
- const isSensitiveForGlobal =
172
- /\b(?:password|secret|token|key|\.env|localhost|127\.0\.0\.1|internal|private)\b/i.test(
173
- params.text,
174
- );
171
+ const isSensitiveForGlobal = isSensitiveForGlobalMemory(`${params.text}\n${params.tags ?? ""}`);
175
172
 
176
173
  const downgradedToProject = isSensitiveForGlobal && scope === "global";
177
174
  if (downgradedToProject) {
@@ -228,14 +225,22 @@ export function registerTools(pi: ExtensionAPI): void {
228
225
  };
229
226
  }
230
227
 
228
+ const currentProjectId = getProjectId(ctx.cwd);
229
+ if (record.status !== "active" || !isRecordVisibleInProject(record, currentProjectId)) {
230
+ return {
231
+ content: [{ type: "text", text: `Memory record ${params.ref} is not available.` }],
232
+ details: { ref: params.ref, found: false, unavailable: true },
233
+ };
234
+ }
235
+
231
236
  if (params.hard) {
232
237
  // For hard delete via tool, we require the user to explicitly confirm
233
- // The tool should note this requires user interaction
238
+ // The tool should note this requires user interaction without leaking record contents.
234
239
  return {
235
240
  content: [
236
241
  {
237
242
  type: "text",
238
- text: `Permanent deletion requires explicit confirmation. Please use /memory-forget ${params.ref} --hard to permanently delete this record.\n\nRecord: [${record.kind}] ${record.text.slice(0, 200)}`,
243
+ text: `Permanent deletion requires explicit confirmation. Please use /memory-forget ${params.ref} --hard to permanently delete this record.`,
239
244
  },
240
245
  ],
241
246
  details: { ref: params.ref, requiresConfirmation: true },
@@ -0,0 +1,268 @@
1
+ /** URL capture for memory vault source pages. */
2
+
3
+ import { createHash } from "node:crypto";
4
+ import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
5
+ import { join, relative } from "node:path";
6
+ import { redactSecrets } from "../privacy/index.js";
7
+ import { initVault, getVaultStatus, type VaultRegistry, type VaultRegistryPage } from "./index.js";
8
+ import { sanitizeSlug } from "./markdown.js";
9
+ import { resolveSourcePacketPath, resolveVaultPath, type VaultScope } from "./paths.js";
10
+ import { extractArticle, type ExtractedArticle } from "./extract.js";
11
+ import { assessCaptureQuality, type CaptureQuality, type CaptureQualityReport } from "./quality.js";
12
+ import { fetchCandidate, type CaptureFetchAttempt, type CaptureFetchOptions, type FetchedCandidate } from "./fetch.js";
13
+ import { resolveCaptureTargets, type CaptureCandidate } from "./url-resolvers.js";
14
+
15
+ const MAX_EXTRACTED_CHARS = 200_000;
16
+
17
+ export interface CaptureUrlOptions extends CaptureFetchOptions {}
18
+
19
+ export interface CaptureUrlResult {
20
+ vaultPath: string;
21
+ pagePath: string;
22
+ sourcePacketPath: string;
23
+ title: string;
24
+ url: string;
25
+ finalUrl: string;
26
+ initialized: boolean;
27
+ quality: CaptureQuality;
28
+ qualityScore: number;
29
+ warnings: string[];
30
+ }
31
+
32
+ export async function captureUrlToVault(
33
+ scope: VaultScope,
34
+ projectId: string | null,
35
+ cwd: string,
36
+ url: string,
37
+ options: CaptureUrlOptions = {},
38
+ ): Promise<CaptureUrlResult> {
39
+ const targets = resolveCaptureTargets(url);
40
+ const vaultPath = resolveVaultPath(scope, projectId, cwd);
41
+ const wasInitialized = getVaultStatus(scope, projectId, cwd).initialized;
42
+ if (!wasInitialized) {
43
+ initVault(scope, projectId, cwd);
44
+ }
45
+
46
+ const selected = await fetchAndExtractBest(targets.candidates, options);
47
+ const title = selected.extracted.title || new URL(selected.fetched.finalUrl).hostname;
48
+ const slug = sanitizeSlug(title).slice(0, 70) || "captured-page";
49
+ const captureId = `SRC-${new Date().toISOString().slice(0, 10)}-${sha256(targets.originalUrl).slice(0, 8)}`;
50
+ const packetPath = resolveSourcePacketPath(scope, projectId, cwd, captureId);
51
+ const packetRelPath = normalizePath(relative(vaultPath, packetPath));
52
+ const sourcePageRelPath = join("sources", `${slug}-${sha256(targets.originalUrl).slice(0, 8)}.md`);
53
+ const sourcePagePath = join(vaultPath, sourcePageRelPath);
54
+
55
+ mkdirSync(join(packetPath, "original"), { recursive: true, mode: 0o700 });
56
+ mkdirSync(join(packetPath, "attachments"), { recursive: true, mode: 0o700 });
57
+ mkdirSync(join(vaultPath, "sources"), { recursive: true, mode: 0o700 });
58
+
59
+ const capturedAt = new Date().toISOString();
60
+ const originalName = originalArtifactName(selected.fetched.contentType, selected.fetched.finalUrl, selected.extracted.extractor);
61
+ const extractedMarkdown = unescapeRedactionMarkers(redactSecrets(selected.extracted.markdown)).slice(0, MAX_EXTRACTED_CHARS);
62
+ const redactedRaw = redactSecrets(selected.fetched.raw);
63
+ const contentHash = sha256(extractedMarkdown);
64
+
65
+ const manifest = {
66
+ id: captureId,
67
+ url: targets.originalUrl,
68
+ canonical_url: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
69
+ final_url: selected.fetched.finalUrl,
70
+ title,
71
+ byline: selected.extracted.byline,
72
+ site_name: selected.extracted.siteName,
73
+ excerpt: selected.extracted.excerpt,
74
+ published_at: selected.extracted.publishedAt,
75
+ content_type: selected.fetched.contentType,
76
+ captured_at: capturedAt,
77
+ original: `original/${originalName}`,
78
+ extracted: "extracted.md",
79
+ metadata: "metadata.json",
80
+ attempts: selected.attempts,
81
+ extraction: {
82
+ extractor: selected.extracted.extractor,
83
+ strategy: selected.fetched.candidate.strategy,
84
+ candidate_kind: selected.fetched.candidate.kind,
85
+ },
86
+ quality: selected.quality,
87
+ content_hash: contentHash,
88
+ };
89
+
90
+ const metadata = {
91
+ title,
92
+ byline: selected.extracted.byline,
93
+ site_name: selected.extracted.siteName,
94
+ excerpt: selected.extracted.excerpt,
95
+ published_at: selected.extracted.publishedAt,
96
+ source_url: targets.originalUrl,
97
+ canonical_url: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
98
+ final_url: selected.fetched.finalUrl,
99
+ content_hash: contentHash,
100
+ extractor: selected.extracted.extractor,
101
+ fetch_strategy: selected.fetched.candidate.strategy,
102
+ quality: selected.quality,
103
+ };
104
+
105
+ writeFileSync(join(packetPath, "manifest.json"), JSON.stringify(manifest, null, 2) + "\n", { mode: 0o600 });
106
+ writeFileSync(join(packetPath, "metadata.json"), JSON.stringify(metadata, null, 2) + "\n", { mode: 0o600 });
107
+ writeFileSync(join(packetPath, "original", originalName), redactedRaw, { mode: 0o600 });
108
+ writeFileSync(join(packetPath, "extracted.md"), extractedMarkdown, { mode: 0o600 });
109
+
110
+ const pageMarkdown = renderSourcePage({
111
+ title,
112
+ url: targets.originalUrl,
113
+ canonicalUrl: selected.extracted.canonicalUrl ?? selected.fetched.finalUrl,
114
+ capturedAt,
115
+ captureId,
116
+ packetRelPath,
117
+ extractedMarkdown,
118
+ quality: selected.quality,
119
+ warnings: selected.quality.warnings,
120
+ });
121
+ writeFileSync(sourcePagePath, pageMarkdown, { mode: 0o600 });
122
+
123
+ updateRegistry(vaultPath, {
124
+ path: normalizePath(sourcePageRelPath),
125
+ title,
126
+ kind: "web_source",
127
+ source_url: targets.originalUrl,
128
+ source_packet: packetRelPath,
129
+ content_hash: sha256(pageMarkdown),
130
+ generated: true,
131
+ created_at: capturedAt,
132
+ updated_at: capturedAt,
133
+ });
134
+
135
+ return {
136
+ vaultPath,
137
+ pagePath: sourcePagePath,
138
+ sourcePacketPath: packetPath,
139
+ title,
140
+ url: targets.originalUrl,
141
+ finalUrl: selected.fetched.finalUrl,
142
+ initialized: !wasInitialized,
143
+ quality: selected.quality.quality,
144
+ qualityScore: selected.quality.score,
145
+ warnings: selected.quality.warnings,
146
+ };
147
+ }
148
+
149
+ interface ExtractedCandidate {
150
+ fetched: FetchedCandidate;
151
+ extracted: ExtractedArticle;
152
+ quality: CaptureQualityReport;
153
+ attempts: CaptureFetchAttempt[];
154
+ }
155
+
156
+ async function fetchAndExtractBest(candidates: CaptureCandidate[], options: CaptureUrlOptions): Promise<ExtractedCandidate> {
157
+ const allAttempts: CaptureFetchAttempt[] = [];
158
+ let best: ExtractedCandidate | null = null;
159
+ const errors: string[] = [];
160
+
161
+ for (const candidate of candidates) {
162
+ try {
163
+ const fetched = await fetchCandidate(candidate, options);
164
+ allAttempts.push(...fetched.attempts);
165
+ const redactedRaw = redactSecrets(fetched.raw);
166
+ const extracted = extractArticle({
167
+ raw: redactedRaw,
168
+ contentType: fetched.contentType,
169
+ url: fetched.finalUrl,
170
+ candidateKind: candidate.kind,
171
+ });
172
+ const quality = assessCaptureQuality({
173
+ title: extracted.title,
174
+ markdown: extracted.markdown,
175
+ extractor: extracted.extractor,
176
+ });
177
+ const current: ExtractedCandidate = { fetched, extracted, quality, attempts: [...allAttempts] };
178
+ if (!best || current.quality.score > best.quality.score) best = current;
179
+ if (quality.quality === "good") return current;
180
+ } catch (error) {
181
+ const attempts = (error as Error & { attempts?: CaptureFetchAttempt[] }).attempts;
182
+ if (attempts) allAttempts.push(...attempts);
183
+ errors.push(`${candidate.strategy}: ${error instanceof Error ? error.message : String(error)}`);
184
+ }
185
+ }
186
+
187
+ if (best) return { ...best, attempts: allAttempts };
188
+ throw new Error(`Unable to fetch article. Attempts failed: ${errors.join("; ")}`);
189
+ }
190
+
191
+ function renderSourcePage(input: {
192
+ title: string;
193
+ url: string;
194
+ canonicalUrl: string;
195
+ capturedAt: string;
196
+ captureId: string;
197
+ packetRelPath: string;
198
+ extractedMarkdown: string;
199
+ quality: CaptureQualityReport;
200
+ warnings: string[];
201
+ }): string {
202
+ const warningLines = input.warnings.length > 0
203
+ ? ["", "Warnings:", ...input.warnings.map((warning) => `- ${warning}`)]
204
+ : [];
205
+
206
+ return [
207
+ "---",
208
+ `title: ${JSON.stringify(input.title)}`,
209
+ "kind: web_source",
210
+ `source_url: ${JSON.stringify(input.url)}`,
211
+ `canonical_url: ${JSON.stringify(input.canonicalUrl)}`,
212
+ `source_packet: ${JSON.stringify(input.packetRelPath)}`,
213
+ `captured_at: ${JSON.stringify(input.capturedAt)}`,
214
+ `capture_id: ${JSON.stringify(input.captureId)}`,
215
+ `quality: ${JSON.stringify(input.quality.quality)}`,
216
+ `quality_score: ${input.quality.score}`,
217
+ "generated: true",
218
+ "source: pi-memory-stone",
219
+ "---",
220
+ "",
221
+ `# ${input.title.replace(/[\r\n]+/g, " ").trim()}`,
222
+ "",
223
+ `Source: ${input.url}`,
224
+ `Canonical: ${input.canonicalUrl}`,
225
+ `Captured: ${input.capturedAt}`,
226
+ `Quality: ${input.quality.quality} (${input.quality.score})`,
227
+ `Source packet: ${input.captureId} (stored outside vault: ${input.packetRelPath})`,
228
+ ...warningLines,
229
+ "",
230
+ "## Extracted text",
231
+ "",
232
+ input.extractedMarkdown.trim() || "_No text extracted._",
233
+ "",
234
+ ].join("\n");
235
+ }
236
+
237
+ function updateRegistry(vaultPath: string, page: VaultRegistryPage & {
238
+ source_url: string;
239
+ source_packet: string;
240
+ }): void {
241
+ const registryPath = join(vaultPath, "meta", "registry.json");
242
+ const registry = JSON.parse(readFileSync(registryPath, "utf8")) as VaultRegistry;
243
+ const pages = registry.pages.filter((existing) => existing.path !== page.path);
244
+ pages.push(page);
245
+ pages.sort((a, b) => a.path.localeCompare(b.path));
246
+ registry.pages = pages;
247
+ registry.generated_at = new Date().toISOString();
248
+ writeFileSync(registryPath, JSON.stringify(registry, null, 2) + "\n", { mode: 0o600 });
249
+ }
250
+
251
+ function originalArtifactName(contentType: string, finalUrl: string, extractor: string): string {
252
+ if (contentType.includes("html") || extractor.startsWith("html")) return "response.html";
253
+ if (contentType.includes("markdown") || finalUrl.toLowerCase().match(/\.(md|markdown|mdx)(?:$|[?#])/) || extractor === "markdown") return "response.md";
254
+ if (contentType.includes("pdf") || finalUrl.toLowerCase().match(/\.pdf(?:$|[?#])/)) return "response.pdf.txt";
255
+ return "response.txt";
256
+ }
257
+
258
+ function normalizePath(path: string): string {
259
+ return path.split(/[\\/]+/).join("/");
260
+ }
261
+
262
+ function unescapeRedactionMarkers(markdown: string): string {
263
+ return markdown.replace(/\\\[REDACTED:([a-z-]+)\\\]/g, "[REDACTED:$1]");
264
+ }
265
+
266
+ function sha256(content: string): string {
267
+ return createHash("sha256").update(content).digest("hex");
268
+ }