chapterhouse 0.7.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/korg.agent.md +65 -0
- package/dist/api/korg.js +34 -0
- package/dist/api/korg.test.js +42 -0
- package/dist/api/server.js +238 -2
- package/dist/api/server.test.js +199 -0
- package/dist/config.js +28 -0
- package/dist/config.test.js +20 -0
- package/dist/copilot/agents.js +3 -4
- package/dist/copilot/agents.test.js +12 -1
- package/dist/copilot/orchestrator.js +12 -1
- package/dist/copilot/orchestrator.test.js +3 -7
- package/dist/copilot/system-message.js +12 -10
- package/dist/copilot/system-message.test.js +6 -1
- package/dist/copilot/tools.js +193 -375
- package/dist/copilot/tools.memory.test.js +32 -0
- package/dist/copilot/tools.wiki.test.js +80 -59
- package/dist/copilot/turn-event-log-env.test.js +11 -15
- package/dist/daemon.js +19 -0
- package/dist/memory/decisions.js +6 -5
- package/dist/memory/entities.js +20 -9
- package/dist/memory/eot.js +30 -8
- package/dist/memory/eot.test.js +220 -6
- package/dist/memory/hooks.js +151 -0
- package/dist/memory/hooks.test.js +325 -0
- package/dist/memory/hot-tier.js +37 -0
- package/dist/memory/hot-tier.test.js +30 -0
- package/dist/memory/housekeeping-scheduler.js +35 -0
- package/dist/memory/housekeeping-scheduler.test.js +50 -0
- package/dist/memory/inbox.js +10 -0
- package/dist/memory/index.js +3 -1
- package/dist/memory/migration.js +244 -0
- package/dist/memory/migration.test.js +108 -0
- package/dist/memory/reflect.js +273 -0
- package/dist/memory/reflect.test.js +254 -0
- package/dist/paths.js +31 -11
- package/dist/store/db.js +187 -4
- package/dist/store/db.test.js +66 -2
- package/dist/test/helpers/reset-singletons.js +8 -0
- package/dist/test/helpers/reset-singletons.test.js +37 -0
- package/dist/test/setup-env.js +9 -1
- package/dist/wiki/consolidation.js +641 -0
- package/dist/wiki/consolidation.test.js +143 -0
- package/dist/wiki/frontmatter.js +48 -0
- package/dist/wiki/frontmatter.test.js +42 -0
- package/dist/wiki/fs.js +22 -13
- package/dist/wiki/index-manager.js +305 -330
- package/dist/wiki/index-manager.test.js +265 -144
- package/dist/wiki/ingest.js +347 -0
- package/dist/wiki/ingest.test.js +111 -0
- package/dist/wiki/links.js +151 -0
- package/dist/wiki/links.test.js +176 -0
- package/dist/wiki/log-manager.js +8 -5
- package/dist/wiki/log-manager.test.js +4 -0
- package/dist/wiki/migrate-topics.test.js +16 -6
- package/dist/wiki/scheduler.js +118 -0
- package/dist/wiki/scheduler.test.js +64 -0
- package/dist/wiki/timeline.js +51 -0
- package/dist/wiki/timeline.test.js +65 -0
- package/dist/wiki/topic-structure.js +1 -1
- package/package.json +1 -1
- package/skills/pkb-ideas/SKILL.md +78 -0
- package/skills/pkb-ideas/_meta.json +4 -0
- package/skills/pkb-org/SKILL.md +82 -0
- package/skills/pkb-org/_meta.json +4 -0
- package/skills/pkb-people/SKILL.md +74 -0
- package/skills/pkb-people/_meta.json +4 -0
- package/skills/pkb-research/SKILL.md +83 -0
- package/skills/pkb-research/_meta.json +4 -0
- package/skills/pkb-source/SKILL.md +38 -0
- package/skills/pkb-source/_meta.json +4 -0
- package/skills/wiki-conventions/SKILL.md +5 -5
- package/web/dist/assets/{index-DuKYxMIR.css → index-5kz9aRU9.css} +1 -1
- package/web/dist/assets/{index-DytB69KC.js → index-BbX9RKf3.js} +91 -89
- package/web/dist/assets/index-BbX9RKf3.js.map +1 -0
- package/web/dist/index.html +2 -2
- package/dist/wiki/context.js +0 -138
- package/dist/wiki/fix.js +0 -335
- package/dist/wiki/fix.test.js +0 -350
- package/dist/wiki/lint.js +0 -451
- package/dist/wiki/lint.test.js +0 -329
- package/web/dist/assets/index-DytB69KC.js.map +0 -1
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// PKB ingestion pipeline — parse, extract entities, write wiki pages
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { createHash } from "node:crypto";
|
|
5
|
+
import { exec } from "node:child_process";
|
|
6
|
+
import { mkdirSync, readFileSync, existsSync, rmSync } from "node:fs";
|
|
7
|
+
import { join } from "node:path";
|
|
8
|
+
import { promisify } from "node:util";
|
|
9
|
+
import { getDb } from "../store/db.js";
|
|
10
|
+
import { ensureWikiStructure, writeRawSource, assertPagePath } from "./fs.js";
|
|
11
|
+
import { appendTimeline } from "./timeline.js";
|
|
12
|
+
import { validateAndBackfillFrontmatter } from "./frontmatter.js";
|
|
13
|
+
import { writePage, readPage } from "./fs.js";
|
|
14
|
+
import { childLogger } from "../util/logger.js";
|
|
15
|
+
const log = childLogger("ingest");
|
|
16
|
+
const execAsync = promisify(exec);
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Source ID
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
export function computeSourceId(sourceType, origin) {
|
|
21
|
+
return createHash("sha256").update(sourceType + origin).digest("hex");
|
|
22
|
+
}
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Content parsers
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
export function assertSafeRemoteUrl(url) {
|
|
27
|
+
const parsedUrl = new URL(url);
|
|
28
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
|
|
29
|
+
throw new Error(`Only http/https URLs supported, got: ${parsedUrl.protocol}`);
|
|
30
|
+
}
|
|
31
|
+
const host = parsedUrl.hostname.toLowerCase();
|
|
32
|
+
const octets = host.split(".").map((part) => Number(part));
|
|
33
|
+
const isIpv4 = octets.length === 4 && octets.every((part) => Number.isInteger(part) && part >= 0 && part <= 255);
|
|
34
|
+
const isPrivateIpv4 = isIpv4 && (octets[0] === 10
|
|
35
|
+
|| (octets[0] === 172 && octets[1] >= 16 && octets[1] <= 31)
|
|
36
|
+
|| (octets[0] === 192 && octets[1] === 168));
|
|
37
|
+
if (host === "localhost" || host === "127.0.0.1" || host === "::1"
|
|
38
|
+
|| isPrivateIpv4
|
|
39
|
+
|| host.startsWith("169.254.") || host === "metadata.google.internal") {
|
|
40
|
+
throw new Error("Cannot fetch internal/private URLs.");
|
|
41
|
+
}
|
|
42
|
+
return parsedUrl;
|
|
43
|
+
}
|
|
44
|
+
export function createEntityPageContent({ pageTitle, pageSummary, entityType, updatedAt, }) {
|
|
45
|
+
return `---\ntitle: ${pageTitle}\nsummary: ${pageSummary}\nupdated: ${updatedAt}\ntags: []\nmetadata:\n entity_type: ${entityType}\n---\n\n# ${pageTitle}\n\n## Summary\n\n${pageSummary}\n\n## Timeline\n`;
|
|
46
|
+
}
|
|
47
|
+
async function parseUrl(url) {
|
|
48
|
+
const parsedUrl = assertSafeRemoteUrl(url);
|
|
49
|
+
const res = await fetch(url, { headers: { "User-Agent": "Chapterhouse/1.0 PKB-Ingest" } });
|
|
50
|
+
if (!res.ok)
|
|
51
|
+
throw new Error(`HTTP ${res.status} ${res.statusText} fetching ${url}`);
|
|
52
|
+
const html = await res.text();
|
|
53
|
+
// Try @mozilla/readability if available
|
|
54
|
+
let text;
|
|
55
|
+
let title = parsedUrl.hostname;
|
|
56
|
+
try {
|
|
57
|
+
const { Readability } = await import("@mozilla/readability");
|
|
58
|
+
const { JSDOM } = await import("jsdom");
|
|
59
|
+
const dom = new JSDOM(html, { url });
|
|
60
|
+
const reader = new Readability(dom.window.document);
|
|
61
|
+
const article = reader.parse();
|
|
62
|
+
if (article) {
|
|
63
|
+
text = article.textContent;
|
|
64
|
+
title = article.title || title;
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
text = stripHtml(html);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
text = stripHtml(html);
|
|
72
|
+
// Try to extract title from <title> tag
|
|
73
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
74
|
+
if (titleMatch)
|
|
75
|
+
title = titleMatch[1].trim();
|
|
76
|
+
}
|
|
77
|
+
if (text.length > 50_000)
|
|
78
|
+
text = text.slice(0, 50_000);
|
|
79
|
+
return { text, title };
|
|
80
|
+
}
|
|
81
|
+
function stripHtml(html) {
|
|
82
|
+
return html
|
|
83
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
84
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
85
|
+
.replace(/<[^>]+>/g, " ")
|
|
86
|
+
.replace(/\s{2,}/g, " ")
|
|
87
|
+
.trim()
|
|
88
|
+
.slice(0, 10_000);
|
|
89
|
+
}
|
|
90
|
+
async function parsePdf(filePath) {
|
|
91
|
+
try {
|
|
92
|
+
const pdfParse = await import("pdf-parse");
|
|
93
|
+
const buf = readFileSync(filePath);
|
|
94
|
+
const data = await pdfParse.default(buf);
|
|
95
|
+
return { text: data.text.slice(0, 50_000), title: filePath.replace(/.*\//, "").replace(/\.pdf$/i, "") };
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
throw new Error("PDF ingestion requires pdf-parse: npm install pdf-parse");
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
async function parseRepo(repoUrl) {
|
|
102
|
+
const tmpDir = join(process.cwd(), ".test-work", `repo-${Date.now()}`);
|
|
103
|
+
mkdirSync(tmpDir, { recursive: true });
|
|
104
|
+
try {
|
|
105
|
+
await execAsync(`git clone --depth 1 ${JSON.stringify(repoUrl)} ${JSON.stringify(tmpDir)}`, { timeout: 60_000 });
|
|
106
|
+
const parts = [];
|
|
107
|
+
// README
|
|
108
|
+
for (const name of ["README.md", "README.rst", "README.txt", "README"]) {
|
|
109
|
+
const p = join(tmpDir, name);
|
|
110
|
+
if (existsSync(p)) {
|
|
111
|
+
parts.push(`## README\n\n${readFileSync(p, "utf-8").slice(0, 10_000)}`);
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// package.json / Cargo.toml / go.mod
|
|
116
|
+
for (const name of ["package.json", "Cargo.toml", "go.mod"]) {
|
|
117
|
+
const p = join(tmpDir, name);
|
|
118
|
+
if (existsSync(p)) {
|
|
119
|
+
parts.push(`## ${name}\n\n\`\`\`\n${readFileSync(p, "utf-8").slice(0, 2_000)}\n\`\`\``);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
const { stdout } = await execAsync(`ls -la ${JSON.stringify(tmpDir)}`);
|
|
123
|
+
parts.push(`## Directory listing\n\n\`\`\`\n${stdout}\n\`\`\``);
|
|
124
|
+
const title = repoUrl.replace(/.*\//, "").replace(/\.git$/, "");
|
|
125
|
+
return { text: parts.join("\n\n"), title };
|
|
126
|
+
}
|
|
127
|
+
finally {
|
|
128
|
+
try {
|
|
129
|
+
rmSync(tmpDir, { recursive: true, force: true });
|
|
130
|
+
}
|
|
131
|
+
catch { /* best-effort */ }
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
// Entity extraction via LLM
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
async function extractEntities(text, topic) {
|
|
138
|
+
// Skip entity extraction if no auth token is configured
|
|
139
|
+
const { config } = await import("../config.js");
|
|
140
|
+
const token = config.copilotAuthToken || process.env.COPILOT_TOKEN || process.env.GITHUB_TOKEN;
|
|
141
|
+
if (!token) {
|
|
142
|
+
log.debug("No Copilot auth token configured, skipping entity extraction");
|
|
143
|
+
return { entities: [], relationships: [] };
|
|
144
|
+
}
|
|
145
|
+
const topicHint = topic ? ` Focus especially on entities related to: ${topic}.` : "";
|
|
146
|
+
const systemPrompt = "Extract entities and relationships from this content. Return JSON only, no other text: " +
|
|
147
|
+
`{ "entities": [{"name": string, "type": string, "description": string}], ` +
|
|
148
|
+
`"relationships": [{"from": string, "to": string, "type": string}] }`;
|
|
149
|
+
const userMessage = `${systemPrompt}${topicHint}\n\n---\n\n${text.slice(0, 8_000)}`;
|
|
150
|
+
try {
|
|
151
|
+
const { CopilotClient, approveAll } = await import("@github/copilot-sdk");
|
|
152
|
+
// Use a one-shot client (autoRestart: false) so it doesn't keep the process alive
|
|
153
|
+
const client = new CopilotClient({
|
|
154
|
+
autoStart: true,
|
|
155
|
+
autoRestart: false,
|
|
156
|
+
gitHubToken: token,
|
|
157
|
+
});
|
|
158
|
+
await client.start();
|
|
159
|
+
try {
|
|
160
|
+
const session = await client.createSession({
|
|
161
|
+
model: "claude-haiku-4.5",
|
|
162
|
+
tools: [],
|
|
163
|
+
onPermissionRequest: approveAll,
|
|
164
|
+
});
|
|
165
|
+
try {
|
|
166
|
+
const result = await session.sendAndWait({ prompt: userMessage }, 30_000);
|
|
167
|
+
const rawText = typeof result === "string" ? result : JSON.stringify(result);
|
|
168
|
+
// Extract JSON from the response (may be wrapped in markdown code blocks)
|
|
169
|
+
const jsonMatch = rawText.match(/\{[\s\S]*\}/);
|
|
170
|
+
if (!jsonMatch)
|
|
171
|
+
return { entities: [], relationships: [] };
|
|
172
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
173
|
+
return {
|
|
174
|
+
entities: Array.isArray(parsed.entities) ? parsed.entities : [],
|
|
175
|
+
relationships: Array.isArray(parsed.relationships) ? parsed.relationships : [],
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
finally {
|
|
179
|
+
try {
|
|
180
|
+
session.destroy();
|
|
181
|
+
}
|
|
182
|
+
catch { /* best-effort */ }
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
finally {
|
|
186
|
+
try {
|
|
187
|
+
await client.stop();
|
|
188
|
+
}
|
|
189
|
+
catch { /* best-effort */ }
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
catch (err) {
|
|
193
|
+
log.warn({ err: err instanceof Error ? err.message : err }, "Entity extraction LLM call failed, skipping");
|
|
194
|
+
return { entities: [], relationships: [] };
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
// Slug helper
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
function slugify(name) {
|
|
201
|
+
return name
|
|
202
|
+
.toLowerCase()
|
|
203
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
204
|
+
.replace(/^-+|-+$/g, "")
|
|
205
|
+
.slice(0, 64) || "unknown";
|
|
206
|
+
}
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// Main ingest function
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
export async function ingestSource(source, type, topic, session) {
|
|
211
|
+
ensureWikiStructure();
|
|
212
|
+
const db = getDb();
|
|
213
|
+
const origin = type === "text" ? source.slice(0, 200) : source;
|
|
214
|
+
const sourceId = computeSourceId(type, origin);
|
|
215
|
+
// Idempotency check
|
|
216
|
+
const existing = db.prepare(`SELECT id, pages_updated FROM wiki_sources WHERE id = ?`).get(sourceId);
|
|
217
|
+
if (existing) {
|
|
218
|
+
log.info({ sourceId, type, origin }, "Source already ingested, skipping");
|
|
219
|
+
const pagesUpdated = JSON.parse(existing.pages_updated || "[]");
|
|
220
|
+
return {
|
|
221
|
+
source_id: sourceId,
|
|
222
|
+
pages_created: [],
|
|
223
|
+
pages_updated: pagesUpdated,
|
|
224
|
+
entities: [],
|
|
225
|
+
already_existed: true,
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
// Parse content
|
|
229
|
+
let parsedText;
|
|
230
|
+
let title;
|
|
231
|
+
switch (type) {
|
|
232
|
+
case "url": {
|
|
233
|
+
const r = await parseUrl(source);
|
|
234
|
+
parsedText = r.text;
|
|
235
|
+
title = r.title;
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
case "pdf": {
|
|
239
|
+
const r = await parsePdf(source);
|
|
240
|
+
parsedText = r.text;
|
|
241
|
+
title = r.title;
|
|
242
|
+
break;
|
|
243
|
+
}
|
|
244
|
+
case "repo": {
|
|
245
|
+
const r = await parseRepo(source);
|
|
246
|
+
parsedText = r.text;
|
|
247
|
+
title = r.title;
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
case "text":
|
|
251
|
+
default:
|
|
252
|
+
parsedText = source;
|
|
253
|
+
title = topic ?? `text-${sourceId.slice(0, 8)}`;
|
|
254
|
+
break;
|
|
255
|
+
}
|
|
256
|
+
// Save raw source archive
|
|
257
|
+
const rawFileName = `${sourceId.slice(0, 16)}.md`;
|
|
258
|
+
writeRawSource(rawFileName, parsedText);
|
|
259
|
+
// Persist to wiki_sources
|
|
260
|
+
const ingestedAt = new Date().toISOString();
|
|
261
|
+
db.prepare(`
|
|
262
|
+
INSERT INTO wiki_sources (id, source_type, origin, title, ingested_at, raw_path, parsed_content, pages_updated, session_id, session_name)
|
|
263
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, ?)
|
|
264
|
+
`).run(sourceId, type, origin, title, ingestedAt, `sources/${rawFileName}`, parsedText.slice(0, 100_000), session?.sessionId ?? null, session?.sessionName ?? null);
|
|
265
|
+
// Extract entities
|
|
266
|
+
const extraction = await extractEntities(parsedText, topic);
|
|
267
|
+
const pagesCreated = [];
|
|
268
|
+
const pagesUpdated = [];
|
|
269
|
+
const entitySummaries = [];
|
|
270
|
+
for (const entity of extraction.entities) {
|
|
271
|
+
if (!entity.name || !entity.type)
|
|
272
|
+
continue;
|
|
273
|
+
const entitySlug = slugify(entity.name);
|
|
274
|
+
const typeLower = entity.type.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
|
275
|
+
const pagePath = `pages/${typeLower}/${entitySlug}/index.md`;
|
|
276
|
+
// Validate path is safe
|
|
277
|
+
try {
|
|
278
|
+
assertPagePath(pagePath);
|
|
279
|
+
}
|
|
280
|
+
catch {
|
|
281
|
+
// Use topics fallback for unknown entity types
|
|
282
|
+
const fallbackPath = `pages/topics/${entitySlug}/index.md`;
|
|
283
|
+
try {
|
|
284
|
+
assertPagePath(fallbackPath);
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
const safePagePath = (() => {
|
|
291
|
+
try {
|
|
292
|
+
assertPagePath(pagePath);
|
|
293
|
+
return pagePath;
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
return `pages/topics/${entitySlug}/index.md`;
|
|
297
|
+
}
|
|
298
|
+
})();
|
|
299
|
+
const existed = readPage(safePagePath) !== undefined;
|
|
300
|
+
const timelineEntry = `Source ingested: ${title}\n\n${entity.description || entity.name}`;
|
|
301
|
+
if (!existed) {
|
|
302
|
+
const pageTitle = entity.name;
|
|
303
|
+
const pageSummary = (entity.description || entity.name).slice(0, 180).replace(/\n/g, " ");
|
|
304
|
+
const pageContent = createEntityPageContent({
|
|
305
|
+
pageTitle,
|
|
306
|
+
pageSummary,
|
|
307
|
+
entityType: typeLower,
|
|
308
|
+
updatedAt: ingestedAt.slice(0, 10),
|
|
309
|
+
});
|
|
310
|
+
const { content: backfilled } = validateAndBackfillFrontmatter(safePagePath, pageContent);
|
|
311
|
+
writePage(safePagePath, backfilled);
|
|
312
|
+
pagesCreated.push(safePagePath);
|
|
313
|
+
}
|
|
314
|
+
else {
|
|
315
|
+
pagesUpdated.push(safePagePath);
|
|
316
|
+
}
|
|
317
|
+
appendTimeline(safePagePath, timelineEntry);
|
|
318
|
+
entitySummaries.push({ name: entity.name, type: entity.type, path: safePagePath });
|
|
319
|
+
}
|
|
320
|
+
// Update wiki_sources with pages_updated
|
|
321
|
+
const allPages = [...new Set([...pagesCreated, ...pagesUpdated])];
|
|
322
|
+
db.prepare(`UPDATE wiki_sources SET pages_updated = ? WHERE id = ?`).run(JSON.stringify(allPages), sourceId);
|
|
323
|
+
return {
|
|
324
|
+
source_id: sourceId,
|
|
325
|
+
pages_created: pagesCreated,
|
|
326
|
+
pages_updated: pagesUpdated,
|
|
327
|
+
entities: entitySummaries,
|
|
328
|
+
already_existed: false,
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
// ---------------------------------------------------------------------------
|
|
332
|
+
// Type auto-detection
|
|
333
|
+
// ---------------------------------------------------------------------------
|
|
334
|
+
export function detectSourceType(source) {
|
|
335
|
+
const trimmed = source.trim();
|
|
336
|
+
if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) {
|
|
337
|
+
if (trimmed.endsWith(".pdf"))
|
|
338
|
+
return "pdf";
|
|
339
|
+
if (trimmed.includes("github.com") || trimmed.includes("gitlab.com") || trimmed.endsWith(".git"))
|
|
340
|
+
return "repo";
|
|
341
|
+
return "url";
|
|
342
|
+
}
|
|
343
|
+
if (trimmed.endsWith(".pdf") && !trimmed.includes(" "))
|
|
344
|
+
return "pdf";
|
|
345
|
+
return "text";
|
|
346
|
+
}
|
|
347
|
+
//# sourceMappingURL=ingest.js.map
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Ingestion pipeline tests — ingestSource
|
|
3
|
+
// Sandbox: single CHAPTERHOUSE_HOME per file to avoid module-singleton confusion
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
import assert from "node:assert/strict";
|
|
6
|
+
import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
|
|
7
|
+
import { join } from "node:path";
|
|
8
|
+
import test from "node:test";
|
|
9
|
+
// Single sandbox shared across all tests in this file
|
|
10
|
+
let SANDBOX;
|
|
11
|
+
let mods;
|
|
12
|
+
test.before(async () => {
|
|
13
|
+
mkdirSync(join(process.cwd(), ".test-work"), { recursive: true });
|
|
14
|
+
SANDBOX = mkdtempSync(join(process.cwd(), ".test-work", "ingest-"));
|
|
15
|
+
process.env.CHAPTERHOUSE_HOME = SANDBOX;
|
|
16
|
+
const nonce = `${Date.now()}-${Math.random()}`;
|
|
17
|
+
const ingestMod = await import(new URL(`./ingest.js?c=${nonce}`, import.meta.url).href);
|
|
18
|
+
const wikiFs = await import(new URL(`./fs.js?c=${nonce}`, import.meta.url).href);
|
|
19
|
+
const dbMod = await import(new URL(`../store/db.js?c=${nonce}`, import.meta.url).href);
|
|
20
|
+
mods = { ingestMod, wikiFs, dbMod };
|
|
21
|
+
mods.wikiFs.ensureWikiStructure();
|
|
22
|
+
});
|
|
23
|
+
test.after(() => {
|
|
24
|
+
try {
|
|
25
|
+
rmSync(SANDBOX, { recursive: true, force: true });
|
|
26
|
+
}
|
|
27
|
+
catch { /* best-effort */ }
|
|
28
|
+
});
|
|
29
|
+
test("ingestSource(text) creates wiki_sources record", async () => {
|
|
30
|
+
const text = `Source-A: Alice is a senior engineer at Acme Corp. timestamp=${Date.now()}`;
|
|
31
|
+
const result = await mods.ingestMod.ingestSource(text, "text", "people");
|
|
32
|
+
assert.ok(result.source_id, "Should return a source_id");
|
|
33
|
+
assert.equal(result.already_existed, false, "Should not already exist on first call");
|
|
34
|
+
const db = mods.dbMod.getDb();
|
|
35
|
+
const row = db.prepare(`SELECT * FROM wiki_sources WHERE id = ?`).get(result.source_id);
|
|
36
|
+
assert.ok(row, "Should be persisted in wiki_sources");
|
|
37
|
+
assert.equal(row.source_type, "text");
|
|
38
|
+
});
|
|
39
|
+
test("ingestSource(text) saves raw source file", async () => {
|
|
40
|
+
const text = `Source-B: Bob leads the platform engineering team at TechCo. timestamp=${Date.now()}`;
|
|
41
|
+
const result = await mods.ingestMod.ingestSource(text, "text");
|
|
42
|
+
const sources = mods.wikiFs.listSources();
|
|
43
|
+
assert.ok(sources.length > 0, "Should have saved a raw source file");
|
|
44
|
+
assert.ok(sources.some((s) => s.startsWith(result.source_id.slice(0, 16))), "Source file should be named with source_id prefix");
|
|
45
|
+
});
|
|
46
|
+
test("ingestSource duplicate ingestion returns already_existed=true", async () => {
|
|
47
|
+
const text = `Source-C: Carol is a product manager at StartupXYZ. timestamp=${Date.now()}`;
|
|
48
|
+
const first = await mods.ingestMod.ingestSource(text, "text");
|
|
49
|
+
assert.equal(first.already_existed, false, "First call should not be a duplicate");
|
|
50
|
+
const second = await mods.ingestMod.ingestSource(text, "text");
|
|
51
|
+
assert.equal(second.already_existed, true, "Second ingestion should be idempotent");
|
|
52
|
+
assert.equal(second.source_id, first.source_id, "Should return same source_id");
|
|
53
|
+
const db = mods.dbMod.getDb();
|
|
54
|
+
const count = db.prepare(`SELECT COUNT(*) as c FROM wiki_sources WHERE id = ?`).get(first.source_id).c;
|
|
55
|
+
assert.equal(count, 1, "Should only have one row in wiki_sources");
|
|
56
|
+
});
|
|
57
|
+
test("computeSourceId is deterministic and type-scoped", () => {
|
|
58
|
+
const id1 = mods.ingestMod.computeSourceId("text", "hello world");
|
|
59
|
+
const id2 = mods.ingestMod.computeSourceId("text", "hello world");
|
|
60
|
+
assert.equal(id1, id2, "Same input should give same id");
|
|
61
|
+
const id3 = mods.ingestMod.computeSourceId("url", "hello world");
|
|
62
|
+
assert.notEqual(id1, id3, "Different types should give different ids");
|
|
63
|
+
});
|
|
64
|
+
test("detectSourceType identifies URLs, repos, and text", () => {
|
|
65
|
+
assert.equal(mods.ingestMod.detectSourceType("https://tokio.rs"), "url");
|
|
66
|
+
assert.equal(mods.ingestMod.detectSourceType("http://example.com"), "url");
|
|
67
|
+
assert.equal(mods.ingestMod.detectSourceType("https://github.com/user/repo"), "repo");
|
|
68
|
+
assert.equal(mods.ingestMod.detectSourceType("some plain text content"), "text");
|
|
69
|
+
assert.equal(mods.ingestMod.detectSourceType(""), "text");
|
|
70
|
+
});
|
|
71
|
+
test("assertSafeRemoteUrl blocks all RFC 1918 private ranges", () => {
|
|
72
|
+
assert.throws(() => mods.ingestMod.assertSafeRemoteUrl("http://10.1.2.3"), /Cannot fetch internal\/private URLs\./);
|
|
73
|
+
assert.throws(() => mods.ingestMod.assertSafeRemoteUrl("http://172.16.5.4"), /Cannot fetch internal\/private URLs\./);
|
|
74
|
+
assert.throws(() => mods.ingestMod.assertSafeRemoteUrl("http://172.31.255.255"), /Cannot fetch internal\/private URLs\./);
|
|
75
|
+
assert.throws(() => mods.ingestMod.assertSafeRemoteUrl("http://192.168.1.9"), /Cannot fetch internal\/private URLs\./);
|
|
76
|
+
});
|
|
77
|
+
test("createEntityPageContent uses the Summary and Timeline headings", () => {
|
|
78
|
+
const content = mods.ingestMod.createEntityPageContent({
|
|
79
|
+
pageTitle: "Alice Example",
|
|
80
|
+
pageSummary: "Senior engineer at Acme.",
|
|
81
|
+
entityType: "people",
|
|
82
|
+
updatedAt: "2026-05-15",
|
|
83
|
+
});
|
|
84
|
+
assert.match(content, /## Summary/);
|
|
85
|
+
assert.match(content, /## Timeline/);
|
|
86
|
+
assert.doesNotMatch(content, /## Compiled Truth/);
|
|
87
|
+
});
|
|
88
|
+
test("ingestSource stores optional research session metadata in wiki_sources", async () => {
|
|
89
|
+
const text = `Source-D: Research session metadata should persist. timestamp=${Date.now()}`;
|
|
90
|
+
const result = await mods.ingestMod.ingestSource(text, "text", "topics", {
|
|
91
|
+
sessionId: "compiler-research",
|
|
92
|
+
sessionName: "Compiler research",
|
|
93
|
+
});
|
|
94
|
+
const db = mods.dbMod.getDb();
|
|
95
|
+
const row = db.prepare(`SELECT session_id, session_name FROM wiki_sources WHERE id = ?`).get(result.source_id);
|
|
96
|
+
assert.deepEqual(row, {
|
|
97
|
+
session_id: "compiler-research",
|
|
98
|
+
session_name: "Compiler research",
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
// URL/PDF/repo tests are skipped if connectivity or dependencies are unavailable
|
|
102
|
+
test.skip("ingestSource(url) fetches and parses content — requires network", async () => {
|
|
103
|
+
// Integration test: run manually with network access
|
|
104
|
+
});
|
|
105
|
+
test.skip("ingestSource(pdf) parses PDF — requires pdf-parse", async () => {
|
|
106
|
+
// Integration test: run manually with pdf-parse installed
|
|
107
|
+
});
|
|
108
|
+
test.skip("ingestSource(repo) clones and summarises repo — requires network + git", async () => {
|
|
109
|
+
// Integration test: run manually
|
|
110
|
+
});
|
|
111
|
+
//# sourceMappingURL=ingest.test.js.map
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Wiki entity graph — link extraction and graph traversal
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { getDb } from "../store/db.js";
|
|
5
|
+
import { readPage, pageExists } from "./fs.js";
|
|
6
|
+
import { parseWikiFrontmatter } from "./frontmatter.js";
|
|
7
|
+
import { normalizeWikiPath } from "./path-utils.js";
|
|
8
|
+
const RELATIONSHIP_PATTERNS = [
|
|
9
|
+
{ regex: /\bimplements\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "implements" },
|
|
10
|
+
{ regex: /\bsupersedes\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "supersedes" },
|
|
11
|
+
{ regex: /\bmember\s+of\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "member_of" },
|
|
12
|
+
{ regex: /\bworks?\s+at\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "member_of" },
|
|
13
|
+
{ regex: /\bworks\s+on\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "works_on" },
|
|
14
|
+
{ regex: /\bdecided\s+by\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "decided_by" },
|
|
15
|
+
{ regex: /\bdepends\s+on\s+([A-Za-z0-9][^\n.,;:!?]{1,60})/gi, linkType: "depends_on" },
|
|
16
|
+
];
|
|
17
|
+
function nameToSlug(name) {
|
|
18
|
+
return name
|
|
19
|
+
.toLowerCase()
|
|
20
|
+
.trim()
|
|
21
|
+
.replace(/\s+/g, "-")
|
|
22
|
+
.replace(/[^a-z0-9-]/g, "");
|
|
23
|
+
}
|
|
24
|
+
function wikiLinkToPath(name) {
|
|
25
|
+
return `pages/${nameToSlug(name)}/index.md`;
|
|
26
|
+
}
|
|
27
|
+
function tagToTopicPath(tag) {
|
|
28
|
+
return `pages/topics/${nameToSlug(tag)}/index.md`;
|
|
29
|
+
}
|
|
30
|
+
/** Extract typed links from a page. Returns deduplicated WikiLink array. */
|
|
31
|
+
export function extractLinks(pagePath) {
|
|
32
|
+
const normalizedPath = normalizeWikiPath(pagePath);
|
|
33
|
+
const content = readPage(normalizedPath);
|
|
34
|
+
if (!content)
|
|
35
|
+
return [];
|
|
36
|
+
const { parsed: fm, body } = parseWikiFrontmatter(content);
|
|
37
|
+
const links = [];
|
|
38
|
+
const seen = new Set();
|
|
39
|
+
const extractedAt = new Date().toISOString();
|
|
40
|
+
function addLink(toPage, linkType) {
|
|
41
|
+
const normalized = normalizeWikiPath(toPage);
|
|
42
|
+
if (!normalized || normalized === normalizedPath)
|
|
43
|
+
return;
|
|
44
|
+
const key = `${normalized}:${linkType}`;
|
|
45
|
+
if (seen.has(key))
|
|
46
|
+
return;
|
|
47
|
+
seen.add(key);
|
|
48
|
+
links.push({ from_page: normalizedPath, to_page: normalized, link_type: linkType, extracted_at: extractedAt });
|
|
49
|
+
}
|
|
50
|
+
// 1. [[Page Name]] wiki links
|
|
51
|
+
const wikiLinkRe = /\[\[([^\]]+)\]\]/g;
|
|
52
|
+
let m;
|
|
53
|
+
while ((m = wikiLinkRe.exec(body)) !== null) {
|
|
54
|
+
const target = wikiLinkToPath(m[1].trim());
|
|
55
|
+
addLink(target, "references");
|
|
56
|
+
}
|
|
57
|
+
// 2. Frontmatter `related` array
|
|
58
|
+
for (const rel of fm.related ?? []) {
|
|
59
|
+
if (typeof rel === "string" && rel.trim()) {
|
|
60
|
+
addLink(normalizeWikiPath(rel.trim()), "references");
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// 3. Frontmatter `tags` → topic pages (only if target page exists on disk)
|
|
64
|
+
for (const tag of fm.tags ?? []) {
|
|
65
|
+
if (typeof tag === "string" && tag.trim()) {
|
|
66
|
+
const target = tagToTopicPath(tag.trim());
|
|
67
|
+
if (pageExists(target)) {
|
|
68
|
+
addLink(target, "references");
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// 4. Relationship statements in body text
|
|
73
|
+
for (const { regex, linkType } of RELATIONSHIP_PATTERNS) {
|
|
74
|
+
regex.lastIndex = 0;
|
|
75
|
+
while ((m = regex.exec(body)) !== null) {
|
|
76
|
+
const rawTarget = nameToSlug(m[1].trim());
|
|
77
|
+
if (rawTarget) {
|
|
78
|
+
addLink(`pages/${rawTarget}/index.md`, linkType);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return links;
|
|
83
|
+
}
|
|
84
|
+
/** Re-extract links for a page and sync to wiki_links table. */
|
|
85
|
+
export function updateLinks(pagePath) {
|
|
86
|
+
const normalizedPath = normalizeWikiPath(pagePath);
|
|
87
|
+
const db = getDb();
|
|
88
|
+
const existing = db.prepare(`SELECT COUNT(*) as c FROM wiki_links WHERE from_page = ?`).get(normalizedPath);
|
|
89
|
+
const removedCount = existing.c;
|
|
90
|
+
const newLinks = extractLinks(normalizedPath);
|
|
91
|
+
db.transaction(() => {
|
|
92
|
+
db.prepare(`DELETE FROM wiki_links WHERE from_page = ?`).run(normalizedPath);
|
|
93
|
+
const insert = db.prepare(`
|
|
94
|
+
INSERT OR IGNORE INTO wiki_links (from_page, to_page, link_type, extracted_at)
|
|
95
|
+
VALUES (?, ?, ?, ?)
|
|
96
|
+
`);
|
|
97
|
+
for (const link of newLinks) {
|
|
98
|
+
insert.run(link.from_page, link.to_page, link.link_type, link.extracted_at);
|
|
99
|
+
}
|
|
100
|
+
})();
|
|
101
|
+
return { added: newLinks.length, removed: removedCount };
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Walk the entity graph from a starting page.
|
|
105
|
+
* Default depth 1, max depth 3. Returns flat list sorted by depth then page.
|
|
106
|
+
*/
|
|
107
|
+
export function traverse(pagePath, linkType, depth = 1) {
|
|
108
|
+
const MAX_DEPTH = 3;
|
|
109
|
+
const effectiveDepth = Math.min(Math.max(depth, 1), MAX_DEPTH);
|
|
110
|
+
const normalizedPath = normalizeWikiPath(pagePath);
|
|
111
|
+
const db = getDb();
|
|
112
|
+
const results = [];
|
|
113
|
+
const visited = new Set([normalizedPath]);
|
|
114
|
+
const queue = [{ page: normalizedPath, depth: 0 }];
|
|
115
|
+
while (queue.length > 0) {
|
|
116
|
+
const { page: currentPage, depth: currentDepth } = queue.shift();
|
|
117
|
+
if (currentDepth >= effectiveDepth)
|
|
118
|
+
continue;
|
|
119
|
+
const outbound = (linkType
|
|
120
|
+
? db.prepare(`SELECT to_page, link_type FROM wiki_links WHERE from_page = ? AND link_type = ?`).all(currentPage, linkType)
|
|
121
|
+
: db.prepare(`SELECT to_page, link_type FROM wiki_links WHERE from_page = ?`).all(currentPage));
|
|
122
|
+
for (const row of outbound) {
|
|
123
|
+
if (!visited.has(row.to_page)) {
|
|
124
|
+
visited.add(row.to_page);
|
|
125
|
+
results.push({ page: row.to_page, link_type: row.link_type, direction: "outbound", depth: currentDepth + 1 });
|
|
126
|
+
if (currentDepth + 1 < effectiveDepth) {
|
|
127
|
+
queue.push({ page: row.to_page, depth: currentDepth + 1 });
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
const inbound = (linkType
|
|
132
|
+
? db.prepare(`SELECT from_page, link_type FROM wiki_links WHERE to_page = ? AND link_type = ?`).all(currentPage, linkType)
|
|
133
|
+
: db.prepare(`SELECT from_page, link_type FROM wiki_links WHERE to_page = ?`).all(currentPage));
|
|
134
|
+
for (const row of inbound) {
|
|
135
|
+
if (!visited.has(row.from_page)) {
|
|
136
|
+
visited.add(row.from_page);
|
|
137
|
+
results.push({ page: row.from_page, link_type: row.link_type, direction: "inbound", depth: currentDepth + 1 });
|
|
138
|
+
if (currentDepth + 1 < effectiveDepth) {
|
|
139
|
+
queue.push({ page: row.from_page, depth: currentDepth + 1 });
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
results.sort((a, b) => {
|
|
145
|
+
if (a.depth !== b.depth)
|
|
146
|
+
return a.depth - b.depth;
|
|
147
|
+
return a.page.localeCompare(b.page);
|
|
148
|
+
});
|
|
149
|
+
return results;
|
|
150
|
+
}
|
|
151
|
+
//# sourceMappingURL=links.js.map
|