chapterhouse 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/korg.agent.md +65 -0
- package/dist/api/agent-edit-access.js +11 -0
- package/dist/api/agents.api.test.js +48 -0
- package/dist/api/korg.js +34 -0
- package/dist/api/korg.test.js +42 -0
- package/dist/api/server.js +420 -13
- package/dist/api/server.test.js +533 -3
- package/dist/config.js +28 -0
- package/dist/config.test.js +20 -0
- package/dist/copilot/agent-event-bus.js +1 -0
- package/dist/copilot/agents.js +117 -50
- package/dist/copilot/agents.mcp-servers.test.js +87 -0
- package/dist/copilot/agents.parse.test.js +69 -0
- package/dist/copilot/agents.test.js +137 -2
- package/dist/copilot/orchestrator.js +62 -13
- package/dist/copilot/orchestrator.test.js +130 -8
- package/dist/copilot/session-manager.js +34 -0
- package/dist/copilot/system-message.js +11 -10
- package/dist/copilot/system-message.test.js +6 -1
- package/dist/copilot/tools.js +184 -376
- package/dist/copilot/tools.memory.test.js +32 -0
- package/dist/copilot/tools.wiki.test.js +53 -59
- package/dist/daemon.js +9 -0
- package/dist/memory/decisions.js +6 -5
- package/dist/memory/entities.js +20 -9
- package/dist/memory/hooks.js +151 -0
- package/dist/memory/hooks.test.js +325 -0
- package/dist/memory/hot-tier.js +37 -0
- package/dist/memory/hot-tier.test.js +30 -0
- package/dist/memory/housekeeping-scheduler.js +35 -0
- package/dist/memory/housekeeping-scheduler.test.js +50 -0
- package/dist/memory/inbox.js +10 -0
- package/dist/memory/index.js +3 -1
- package/dist/memory/migration.js +244 -0
- package/dist/memory/migration.test.js +100 -0
- package/dist/memory/reflect.js +273 -0
- package/dist/memory/reflect.test.js +254 -0
- package/dist/store/db.js +119 -4
- package/dist/store/db.test.js +19 -1
- package/dist/test/setup-env.js +3 -1
- package/dist/test/setup-env.test.js +8 -1
- package/dist/wiki/consolidation.js +641 -0
- package/dist/wiki/consolidation.test.js +140 -0
- package/dist/wiki/frontmatter.js +48 -0
- package/dist/wiki/frontmatter.test.js +42 -0
- package/dist/wiki/index-manager.js +246 -330
- package/dist/wiki/index-manager.test.js +138 -145
- package/dist/wiki/ingest.js +347 -0
- package/dist/wiki/ingest.test.js +111 -0
- package/dist/wiki/links.js +151 -0
- package/dist/wiki/links.test.js +176 -0
- package/dist/wiki/migrate-topics.test.js +16 -6
- package/dist/wiki/scheduler.js +118 -0
- package/dist/wiki/scheduler.test.js +64 -0
- package/dist/wiki/timeline.js +51 -0
- package/dist/wiki/timeline.test.js +65 -0
- package/dist/wiki/topic-structure.js +1 -1
- package/package.json +3 -1
- package/skills/pkb-ideas/SKILL.md +78 -0
- package/skills/pkb-ideas/_meta.json +4 -0
- package/skills/pkb-org/SKILL.md +82 -0
- package/skills/pkb-org/_meta.json +4 -0
- package/skills/pkb-people/SKILL.md +74 -0
- package/skills/pkb-people/_meta.json +4 -0
- package/skills/pkb-research/SKILL.md +83 -0
- package/skills/pkb-research/_meta.json +4 -0
- package/skills/pkb-source/SKILL.md +38 -0
- package/skills/pkb-source/_meta.json +4 -0
- package/skills/wiki-conventions/SKILL.md +5 -5
- package/web/dist/assets/index-5kz9aRU9.css +10 -0
- package/web/dist/assets/{index-B5oDsQ5y.js → index-BbX9RKf3.js} +101 -99
- package/web/dist/assets/index-BbX9RKf3.js.map +1 -0
- package/web/dist/index.html +2 -2
- package/dist/wiki/context.js +0 -138
- package/dist/wiki/fix.js +0 -335
- package/dist/wiki/fix.test.js +0 -350
- package/dist/wiki/lint.js +0 -451
- package/dist/wiki/lint.test.js +0 -329
- package/web/dist/assets/index-B5oDsQ5y.js.map +0 -1
- package/web/dist/assets/index-DknKAtDS.css +0 -10
|
@@ -1,160 +1,153 @@
|
|
|
1
1
|
import assert from "node:assert/strict";
|
|
2
|
-
import { mkdirSync, rmSync } from "node:fs";
|
|
2
|
+
import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
import test from "node:test";
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
process.
|
|
8
|
-
|
|
5
|
+
// Sandbox: every test gets a fresh CHAPTERHOUSE_HOME
|
|
6
|
+
function makeSandbox() {
|
|
7
|
+
const dir = mkdtempSync(join(process.cwd(), ".test-work", "wiki-idx-"));
|
|
8
|
+
process.env.CHAPTERHOUSE_HOME = dir;
|
|
9
|
+
return dir;
|
|
10
|
+
}
|
|
11
|
+
async function loadModules(sandbox) {
|
|
9
12
|
const nonce = `${Date.now()}-${Math.random()}`;
|
|
10
|
-
const indexManager = await import(new URL(`./index-manager.js?
|
|
11
|
-
const wikiFs = await import(new URL(`./fs.js?
|
|
13
|
+
const indexManager = await import(new URL(`./index-manager.js?c=${nonce}`, import.meta.url).href);
|
|
14
|
+
const wikiFs = await import(new URL(`./fs.js?c=${nonce}`, import.meta.url).href);
|
|
12
15
|
return { indexManager, wikiFs };
|
|
13
16
|
}
|
|
14
|
-
|
|
15
|
-
mkdirSync(join(
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
test.before(() => {
|
|
18
|
+
mkdirSync(join(process.cwd(), ".test-work"), { recursive: true });
|
|
19
|
+
});
|
|
20
|
+
test("wikiSearch returns FTS5 results for matching query", async () => {
|
|
21
|
+
const sandbox = makeSandbox();
|
|
22
|
+
try {
|
|
23
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
24
|
+
wikiFs.writePage("pages/topics/rust/index.md", "---\ntitle: Rust Programming\nsummary: Systems programming with async support\ntags: [rust, async]\nupdated: 2026-05-01\n---\n\n# Rust\n\nSystems language.\n");
|
|
25
|
+
wikiFs.writePage("pages/topics/typescript/index.md", "---\ntitle: TypeScript\nsummary: Typed JavaScript for large projects\ntags: [ts, web]\nupdated: 2026-05-02\n---\n\n# TypeScript\n\nJS with types.\n");
|
|
26
|
+
indexManager.rebuildWikiIndex();
|
|
27
|
+
const results = indexManager.wikiSearch("rust async");
|
|
28
|
+
assert.ok(results.length > 0, "Should return results for 'rust async'");
|
|
29
|
+
assert.ok(results.some((r) => r.path === "pages/topics/rust/index.md"), "Should include rust page");
|
|
30
|
+
}
|
|
31
|
+
finally {
|
|
32
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
33
|
+
}
|
|
20
34
|
});
|
|
21
|
-
test
|
|
22
|
-
|
|
35
|
+
test("wikiSearch empty query returns most recently updated pages", async () => {
|
|
36
|
+
const sandbox = makeSandbox();
|
|
37
|
+
try {
|
|
38
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
39
|
+
wikiFs.writePage("pages/topics/alpha/index.md", "---\ntitle: Alpha\nsummary: First topic\nupdated: 2026-01-01\n---\n\n# Alpha\n");
|
|
40
|
+
wikiFs.writePage("pages/topics/beta/index.md", "---\ntitle: Beta\nsummary: Second topic\nupdated: 2026-05-14\n---\n\n# Beta\n");
|
|
41
|
+
indexManager.rebuildWikiIndex();
|
|
42
|
+
const results = indexManager.wikiSearch("", 10);
|
|
43
|
+
assert.ok(results.length >= 2, "Should return pages for empty query");
|
|
44
|
+
// Most recent first
|
|
45
|
+
const betaIdx = results.findIndex((r) => r.path === "pages/topics/beta/index.md");
|
|
46
|
+
const alphaIdx = results.findIndex((r) => r.path === "pages/topics/alpha/index.md");
|
|
47
|
+
assert.ok(betaIdx < alphaIdx, "More recently updated page should come first");
|
|
48
|
+
}
|
|
49
|
+
finally {
|
|
50
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
51
|
+
}
|
|
23
52
|
});
|
|
24
|
-
test("
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
summary: "Shared priorities",
|
|
40
|
-
section: "Projects",
|
|
41
|
-
tags: undefined,
|
|
42
|
-
updated: undefined,
|
|
43
|
-
},
|
|
44
|
-
]);
|
|
53
|
+
test("rebuildWikiIndex populates wiki_pages from filesystem", async () => {
|
|
54
|
+
const sandbox = makeSandbox();
|
|
55
|
+
try {
|
|
56
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
57
|
+
wikiFs.writePage("pages/projects/chapterhouse/index.md", "---\ntitle: Chapterhouse\nsummary: AI orchestrator\ntags: [ai, orchestration]\nupdated: 2026-05-10\n---\n\n# Chapterhouse\n");
|
|
58
|
+
wikiFs.writePage("pages/projects/chapterhouse/decisions.md", "---\ntitle: Decisions\nsummary: Architectural decisions\nupdated: 2026-05-09\n---\n\n# Decisions\n");
|
|
59
|
+
indexManager.rebuildWikiIndex();
|
|
60
|
+
const entries = indexManager.parseIndex();
|
|
61
|
+
const paths = entries.map((e) => e.path).sort();
|
|
62
|
+
assert.ok(paths.includes("pages/projects/chapterhouse/index.md"), "Should include index.md");
|
|
63
|
+
assert.ok(paths.includes("pages/projects/chapterhouse/decisions.md"), "Should include decisions.md");
|
|
64
|
+
}
|
|
65
|
+
finally {
|
|
66
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
67
|
+
}
|
|
45
68
|
});
|
|
46
|
-
test("
|
|
47
|
-
const
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
title
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
updated
|
|
57
|
-
}
|
|
69
|
+
test("upsertWikiPage inserts and updates correctly", async () => {
|
|
70
|
+
const sandbox = makeSandbox();
|
|
71
|
+
try {
|
|
72
|
+
const { indexManager } = await loadModules(sandbox);
|
|
73
|
+
indexManager.upsertWikiPage("pages/people/ada/index.md", { title: "Ada Lovelace", summary: "Mathematician", tags: ["math"], updated: "2026-05-01", metadata: {} }, "First programmer");
|
|
74
|
+
const results = indexManager.wikiSearch("Ada");
|
|
75
|
+
assert.ok(results.some((r) => r.title === "Ada Lovelace"), "Should find Ada");
|
|
76
|
+
// Update
|
|
77
|
+
indexManager.upsertWikiPage("pages/people/ada/index.md", { title: "Ada Lovelace", summary: "Mathematician and programmer", tags: ["math", "history"], updated: "2026-05-02", metadata: {} }, "First programmer and mathematician");
|
|
78
|
+
const updated = indexManager.wikiSearch("programmer");
|
|
79
|
+
assert.ok(updated.length > 0, "Should find updated page");
|
|
80
|
+
}
|
|
81
|
+
finally {
|
|
82
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
83
|
+
}
|
|
58
84
|
});
|
|
59
|
-
test("
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
summary: "Shared direction for the team.",
|
|
78
|
-
section: "Knowledge",
|
|
79
|
-
tags: undefined,
|
|
80
|
-
updated: undefined,
|
|
81
|
-
},
|
|
82
|
-
]);
|
|
83
|
-
assert.match(wikiFs.readIndexFile(), /\[Vision\]\(pages\/team\/vision\.md\)/);
|
|
85
|
+
test("FTS search returns results under 50ms", async () => {
|
|
86
|
+
const sandbox = makeSandbox();
|
|
87
|
+
try {
|
|
88
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
89
|
+
// Populate with 20 pages
|
|
90
|
+
for (let i = 0; i < 20; i++) {
|
|
91
|
+
wikiFs.writePage(`pages/topics/topic-${i}/index.md`, `---\ntitle: Topic ${i}\nsummary: Description for topic ${i} covering various subjects\ntags: [topic${i}]\nupdated: 2026-05-01\n---\n\n# Topic ${i}\n\nContent.\n`);
|
|
92
|
+
}
|
|
93
|
+
indexManager.rebuildWikiIndex();
|
|
94
|
+
const start = Date.now();
|
|
95
|
+
const results = indexManager.wikiSearch("topic description");
|
|
96
|
+
const elapsed = Date.now() - start;
|
|
97
|
+
assert.ok(results.length > 0, "Should return results");
|
|
98
|
+
assert.ok(elapsed < 50, `FTS search should complete in <50ms, took ${elapsed}ms`);
|
|
99
|
+
}
|
|
100
|
+
finally {
|
|
101
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
102
|
+
}
|
|
84
103
|
});
|
|
85
|
-
test("
|
|
86
|
-
const
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"pages/projects/chapterhouse/decisions.md",
|
|
101
|
-
"pages/projects/chapterhouse/index.md",
|
|
102
|
-
]);
|
|
104
|
+
test("removeFromIndex removes from wiki_pages", async () => {
|
|
105
|
+
const sandbox = makeSandbox();
|
|
106
|
+
try {
|
|
107
|
+
const { indexManager } = await loadModules(sandbox);
|
|
108
|
+
indexManager.upsertWikiPage("pages/people/test/index.md", { title: "Test Person", summary: "A test", tags: [], updated: "2026-05-01", metadata: {} }, "A test");
|
|
109
|
+
const before = indexManager.wikiSearch("Test Person");
|
|
110
|
+
assert.ok(before.length > 0, "Should exist before removal");
|
|
111
|
+
const removed = indexManager.removeFromIndex("pages/people/test/index.md");
|
|
112
|
+
assert.equal(removed, true);
|
|
113
|
+
const after = indexManager.wikiSearch("Test Person");
|
|
114
|
+
assert.equal(after.length, 0, "Should not exist after removal");
|
|
115
|
+
}
|
|
116
|
+
finally {
|
|
117
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
118
|
+
}
|
|
103
119
|
});
|
|
104
|
-
test("searchIndex
|
|
105
|
-
const
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
path: "pages/team/ops.md",
|
|
119
|
-
title: "Operations",
|
|
120
|
-
summary: "Runbooks and incident work",
|
|
121
|
-
section: "Team",
|
|
122
|
-
},
|
|
123
|
-
]);
|
|
124
|
-
const metadataHit = indexManager.searchIndex("api", 1);
|
|
125
|
-
const bodyFallback = indexManager.searchIndex("telemetry", 1);
|
|
126
|
-
assert.deepEqual(metadataHit.map((entry) => entry.path), ["pages/team/api.md"]);
|
|
127
|
-
assert.deepEqual(bodyFallback.map((entry) => entry.path), ["pages/team/api.md"]);
|
|
120
|
+
test("searchIndex delegates to wikiSearch and returns IndexEntry shape", async () => {
|
|
121
|
+
const sandbox = makeSandbox();
|
|
122
|
+
try {
|
|
123
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
124
|
+
wikiFs.writePage("pages/team/api.md", "---\ntitle: API Docs\nsummary: API documentation\ntags: [api]\nupdated: 2026-05-01\n---\n\n# API\n");
|
|
125
|
+
indexManager.rebuildWikiIndex();
|
|
126
|
+
const results = indexManager.searchIndex("api");
|
|
127
|
+
assert.ok(results.length > 0);
|
|
128
|
+
assert.ok("section" in results[0], "Should have section field");
|
|
129
|
+
assert.ok("title" in results[0], "Should have title field");
|
|
130
|
+
}
|
|
131
|
+
finally {
|
|
132
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
133
|
+
}
|
|
128
134
|
});
|
|
129
|
-
test("
|
|
130
|
-
const
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
summary: "
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
}
|
|
146
|
-
indexManager.addToIndex({
|
|
147
|
-
path: "pages/people/ada.md",
|
|
148
|
-
title: "Ada Lovelace",
|
|
149
|
-
summary: "Owns regression coverage",
|
|
150
|
-
section: "People",
|
|
151
|
-
tags: ["qa", "testing"],
|
|
152
|
-
updated: "2026-05-06",
|
|
153
|
-
});
|
|
154
|
-
assert.equal(indexManager.removeFromIndex("pages/projects/launch.md"), true);
|
|
155
|
-
assert.equal(indexManager.removeFromIndex("pages/projects/missing.md"), false);
|
|
156
|
-
const summary = indexManager.getIndexSummary();
|
|
157
|
-
assert.match(summary, /\*\*People\*\*: Ada Lovelace: Owns regression coverage \[qa, testing\] \(2026-05-06\)/);
|
|
158
|
-
assert.match(summary, new RegExp(`\\*\\*Index\\*\\*: Wiki: Index of all wiki pages\\. \\(${today}\\)`));
|
|
135
|
+
test("rebuildWikiIndex removes stale entries not on disk", async () => {
|
|
136
|
+
const sandbox = makeSandbox();
|
|
137
|
+
try {
|
|
138
|
+
const { indexManager, wikiFs } = await loadModules(sandbox);
|
|
139
|
+
wikiFs.writePage("pages/topics/keep/index.md", "---\ntitle: Keep\nsummary: Keep this\nupdated: 2026-05-01\n---\n\n# Keep\n");
|
|
140
|
+
// Insert stale entry directly
|
|
141
|
+
indexManager.upsertWikiPage("pages/topics/stale/index.md", { title: "Stale", summary: "Should be removed", tags: [], updated: "2026-01-01", metadata: {} }, "Stale");
|
|
142
|
+
// Rebuild syncs disk → DB
|
|
143
|
+
indexManager.rebuildWikiIndex();
|
|
144
|
+
const entries = indexManager.parseIndex();
|
|
145
|
+
const paths = entries.map((e) => e.path);
|
|
146
|
+
assert.ok(paths.includes("pages/topics/keep/index.md"), "Should keep on-disk page");
|
|
147
|
+
assert.ok(!paths.includes("pages/topics/stale/index.md"), "Should remove stale entry");
|
|
148
|
+
}
|
|
149
|
+
finally {
|
|
150
|
+
rmSync(sandbox, { recursive: true, force: true });
|
|
151
|
+
}
|
|
159
152
|
});
|
|
160
153
|
//# sourceMappingURL=index-manager.test.js.map
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// PKB ingestion pipeline — parse, extract entities, write wiki pages
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { createHash } from "node:crypto";
|
|
5
|
+
import { exec } from "node:child_process";
|
|
6
|
+
import { mkdirSync, readFileSync, existsSync, rmSync } from "node:fs";
|
|
7
|
+
import { join } from "node:path";
|
|
8
|
+
import { promisify } from "node:util";
|
|
9
|
+
import { getDb } from "../store/db.js";
|
|
10
|
+
import { ensureWikiStructure, writeRawSource, assertPagePath } from "./fs.js";
|
|
11
|
+
import { appendTimeline } from "./timeline.js";
|
|
12
|
+
import { validateAndBackfillFrontmatter } from "./frontmatter.js";
|
|
13
|
+
import { writePage, readPage } from "./fs.js";
|
|
14
|
+
import { childLogger } from "../util/logger.js";
|
|
15
|
+
const log = childLogger("ingest");
|
|
16
|
+
const execAsync = promisify(exec);
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Source ID
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
export function computeSourceId(sourceType, origin) {
|
|
21
|
+
return createHash("sha256").update(sourceType + origin).digest("hex");
|
|
22
|
+
}
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Content parsers
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
export function assertSafeRemoteUrl(url) {
|
|
27
|
+
const parsedUrl = new URL(url);
|
|
28
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
|
|
29
|
+
throw new Error(`Only http/https URLs supported, got: ${parsedUrl.protocol}`);
|
|
30
|
+
}
|
|
31
|
+
const host = parsedUrl.hostname.toLowerCase();
|
|
32
|
+
const octets = host.split(".").map((part) => Number(part));
|
|
33
|
+
const isIpv4 = octets.length === 4 && octets.every((part) => Number.isInteger(part) && part >= 0 && part <= 255);
|
|
34
|
+
const isPrivateIpv4 = isIpv4 && (octets[0] === 10
|
|
35
|
+
|| (octets[0] === 172 && octets[1] >= 16 && octets[1] <= 31)
|
|
36
|
+
|| (octets[0] === 192 && octets[1] === 168));
|
|
37
|
+
if (host === "localhost" || host === "127.0.0.1" || host === "::1"
|
|
38
|
+
|| isPrivateIpv4
|
|
39
|
+
|| host.startsWith("169.254.") || host === "metadata.google.internal") {
|
|
40
|
+
throw new Error("Cannot fetch internal/private URLs.");
|
|
41
|
+
}
|
|
42
|
+
return parsedUrl;
|
|
43
|
+
}
|
|
44
|
+
export function createEntityPageContent({ pageTitle, pageSummary, entityType, updatedAt, }) {
|
|
45
|
+
return `---\ntitle: ${pageTitle}\nsummary: ${pageSummary}\nupdated: ${updatedAt}\ntags: []\nmetadata:\n entity_type: ${entityType}\n---\n\n# ${pageTitle}\n\n## Summary\n\n${pageSummary}\n\n## Timeline\n`;
|
|
46
|
+
}
|
|
47
|
+
async function parseUrl(url) {
|
|
48
|
+
const parsedUrl = assertSafeRemoteUrl(url);
|
|
49
|
+
const res = await fetch(url, { headers: { "User-Agent": "Chapterhouse/1.0 PKB-Ingest" } });
|
|
50
|
+
if (!res.ok)
|
|
51
|
+
throw new Error(`HTTP ${res.status} ${res.statusText} fetching ${url}`);
|
|
52
|
+
const html = await res.text();
|
|
53
|
+
// Try @mozilla/readability if available
|
|
54
|
+
let text;
|
|
55
|
+
let title = parsedUrl.hostname;
|
|
56
|
+
try {
|
|
57
|
+
const { Readability } = await import("@mozilla/readability");
|
|
58
|
+
const { JSDOM } = await import("jsdom");
|
|
59
|
+
const dom = new JSDOM(html, { url });
|
|
60
|
+
const reader = new Readability(dom.window.document);
|
|
61
|
+
const article = reader.parse();
|
|
62
|
+
if (article) {
|
|
63
|
+
text = article.textContent;
|
|
64
|
+
title = article.title || title;
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
text = stripHtml(html);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
text = stripHtml(html);
|
|
72
|
+
// Try to extract title from <title> tag
|
|
73
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
74
|
+
if (titleMatch)
|
|
75
|
+
title = titleMatch[1].trim();
|
|
76
|
+
}
|
|
77
|
+
if (text.length > 50_000)
|
|
78
|
+
text = text.slice(0, 50_000);
|
|
79
|
+
return { text, title };
|
|
80
|
+
}
|
|
81
|
+
function stripHtml(html) {
|
|
82
|
+
return html
|
|
83
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
84
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
85
|
+
.replace(/<[^>]+>/g, " ")
|
|
86
|
+
.replace(/\s{2,}/g, " ")
|
|
87
|
+
.trim()
|
|
88
|
+
.slice(0, 10_000);
|
|
89
|
+
}
|
|
90
|
+
async function parsePdf(filePath) {
|
|
91
|
+
try {
|
|
92
|
+
const pdfParse = await import("pdf-parse");
|
|
93
|
+
const buf = readFileSync(filePath);
|
|
94
|
+
const data = await pdfParse.default(buf);
|
|
95
|
+
return { text: data.text.slice(0, 50_000), title: filePath.replace(/.*\//, "").replace(/\.pdf$/i, "") };
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
throw new Error("PDF ingestion requires pdf-parse: npm install pdf-parse");
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
async function parseRepo(repoUrl) {
|
|
102
|
+
const tmpDir = join(process.cwd(), ".test-work", `repo-${Date.now()}`);
|
|
103
|
+
mkdirSync(tmpDir, { recursive: true });
|
|
104
|
+
try {
|
|
105
|
+
await execAsync(`git clone --depth 1 ${JSON.stringify(repoUrl)} ${JSON.stringify(tmpDir)}`, { timeout: 60_000 });
|
|
106
|
+
const parts = [];
|
|
107
|
+
// README
|
|
108
|
+
for (const name of ["README.md", "README.rst", "README.txt", "README"]) {
|
|
109
|
+
const p = join(tmpDir, name);
|
|
110
|
+
if (existsSync(p)) {
|
|
111
|
+
parts.push(`## README\n\n${readFileSync(p, "utf-8").slice(0, 10_000)}`);
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// package.json / Cargo.toml / go.mod
|
|
116
|
+
for (const name of ["package.json", "Cargo.toml", "go.mod"]) {
|
|
117
|
+
const p = join(tmpDir, name);
|
|
118
|
+
if (existsSync(p)) {
|
|
119
|
+
parts.push(`## ${name}\n\n\`\`\`\n${readFileSync(p, "utf-8").slice(0, 2_000)}\n\`\`\``);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
const { stdout } = await execAsync(`ls -la ${JSON.stringify(tmpDir)}`);
|
|
123
|
+
parts.push(`## Directory listing\n\n\`\`\`\n${stdout}\n\`\`\``);
|
|
124
|
+
const title = repoUrl.replace(/.*\//, "").replace(/\.git$/, "");
|
|
125
|
+
return { text: parts.join("\n\n"), title };
|
|
126
|
+
}
|
|
127
|
+
finally {
|
|
128
|
+
try {
|
|
129
|
+
rmSync(tmpDir, { recursive: true, force: true });
|
|
130
|
+
}
|
|
131
|
+
catch { /* best-effort */ }
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
// Entity extraction via LLM
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
async function extractEntities(text, topic) {
|
|
138
|
+
// Skip entity extraction if no auth token is configured
|
|
139
|
+
const { config } = await import("../config.js");
|
|
140
|
+
const token = config.copilotAuthToken || process.env.COPILOT_TOKEN || process.env.GITHUB_TOKEN;
|
|
141
|
+
if (!token) {
|
|
142
|
+
log.debug("No Copilot auth token configured, skipping entity extraction");
|
|
143
|
+
return { entities: [], relationships: [] };
|
|
144
|
+
}
|
|
145
|
+
const topicHint = topic ? ` Focus especially on entities related to: ${topic}.` : "";
|
|
146
|
+
const systemPrompt = "Extract entities and relationships from this content. Return JSON only, no other text: " +
|
|
147
|
+
`{ "entities": [{"name": string, "type": string, "description": string}], ` +
|
|
148
|
+
`"relationships": [{"from": string, "to": string, "type": string}] }`;
|
|
149
|
+
const userMessage = `${systemPrompt}${topicHint}\n\n---\n\n${text.slice(0, 8_000)}`;
|
|
150
|
+
try {
|
|
151
|
+
const { CopilotClient, approveAll } = await import("@github/copilot-sdk");
|
|
152
|
+
// Use a one-shot client (autoRestart: false) so it doesn't keep the process alive
|
|
153
|
+
const client = new CopilotClient({
|
|
154
|
+
autoStart: true,
|
|
155
|
+
autoRestart: false,
|
|
156
|
+
gitHubToken: token,
|
|
157
|
+
});
|
|
158
|
+
await client.start();
|
|
159
|
+
try {
|
|
160
|
+
const session = await client.createSession({
|
|
161
|
+
model: "claude-haiku-4.5",
|
|
162
|
+
tools: [],
|
|
163
|
+
onPermissionRequest: approveAll,
|
|
164
|
+
});
|
|
165
|
+
try {
|
|
166
|
+
const result = await session.sendAndWait({ prompt: userMessage }, 30_000);
|
|
167
|
+
const rawText = typeof result === "string" ? result : JSON.stringify(result);
|
|
168
|
+
// Extract JSON from the response (may be wrapped in markdown code blocks)
|
|
169
|
+
const jsonMatch = rawText.match(/\{[\s\S]*\}/);
|
|
170
|
+
if (!jsonMatch)
|
|
171
|
+
return { entities: [], relationships: [] };
|
|
172
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
173
|
+
return {
|
|
174
|
+
entities: Array.isArray(parsed.entities) ? parsed.entities : [],
|
|
175
|
+
relationships: Array.isArray(parsed.relationships) ? parsed.relationships : [],
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
finally {
|
|
179
|
+
try {
|
|
180
|
+
session.destroy();
|
|
181
|
+
}
|
|
182
|
+
catch { /* best-effort */ }
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
finally {
|
|
186
|
+
try {
|
|
187
|
+
await client.stop();
|
|
188
|
+
}
|
|
189
|
+
catch { /* best-effort */ }
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
catch (err) {
|
|
193
|
+
log.warn({ err: err instanceof Error ? err.message : err }, "Entity extraction LLM call failed, skipping");
|
|
194
|
+
return { entities: [], relationships: [] };
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
// Slug helper
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
function slugify(name) {
|
|
201
|
+
return name
|
|
202
|
+
.toLowerCase()
|
|
203
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
204
|
+
.replace(/^-+|-+$/g, "")
|
|
205
|
+
.slice(0, 64) || "unknown";
|
|
206
|
+
}
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// Main ingest function
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
export async function ingestSource(source, type, topic, session) {
|
|
211
|
+
ensureWikiStructure();
|
|
212
|
+
const db = getDb();
|
|
213
|
+
const origin = type === "text" ? source.slice(0, 200) : source;
|
|
214
|
+
const sourceId = computeSourceId(type, origin);
|
|
215
|
+
// Idempotency check
|
|
216
|
+
const existing = db.prepare(`SELECT id, pages_updated FROM wiki_sources WHERE id = ?`).get(sourceId);
|
|
217
|
+
if (existing) {
|
|
218
|
+
log.info({ sourceId, type, origin }, "Source already ingested, skipping");
|
|
219
|
+
const pagesUpdated = JSON.parse(existing.pages_updated || "[]");
|
|
220
|
+
return {
|
|
221
|
+
source_id: sourceId,
|
|
222
|
+
pages_created: [],
|
|
223
|
+
pages_updated: pagesUpdated,
|
|
224
|
+
entities: [],
|
|
225
|
+
already_existed: true,
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
// Parse content
|
|
229
|
+
let parsedText;
|
|
230
|
+
let title;
|
|
231
|
+
switch (type) {
|
|
232
|
+
case "url": {
|
|
233
|
+
const r = await parseUrl(source);
|
|
234
|
+
parsedText = r.text;
|
|
235
|
+
title = r.title;
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
case "pdf": {
|
|
239
|
+
const r = await parsePdf(source);
|
|
240
|
+
parsedText = r.text;
|
|
241
|
+
title = r.title;
|
|
242
|
+
break;
|
|
243
|
+
}
|
|
244
|
+
case "repo": {
|
|
245
|
+
const r = await parseRepo(source);
|
|
246
|
+
parsedText = r.text;
|
|
247
|
+
title = r.title;
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
case "text":
|
|
251
|
+
default:
|
|
252
|
+
parsedText = source;
|
|
253
|
+
title = topic ?? `text-${sourceId.slice(0, 8)}`;
|
|
254
|
+
break;
|
|
255
|
+
}
|
|
256
|
+
// Save raw source archive
|
|
257
|
+
const rawFileName = `${sourceId.slice(0, 16)}.md`;
|
|
258
|
+
writeRawSource(rawFileName, parsedText);
|
|
259
|
+
// Persist to wiki_sources
|
|
260
|
+
const ingestedAt = new Date().toISOString();
|
|
261
|
+
db.prepare(`
|
|
262
|
+
INSERT INTO wiki_sources (id, source_type, origin, title, ingested_at, raw_path, parsed_content, pages_updated, session_id, session_name)
|
|
263
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, ?)
|
|
264
|
+
`).run(sourceId, type, origin, title, ingestedAt, `sources/${rawFileName}`, parsedText.slice(0, 100_000), session?.sessionId ?? null, session?.sessionName ?? null);
|
|
265
|
+
// Extract entities
|
|
266
|
+
const extraction = await extractEntities(parsedText, topic);
|
|
267
|
+
const pagesCreated = [];
|
|
268
|
+
const pagesUpdated = [];
|
|
269
|
+
const entitySummaries = [];
|
|
270
|
+
for (const entity of extraction.entities) {
|
|
271
|
+
if (!entity.name || !entity.type)
|
|
272
|
+
continue;
|
|
273
|
+
const entitySlug = slugify(entity.name);
|
|
274
|
+
const typeLower = entity.type.toLowerCase().replace(/[^a-z0-9-]/g, "-");
|
|
275
|
+
const pagePath = `pages/${typeLower}/${entitySlug}/index.md`;
|
|
276
|
+
// Validate path is safe
|
|
277
|
+
try {
|
|
278
|
+
assertPagePath(pagePath);
|
|
279
|
+
}
|
|
280
|
+
catch {
|
|
281
|
+
// Use topics fallback for unknown entity types
|
|
282
|
+
const fallbackPath = `pages/topics/${entitySlug}/index.md`;
|
|
283
|
+
try {
|
|
284
|
+
assertPagePath(fallbackPath);
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
const safePagePath = (() => {
|
|
291
|
+
try {
|
|
292
|
+
assertPagePath(pagePath);
|
|
293
|
+
return pagePath;
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
return `pages/topics/${entitySlug}/index.md`;
|
|
297
|
+
}
|
|
298
|
+
})();
|
|
299
|
+
const existed = readPage(safePagePath) !== undefined;
|
|
300
|
+
const timelineEntry = `Source ingested: ${title}\n\n${entity.description || entity.name}`;
|
|
301
|
+
if (!existed) {
|
|
302
|
+
const pageTitle = entity.name;
|
|
303
|
+
const pageSummary = (entity.description || entity.name).slice(0, 180).replace(/\n/g, " ");
|
|
304
|
+
const pageContent = createEntityPageContent({
|
|
305
|
+
pageTitle,
|
|
306
|
+
pageSummary,
|
|
307
|
+
entityType: typeLower,
|
|
308
|
+
updatedAt: ingestedAt.slice(0, 10),
|
|
309
|
+
});
|
|
310
|
+
const { content: backfilled } = validateAndBackfillFrontmatter(safePagePath, pageContent);
|
|
311
|
+
writePage(safePagePath, backfilled);
|
|
312
|
+
pagesCreated.push(safePagePath);
|
|
313
|
+
}
|
|
314
|
+
else {
|
|
315
|
+
pagesUpdated.push(safePagePath);
|
|
316
|
+
}
|
|
317
|
+
appendTimeline(safePagePath, timelineEntry);
|
|
318
|
+
entitySummaries.push({ name: entity.name, type: entity.type, path: safePagePath });
|
|
319
|
+
}
|
|
320
|
+
// Update wiki_sources with pages_updated
|
|
321
|
+
const allPages = [...new Set([...pagesCreated, ...pagesUpdated])];
|
|
322
|
+
db.prepare(`UPDATE wiki_sources SET pages_updated = ? WHERE id = ?`).run(JSON.stringify(allPages), sourceId);
|
|
323
|
+
return {
|
|
324
|
+
source_id: sourceId,
|
|
325
|
+
pages_created: pagesCreated,
|
|
326
|
+
pages_updated: pagesUpdated,
|
|
327
|
+
entities: entitySummaries,
|
|
328
|
+
already_existed: false,
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
// ---------------------------------------------------------------------------
|
|
332
|
+
// Type auto-detection
|
|
333
|
+
// ---------------------------------------------------------------------------
|
|
334
|
+
export function detectSourceType(source) {
|
|
335
|
+
const trimmed = source.trim();
|
|
336
|
+
if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) {
|
|
337
|
+
if (trimmed.endsWith(".pdf"))
|
|
338
|
+
return "pdf";
|
|
339
|
+
if (trimmed.includes("github.com") || trimmed.includes("gitlab.com") || trimmed.endsWith(".git"))
|
|
340
|
+
return "repo";
|
|
341
|
+
return "url";
|
|
342
|
+
}
|
|
343
|
+
if (trimmed.endsWith(".pdf") && !trimmed.includes(" "))
|
|
344
|
+
return "pdf";
|
|
345
|
+
return "text";
|
|
346
|
+
}
|
|
347
|
+
//# sourceMappingURL=ingest.js.map
|