@nzpr/kb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +5 -0
- package/LICENSE +21 -0
- package/README.md +185 -0
- package/bin/kb-admin.js +5 -0
- package/bin/kb.js +5 -0
- package/docker-compose.pgvector.yml +19 -0
- package/lib/admin-cli.js +203 -0
- package/lib/chunking.js +16 -0
- package/lib/cli-common.js +73 -0
- package/lib/cli.js +391 -0
- package/lib/config.js +109 -0
- package/lib/db.js +81 -0
- package/lib/embeddings.js +94 -0
- package/lib/frontmatter.js +66 -0
- package/lib/index.js +140 -0
- package/lib/kb-proposals.js +188 -0
- package/lib/migrations.js +149 -0
- package/lib/repo-init.js +438 -0
- package/lib/search.js +206 -0
- package/migrations/0001_initial.sql +77 -0
- package/migrations/0002_relax_embedding_dimension.sql +9 -0
- package/migrations/0003_simplify_documents_table.sql +64 -0
- package/package.json +58 -0
package/lib/cli.js
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import {
|
|
3
|
+
resolveDocsRoot,
|
|
4
|
+
resolveEmbeddingProfile,
|
|
5
|
+
resolveDatabaseUrl,
|
|
6
|
+
resolveGitHubApiBaseUrl,
|
|
7
|
+
resolveGitHubRepository,
|
|
8
|
+
tryResolveKnowledgeRoot
|
|
9
|
+
} from "./config.js";
|
|
10
|
+
import { connect, initDb } from "./db.js";
|
|
11
|
+
import { ingestDocuments } from "./index.js";
|
|
12
|
+
import { createGitHubIssueFromText } from "./kb-proposals.js";
|
|
13
|
+
import { bootstrapKnowledgeRepo } from "./repo-init.js";
|
|
14
|
+
import { askIndex, doctor, knowledgeCatalog, listDocuments, searchIndex, snippet } from "./search.js";
|
|
15
|
+
import {
|
|
16
|
+
databaseHelp,
|
|
17
|
+
formatCliError,
|
|
18
|
+
githubCreationHelp,
|
|
19
|
+
maskConnection,
|
|
20
|
+
parseFlags,
|
|
21
|
+
publishHelp
|
|
22
|
+
} from "./cli-common.js";
|
|
23
|
+
|
|
24
|
+
export async function main(argv) {
|
|
25
|
+
try {
|
|
26
|
+
const [command, ...rest] = argv;
|
|
27
|
+
if (!command || command === "--help" || command === "-h") {
|
|
28
|
+
printHelp();
|
|
29
|
+
return 0;
|
|
30
|
+
}
|
|
31
|
+
if (rest.includes("--help") || rest.includes("-h")) {
|
|
32
|
+
printCommandHelp(command);
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const { flags, positional } = parseFlags(rest);
|
|
37
|
+
|
|
38
|
+
switch (command) {
|
|
39
|
+
case "create": {
|
|
40
|
+
const title = String(flags.title ?? "").trim();
|
|
41
|
+
const text = String(flags.text ?? "").trim();
|
|
42
|
+
const repo = flags.repo ?? resolveGitHubRepository();
|
|
43
|
+
if (!title) {
|
|
44
|
+
console.error("--title is required");
|
|
45
|
+
return 2;
|
|
46
|
+
}
|
|
47
|
+
if (!text) {
|
|
48
|
+
console.error("--text is required");
|
|
49
|
+
return 2;
|
|
50
|
+
}
|
|
51
|
+
const issue = await createGitHubIssueFromText({
|
|
52
|
+
title,
|
|
53
|
+
text,
|
|
54
|
+
relativePath: flags.path ?? null,
|
|
55
|
+
repo,
|
|
56
|
+
token: process.env.GITHUB_TOKEN ?? null,
|
|
57
|
+
apiBaseUrl: resolveGitHubApiBaseUrl()
|
|
58
|
+
});
|
|
59
|
+
console.log(`created KB proposal issue #${issue.number}: ${issue.html_url}`);
|
|
60
|
+
return 0;
|
|
61
|
+
}
|
|
62
|
+
case "init-repo": {
|
|
63
|
+
const result = await bootstrapKnowledgeRepo({
|
|
64
|
+
targetDir: flags.dir ?? process.cwd(),
|
|
65
|
+
repo: flags.repo ?? resolveGitHubRepository(),
|
|
66
|
+
githubToken: process.env.GITHUB_TOKEN ?? null,
|
|
67
|
+
databaseUrl: flags["database-url"] ?? process.env.KB_DATABASE_URL ?? null,
|
|
68
|
+
embeddingMode: flags["embedding-mode"] ?? process.env.KB_EMBEDDING_MODE ?? null,
|
|
69
|
+
embeddingApiUrl: flags["embedding-api-url"] ?? process.env.KB_EMBEDDING_API_URL ?? null,
|
|
70
|
+
embeddingModel: flags["embedding-model"] ?? process.env.KB_EMBEDDING_MODEL ?? null,
|
|
71
|
+
embeddingApiKey: flags["embedding-api-key"] ?? process.env.KB_EMBEDDING_API_KEY ?? null,
|
|
72
|
+
dbConnectTimeoutMs:
|
|
73
|
+
flags["db-connect-timeout-ms"] ?? process.env.KB_DB_CONNECT_TIMEOUT_MS ?? null,
|
|
74
|
+
repoAutomationToken:
|
|
75
|
+
flags["repo-automation-token"] ?? process.env.KB_REPO_AUTOMATION_TOKEN ?? null
|
|
76
|
+
});
|
|
77
|
+
console.log(`initialized knowledge repo scaffold in ${result.root}`);
|
|
78
|
+
for (const relativePath of result.created) {
|
|
79
|
+
console.log(`created ${relativePath}`);
|
|
80
|
+
}
|
|
81
|
+
for (const relativePath of result.skipped) {
|
|
82
|
+
console.log(`kept existing ${relativePath}`);
|
|
83
|
+
}
|
|
84
|
+
if (!result.created.length) {
|
|
85
|
+
console.log("no files created");
|
|
86
|
+
}
|
|
87
|
+
printInitRepoStatus(result);
|
|
88
|
+
printRepoConfiguration(result.configuration);
|
|
89
|
+
printInitRepoNextStep(result);
|
|
90
|
+
return result.ok ? 0 : 1;
|
|
91
|
+
}
|
|
92
|
+
case "search": {
|
|
93
|
+
const databaseUrl = requireDatabaseUrl();
|
|
94
|
+
const embeddingProfile = resolveEmbeddingProfile();
|
|
95
|
+
const query = positional.join(" ").trim();
|
|
96
|
+
if (!query) {
|
|
97
|
+
console.error("search query is required");
|
|
98
|
+
return 2;
|
|
99
|
+
}
|
|
100
|
+
const results = await searchIndex({
|
|
101
|
+
databaseUrl,
|
|
102
|
+
embeddingProfile,
|
|
103
|
+
query,
|
|
104
|
+
limit: Number(flags.limit ?? 5)
|
|
105
|
+
});
|
|
106
|
+
for (const result of results) {
|
|
107
|
+
console.log(result.title);
|
|
108
|
+
console.log(` ${snippet(result.content)}`);
|
|
109
|
+
console.log(` Source: ${result.path} | score=${result.finalScore.toFixed(3)}`);
|
|
110
|
+
}
|
|
111
|
+
return results.length ? 0 : 1;
|
|
112
|
+
}
|
|
113
|
+
case "ask": {
|
|
114
|
+
const databaseUrl = requireDatabaseUrl();
|
|
115
|
+
const embeddingProfile = resolveEmbeddingProfile();
|
|
116
|
+
const query = positional.join(" ").trim();
|
|
117
|
+
if (!query) {
|
|
118
|
+
console.error("question is required");
|
|
119
|
+
return 2;
|
|
120
|
+
}
|
|
121
|
+
const { answer, results } = await askIndex({
|
|
122
|
+
databaseUrl,
|
|
123
|
+
embeddingProfile,
|
|
124
|
+
query,
|
|
125
|
+
limit: Number(flags.limit ?? 3)
|
|
126
|
+
});
|
|
127
|
+
console.log(answer);
|
|
128
|
+
return results.length ? 0 : 1;
|
|
129
|
+
}
|
|
130
|
+
case "list": {
|
|
131
|
+
const databaseUrl = requireDatabaseUrl();
|
|
132
|
+
const rows = await listDocuments({ databaseUrl });
|
|
133
|
+
for (const row of rows) {
|
|
134
|
+
console.log(`${row.doc_id}\t${row.path}\t${row.title}`);
|
|
135
|
+
}
|
|
136
|
+
return 0;
|
|
137
|
+
}
|
|
138
|
+
case "catalog": {
|
|
139
|
+
const databaseUrl = requireDatabaseUrl();
|
|
140
|
+
const catalog = await knowledgeCatalog({ databaseUrl });
|
|
141
|
+
if (flags.json) {
|
|
142
|
+
console.log(JSON.stringify(catalog, null, 2));
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
console.log("Documents:");
|
|
146
|
+
for (const doc of catalog.documents) {
|
|
147
|
+
console.log(`- ${doc.doc_id} | ${doc.path} | ${doc.title}`);
|
|
148
|
+
}
|
|
149
|
+
return 0;
|
|
150
|
+
}
|
|
151
|
+
case "publish": {
|
|
152
|
+
const repo = flags.repo ?? resolveGitHubRepository();
|
|
153
|
+
const apiBaseUrl = resolveGitHubApiBaseUrl();
|
|
154
|
+
const token = process.env.GITHUB_TOKEN ?? null;
|
|
155
|
+
await requirePublishAccess({
|
|
156
|
+
repo,
|
|
157
|
+
token,
|
|
158
|
+
apiBaseUrl
|
|
159
|
+
});
|
|
160
|
+
const knowledgeRoot = flags["knowledge-root"]
|
|
161
|
+
? path.resolve(flags["knowledge-root"])
|
|
162
|
+
: tryResolveKnowledgeRoot();
|
|
163
|
+
const docsRoot = resolveDocsRoot({
|
|
164
|
+
docsRoot: flags["docs-root"] ?? null,
|
|
165
|
+
knowledgeRoot
|
|
166
|
+
});
|
|
167
|
+
if (!docsRoot) {
|
|
168
|
+
console.error("--docs-root PATH is required when no local docs directory is present");
|
|
169
|
+
return 2;
|
|
170
|
+
}
|
|
171
|
+
const databaseUrl = requireDatabaseUrl();
|
|
172
|
+
const embeddingProfile = resolveEmbeddingProfile();
|
|
173
|
+
const client = await connect(databaseUrl);
|
|
174
|
+
try {
|
|
175
|
+
await initDb(client);
|
|
176
|
+
} finally {
|
|
177
|
+
await client.end();
|
|
178
|
+
}
|
|
179
|
+
const result = await ingestDocuments({ databaseUrl, docsRoot, embeddingProfile });
|
|
180
|
+
console.log(
|
|
181
|
+
`published ${result.documents} documents, wrote ${result.vectors} vectors using ${result.embeddingMode}${result.embeddingModel ? `/${result.embeddingModel}` : ""} embeddings`
|
|
182
|
+
);
|
|
183
|
+
return 0;
|
|
184
|
+
}
|
|
185
|
+
case "doctor": {
|
|
186
|
+
const knowledgeRoot = flags["knowledge-root"]
|
|
187
|
+
? path.resolve(flags["knowledge-root"])
|
|
188
|
+
: tryResolveKnowledgeRoot();
|
|
189
|
+
const docsRoot = resolveDocsRoot({
|
|
190
|
+
docsRoot: flags["docs-root"] ?? null,
|
|
191
|
+
knowledgeRoot
|
|
192
|
+
});
|
|
193
|
+
const databaseUrl = requireDatabaseUrl();
|
|
194
|
+
const embeddingProfile = resolveEmbeddingProfile();
|
|
195
|
+
const info = await doctor({ databaseUrl, embeddingProfile });
|
|
196
|
+
console.log(`database: ${maskConnection(databaseUrl)}`);
|
|
197
|
+
if (knowledgeRoot) {
|
|
198
|
+
console.log(`knowledge root: ${knowledgeRoot}`);
|
|
199
|
+
}
|
|
200
|
+
if (docsRoot) {
|
|
201
|
+
console.log(`docs root: ${docsRoot}`);
|
|
202
|
+
}
|
|
203
|
+
console.log(`database reachable: ${info.ok ? "yes" : "no"}`);
|
|
204
|
+
if (!info.ok) {
|
|
205
|
+
console.log(`error: ${info.error}`);
|
|
206
|
+
return 1;
|
|
207
|
+
}
|
|
208
|
+
console.log(`schema current: ${info.schemaCurrent}`);
|
|
209
|
+
console.log(`schema latest: ${info.schemaLatest}`);
|
|
210
|
+
console.log(`schema pending: ${info.schemaPending}`);
|
|
211
|
+
console.log(`documents: ${info.documents}`);
|
|
212
|
+
console.log(`vectors: ${info.vectors}`);
|
|
213
|
+
console.log(`embedding mode: ${info.embeddingMode}`);
|
|
214
|
+
if (info.embeddingModel) {
|
|
215
|
+
console.log(`embedding model: ${info.embeddingModel}`);
|
|
216
|
+
}
|
|
217
|
+
console.log(`embedding dimensions: ${info.embeddingDimensions}`);
|
|
218
|
+
return 0;
|
|
219
|
+
}
|
|
220
|
+
default:
|
|
221
|
+
console.error(`unknown command: ${command}`);
|
|
222
|
+
return 2;
|
|
223
|
+
}
|
|
224
|
+
} catch (error) {
|
|
225
|
+
console.error(formatCliError(error));
|
|
226
|
+
return 1;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function printHelp() {
|
|
231
|
+
console.log(`usage: kb <command> [options]
|
|
232
|
+
|
|
233
|
+
commands:
|
|
234
|
+
init-repo [--dir PATH] [--repo OWNER/REPO]
|
|
235
|
+
create --title TEXT --text TEXT [--path RELATIVE_PATH]
|
|
236
|
+
search <query>
|
|
237
|
+
ask <question>
|
|
238
|
+
list
|
|
239
|
+
catalog [--json]
|
|
240
|
+
publish [--docs-root PATH] [--repo OWNER/REPO]
|
|
241
|
+
|
|
242
|
+
${databaseHelp()}
|
|
243
|
+
|
|
244
|
+
${githubCreationHelp()}
|
|
245
|
+
|
|
246
|
+
${publishHelp()}
|
|
247
|
+
|
|
248
|
+
optional path hint:
|
|
249
|
+
--knowledge-root PATH
|
|
250
|
+
|
|
251
|
+
search options:
|
|
252
|
+
--limit N
|
|
253
|
+
`);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
function printCommandHelp(command) {
|
|
257
|
+
const commandHelp = {
|
|
258
|
+
"init-repo":
|
|
259
|
+
"usage: kb init-repo [--dir PATH] [--repo OWNER/REPO] [--database-url URL] [--embedding-mode MODE] [--embedding-api-url URL] [--embedding-model NAME] [--embedding-api-key KEY] [--db-connect-timeout-ms N] [--repo-automation-token TOKEN]\n\nScaffold a knowledge-authority repository, optionally configure its GitHub settings, and preflight the target database.",
|
|
260
|
+
create: `usage: kb create --title TEXT --text TEXT [--path RELATIVE_PATH] [--repo OWNER/REPO]\n\n${githubCreationHelp()}`,
|
|
261
|
+
search: `usage: kb search <query> [options]\n\n${databaseHelp()}\n --limit N`,
|
|
262
|
+
ask: `usage: kb ask <question> [options]\n\n${databaseHelp()}\n --limit N`,
|
|
263
|
+
list: `usage: kb list\n\n${databaseHelp()}`,
|
|
264
|
+
catalog: `usage: kb catalog [--json]\n\n${databaseHelp()}`,
|
|
265
|
+
publish: `usage: kb publish [--docs-root PATH] [--knowledge-root PATH] [--repo OWNER/REPO]\n\n${databaseHelp()}\n\n${publishHelp()}`,
|
|
266
|
+
doctor: `usage: kb doctor [--knowledge-root PATH] [--docs-root PATH]\n\n${databaseHelp()}`
|
|
267
|
+
};
|
|
268
|
+
console.log(commandHelp[command] ?? `unknown command: ${command}`);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
function requireDatabaseUrl() {
|
|
272
|
+
const databaseUrl = resolveDatabaseUrl();
|
|
273
|
+
if (!databaseUrl) {
|
|
274
|
+
throw new Error("KB_DATABASE_URL is not set");
|
|
275
|
+
}
|
|
276
|
+
return databaseUrl;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
async function requirePublishAccess({ repo, token, apiBaseUrl }) {
|
|
280
|
+
if (!repo) {
|
|
281
|
+
throw new Error("KB_GITHUB_REPO or --repo OWNER/REPO is required to publish knowledge");
|
|
282
|
+
}
|
|
283
|
+
if (!token) {
|
|
284
|
+
throw new Error("GITHUB_TOKEN is required to publish knowledge");
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
const response = await fetch(`${apiBaseUrl.replace(/\/$/, "")}/repos/${repo}`, {
|
|
288
|
+
headers: {
|
|
289
|
+
accept: "application/vnd.github+json",
|
|
290
|
+
authorization: `Bearer ${token}`,
|
|
291
|
+
"user-agent": "@nzpr/kb"
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
if (!response.ok) {
|
|
296
|
+
const errorText = await response.text().catch(() => "");
|
|
297
|
+
throw new Error(
|
|
298
|
+
`failed to verify GitHub repo access: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const payload = await response.json();
|
|
303
|
+
const permissions = payload?.permissions ?? null;
|
|
304
|
+
if (!permissions) {
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
if (permissions.admin || permissions.maintain || permissions.push) {
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
throw new Error(`GITHUB_TOKEN does not have write access to ${repo}`);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function printRepoConfiguration(configuration) {
|
|
314
|
+
console.log("");
|
|
315
|
+
console.log("configure the knowledge repo with these GitHub settings:");
|
|
316
|
+
console.log("");
|
|
317
|
+
console.log("required secrets:");
|
|
318
|
+
for (const entry of configuration.requiredSecrets) {
|
|
319
|
+
console.log(` ${entry.name} - ${entry.purpose}`);
|
|
320
|
+
}
|
|
321
|
+
console.log("");
|
|
322
|
+
console.log("optional secrets:");
|
|
323
|
+
for (const entry of configuration.optionalSecrets) {
|
|
324
|
+
console.log(` ${entry.name} - ${entry.purpose}`);
|
|
325
|
+
}
|
|
326
|
+
console.log("");
|
|
327
|
+
console.log("optional variables:");
|
|
328
|
+
for (const entry of configuration.optionalVariables) {
|
|
329
|
+
console.log(` ${entry.name}=${entry.value} - ${entry.purpose}`);
|
|
330
|
+
}
|
|
331
|
+
console.log("");
|
|
332
|
+
console.log("publish workflow auth:");
|
|
333
|
+
console.log(" KB_GITHUB_REPO is set automatically to github.repository in the scaffolded workflow");
|
|
334
|
+
console.log(" GITHUB_TOKEN is provided automatically by GitHub Actions and must have contents:write")
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
function printInitRepoStatus(result) {
|
|
338
|
+
console.log("");
|
|
339
|
+
console.log("bootstrap status:");
|
|
340
|
+
printInitRepoDatabaseStatus(result.database);
|
|
341
|
+
printInitRepoGitHubStatus(result.github);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
function printInitRepoDatabaseStatus(database) {
|
|
345
|
+
if (database.status === "verified") {
|
|
346
|
+
console.log(
|
|
347
|
+
` database: verified ${database.database} current=${database.currentVersion} applied=${database.appliedCount}`
|
|
348
|
+
);
|
|
349
|
+
return;
|
|
350
|
+
}
|
|
351
|
+
if (database.status === "failed") {
|
|
352
|
+
console.log(` database: failed - ${formatCliError(new Error(database.error))}`);
|
|
353
|
+
return;
|
|
354
|
+
}
|
|
355
|
+
console.log(` database: pending - ${database.message}`);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
function printInitRepoGitHubStatus(github) {
|
|
359
|
+
if (github.status === "configured") {
|
|
360
|
+
console.log(` github: configured ${github.repo}`);
|
|
361
|
+
console.log(` labels: ${github.labels.join(", ")}`);
|
|
362
|
+
if (github.secrets.length) {
|
|
363
|
+
console.log(` secrets: ${github.secrets.join(", ")}`);
|
|
364
|
+
}
|
|
365
|
+
if (github.variables.length) {
|
|
366
|
+
console.log(` variables: ${github.variables.join(", ")}`);
|
|
367
|
+
}
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
if (github.status === "failed") {
|
|
371
|
+
console.log(` github: failed - ${github.error}`);
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
console.log(` github: ${github.status} - ${github.message}`);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
function printInitRepoNextStep(result) {
|
|
378
|
+
if (!result.ok) {
|
|
379
|
+
console.log(
|
|
380
|
+
"next: fix the failed bootstrap step and rerun kb init-repo; rerunning is safe because the scaffold is idempotent"
|
|
381
|
+
);
|
|
382
|
+
return;
|
|
383
|
+
}
|
|
384
|
+
if (result.database.status === "verified" && result.github.status === "configured") {
|
|
385
|
+
console.log("next: commit the scaffold in the knowledge repo, push it, and let that repo own KB publishing");
|
|
386
|
+
return;
|
|
387
|
+
}
|
|
388
|
+
console.log(
|
|
389
|
+
"next: rerun kb init-repo with the missing repo or database inputs when you are ready, or commit the scaffold now and finish remote setup later"
|
|
390
|
+
);
|
|
391
|
+
}
|
package/lib/config.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
const LOCAL_HASH_PROFILE = Object.freeze({
|
|
4
|
+
mode: "local-hash",
|
|
5
|
+
model: null,
|
|
6
|
+
dimensions: 256,
|
|
7
|
+
apiUrl: null,
|
|
8
|
+
apiKey: null,
|
|
9
|
+
timeoutMs: 0
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
export function tryResolveKnowledgeRoot(start = process.cwd()) {
|
|
13
|
+
let current = path.resolve(start);
|
|
14
|
+
while (true) {
|
|
15
|
+
if (fs.existsSync(path.join(current, "kb", "docs"))) {
|
|
16
|
+
return path.join(current, "kb");
|
|
17
|
+
}
|
|
18
|
+
if (fs.existsSync(path.join(current, "knowledge", "docs"))) {
|
|
19
|
+
return path.join(current, "knowledge");
|
|
20
|
+
}
|
|
21
|
+
if (
|
|
22
|
+
fs.existsSync(path.join(current, "docs")) &&
|
|
23
|
+
fs.existsSync(path.join(current, "package.json"))
|
|
24
|
+
) {
|
|
25
|
+
return current;
|
|
26
|
+
}
|
|
27
|
+
const parent = path.dirname(current);
|
|
28
|
+
if (parent === current) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
current = parent;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function resolveDocsRoot({ docsRoot = null, knowledgeRoot = null, cwd = process.cwd() } = {}) {
|
|
36
|
+
if (docsRoot) {
|
|
37
|
+
return path.resolve(docsRoot);
|
|
38
|
+
}
|
|
39
|
+
if (knowledgeRoot) {
|
|
40
|
+
return path.join(knowledgeRoot, "docs");
|
|
41
|
+
}
|
|
42
|
+
const cwdKbDocsRoot = path.resolve(cwd, "kb", "docs");
|
|
43
|
+
if (fs.existsSync(cwdKbDocsRoot)) {
|
|
44
|
+
return cwdKbDocsRoot;
|
|
45
|
+
}
|
|
46
|
+
const cwdDocsRoot = path.resolve(cwd, "docs");
|
|
47
|
+
if (fs.existsSync(cwdDocsRoot)) {
|
|
48
|
+
return cwdDocsRoot;
|
|
49
|
+
}
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function resolveDatabaseUrl() {
|
|
54
|
+
return process.env.KB_DATABASE_URL ?? null;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function resolveGitHubRepository(env = process.env) {
|
|
58
|
+
return env.KB_GITHUB_REPO ?? env.GITHUB_REPOSITORY ?? null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function resolveGitHubApiBaseUrl(env = process.env) {
|
|
62
|
+
return env.GITHUB_API_URL ?? "https://api.github.com";
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function resolveEmbeddingProfile(env = process.env) {
|
|
66
|
+
const mode = normalizeEmbeddingMode(env.KB_EMBEDDING_MODE ?? LOCAL_HASH_PROFILE.mode);
|
|
67
|
+
if (mode === LOCAL_HASH_PROFILE.mode) {
|
|
68
|
+
return { ...LOCAL_HASH_PROFILE };
|
|
69
|
+
}
|
|
70
|
+
if (mode !== "bge-m3-openai") {
|
|
71
|
+
throw new Error(`unsupported KB_EMBEDDING_MODE: ${mode}`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const rawApiUrl = env.KB_EMBEDDING_API_URL ?? null;
|
|
75
|
+
if (!rawApiUrl) {
|
|
76
|
+
throw new Error("KB_EMBEDDING_API_URL is required when KB_EMBEDDING_MODE=bge-m3-openai");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const timeoutMs = Number(env.KB_EMBEDDING_TIMEOUT_MS ?? 30000);
|
|
80
|
+
if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
|
|
81
|
+
throw new Error("KB_EMBEDDING_TIMEOUT_MS must be a positive number");
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
mode,
|
|
86
|
+
model: env.KB_EMBEDDING_MODEL ?? "BAAI/bge-m3",
|
|
87
|
+
dimensions: 1024,
|
|
88
|
+
apiUrl: normalizeEmbeddingApiUrl(rawApiUrl),
|
|
89
|
+
apiKey: env.KB_EMBEDDING_API_KEY ?? null,
|
|
90
|
+
timeoutMs
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function normalizeEmbeddingMode(mode) {
|
|
95
|
+
if (mode === "openai") {
|
|
96
|
+
return "bge-m3-openai";
|
|
97
|
+
}
|
|
98
|
+
return mode;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function normalizeEmbeddingApiUrl(value) {
|
|
102
|
+
const url = new URL(value);
|
|
103
|
+
if (url.pathname === "" || url.pathname === "/") {
|
|
104
|
+
url.pathname = "/v1/embeddings";
|
|
105
|
+
} else if (url.pathname.endsWith("/v1")) {
|
|
106
|
+
url.pathname = `${url.pathname}/embeddings`;
|
|
107
|
+
}
|
|
108
|
+
return url.toString();
|
|
109
|
+
}
|
package/lib/db.js
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { Client } from "pg";
|
|
2
|
+
import { migrateDatabase, migrationStatus } from "./migrations.js";
|
|
3
|
+
|
|
4
|
+
export async function connect(databaseUrl) {
|
|
5
|
+
const client = new Client({
|
|
6
|
+
connectionString: databaseUrl,
|
|
7
|
+
connectionTimeoutMillis: Number(process.env.KB_DB_CONNECT_TIMEOUT_MS ?? 5000)
|
|
8
|
+
});
|
|
9
|
+
await client.connect();
|
|
10
|
+
return client;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function initDb(client) {
|
|
14
|
+
return migrateDatabase(client);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function schemaStatus(client) {
|
|
18
|
+
return migrationStatus(client);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function ensureCompatibility(client, embeddingProfile) {
|
|
22
|
+
const status = await migrationStatus(client);
|
|
23
|
+
if (status.pendingCount > 0) {
|
|
24
|
+
throw new Error(
|
|
25
|
+
`database schema is behind: current=${status.currentVersion} latest=${status.latestVersion}; run kb publish`
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
if (!embeddingProfile) {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
const values = await readEmbeddingMetadata(client);
|
|
32
|
+
if (!values.embedding_mode && !values.embedding_dim && !values.embedding_model) {
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
if (values.embedding_dim && values.embedding_dim !== String(embeddingProfile.dimensions)) {
|
|
36
|
+
throw new Error(
|
|
37
|
+
`embedding dimension mismatch: database=${values.embedding_dim} current=${embeddingProfile.dimensions}`
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
if (values.embedding_mode && values.embedding_mode !== embeddingProfile.mode) {
|
|
41
|
+
throw new Error(
|
|
42
|
+
`embedding mode mismatch: database=${values.embedding_mode} current=${embeddingProfile.mode}`
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
if ((values.embedding_model ?? "") !== normalizeEmbeddingModel(embeddingProfile.model)) {
|
|
46
|
+
throw new Error(
|
|
47
|
+
`embedding model mismatch: database=${values.embedding_model ?? ""} current=${normalizeEmbeddingModel(
|
|
48
|
+
embeddingProfile.model
|
|
49
|
+
)}`
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export async function readEmbeddingMetadata(client) {
|
|
55
|
+
const result = await client.query(
|
|
56
|
+
"SELECT key, value FROM kb_metadata WHERE key IN ('embedding_mode', 'embedding_dim', 'embedding_model')"
|
|
57
|
+
);
|
|
58
|
+
return Object.fromEntries(result.rows.map((row) => [row.key, row.value]));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export async function writeEmbeddingMetadata(client, embeddingProfile) {
|
|
62
|
+
const entries = [
|
|
63
|
+
["embedding_mode", embeddingProfile.mode],
|
|
64
|
+
["embedding_dim", String(embeddingProfile.dimensions)],
|
|
65
|
+
["embedding_model", normalizeEmbeddingModel(embeddingProfile.model)]
|
|
66
|
+
];
|
|
67
|
+
for (const [key, value] of entries) {
|
|
68
|
+
await client.query(
|
|
69
|
+
`
|
|
70
|
+
INSERT INTO kb_metadata (key, value)
|
|
71
|
+
VALUES ($1, $2)
|
|
72
|
+
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
|
|
73
|
+
`,
|
|
74
|
+
[key, value]
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function normalizeEmbeddingModel(model) {
|
|
80
|
+
return model ?? "";
|
|
81
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
const TOKEN_RE = /[a-zA-Z0-9_]+/g;
|
|
2
|
+
export const EMBEDDING_DIM = 256;
|
|
3
|
+
|
|
4
|
+
export async function embedText(text, embeddingProfile = null) {
|
|
5
|
+
if (!embeddingProfile || embeddingProfile.mode === "local-hash") {
|
|
6
|
+
return embedTextLocal(text);
|
|
7
|
+
}
|
|
8
|
+
if (embeddingProfile.mode === "bge-m3-openai") {
|
|
9
|
+
return embedTextRemote(text, embeddingProfile);
|
|
10
|
+
}
|
|
11
|
+
throw new Error(`unsupported embedding mode: ${embeddingProfile.mode}`);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function embedTextLocal(text) {
|
|
15
|
+
const tokens = (text.match(TOKEN_RE) ?? []).map((token) => token.toLowerCase());
|
|
16
|
+
const counts = new Map();
|
|
17
|
+
for (const token of tokens) {
|
|
18
|
+
counts.set(token, (counts.get(token) ?? 0) + 1);
|
|
19
|
+
}
|
|
20
|
+
const vector = new Array(EMBEDDING_DIM).fill(0);
|
|
21
|
+
for (const [token, count] of counts.entries()) {
|
|
22
|
+
const index = hashToken(token) % EMBEDDING_DIM;
|
|
23
|
+
vector[index] += count;
|
|
24
|
+
}
|
|
25
|
+
const norm = Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
|
|
26
|
+
if (!norm) {
|
|
27
|
+
return vector;
|
|
28
|
+
}
|
|
29
|
+
return vector.map((value) => value / norm);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export function vectorLiteral(values) {
|
|
33
|
+
return `[${values.map((value) => Number(value).toFixed(8)).join(",")}]`;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function tokenOverlap(query, text) {
|
|
37
|
+
const queryTokens = new Set((query.match(TOKEN_RE) ?? []).map((token) => token.toLowerCase()));
|
|
38
|
+
const textTokens = new Set((text.match(TOKEN_RE) ?? []).map((token) => token.toLowerCase()));
|
|
39
|
+
if (!queryTokens.size) {
|
|
40
|
+
return 0;
|
|
41
|
+
}
|
|
42
|
+
let overlap = 0;
|
|
43
|
+
for (const token of queryTokens) {
|
|
44
|
+
if (textTokens.has(token)) {
|
|
45
|
+
overlap += 1;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return overlap / queryTokens.size;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function hashToken(input) {
|
|
52
|
+
let hash = 2166136261;
|
|
53
|
+
for (let i = 0; i < input.length; i += 1) {
|
|
54
|
+
hash ^= input.charCodeAt(i);
|
|
55
|
+
hash = Math.imul(hash, 16777619);
|
|
56
|
+
}
|
|
57
|
+
return Math.abs(hash >>> 0);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async function embedTextRemote(text, embeddingProfile) {
|
|
61
|
+
const response = await fetch(embeddingProfile.apiUrl, {
|
|
62
|
+
method: "POST",
|
|
63
|
+
headers: {
|
|
64
|
+
"content-type": "application/json",
|
|
65
|
+
...(embeddingProfile.apiKey
|
|
66
|
+
? { authorization: `Bearer ${embeddingProfile.apiKey}` }
|
|
67
|
+
: {})
|
|
68
|
+
},
|
|
69
|
+
body: JSON.stringify({
|
|
70
|
+
input: text,
|
|
71
|
+
model: embeddingProfile.model
|
|
72
|
+
}),
|
|
73
|
+
signal: AbortSignal.timeout(embeddingProfile.timeoutMs)
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
if (!response.ok) {
|
|
77
|
+
const body = await response.text().catch(() => "");
|
|
78
|
+
throw new Error(
|
|
79
|
+
`embedding request failed: ${response.status} ${response.statusText}${body ? ` - ${body}` : ""}`
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const payload = await response.json();
|
|
84
|
+
const embedding = payload?.data?.[0]?.embedding;
|
|
85
|
+
if (!Array.isArray(embedding)) {
|
|
86
|
+
throw new Error("embedding response did not include data[0].embedding");
|
|
87
|
+
}
|
|
88
|
+
if (embedding.length !== embeddingProfile.dimensions) {
|
|
89
|
+
throw new Error(
|
|
90
|
+
`embedding dimension mismatch from provider: expected=${embeddingProfile.dimensions} actual=${embedding.length}`
|
|
91
|
+
);
|
|
92
|
+
}
|
|
93
|
+
return embedding.map((value) => Number(value));
|
|
94
|
+
}
|