ralph-hero-knowledge-index 0.1.21 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/README.md +109 -0
- package/dist/config.d.ts +32 -0
- package/dist/config.js +75 -0
- package/dist/config.js.map +1 -0
- package/dist/db.d.ts +7 -0
- package/dist/db.js +17 -0
- package/dist/db.js.map +1 -1
- package/dist/file-scanner.d.ts +13 -1
- package/dist/file-scanner.js +30 -3
- package/dist/file-scanner.js.map +1 -1
- package/dist/hybrid-search.d.ts +12 -0
- package/dist/hybrid-search.js +74 -5
- package/dist/hybrid-search.js.map +1 -1
- package/dist/ignore.d.ts +29 -0
- package/dist/ignore.js +65 -0
- package/dist/ignore.js.map +1 -0
- package/dist/index.d.ts +9 -1
- package/dist/index.js +166 -6
- package/dist/index.js.map +1 -1
- package/dist/llm-client.d.ts +41 -0
- package/dist/llm-client.js +98 -0
- package/dist/llm-client.js.map +1 -0
- package/dist/reindex.d.ts +22 -3
- package/dist/reindex.js +60 -8
- package/dist/reindex.js.map +1 -1
- package/dist/search.d.ts +12 -0
- package/dist/search.js +15 -1
- package/dist/search.js.map +1 -1
- package/package.json +2 -1
- package/src/__tests__/config.test.ts +173 -0
- package/src/__tests__/file-scanner.test.ts +88 -0
- package/src/__tests__/hybrid-search.test.ts +107 -0
- package/src/__tests__/ignore.test.ts +86 -0
- package/src/__tests__/index.test.ts +450 -0
- package/src/__tests__/llm-client.test.ts +349 -0
- package/src/__tests__/memory-stats.test.ts +204 -0
- package/src/__tests__/reindex.test.ts +148 -2
- package/src/__tests__/search.test.ts +37 -0
- package/src/config.ts +105 -0
- package/src/db.ts +17 -0
- package/src/file-scanner.ts +28 -3
- package/src/hybrid-search.ts +88 -5
- package/src/ignore.ts +82 -0
- package/src/index.ts +202 -7
- package/src/llm-client.ts +136 -0
- package/src/reindex.ts +80 -9
- package/src/search.ts +27 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
2
|
import { mkdtempSync, writeFileSync, mkdirSync, unlinkSync, utimesSync } from "node:fs";
|
|
3
3
|
import { join, resolve } from "node:path";
|
|
4
4
|
import { tmpdir } from "node:os";
|
|
@@ -15,7 +15,7 @@ vi.mock("../embedder.js", () => ({
|
|
|
15
15
|
}));
|
|
16
16
|
|
|
17
17
|
import { embed } from "../embedder.js";
|
|
18
|
-
import { reindex } from "../reindex.js";
|
|
18
|
+
import { reindex, resolveDirs } from "../reindex.js";
|
|
19
19
|
import { KnowledgeDB } from "../db.js";
|
|
20
20
|
|
|
21
21
|
const mockedEmbed = vi.mocked(embed);
|
|
@@ -353,4 +353,150 @@ describe("incremental reindex", () => {
|
|
|
353
353
|
expect(results.some(r => r.id === "fresh-doc")).toBe(true);
|
|
354
354
|
db1.close();
|
|
355
355
|
});
|
|
356
|
+
|
|
357
|
+
it("scenario 13: reindex honors .ralphignore for file discovery", async () => {
|
|
358
|
+
writeFileSync(join(dir, "kept.md"), makeDoc("Kept"));
|
|
359
|
+
writeFileSync(join(dir, "skipped.md"), makeDoc("Skipped"));
|
|
360
|
+
writeFileSync(join(dir, ".ralphignore"), "skipped.md\n");
|
|
361
|
+
|
|
362
|
+
await reindex([dir], dbPath);
|
|
363
|
+
expect(mockedEmbed).toHaveBeenCalledTimes(1);
|
|
364
|
+
|
|
365
|
+
const db = new KnowledgeDB(dbPath);
|
|
366
|
+
expect(db.getDocument("kept")).toBeTruthy();
|
|
367
|
+
expect(db.getDocument("skipped")).toBeUndefined();
|
|
368
|
+
db.close();
|
|
369
|
+
});
|
|
370
|
+
|
|
371
|
+
it("scenario 14: reindex honors caller-supplied ignorePatterns arg", async () => {
|
|
372
|
+
writeFileSync(join(dir, "kept.md"), makeDoc("Kept"));
|
|
373
|
+
mkdirSync(join(dir, "drafts"));
|
|
374
|
+
writeFileSync(join(dir, "drafts", "wip.md"), makeDoc("WIP"));
|
|
375
|
+
|
|
376
|
+
await reindex([dir], dbPath, false, ["drafts/**"]);
|
|
377
|
+
// Only kept.md should have been embedded.
|
|
378
|
+
expect(mockedEmbed).toHaveBeenCalledTimes(1);
|
|
379
|
+
|
|
380
|
+
const db = new KnowledgeDB(dbPath);
|
|
381
|
+
expect(db.getDocument("kept")).toBeTruthy();
|
|
382
|
+
expect(db.getDocument("wip")).toBeUndefined();
|
|
383
|
+
db.close();
|
|
384
|
+
});
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
describe("resolveDirs precedence", () => {
|
|
388
|
+
const ORIGINAL_ARGV = process.argv;
|
|
389
|
+
const ORIGINAL_ENV = {
|
|
390
|
+
RALPH_KNOWLEDGE_DIRS: process.env.RALPH_KNOWLEDGE_DIRS,
|
|
391
|
+
RALPH_KNOWLEDGE_DB: process.env.RALPH_KNOWLEDGE_DB,
|
|
392
|
+
RALPH_KNOWLEDGE_CONFIG: process.env.RALPH_KNOWLEDGE_CONFIG,
|
|
393
|
+
};
|
|
394
|
+
let tmpHome: string;
|
|
395
|
+
let configDir: string;
|
|
396
|
+
|
|
397
|
+
beforeEach(() => {
|
|
398
|
+
process.argv = ["node", "reindex.js"];
|
|
399
|
+
delete process.env.RALPH_KNOWLEDGE_DIRS;
|
|
400
|
+
delete process.env.RALPH_KNOWLEDGE_DB;
|
|
401
|
+
configDir = mkdtempSync(join(tmpdir(), "resolve-dirs-"));
|
|
402
|
+
tmpHome = configDir;
|
|
403
|
+
process.env.RALPH_KNOWLEDGE_CONFIG = join(configDir, "knowledge.config.json");
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
afterEach(() => {
|
|
407
|
+
process.argv = ORIGINAL_ARGV;
|
|
408
|
+
for (const key of Object.keys(ORIGINAL_ENV) as (keyof typeof ORIGINAL_ENV)[]) {
|
|
409
|
+
const orig = ORIGINAL_ENV[key];
|
|
410
|
+
if (orig === undefined) {
|
|
411
|
+
delete process.env[key];
|
|
412
|
+
} else {
|
|
413
|
+
process.env[key] = orig;
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
it("CLI positional args beat env var even when both are set", () => {
|
|
419
|
+
writeFileSync(
|
|
420
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
421
|
+
JSON.stringify({ roots: ["/from/config"] }),
|
|
422
|
+
);
|
|
423
|
+
process.argv = ["node", "reindex.js", "/from/cli"];
|
|
424
|
+
process.env.RALPH_KNOWLEDGE_DIRS = "/from/env";
|
|
425
|
+
const r = resolveDirs();
|
|
426
|
+
expect(r.source).toBe("cli");
|
|
427
|
+
expect(r.dirs).toEqual(["/from/cli"]);
|
|
428
|
+
});
|
|
429
|
+
|
|
430
|
+
it("env var beats config file roots when CLI is empty", () => {
|
|
431
|
+
writeFileSync(
|
|
432
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
433
|
+
JSON.stringify({ roots: ["/from/config"] }),
|
|
434
|
+
);
|
|
435
|
+
process.env.RALPH_KNOWLEDGE_DIRS = "/from/env-a,/from/env-b";
|
|
436
|
+
const r = resolveDirs();
|
|
437
|
+
expect(r.source).toBe("env");
|
|
438
|
+
expect(r.dirs).toEqual(["/from/env-a", "/from/env-b"]);
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
it("config file roots beat fallback when CLI and env are absent", () => {
|
|
442
|
+
writeFileSync(
|
|
443
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
444
|
+
JSON.stringify({ roots: ["/from/config-a", "/from/config-b"] }),
|
|
445
|
+
);
|
|
446
|
+
const r = resolveDirs();
|
|
447
|
+
expect(r.source).toBe("config");
|
|
448
|
+
expect(r.dirs).toEqual(["/from/config-a", "/from/config-b"]);
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
it("falls back to ../../thoughts when no source is configured", () => {
|
|
452
|
+
// Point env var at a nonexistent config path so loadConfig returns {}.
|
|
453
|
+
process.env.RALPH_KNOWLEDGE_CONFIG = join(configDir, "missing.json");
|
|
454
|
+
const r = resolveDirs();
|
|
455
|
+
expect(r.source).toBe("fallback");
|
|
456
|
+
expect(r.dirs).toEqual(["../../thoughts"]);
|
|
457
|
+
});
|
|
458
|
+
|
|
459
|
+
it("dbPath precedence: CLI arg > env var > config > default", () => {
|
|
460
|
+
writeFileSync(
|
|
461
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
462
|
+
JSON.stringify({ roots: ["/x"], dbPath: "/from/config.db" }),
|
|
463
|
+
);
|
|
464
|
+
// CLI wins
|
|
465
|
+
process.argv = ["node", "reindex.js", "/cli/root", "/cli/override.db"];
|
|
466
|
+
process.env.RALPH_KNOWLEDGE_DB = "/from/env.db";
|
|
467
|
+
expect(resolveDirs().dbPath).toBe("/cli/override.db");
|
|
468
|
+
|
|
469
|
+
// Env wins over config when CLI is absent
|
|
470
|
+
process.argv = ["node", "reindex.js"];
|
|
471
|
+
process.env.RALPH_KNOWLEDGE_DIRS = "/env/root";
|
|
472
|
+
process.env.RALPH_KNOWLEDGE_DB = "/from/env.db";
|
|
473
|
+
expect(resolveDirs().dbPath).toBe("/from/env.db");
|
|
474
|
+
|
|
475
|
+
// Config wins when neither CLI nor env set dbPath
|
|
476
|
+
delete process.env.RALPH_KNOWLEDGE_DB;
|
|
477
|
+
expect(resolveDirs().dbPath).toBe("/from/config.db");
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
it("forwards config.ignorePatterns on the returned config object", () => {
|
|
481
|
+
writeFileSync(
|
|
482
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
483
|
+
JSON.stringify({
|
|
484
|
+
roots: ["/r1"],
|
|
485
|
+
ignorePatterns: ["draft/**", "*.bak"],
|
|
486
|
+
}),
|
|
487
|
+
);
|
|
488
|
+
const r = resolveDirs();
|
|
489
|
+
expect(r.config.ignorePatterns).toEqual(["draft/**", "*.bak"]);
|
|
490
|
+
});
|
|
491
|
+
|
|
492
|
+
it("treats an empty RALPH_KNOWLEDGE_DIRS as unset and falls through", () => {
|
|
493
|
+
writeFileSync(
|
|
494
|
+
process.env.RALPH_KNOWLEDGE_CONFIG!,
|
|
495
|
+
JSON.stringify({ roots: ["/from/config"] }),
|
|
496
|
+
);
|
|
497
|
+
process.env.RALPH_KNOWLEDGE_DIRS = " , ";
|
|
498
|
+
const r = resolveDirs();
|
|
499
|
+
expect(r.source).toBe("config");
|
|
500
|
+
expect(r.dirs).toEqual(["/from/config"]);
|
|
501
|
+
});
|
|
356
502
|
});
|
|
@@ -205,6 +205,43 @@ describe("FtsSearch", () => {
|
|
|
205
205
|
});
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
+
describe("memory_tier filter", () => {
|
|
209
|
+
it("filters by memory_tier when schema has the column", () => {
|
|
210
|
+
db.db
|
|
211
|
+
.prepare("UPDATE documents SET memory_tier = ? WHERE id = ?")
|
|
212
|
+
.run("reflection", "auth-doc");
|
|
213
|
+
fts.rebuildIndex();
|
|
214
|
+
|
|
215
|
+
// auth-doc is "reflection", so search for terms in auth-doc should hit.
|
|
216
|
+
const reflectionHits = fts.search("authentication", { memoryTier: "reflection" });
|
|
217
|
+
const ids = reflectionHits.map((r) => r.id);
|
|
218
|
+
expect(ids).toContain("auth-doc");
|
|
219
|
+
|
|
220
|
+
// A "doc" filter should omit the reflection-tagged doc.
|
|
221
|
+
const docHits = fts.search("authentication", { memoryTier: "doc" });
|
|
222
|
+
expect(docHits.some((r) => r.id === "auth-doc")).toBe(false);
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
it("ignores memory_tier silently when column is absent (v2 schema)", () => {
|
|
226
|
+
// beforeEach gives us a v2 schema — column does not exist.
|
|
227
|
+
const results = fts.search("cache", { memoryTier: "reflection" });
|
|
228
|
+
// Filter is a no-op on v2; regular FTS results come through.
|
|
229
|
+
expect(Array.isArray(results)).toBe(true);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
it("returns all tiers when memoryTier='any'", () => {
|
|
233
|
+
db.db
|
|
234
|
+
.prepare("UPDATE documents SET memory_tier = ? WHERE id = ?")
|
|
235
|
+
.run("reflection", "auth-doc");
|
|
236
|
+
fts.rebuildIndex();
|
|
237
|
+
|
|
238
|
+
const authHits = fts.search("authentication", { memoryTier: "any" });
|
|
239
|
+
expect(authHits.some((r) => r.id === "auth-doc")).toBe(true);
|
|
240
|
+
const cacheHits = fts.search("cache", { memoryTier: "any" });
|
|
241
|
+
expect(cacheHits.some((r) => r.id === "cache-doc")).toBe(true);
|
|
242
|
+
});
|
|
243
|
+
});
|
|
244
|
+
|
|
208
245
|
describe("ensureTable", () => {
|
|
209
246
|
it("creates FTS table if it does not exist", () => {
|
|
210
247
|
// Create a fresh DB without FTS table
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Shape of the optional `~/.ralph/knowledge.config.json` file.
|
|
7
|
+
*
|
|
8
|
+
* All fields are optional. Unknown fields are preserved at parse time but are
|
|
9
|
+
* not surfaced through this interface — callers should treat the file as
|
|
10
|
+
* forward-compatible.
|
|
11
|
+
*/
|
|
12
|
+
export interface KnowledgeConfig {
|
|
13
|
+
/** Absolute or `~`-prefixed directories to index. */
|
|
14
|
+
roots?: string[];
|
|
15
|
+
/** Extra gitignore-syntax patterns layered on top of per-root `.ralphignore`. */
|
|
16
|
+
ignorePatterns?: string[];
|
|
17
|
+
/** Override for the SQLite database path. */
|
|
18
|
+
dbPath?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Expand a leading `~` or `~/` segment in a path to the user's home directory.
|
|
23
|
+
* Paths that do not begin with `~` are returned unchanged.
|
|
24
|
+
*/
|
|
25
|
+
export function expandHome(p: string): string {
|
|
26
|
+
if (!p) return p;
|
|
27
|
+
if (p === "~") return homedir();
|
|
28
|
+
if (p.startsWith("~/") || p.startsWith("~\\")) {
|
|
29
|
+
return join(homedir(), p.slice(2));
|
|
30
|
+
}
|
|
31
|
+
return p;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Resolve the knowledge config file path. Precedence:
|
|
36
|
+
* 1. `process.env.RALPH_KNOWLEDGE_CONFIG`
|
|
37
|
+
* 2. `~/.ralph/knowledge.config.json`
|
|
38
|
+
*/
|
|
39
|
+
export function resolveConfigPath(): string {
|
|
40
|
+
const envPath = process.env.RALPH_KNOWLEDGE_CONFIG;
|
|
41
|
+
if (envPath && envPath.trim().length > 0) {
|
|
42
|
+
return expandHome(envPath);
|
|
43
|
+
}
|
|
44
|
+
return join(homedir(), ".ralph", "knowledge.config.json");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Load the optional `knowledge.config.json` file. Returns an empty object when
|
|
49
|
+
* the file is missing or malformed. Tilde-prefixed paths inside `roots` and
|
|
50
|
+
* `dbPath` are expanded eagerly so callers receive absolute paths.
|
|
51
|
+
*/
|
|
52
|
+
export function loadConfig(): KnowledgeConfig {
|
|
53
|
+
const configPath = resolveConfigPath();
|
|
54
|
+
if (!existsSync(configPath)) {
|
|
55
|
+
return {};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
let raw: string;
|
|
59
|
+
try {
|
|
60
|
+
raw = readFileSync(configPath, "utf-8");
|
|
61
|
+
} catch (e) {
|
|
62
|
+
console.warn(
|
|
63
|
+
`Failed to read knowledge config at ${configPath}: ${(e as Error).message}`,
|
|
64
|
+
);
|
|
65
|
+
return {};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
let parsed: unknown;
|
|
69
|
+
try {
|
|
70
|
+
parsed = JSON.parse(raw);
|
|
71
|
+
} catch (e) {
|
|
72
|
+
console.warn(
|
|
73
|
+
`Malformed JSON in knowledge config at ${configPath}: ${(e as Error).message}`,
|
|
74
|
+
);
|
|
75
|
+
return {};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
79
|
+
console.warn(
|
|
80
|
+
`Knowledge config at ${configPath} is not a JSON object; ignoring.`,
|
|
81
|
+
);
|
|
82
|
+
return {};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const obj = parsed as Record<string, unknown>;
|
|
86
|
+
const out: KnowledgeConfig = {};
|
|
87
|
+
|
|
88
|
+
if (Array.isArray(obj.roots)) {
|
|
89
|
+
out.roots = obj.roots
|
|
90
|
+
.filter((r): r is string => typeof r === "string" && r.length > 0)
|
|
91
|
+
.map(expandHome);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (Array.isArray(obj.ignorePatterns)) {
|
|
95
|
+
out.ignorePatterns = obj.ignorePatterns.filter(
|
|
96
|
+
(p): p is string => typeof p === "string" && p.length > 0,
|
|
97
|
+
);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (typeof obj.dbPath === "string" && obj.dbPath.length > 0) {
|
|
101
|
+
out.dbPath = expandHome(obj.dbPath);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return out;
|
|
105
|
+
}
|
package/src/db.ts
CHANGED
|
@@ -468,6 +468,23 @@ export class KnowledgeDB {
|
|
|
468
468
|
return row !== undefined;
|
|
469
469
|
}
|
|
470
470
|
|
|
471
|
+
/**
|
|
472
|
+
* Returns the `memory_tier` for the given document id. Returns `undefined`
|
|
473
|
+
* when the document does not exist OR when the `memory_tier` column is
|
|
474
|
+
* absent from the schema (pre-v3 databases). Used by MCP `knowledge_*`
|
|
475
|
+
* tools that need to post-filter result sets by tier.
|
|
476
|
+
*/
|
|
477
|
+
getMemoryTier(id: string): string | undefined {
|
|
478
|
+
const columns = this.db
|
|
479
|
+
.prepare("PRAGMA table_info(documents)")
|
|
480
|
+
.all() as Array<{ name: string }>;
|
|
481
|
+
if (!columns.some((c) => c.name === "memory_tier")) return undefined;
|
|
482
|
+
const row = this.db
|
|
483
|
+
.prepare("SELECT memory_tier AS memoryTier FROM documents WHERE id = ?")
|
|
484
|
+
.get(id) as { memoryTier: string } | undefined;
|
|
485
|
+
return row?.memoryTier;
|
|
486
|
+
}
|
|
487
|
+
|
|
471
488
|
deleteDocument(id: string): void {
|
|
472
489
|
this.db.prepare("DELETE FROM documents WHERE id = ?").run(id);
|
|
473
490
|
}
|
package/src/file-scanner.ts
CHANGED
|
@@ -1,14 +1,39 @@
|
|
|
1
1
|
import { readdirSync } from "node:fs";
|
|
2
|
-
import { join } from "node:path";
|
|
2
|
+
import { join, relative } from "node:path";
|
|
3
|
+
import type { IgnoreMatcher } from "./ignore.js";
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
/**
|
|
6
|
+
* Recursively find all `.md` files under `dir`.
|
|
7
|
+
*
|
|
8
|
+
* Directory names beginning with `.` or `_` and file names beginning with `_`
|
|
9
|
+
* are always skipped (fast-path). When an {@link IgnoreMatcher} is supplied,
|
|
10
|
+
* each remaining path is additionally tested against it via its root-relative
|
|
11
|
+
* form; matches are skipped.
|
|
12
|
+
*
|
|
13
|
+
* @param dir root directory to walk
|
|
14
|
+
* @param matcher optional matcher built via `loadIgnoreForRoot(dir, …)`
|
|
15
|
+
*/
|
|
16
|
+
export function findMarkdownFiles(dir: string, matcher?: IgnoreMatcher): string[] {
|
|
5
17
|
const results: string[] = [];
|
|
6
18
|
function walk(d: string) {
|
|
7
19
|
for (const entry of readdirSync(d, { withFileTypes: true })) {
|
|
8
20
|
const fullPath = join(d, entry.name);
|
|
9
|
-
if (entry.isDirectory()
|
|
21
|
+
if (entry.isDirectory()) {
|
|
22
|
+
// Fast-path: hidden/underscored directories are always skipped.
|
|
23
|
+
if (entry.name.startsWith(".") || entry.name.startsWith("_")) continue;
|
|
24
|
+
if (matcher) {
|
|
25
|
+
// Test both bare and trailing-slash forms so gitignore-style
|
|
26
|
+
// directory-only patterns (e.g., `dist/`) match even when the
|
|
27
|
+
// directory itself has not yet been descended.
|
|
28
|
+
const rel = relative(dir, fullPath);
|
|
29
|
+
if (matcher.isIgnored(rel) || matcher.isIgnored(`${rel}/`)) continue;
|
|
30
|
+
}
|
|
10
31
|
walk(fullPath);
|
|
11
32
|
} else if (entry.isFile() && entry.name.endsWith(".md") && !entry.name.startsWith("_")) {
|
|
33
|
+
if (matcher) {
|
|
34
|
+
const rel = relative(dir, fullPath);
|
|
35
|
+
if (matcher.isIgnored(rel)) continue;
|
|
36
|
+
}
|
|
12
37
|
results.push(fullPath);
|
|
13
38
|
}
|
|
14
39
|
}
|
package/src/hybrid-search.ts
CHANGED
|
@@ -4,6 +4,16 @@ import type { VectorSearch } from "./vector-search.js";
|
|
|
4
4
|
|
|
5
5
|
export type EmbedFn = (text: string) => Promise<Float32Array>;
|
|
6
6
|
|
|
7
|
+
interface ChunkRow {
|
|
8
|
+
id: string;
|
|
9
|
+
document_id: string;
|
|
10
|
+
chunk_index: number;
|
|
11
|
+
char_start: number;
|
|
12
|
+
char_end: number;
|
|
13
|
+
context_prefix: string;
|
|
14
|
+
content: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
7
17
|
export class HybridSearch {
|
|
8
18
|
private static readonly RRF_K = 60;
|
|
9
19
|
|
|
@@ -14,23 +24,64 @@ export class HybridSearch {
|
|
|
14
24
|
private readonly embedFn: EmbedFn,
|
|
15
25
|
) {}
|
|
16
26
|
|
|
27
|
+
/**
|
|
28
|
+
* Returns true when the `chunks` table exists (schema v3+). When absent we
|
|
29
|
+
* behave as if all vector ids are doc ids (pre-chunking behavior).
|
|
30
|
+
*/
|
|
31
|
+
private chunksTableExists(): boolean {
|
|
32
|
+
const row = this.db.db
|
|
33
|
+
.prepare(
|
|
34
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'",
|
|
35
|
+
)
|
|
36
|
+
.get();
|
|
37
|
+
return row !== undefined;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Given a vector-search id, return the `document_id` portion. Chunk ids
|
|
42
|
+
* follow the pattern `{doc_id}#c{index}` per Shared Constraint #6 of the
|
|
43
|
+
* GH-0761 plan. Legacy non-chunk ids pass through unchanged.
|
|
44
|
+
*/
|
|
45
|
+
private docIdFromVecId(vecId: string): string {
|
|
46
|
+
const marker = vecId.lastIndexOf("#c");
|
|
47
|
+
if (marker === -1) return vecId;
|
|
48
|
+
const suffix = vecId.slice(marker + 2);
|
|
49
|
+
if (suffix.length === 0 || !/^\d+$/.test(suffix)) return vecId;
|
|
50
|
+
return vecId.slice(0, marker);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
private fetchChunk(chunkId: string): ChunkRow | undefined {
|
|
54
|
+
if (!this.chunksTableExists()) return undefined;
|
|
55
|
+
return this.db.db
|
|
56
|
+
.prepare(
|
|
57
|
+
`SELECT id, document_id, chunk_index, char_start, char_end, context_prefix, content
|
|
58
|
+
FROM chunks WHERE id = ?`,
|
|
59
|
+
)
|
|
60
|
+
.get(chunkId) as ChunkRow | undefined;
|
|
61
|
+
}
|
|
62
|
+
|
|
17
63
|
async search(
|
|
18
64
|
query: string,
|
|
19
65
|
options: SearchOptions = {},
|
|
20
66
|
): Promise<SearchResult[]> {
|
|
21
|
-
const { type, tags, includeSuperseded = false, limit = 20 } = options;
|
|
67
|
+
const { type, tags, includeSuperseded = false, limit = 20, memoryTier } = options;
|
|
22
68
|
|
|
23
|
-
// Run FTS and vector search
|
|
69
|
+
// Run FTS and vector search (FTS already applies memoryTier filter in SQL
|
|
70
|
+
// when the schema supports it).
|
|
24
71
|
const ftsResults = this.fts.search(query, {
|
|
25
72
|
includeSuperseded: true,
|
|
26
73
|
limit: limit * 2,
|
|
74
|
+
memoryTier,
|
|
27
75
|
});
|
|
28
76
|
|
|
29
77
|
const queryEmbedding = await this.embedFn(query);
|
|
30
78
|
const vecResults = this.vec.search(queryEmbedding, limit * 2);
|
|
31
79
|
|
|
32
|
-
// Build RRF score map
|
|
80
|
+
// Build RRF score map, keyed by document_id. When vec ids are chunk ids
|
|
81
|
+
// like `{doc}#c{n}`, we collapse to the parent doc for scoring but
|
|
82
|
+
// remember the best-scoring chunk id per doc for later meta enrichment.
|
|
33
83
|
const scores = new Map<string, number>();
|
|
84
|
+
const bestChunkByDoc = new Map<string, { chunkId: string; rank: number }>();
|
|
34
85
|
|
|
35
86
|
for (let i = 0; i < ftsResults.length; i++) {
|
|
36
87
|
const id = ftsResults[i].id;
|
|
@@ -39,9 +90,16 @@ export class HybridSearch {
|
|
|
39
90
|
}
|
|
40
91
|
|
|
41
92
|
for (let i = 0; i < vecResults.length; i++) {
|
|
42
|
-
const
|
|
93
|
+
const vecId = vecResults[i].id;
|
|
94
|
+
const docId = this.docIdFromVecId(vecId);
|
|
43
95
|
const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
|
|
44
|
-
scores.set(
|
|
96
|
+
scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
|
|
97
|
+
if (vecId !== docId) {
|
|
98
|
+
const existing = bestChunkByDoc.get(docId);
|
|
99
|
+
if (!existing || i < existing.rank) {
|
|
100
|
+
bestChunkByDoc.set(docId, { chunkId: vecId, rank: i });
|
|
101
|
+
}
|
|
102
|
+
}
|
|
45
103
|
}
|
|
46
104
|
|
|
47
105
|
// Build a lookup of FTS results by id for quick access
|
|
@@ -98,6 +156,31 @@ export class HybridSearch {
|
|
|
98
156
|
});
|
|
99
157
|
}
|
|
100
158
|
|
|
159
|
+
// Post-filter: memory_tier for vector-only hits that bypassed the FTS
|
|
160
|
+
// SQL filter. Also covers the case where the FTS stage returned 0 rows
|
|
161
|
+
// but vec returned chunks from a doc in another tier.
|
|
162
|
+
if (memoryTier && memoryTier !== "any") {
|
|
163
|
+
filtered = filtered.filter((r) => {
|
|
164
|
+
const tier = this.db.getMemoryTier(r.id);
|
|
165
|
+
// When column absent (v2 schema) treat as "doc"
|
|
166
|
+
return (tier ?? "doc") === memoryTier;
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Enrich with chunk meta when chunk data is available (best-scoring
|
|
171
|
+
// chunk per doc).
|
|
172
|
+
for (const r of filtered) {
|
|
173
|
+
const best = bestChunkByDoc.get(r.id);
|
|
174
|
+
if (!best) continue;
|
|
175
|
+
const chunk = this.fetchChunk(best.chunkId);
|
|
176
|
+
if (!chunk) continue;
|
|
177
|
+
r.bestChunkId = chunk.id;
|
|
178
|
+
r.chunkIndex = chunk.chunk_index;
|
|
179
|
+
r.charStart = chunk.char_start;
|
|
180
|
+
r.charEnd = chunk.char_end;
|
|
181
|
+
r.contextPrefix = chunk.context_prefix;
|
|
182
|
+
}
|
|
183
|
+
|
|
101
184
|
return filtered.slice(0, limit);
|
|
102
185
|
}
|
|
103
186
|
}
|
package/src/ignore.ts
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import ignorePkg, { type Ignore } from "ignore";
|
|
4
|
+
|
|
5
|
+
// The `ignore` CJS module exposes the factory via `module.exports = factory`
|
|
6
|
+
// with `factory.default = factory` attached. Under `NodeNext` + ESM, depending
|
|
7
|
+
// on the interop mode, the default import can resolve to either the factory
|
|
8
|
+
// itself or the whole namespace. Probe and pick the callable form.
|
|
9
|
+
const ignore: (options?: { ignorecase?: boolean }) => Ignore = (
|
|
10
|
+
typeof (ignorePkg as unknown) === "function"
|
|
11
|
+
? (ignorePkg as unknown as (options?: { ignorecase?: boolean }) => Ignore)
|
|
12
|
+
: ((ignorePkg as unknown as { default: (options?: { ignorecase?: boolean }) => Ignore }).default)
|
|
13
|
+
);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Default ignore patterns applied to every root even when no `.ralphignore`
|
|
17
|
+
* file or caller-supplied globals are provided. These target directories and
|
|
18
|
+
* files that should virtually never be indexed.
|
|
19
|
+
*/
|
|
20
|
+
export const DEFAULT_IGNORE_PATTERNS: string[] = [
|
|
21
|
+
".claude/",
|
|
22
|
+
"node_modules/",
|
|
23
|
+
"dist/",
|
|
24
|
+
".git/",
|
|
25
|
+
"*.log",
|
|
26
|
+
];
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Opaque matcher returned by {@link loadIgnoreForRoot}. Given a path relative
|
|
30
|
+
* to the root used to construct the matcher, {@link isIgnored} reports whether
|
|
31
|
+
* the path should be skipped by the scanner.
|
|
32
|
+
*/
|
|
33
|
+
export interface IgnoreMatcher {
|
|
34
|
+
isIgnored(relativePath: string): boolean;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Build an {@link IgnoreMatcher} for a given root directory. The matcher
|
|
39
|
+
* combines (in order):
|
|
40
|
+
* 1. {@link DEFAULT_IGNORE_PATTERNS} — always applied.
|
|
41
|
+
* 2. `globalPatterns` — caller-supplied patterns (typically from
|
|
42
|
+
* `knowledge.config.json`'s `ignorePatterns`).
|
|
43
|
+
* 3. Contents of `<rootDir>/.ralphignore`, if present.
|
|
44
|
+
*
|
|
45
|
+
* All patterns follow gitignore syntax via the `ignore` package.
|
|
46
|
+
*
|
|
47
|
+
* @param rootDir absolute path of the root being scanned
|
|
48
|
+
* @param globalPatterns optional extra patterns applied before the per-root
|
|
49
|
+
* `.ralphignore` file
|
|
50
|
+
*/
|
|
51
|
+
export function loadIgnoreForRoot(
|
|
52
|
+
rootDir: string,
|
|
53
|
+
globalPatterns?: string[],
|
|
54
|
+
): IgnoreMatcher {
|
|
55
|
+
const ign: Ignore = ignore();
|
|
56
|
+
ign.add(DEFAULT_IGNORE_PATTERNS);
|
|
57
|
+
if (globalPatterns && globalPatterns.length > 0) {
|
|
58
|
+
ign.add(globalPatterns);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const ralphIgnorePath = join(rootDir, ".ralphignore");
|
|
62
|
+
if (existsSync(ralphIgnorePath)) {
|
|
63
|
+
try {
|
|
64
|
+
const contents = readFileSync(ralphIgnorePath, "utf-8");
|
|
65
|
+
ign.add(contents);
|
|
66
|
+
} catch (e) {
|
|
67
|
+
console.warn(
|
|
68
|
+
`Failed to read .ralphignore at ${ralphIgnorePath}: ${(e as Error).message}`,
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
isIgnored(relativePath: string): boolean {
|
|
75
|
+
if (!relativePath) return false;
|
|
76
|
+
// `ignore` package requires forward-slash paths with no leading slash.
|
|
77
|
+
const normalized = relativePath.replace(/\\/g, "/").replace(/^\/+/, "");
|
|
78
|
+
if (!normalized) return false;
|
|
79
|
+
return ign.ignores(normalized);
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
}
|