@pella-labs/pinakes 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +208 -0
- package/dist/cli/audit.d.ts +30 -0
- package/dist/cli/audit.d.ts.map +1 -0
- package/dist/cli/audit.js +49 -0
- package/dist/cli/audit.js.map +1 -0
- package/dist/cli/export.d.ts +32 -0
- package/dist/cli/export.d.ts.map +1 -0
- package/dist/cli/export.js +73 -0
- package/dist/cli/export.js.map +1 -0
- package/dist/cli/import.d.ts +24 -0
- package/dist/cli/import.d.ts.map +1 -0
- package/dist/cli/import.js +96 -0
- package/dist/cli/import.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +172 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/purge.d.ts +23 -0
- package/dist/cli/purge.d.ts.map +1 -0
- package/dist/cli/purge.js +57 -0
- package/dist/cli/purge.js.map +1 -0
- package/dist/cli/rebuild.d.ts +54 -0
- package/dist/cli/rebuild.d.ts.map +1 -0
- package/dist/cli/rebuild.js +113 -0
- package/dist/cli/rebuild.js.map +1 -0
- package/dist/cli/serve.d.ts +49 -0
- package/dist/cli/serve.d.ts.map +1 -0
- package/dist/cli/serve.js +296 -0
- package/dist/cli/serve.js.map +1 -0
- package/dist/cli/status.d.ts +39 -0
- package/dist/cli/status.d.ts.map +1 -0
- package/dist/cli/status.js +108 -0
- package/dist/cli/status.js.map +1 -0
- package/dist/db/client.d.ts +109 -0
- package/dist/db/client.d.ts.map +1 -0
- package/dist/db/client.js +175 -0
- package/dist/db/client.js.map +1 -0
- package/dist/db/repository.d.ts +82 -0
- package/dist/db/repository.d.ts.map +1 -0
- package/dist/db/repository.js +173 -0
- package/dist/db/repository.js.map +1 -0
- package/dist/db/schema.d.ts +990 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +259 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/db/types.d.ts +28 -0
- package/dist/db/types.d.ts.map +1 -0
- package/dist/db/types.js +11 -0
- package/dist/db/types.js.map +1 -0
- package/dist/gaps/detector.d.ts +67 -0
- package/dist/gaps/detector.d.ts.map +1 -0
- package/dist/gaps/detector.js +160 -0
- package/dist/gaps/detector.js.map +1 -0
- package/dist/gate/budget.d.ts +90 -0
- package/dist/gate/budget.d.ts.map +1 -0
- package/dist/gate/budget.js +145 -0
- package/dist/gate/budget.js.map +1 -0
- package/dist/ingest/chokidar.d.ts +33 -0
- package/dist/ingest/chokidar.d.ts.map +1 -0
- package/dist/ingest/chokidar.js +152 -0
- package/dist/ingest/chokidar.js.map +1 -0
- package/dist/ingest/ingester.d.ts +117 -0
- package/dist/ingest/ingester.d.ts.map +1 -0
- package/dist/ingest/ingester.js +312 -0
- package/dist/ingest/ingester.js.map +1 -0
- package/dist/ingest/manifest.d.ts +87 -0
- package/dist/ingest/manifest.d.ts.map +1 -0
- package/dist/ingest/manifest.js +223 -0
- package/dist/ingest/manifest.js.map +1 -0
- package/dist/ingest/memory-store.d.ts +55 -0
- package/dist/ingest/memory-store.d.ts.map +1 -0
- package/dist/ingest/memory-store.js +94 -0
- package/dist/ingest/memory-store.js.map +1 -0
- package/dist/ingest/parse/chunk.d.ts +15 -0
- package/dist/ingest/parse/chunk.d.ts.map +1 -0
- package/dist/ingest/parse/chunk.js +88 -0
- package/dist/ingest/parse/chunk.js.map +1 -0
- package/dist/ingest/parse/markdown.d.ts +64 -0
- package/dist/ingest/parse/markdown.d.ts.map +1 -0
- package/dist/ingest/parse/markdown.js +152 -0
- package/dist/ingest/parse/markdown.js.map +1 -0
- package/dist/ingest/queue.d.ts +21 -0
- package/dist/ingest/queue.d.ts.map +1 -0
- package/dist/ingest/queue.js +24 -0
- package/dist/ingest/queue.js.map +1 -0
- package/dist/ingest/source.d.ts +42 -0
- package/dist/ingest/source.d.ts.map +1 -0
- package/dist/ingest/source.js +19 -0
- package/dist/ingest/source.js.map +1 -0
- package/dist/mcp/envelope.d.ts +73 -0
- package/dist/mcp/envelope.d.ts.map +1 -0
- package/dist/mcp/envelope.js +46 -0
- package/dist/mcp/envelope.js.map +1 -0
- package/dist/mcp/tools/execute.d.ts +55 -0
- package/dist/mcp/tools/execute.d.ts.map +1 -0
- package/dist/mcp/tools/execute.js +232 -0
- package/dist/mcp/tools/execute.js.map +1 -0
- package/dist/mcp/tools/search.d.ts +53 -0
- package/dist/mcp/tools/search.d.ts.map +1 -0
- package/dist/mcp/tools/search.js +114 -0
- package/dist/mcp/tools/search.js.map +1 -0
- package/dist/observability/audit.d.ts +25 -0
- package/dist/observability/audit.d.ts.map +1 -0
- package/dist/observability/audit.js +38 -0
- package/dist/observability/audit.js.map +1 -0
- package/dist/observability/logger.d.ts +4 -0
- package/dist/observability/logger.d.ts.map +1 -0
- package/dist/observability/logger.js +56 -0
- package/dist/observability/logger.js.map +1 -0
- package/dist/observability/metrics.d.ts +38 -0
- package/dist/observability/metrics.d.ts.map +1 -0
- package/dist/observability/metrics.js +64 -0
- package/dist/observability/metrics.js.map +1 -0
- package/dist/retrieval/embedder.d.ts +130 -0
- package/dist/retrieval/embedder.d.ts.map +1 -0
- package/dist/retrieval/embedder.js +278 -0
- package/dist/retrieval/embedder.js.map +1 -0
- package/dist/retrieval/fts.d.ts +42 -0
- package/dist/retrieval/fts.d.ts.map +1 -0
- package/dist/retrieval/fts.js +46 -0
- package/dist/retrieval/fts.js.map +1 -0
- package/dist/retrieval/hybrid.d.ts +43 -0
- package/dist/retrieval/hybrid.d.ts.map +1 -0
- package/dist/retrieval/hybrid.js +120 -0
- package/dist/retrieval/hybrid.js.map +1 -0
- package/dist/retrieval/vec.d.ts +39 -0
- package/dist/retrieval/vec.d.ts.map +1 -0
- package/dist/retrieval/vec.js +50 -0
- package/dist/retrieval/vec.js.map +1 -0
- package/dist/sandbox/bindings/budget.d.ts +10 -0
- package/dist/sandbox/bindings/budget.d.ts.map +1 -0
- package/dist/sandbox/bindings/budget.js +44 -0
- package/dist/sandbox/bindings/budget.js.map +1 -0
- package/dist/sandbox/bindings/install.d.ts +23 -0
- package/dist/sandbox/bindings/install.d.ts.map +1 -0
- package/dist/sandbox/bindings/install.js +15 -0
- package/dist/sandbox/bindings/install.js.map +1 -0
- package/dist/sandbox/bindings/kg.d.ts +29 -0
- package/dist/sandbox/bindings/kg.d.ts.map +1 -0
- package/dist/sandbox/bindings/kg.js +323 -0
- package/dist/sandbox/bindings/kg.js.map +1 -0
- package/dist/sandbox/bindings/logger.d.ts +11 -0
- package/dist/sandbox/bindings/logger.d.ts.map +1 -0
- package/dist/sandbox/bindings/logger.js +33 -0
- package/dist/sandbox/bindings/logger.js.map +1 -0
- package/dist/sandbox/bindings/write.d.ts +34 -0
- package/dist/sandbox/bindings/write.d.ts.map +1 -0
- package/dist/sandbox/bindings/write.js +195 -0
- package/dist/sandbox/bindings/write.js.map +1 -0
- package/dist/sandbox/executor.d.ts +68 -0
- package/dist/sandbox/executor.d.ts.map +1 -0
- package/dist/sandbox/executor.js +280 -0
- package/dist/sandbox/executor.js.map +1 -0
- package/dist/sandbox/helpers.d.ts +26 -0
- package/dist/sandbox/helpers.d.ts.map +1 -0
- package/dist/sandbox/helpers.js +131 -0
- package/dist/sandbox/helpers.js.map +1 -0
- package/dist/sandbox/pool.d.ts +63 -0
- package/dist/sandbox/pool.d.ts.map +1 -0
- package/dist/sandbox/pool.js +98 -0
- package/dist/sandbox/pool.js.map +1 -0
- package/dist/sandbox/vendored-codemode.d.ts +99 -0
- package/dist/sandbox/vendored-codemode.d.ts.map +1 -0
- package/dist/sandbox/vendored-codemode.js +471 -0
- package/dist/sandbox/vendored-codemode.js.map +1 -0
- package/dist/server.d.ts +3 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +74 -0
- package/dist/server.js.map +1 -0
- package/dist/spike.d.ts +15 -0
- package/dist/spike.d.ts.map +1 -0
- package/dist/spike.js +90 -0
- package/dist/spike.js.map +1 -0
- package/package.json +60 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 1 in-memory document store.
|
|
3
|
+
*
|
|
4
|
+
* This is the spike's substitute for the real SQLite + FTS + vector stack
|
|
5
|
+
* that Phase 2-4 will land. It loads every `*.md` file under a given root
|
|
6
|
+
* directory once at startup, splits each file on blank lines into paragraph
|
|
7
|
+
* chunks, and exposes a substring-based `search()` + id-based `get()`.
|
|
8
|
+
*
|
|
9
|
+
* The store is intentionally dumb — no ranking, no stemming, no tokenizing,
|
|
10
|
+
* no caching beyond the initial load. The whole point of Phase 1 is to
|
|
11
|
+
* prove that the markdown → chunks → tool response round-trip works without
|
|
12
|
+
* getting distracted by retrieval quality. Phase 2 replaces this with the
|
|
13
|
+
* drizzle schema + FTS5 + sqlite-vec layered stack.
|
|
14
|
+
*
|
|
15
|
+
* The only non-trivial contract: chunk ids must be deterministic. Given the
|
|
16
|
+
* same root directory contents, loading the store twice must produce
|
|
17
|
+
* identical ids. Phase 2's ingester relies on this for idempotent upsert.
|
|
18
|
+
*/
|
|
19
|
+
export interface Chunk {
|
|
20
|
+
/** Deterministic sha1(`relative_path:index`) — stable across reloads. */
|
|
21
|
+
id: string;
|
|
22
|
+
/** Paragraph text, trimmed. Never empty. */
|
|
23
|
+
text: string;
|
|
24
|
+
/** `file://` URL of the source file. */
|
|
25
|
+
source_uri: string;
|
|
26
|
+
/** Position of this chunk within its source file, 0-based. */
|
|
27
|
+
chunk_index: number;
|
|
28
|
+
}
|
|
29
|
+
export declare class MemoryStore {
|
|
30
|
+
private readonly rootDir;
|
|
31
|
+
private chunks;
|
|
32
|
+
private byId;
|
|
33
|
+
private constructor();
|
|
34
|
+
/**
|
|
35
|
+
* Construct and populate a store from the given wiki root directory.
|
|
36
|
+
* Reads every `*.md` file recursively, splits on blank lines, and
|
|
37
|
+
* indexes into a flat chunks array + id lookup map.
|
|
38
|
+
*/
|
|
39
|
+
static load(rootDir: string): Promise<MemoryStore>;
|
|
40
|
+
/**
|
|
41
|
+
* Case-insensitive substring filter over every chunk's text. Returns all
|
|
42
|
+
* matches in insertion order (file order, then chunk order within file).
|
|
43
|
+
*
|
|
44
|
+
* Phase 1 does no ranking — the spike only needs to prove the bindings
|
|
45
|
+
* pipeline. Phase 4 replaces this with FTS5 + RRF.
|
|
46
|
+
*/
|
|
47
|
+
search(query: string): Chunk[];
|
|
48
|
+
/** Exact lookup by id. Returns `null` if not found (never throws). */
|
|
49
|
+
get(id: string): Chunk | null;
|
|
50
|
+
/** Total number of indexed chunks. */
|
|
51
|
+
size(): number;
|
|
52
|
+
/** The resolved absolute path this store was loaded from. */
|
|
53
|
+
root(): string;
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=memory-store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"memory-store.d.ts","sourceRoot":"","sources":["../../src/ingest/memory-store.ts"],"names":[],"mappings":"AAKA;;;;;;;;;;;;;;;;;GAiBG;AAEH,MAAM,WAAW,KAAK;IACpB,yEAAyE;IACzE,EAAE,EAAE,MAAM,CAAC;IACX,4CAA4C;IAC5C,IAAI,EAAE,MAAM,CAAC;IACb,wCAAwC;IACxC,UAAU,EAAE,MAAM,CAAC;IACnB,8DAA8D;IAC9D,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,qBAAa,WAAW;IAIF,OAAO,CAAC,QAAQ,CAAC,OAAO;IAH5C,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,IAAI,CAA4B;IAExC,OAAO;IAEP;;;;OAIG;WACU,IAAI,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAuBxD;;;;;;OAMG;IACH,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,KAAK,EAAE;IAM9B,sEAAsE;IACtE,GAAG,CAAC,EAAE,EAAE,MAAM,GAAG,KAAK,GAAG,IAAI;IAI7B,sCAAsC;IACtC,IAAI,IAAI,MAAM;IAId,6DAA6D;IAC7D,IAAI,IAAI,MAAM;CAGf"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { readdir, readFile } from 'node:fs/promises';
|
|
3
|
+
import { resolve, relative } from 'node:path';
|
|
4
|
+
import { pathToFileURL } from 'node:url';
|
|
5
|
+
export class MemoryStore {
|
|
6
|
+
rootDir;
|
|
7
|
+
chunks = [];
|
|
8
|
+
byId = new Map();
|
|
9
|
+
constructor(rootDir) {
|
|
10
|
+
this.rootDir = rootDir;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Construct and populate a store from the given wiki root directory.
|
|
14
|
+
* Reads every `*.md` file recursively, splits on blank lines, and
|
|
15
|
+
* indexes into a flat chunks array + id lookup map.
|
|
16
|
+
*/
|
|
17
|
+
static async load(rootDir) {
|
|
18
|
+
const abs = resolve(rootDir);
|
|
19
|
+
const store = new MemoryStore(abs);
|
|
20
|
+
const files = await collectMarkdownFiles(abs);
|
|
21
|
+
for (const file of files) {
|
|
22
|
+
const text = await readFile(file, 'utf8');
|
|
23
|
+
const rel = relative(abs, file);
|
|
24
|
+
const uri = pathToFileURL(file).href;
|
|
25
|
+
const paragraphs = splitParagraphs(text);
|
|
26
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
27
|
+
const chunk = {
|
|
28
|
+
id: sha1(`${rel}:${i}`),
|
|
29
|
+
text: paragraphs[i],
|
|
30
|
+
source_uri: uri,
|
|
31
|
+
chunk_index: i,
|
|
32
|
+
};
|
|
33
|
+
store.chunks.push(chunk);
|
|
34
|
+
store.byId.set(chunk.id, chunk);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return store;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Case-insensitive substring filter over every chunk's text. Returns all
|
|
41
|
+
* matches in insertion order (file order, then chunk order within file).
|
|
42
|
+
*
|
|
43
|
+
* Phase 1 does no ranking — the spike only needs to prove the bindings
|
|
44
|
+
* pipeline. Phase 4 replaces this with FTS5 + RRF.
|
|
45
|
+
*/
|
|
46
|
+
search(query) {
|
|
47
|
+
const q = query.toLowerCase();
|
|
48
|
+
if (!q)
|
|
49
|
+
return [];
|
|
50
|
+
return this.chunks.filter((c) => c.text.toLowerCase().includes(q));
|
|
51
|
+
}
|
|
52
|
+
/** Exact lookup by id. Returns `null` if not found (never throws). */
|
|
53
|
+
get(id) {
|
|
54
|
+
return this.byId.get(id) ?? null;
|
|
55
|
+
}
|
|
56
|
+
/** Total number of indexed chunks. */
|
|
57
|
+
size() {
|
|
58
|
+
return this.chunks.length;
|
|
59
|
+
}
|
|
60
|
+
/** The resolved absolute path this store was loaded from. */
|
|
61
|
+
root() {
|
|
62
|
+
return this.rootDir;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// ============================================================================
|
|
66
|
+
// helpers
|
|
67
|
+
// ============================================================================
|
|
68
|
+
async function collectMarkdownFiles(root) {
|
|
69
|
+
const out = [];
|
|
70
|
+
const entries = await readdir(root, { withFileTypes: true, recursive: true });
|
|
71
|
+
for (const entry of entries) {
|
|
72
|
+
if (!entry.isFile())
|
|
73
|
+
continue;
|
|
74
|
+
if (!entry.name.toLowerCase().endsWith('.md'))
|
|
75
|
+
continue;
|
|
76
|
+
// Node 20+ recursive readdir provides parentPath on each entry
|
|
77
|
+
const parent = entry.parentPath
|
|
78
|
+
?? entry.path
|
|
79
|
+
?? root;
|
|
80
|
+
out.push(resolve(parent, entry.name));
|
|
81
|
+
}
|
|
82
|
+
out.sort(); // deterministic order across filesystems
|
|
83
|
+
return out;
|
|
84
|
+
}
|
|
85
|
+
function splitParagraphs(source) {
|
|
86
|
+
return source
|
|
87
|
+
.split(/\r?\n\r?\n+/)
|
|
88
|
+
.map((p) => p.trim())
|
|
89
|
+
.filter((p) => p.length > 0);
|
|
90
|
+
}
|
|
91
|
+
function sha1(input) {
|
|
92
|
+
return createHash('sha1').update(input).digest('hex');
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=memory-store.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"memory-store.js","sourceRoot":"","sources":["../../src/ingest/memory-store.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAgCzC,MAAM,OAAO,WAAW;IAIe;IAH7B,MAAM,GAAY,EAAE,CAAC;IACrB,IAAI,GAAG,IAAI,GAAG,EAAiB,CAAC;IAExC,YAAqC,OAAe;QAAf,YAAO,GAAP,OAAO,CAAQ;IAAG,CAAC;IAExD;;;;OAIG;IACH,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,OAAe;QAC/B,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC;QACnC,MAAM,KAAK,GAAG,MAAM,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC9C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC1C,MAAM,GAAG,GAAG,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;YAChC,MAAM,GAAG,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC;YACrC,MAAM,UAAU,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;YACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC3C,MAAM,KAAK,GAAU;oBACnB,EAAE,EAAE,IAAI,CAAC,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC;oBACvB,IAAI,EAAE,UAAU,CAAC,CAAC,CAAE;oBACpB,UAAU,EAAE,GAAG;oBACf,WAAW,EAAE,CAAC;iBACf,CAAC;gBACF,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACzB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;YAClC,CAAC;QACH,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,KAAa;QAClB,MAAM,CAAC,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;QAC9B,IAAI,CAAC,CAAC;YAAE,OAAO,EAAE,CAAC;QAClB,OAAO,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,CAAC;IAED,sEAAsE;IACtE,GAAG,CAAC,EAAU;QACZ,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC;IACnC,CAAC;IAED,sCAAsC;IACtC,IAAI;QACF,OAAO,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC;IAC5B,CAAC;IAED,6DAA6D;IAC7D,IAAI;QACF,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED,+EAA+E;AAC/E,UAAU;AACV,+EAA+E;AAE/E,KAAK,UAAU,oBAAoB,CAAC,IAAY;IAC9C,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC9E,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE;YAAE,SAAS;QAC9B,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,SAAS;QACxD,+DAA+D;QAC/D,MAAM,MAAM,GAAI,KAA2D,CAAC,UAAU;eAChF,KAAsC,CAAC,IAAI;eAC5C,IAAI,CAAC;QACV,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;IACxC,CAAC;IACD,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,yCAAyC;IACrD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,eAAe,CAAC,MAAc;IACrC,OAAO,MAAM;SACV,KAAK,CAAC,aAAa,CAAC;SACpB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACjC,CAAC;AAED,SAAS,IAAI,CAAC,KAAa;IACzB,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACxD,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export interface Chunk {
|
|
2
|
+
/** Chunk text — paragraphs joined by blank lines, original whitespace preserved */
|
|
3
|
+
text: string;
|
|
4
|
+
/** Cached token count — exposed so the ingester doesn't need to recount */
|
|
5
|
+
token_count: number;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Split a section's content into ~target_tokens-sized chunks on paragraph
|
|
9
|
+
* boundaries. Returns an empty array for input with no non-whitespace content.
|
|
10
|
+
*
|
|
11
|
+
* `targetTokens` defaults to 500. Pass a smaller value in tests if you want
|
|
12
|
+
* to force a section to chunk at a predictable boundary.
|
|
13
|
+
*/
|
|
14
|
+
export declare function chunkSection(content: string, targetTokens?: number): Chunk[];
|
|
15
|
+
//# sourceMappingURL=chunk.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.d.ts","sourceRoot":"","sources":["../../../src/ingest/parse/chunk.ts"],"names":[],"mappings":"AAoCA,MAAM,WAAW,KAAK;IACpB,mFAAmF;IACnF,IAAI,EAAE,MAAM,CAAC;IACb,2EAA2E;IAC3E,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;GAMG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,MAAM,EAAE,YAAY,GAAE,MAA8B,GAAG,KAAK,EAAE,CAyCnG"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { countTokens } from '../../gate/budget.js';
|
|
2
|
+
/**
|
|
3
|
+
* Paragraph-aware chunker for KG-MCP Phase 2.
|
|
4
|
+
*
|
|
5
|
+
* Splits a section's body into chunks of approximately `targetTokens` tokens,
|
|
6
|
+
* never breaking a paragraph in half. Tokens are counted via the existing
|
|
7
|
+
* `countTokens()` from `gate/budget.ts`, which uses `js-tiktoken p50k_base`
|
|
8
|
+
* with the long-string fast path (D32 — see CLAUDE.md §API Rules #6 budget math).
|
|
9
|
+
*
|
|
10
|
+
* **Algorithm**:
|
|
11
|
+
* 1. Split the input on blank lines (`\n\n+`) into paragraphs
|
|
12
|
+
* 2. Iterate paragraphs left to right, accumulating into a current chunk
|
|
13
|
+
* 3. If adding the next paragraph would push the chunk past `targetTokens`,
|
|
14
|
+
* flush the current chunk and start a new one with that paragraph
|
|
15
|
+
* 4. A single oversize paragraph that ALONE exceeds `targetTokens` gets
|
|
16
|
+
* its own chunk (rather than being split mid-sentence — the LLM can
|
|
17
|
+
* still query it via FTS5, just slower)
|
|
18
|
+
*
|
|
19
|
+
* **Determinism**: same input → same output. The chunker is pure: no random
|
|
20
|
+
* tie-breaking, no time-based decisions, no environment lookups. The
|
|
21
|
+
* downstream `chunk_sha = sha1(chunk_text)` therefore stays stable across
|
|
22
|
+
* runs, which is the load-bearing assumption for the per-chunk skip-unchanged
|
|
23
|
+
* optimization (CLAUDE.md §Database Rules #3, Loop 6.5 A4).
|
|
24
|
+
*
|
|
25
|
+
* **Why ~500 tokens?** Empirical sweet spot for retrieval: small enough that
|
|
26
|
+
* each chunk is a focused topic, large enough that 1-2 chunks usually answer
|
|
27
|
+
* a query without needing to fetch a whole node. Phase 4's RRF + budget gate
|
|
28
|
+
* tunes around this size; deviating significantly will affect retrieval
|
|
29
|
+
* quality. The actual chunk sizes will fluctuate around this target since
|
|
30
|
+
* we won't break a paragraph — chunks may be smaller (single short paragraph)
|
|
31
|
+
* or larger (single long paragraph).
|
|
32
|
+
*/
|
|
33
|
+
const DEFAULT_TARGET_TOKENS = 500;
|
|
34
|
+
/**
|
|
35
|
+
* Split a section's content into ~target_tokens-sized chunks on paragraph
|
|
36
|
+
* boundaries. Returns an empty array for input with no non-whitespace content.
|
|
37
|
+
*
|
|
38
|
+
* `targetTokens` defaults to 500. Pass a smaller value in tests if you want
|
|
39
|
+
* to force a section to chunk at a predictable boundary.
|
|
40
|
+
*/
|
|
41
|
+
export function chunkSection(content, targetTokens = DEFAULT_TARGET_TOKENS) {
|
|
42
|
+
const paragraphs = splitParagraphs(content);
|
|
43
|
+
if (paragraphs.length === 0)
|
|
44
|
+
return [];
|
|
45
|
+
const chunks = [];
|
|
46
|
+
let current = [];
|
|
47
|
+
let currentTokens = 0;
|
|
48
|
+
for (const para of paragraphs) {
|
|
49
|
+
const paraTokens = countTokens(para);
|
|
50
|
+
// Edge case: paragraph alone exceeds target — emit it as its own chunk
|
|
51
|
+
// (rather than splitting mid-sentence, which would hurt retrieval and
|
|
52
|
+
// also break round-trip determinism).
|
|
53
|
+
if (paraTokens > targetTokens) {
|
|
54
|
+
if (current.length > 0) {
|
|
55
|
+
chunks.push({ text: current.join('\n\n'), token_count: currentTokens });
|
|
56
|
+
current = [];
|
|
57
|
+
currentTokens = 0;
|
|
58
|
+
}
|
|
59
|
+
chunks.push({ text: para, token_count: paraTokens });
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
// If adding this paragraph would exceed target, flush current and start fresh.
|
|
63
|
+
if (currentTokens + paraTokens > targetTokens && current.length > 0) {
|
|
64
|
+
chunks.push({ text: current.join('\n\n'), token_count: currentTokens });
|
|
65
|
+
current = [];
|
|
66
|
+
currentTokens = 0;
|
|
67
|
+
}
|
|
68
|
+
current.push(para);
|
|
69
|
+
currentTokens += paraTokens;
|
|
70
|
+
}
|
|
71
|
+
// Flush the trailing chunk.
|
|
72
|
+
if (current.length > 0) {
|
|
73
|
+
chunks.push({ text: current.join('\n\n'), token_count: currentTokens });
|
|
74
|
+
}
|
|
75
|
+
return chunks;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Split a string on blank lines, trimming each paragraph and dropping empty
|
|
79
|
+
* ones. Mirrors the Phase 1 splitParagraphs in memory-store.ts so chunking
|
|
80
|
+
* behavior stays consistent across the swap.
|
|
81
|
+
*/
|
|
82
|
+
function splitParagraphs(source) {
|
|
83
|
+
return source
|
|
84
|
+
.split(/\r?\n\r?\n+/)
|
|
85
|
+
.map((p) => p.trim())
|
|
86
|
+
.filter((p) => p.length > 0);
|
|
87
|
+
}
|
|
88
|
+
//# sourceMappingURL=chunk.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.js","sourceRoot":"","sources":["../../../src/ingest/parse/chunk.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,MAAM,qBAAqB,GAAG,GAAG,CAAC;AASlC;;;;;;GAMG;AACH,MAAM,UAAU,YAAY,CAAC,OAAe,EAAE,eAAuB,qBAAqB;IACxF,MAAM,UAAU,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAC5C,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEvC,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,OAAO,GAAa,EAAE,CAAC;IAC3B,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QAErC,uEAAuE;QACvE,sEAAsE;QACtE,sCAAsC;QACtC,IAAI,UAAU,GAAG,YAAY,EAAE,CAAC;YAC9B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,aAAa,EAAE,CAAC,CAAC;gBACxE,OAAO,GAAG,EAAE,CAAC;gBACb,aAAa,GAAG,CAAC,CAAC;YACpB,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAC;YACrD,SAAS;QACX,CAAC;QAED,+EAA+E;QAC/E,IAAI,aAAa,GAAG,UAAU,GAAG,YAAY,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,aAAa,EAAE,CAAC,CAAC;YACxE,OAAO,GAAG,EAAE,CAAC;YACb,aAAa,GAAG,CAAC,CAAC;QACpB,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,aAAa,IAAI,UAAU,CAAC;IAC9B,CAAC;IAED,4BAA4B;IAC5B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,WAAW,EAAE,aAAa,EAAE,CAAC,CAAC;IAC1E,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;GAIG;AACH,SAAS,eAAe,CAAC,MAAc;IACrC,OAAO,MAAM;SACV,KAAK,CAAC,aAAa,CAAC;SACpB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACjC,CAAC"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown → section parser for KG-MCP Phase 2.
|
|
3
|
+
*
|
|
4
|
+
* Parses a markdown file into a flat array of `SectionNode` objects, one
|
|
5
|
+
* per ATX heading (`#`, `##`, …) plus an optional pre-heading section for
|
|
6
|
+
* any content above the first heading. The chunker (chunk.ts) then splits
|
|
7
|
+
* each section's `content` into ~500-token chunks.
|
|
8
|
+
*
|
|
9
|
+
* **Why mdast?** Phase 1 used a `\n\n` regex split that lost all heading
|
|
10
|
+
* structure. The Karpathy two-level wiki has nested H1/H2/H3 sections, and
|
|
11
|
+
* the LLM querying via `kg.search` benefits from being able to identify
|
|
12
|
+
* "this chunk lives under H2 'Login flow' which lives under H1 'Authentication'".
|
|
13
|
+
* The `section_path` field captures that hierarchy.
|
|
14
|
+
*
|
|
15
|
+
* **Section content slicing**: we use mdast `position.start.offset` to slice
|
|
16
|
+
* the original source for each section, so the stored content includes the
|
|
17
|
+
* exact original markdown (whitespace, formatting, code fences) — not a
|
|
18
|
+
* re-rendered approximation. This makes round-trip tests trivial: rebuild a
|
|
19
|
+
* file by joining all section contents and you should get back something
|
|
20
|
+
* structurally identical to the input.
|
|
21
|
+
*
|
|
22
|
+
* **Determinism**: same input → same output. Pinned `mdast-util-from-markdown@^2.0.0`
|
|
23
|
+
* + no plugins = stable mdast tree, stable section list, stable downstream
|
|
24
|
+
* chunk ids. Tests verify this by parsing twice and deep-equal-ing the result.
|
|
25
|
+
*/
|
|
26
|
+
/**
|
|
27
|
+
* One section of a markdown file. Sections are derived from ATX headings;
|
|
28
|
+
* a `SectionNode { depth: 0 }` is the optional pre-heading content above
|
|
29
|
+
* the first heading.
|
|
30
|
+
*/
|
|
31
|
+
export interface SectionNode {
|
|
32
|
+
/** ATX heading hierarchy joined by ` / ` (e.g. `"Authentication / Login flow"`); empty for pre-heading content */
|
|
33
|
+
section_path: string;
|
|
34
|
+
/** The heading text itself (or empty string for pre-heading content) */
|
|
35
|
+
title: string;
|
|
36
|
+
/** Original markdown source for this section (heading + body), preserving whitespace */
|
|
37
|
+
content: string;
|
|
38
|
+
/** Always `'section'` for Phase 2 — Phase 4 may add other kinds (entity, decision) */
|
|
39
|
+
kind: 'section';
|
|
40
|
+
/** ATX depth: 0 = pre-heading, 1 = `#`, 2 = `##`, …, 6 = `######` */
|
|
41
|
+
depth: number;
|
|
42
|
+
}
|
|
43
|
+
/** Confidence level for provenance tracking (Phase 6). */
|
|
44
|
+
export type Confidence = 'extracted' | 'inferred' | 'ambiguous';
|
|
45
|
+
/**
|
|
46
|
+
* Detect confidence from YAML frontmatter in a markdown file.
|
|
47
|
+
*
|
|
48
|
+
* Rules:
|
|
49
|
+
* - `source: haiku` or `source: ai` or `source: ai-generated` → `'inferred'`
|
|
50
|
+
* - `status: ambiguous` or `status: needs-review` or `confidence: ambiguous` → `'ambiguous'`
|
|
51
|
+
* - `confidence: inferred` → `'inferred'`
|
|
52
|
+
* - Otherwise → `'extracted'`
|
|
53
|
+
*/
|
|
54
|
+
export declare function detectConfidence(source: string): Confidence;
|
|
55
|
+
/**
|
|
56
|
+
* Parse a markdown source string into a flat list of sections.
|
|
57
|
+
*
|
|
58
|
+
* Sections are emitted in source order (top-to-bottom). Empty pre-heading
|
|
59
|
+
* content (whitespace only) is skipped. Sections with no body (just a
|
|
60
|
+
* heading and nothing after) are still emitted — the chunker handles them
|
|
61
|
+
* by producing zero chunks for that section.
|
|
62
|
+
*/
|
|
63
|
+
export declare function parseMarkdown(source: string): SectionNode[];
|
|
64
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../../src/ingest/parse/markdown.ts"],"names":[],"mappings":"AAiBA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH;;;;GAIG;AACH,MAAM,WAAW,WAAW;IAC1B,kHAAkH;IAClH,YAAY,EAAE,MAAM,CAAC;IACrB,wEAAwE;IACxE,KAAK,EAAE,MAAM,CAAC;IACd,wFAAwF;IACxF,OAAO,EAAE,MAAM,CAAC;IAChB,sFAAsF;IACtF,IAAI,EAAE,SAAS,CAAC;IAChB,qEAAqE;IACrE,KAAK,EAAE,MAAM,CAAC;CACf;AAED,0DAA0D;AAC1D,MAAM,MAAM,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,WAAW,CAAC;AAEhE;;;;;;;;GAQG;AACH,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAG,UAAU,CAiB3D;AAuBD;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,EAAE,CAuF3D"}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { fromMarkdown } from 'mdast-util-from-markdown';
|
|
2
|
+
/**
|
|
3
|
+
* Detect confidence from YAML frontmatter in a markdown file.
|
|
4
|
+
*
|
|
5
|
+
* Rules:
|
|
6
|
+
* - `source: haiku` or `source: ai` or `source: ai-generated` → `'inferred'`
|
|
7
|
+
* - `status: ambiguous` or `status: needs-review` or `confidence: ambiguous` → `'ambiguous'`
|
|
8
|
+
* - `confidence: inferred` → `'inferred'`
|
|
9
|
+
* - Otherwise → `'extracted'`
|
|
10
|
+
*/
|
|
11
|
+
export function detectConfidence(source) {
|
|
12
|
+
const fm = parseFrontmatter(source);
|
|
13
|
+
if (!fm)
|
|
14
|
+
return 'extracted';
|
|
15
|
+
// Check explicit confidence field first
|
|
16
|
+
if (fm.confidence === 'inferred')
|
|
17
|
+
return 'inferred';
|
|
18
|
+
if (fm.confidence === 'ambiguous')
|
|
19
|
+
return 'ambiguous';
|
|
20
|
+
// Check source field
|
|
21
|
+
const src = typeof fm.source === 'string' ? fm.source.toLowerCase() : '';
|
|
22
|
+
if (src === 'haiku' || src === 'ai' || src === 'ai-generated')
|
|
23
|
+
return 'inferred';
|
|
24
|
+
// Check status field
|
|
25
|
+
const status = typeof fm.status === 'string' ? fm.status.toLowerCase() : '';
|
|
26
|
+
if (status === 'ambiguous' || status === 'needs-review')
|
|
27
|
+
return 'ambiguous';
|
|
28
|
+
return 'extracted';
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Minimal YAML frontmatter parser. Extracts key: value pairs from
|
|
32
|
+
* `---\n...\n---` blocks at the start of a file. No dependency needed.
|
|
33
|
+
*/
|
|
34
|
+
function parseFrontmatter(source) {
|
|
35
|
+
if (!source.startsWith('---'))
|
|
36
|
+
return null;
|
|
37
|
+
const endIdx = source.indexOf('\n---', 3);
|
|
38
|
+
if (endIdx === -1)
|
|
39
|
+
return null;
|
|
40
|
+
const block = source.slice(4, endIdx);
|
|
41
|
+
const result = {};
|
|
42
|
+
for (const line of block.split('\n')) {
|
|
43
|
+
const colon = line.indexOf(':');
|
|
44
|
+
if (colon === -1)
|
|
45
|
+
continue;
|
|
46
|
+
const key = line.slice(0, colon).trim();
|
|
47
|
+
const val = line.slice(colon + 1).trim();
|
|
48
|
+
if (key)
|
|
49
|
+
result[key] = val;
|
|
50
|
+
}
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Parse a markdown source string into a flat list of sections.
|
|
55
|
+
*
|
|
56
|
+
* Sections are emitted in source order (top-to-bottom). Empty pre-heading
|
|
57
|
+
* content (whitespace only) is skipped. Sections with no body (just a
|
|
58
|
+
* heading and nothing after) are still emitted — the chunker handles them
|
|
59
|
+
* by producing zero chunks for that section.
|
|
60
|
+
*/
|
|
61
|
+
export function parseMarkdown(source) {
|
|
62
|
+
const tree = fromMarkdown(source);
|
|
63
|
+
const sections = [];
|
|
64
|
+
const rootChildren = tree.children ?? [];
|
|
65
|
+
// Pass 1: collect every top-level heading with its source offset.
|
|
66
|
+
// We only care about headings at the root of the mdast tree — nested
|
|
67
|
+
// headings inside blockquotes or lists are unusual and we treat them
|
|
68
|
+
// as part of the surrounding section's body.
|
|
69
|
+
const headings = [];
|
|
70
|
+
for (const child of rootChildren) {
|
|
71
|
+
if (child.type !== 'heading')
|
|
72
|
+
continue;
|
|
73
|
+
const start = child.position?.start.offset;
|
|
74
|
+
if (typeof start !== 'number')
|
|
75
|
+
continue; // defensive: should always be present
|
|
76
|
+
headings.push({
|
|
77
|
+
depth: child.depth ?? 1,
|
|
78
|
+
title: mdastNodeText(child).trim(),
|
|
79
|
+
startOffset: start,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
// Edge case: no headings at all → one big pre-heading section (if non-empty).
|
|
83
|
+
if (headings.length === 0) {
|
|
84
|
+
if (source.trim().length > 0) {
|
|
85
|
+
sections.push({
|
|
86
|
+
section_path: '',
|
|
87
|
+
title: '',
|
|
88
|
+
content: source,
|
|
89
|
+
kind: 'section',
|
|
90
|
+
depth: 0,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
return sections;
|
|
94
|
+
}
|
|
95
|
+
// Pre-heading content: anything before the first heading.
|
|
96
|
+
const firstStart = headings[0].startOffset;
|
|
97
|
+
if (firstStart > 0) {
|
|
98
|
+
const preContent = source.slice(0, firstStart);
|
|
99
|
+
if (preContent.trim().length > 0) {
|
|
100
|
+
sections.push({
|
|
101
|
+
section_path: '',
|
|
102
|
+
title: '',
|
|
103
|
+
content: preContent,
|
|
104
|
+
kind: 'section',
|
|
105
|
+
depth: 0,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Pass 2: walk the heading list, building section_path via a depth stack
|
|
110
|
+
// and slicing content from this heading's offset to the next heading's
|
|
111
|
+
// offset (or EOF for the final section).
|
|
112
|
+
const stack = [];
|
|
113
|
+
for (let i = 0; i < headings.length; i++) {
|
|
114
|
+
const h = headings[i];
|
|
115
|
+
// Pop the stack until the top is a strict ancestor of `h`.
|
|
116
|
+
// This handles unusual cases like H1 → H3 → H2 (the H2 pops the H3
|
|
117
|
+
// but keeps the H1 ancestor).
|
|
118
|
+
while (stack.length > 0 && stack[stack.length - 1].depth >= h.depth) {
|
|
119
|
+
stack.pop();
|
|
120
|
+
}
|
|
121
|
+
stack.push({ depth: h.depth, title: h.title });
|
|
122
|
+
const sectionPath = stack.map((s) => s.title).join(' / ');
|
|
123
|
+
const startOffset = h.startOffset;
|
|
124
|
+
const endOffset = i + 1 < headings.length ? headings[i + 1].startOffset : source.length;
|
|
125
|
+
const content = source.slice(startOffset, endOffset);
|
|
126
|
+
sections.push({
|
|
127
|
+
section_path: sectionPath,
|
|
128
|
+
title: h.title,
|
|
129
|
+
content,
|
|
130
|
+
kind: 'section',
|
|
131
|
+
depth: h.depth,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
return sections;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Recursively concatenate the text content of an mdast node, ignoring
|
|
138
|
+
* formatting. Equivalent to mdast-util-to-string but inlined to avoid the
|
|
139
|
+
* transitive-dep import issue.
|
|
140
|
+
*/
|
|
141
|
+
function mdastNodeText(node) {
|
|
142
|
+
if (typeof node.value === 'string')
|
|
143
|
+
return node.value;
|
|
144
|
+
if (!node.children)
|
|
145
|
+
return '';
|
|
146
|
+
let out = '';
|
|
147
|
+
for (const child of node.children) {
|
|
148
|
+
out += mdastNodeText(child);
|
|
149
|
+
}
|
|
150
|
+
return out;
|
|
151
|
+
}
|
|
152
|
+
//# sourceMappingURL=markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../../src/ingest/parse/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AAgExD;;;;;;;;GAQG;AACH,MAAM,UAAU,gBAAgB,CAAC,MAAc;IAC7C,MAAM,EAAE,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,WAAW,CAAC;IAE5B,wCAAwC;IACxC,IAAI,EAAE,CAAC,UAAU,KAAK,UAAU;QAAE,OAAO,UAAU,CAAC;IACpD,IAAI,EAAE,CAAC,UAAU,KAAK,WAAW;QAAE,OAAO,WAAW,CAAC;IAEtD,qBAAqB;IACrB,MAAM,GAAG,GAAG,OAAO,EAAE,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACzE,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,cAAc;QAAE,OAAO,UAAU,CAAC;IAEjF,qBAAqB;IACrB,MAAM,MAAM,GAAG,OAAO,EAAE,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAC5E,IAAI,MAAM,KAAK,WAAW,IAAI,MAAM,KAAK,cAAc;QAAE,OAAO,WAAW,CAAC;IAE5E,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,MAAc;IACtC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAC3C,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;IAC1C,IAAI,MAAM,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAE/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IACtC,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACrC,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAChC,IAAI,KAAK,KAAK,CAAC,CAAC;YAAE,SAAS;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QACxC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACzC,IAAI,GAAG;YAAE,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IAC7B,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,MAAc;IAC1C,MAAM,IAAI,GAAG,YAAY,CAAC,MAAM,CAAyB,CAAC;IAC1D,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;IAEzC,kEAAkE;IAClE,qEAAqE;IACrE,qEAAqE;IACrE,6CAA6C;IAC7C,MAAM,QAAQ,GAIT,EAAE,CAAC;IAER,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;QACjC,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS;YAAE,SAAS;QACvC,MAAM,KAAK,GAAG,KAAK,CAAC,QAAQ,EAAE,KAAK,CAAC,MAAM,CAAC;QAC3C,IAAI,OAAO,KAAK,KAAK,QAAQ;YAAE,SAAS,CAAC,sCAAsC;QAC/E,QAAQ,CAAC,IAAI,CAAC;YACZ,KAAK,EAAE,KAAK,CAAC,KAAK,IAAI,CAAC;YACvB,KAAK,EAAE,aAAa,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE;YAClC,WAAW,EAAE,KAAK;SACnB,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,QAAQ,CAAC,IAAI,CAAC;gBACZ,YAAY,EAAE,EAAE;gBAChB,KAAK,EAAE,EAAE;gBACT,OAAO,EAAE,MAAM;gBACf,IAAI,EAAE,SAAS;gBACf,KAAK,EAAE,CAAC;aACT,CAAC,CAAC;QACL,CAAC;QACD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,0DAA0D;IAC1D,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC,WAAW,CAAC;IAC5C,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;QACnB,MAAM,UAAU,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QAC/C,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjC,QAAQ,CAAC,IAAI,CAAC;gBACZ,YAAY,EAAE,EAAE;gBAChB,KAAK,EAAE,EAAE;gBACT,OAAO,EAAE,UAAU;gBACnB,IAAI,EAAE,SAAS;gBACf,KAAK,EAAE,CAAC;aACT,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,yEAAyE;IACzE,uEAAuE;IACvE,yCAAyC;IACzC,MAAM,KAAK,GAA4C,EAAE,CAAC;IAE1D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;QAEvB,2DAA2D;QAC3D,mEAAmE;QACnE,8BAA8B;QAC9B,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,KAAK,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;YACrE,KAAK,CAAC,GAAG,EAAE,CAAC;QACd,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QAE/C,MAAM,WAAW,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAE1D,MAAM,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC;QAClC,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,WAAW,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;QACzF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;QAErD,QAAQ,CAAC,IAAI,CAAC;YACZ,YAAY,EAAE,WAAW;YACzB,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,OAAO;YACP,IAAI,EAAE,SAAS;YACf,KAAK,EAAE,CAAC,CAAC,KAAK;SACf,CAAC,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;GAIG;AACH,SAAS,aAAa,CAAC,IAAe;IACpC,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC,KAAK,CAAC;IACtD,IAAI,CAAC,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,CAAC;IAC9B,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClC,GAAG,IAAI,aAAa,CAAC,KAAK,CAAC,CAAC;IAC9B,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { IngestEvent, IngestSource } from './source.js';
|
|
2
|
+
/**
|
|
3
|
+
* `QueueSubscriber` — placeholder for the Phase 5+ orchestrator contract.
|
|
4
|
+
*
|
|
5
|
+
* The orchestration engineer (per the project memory) is separately
|
|
6
|
+
* building a message-queue contract that will feed events into KG-MCP.
|
|
7
|
+
* When that lands, this class becomes a real subscriber. Until then it
|
|
8
|
+
* exists so:
|
|
9
|
+
* 1. The `IngestSource` interface has both implementations the codebase
|
|
10
|
+
* expects, exercising the contract in type-check
|
|
11
|
+
* 2. `serve.ts` can document the swap-in seam (commented `// const source = new QueueSubscriber(...)`)
|
|
12
|
+
* 3. Phase 5 doesn't have to add a new file — only fill this one in
|
|
13
|
+
*
|
|
14
|
+
* Both methods throw `not implemented` — calling them is a programming
|
|
15
|
+
* error in Phase 2.
|
|
16
|
+
*/
|
|
17
|
+
export declare class QueueSubscriber implements IngestSource {
|
|
18
|
+
start(_onEvent: (ev: IngestEvent) => Promise<void>): Promise<void>;
|
|
19
|
+
stop(): Promise<void>;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=queue.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"queue.d.ts","sourceRoot":"","sources":["../../src/ingest/queue.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE7D;;;;;;;;;;;;;;GAcG;AACH,qBAAa,eAAgB,YAAW,YAAY;IAC5C,KAAK,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,OAAO,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAMlE,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAK5B"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `QueueSubscriber` — placeholder for the Phase 5+ orchestrator contract.
|
|
3
|
+
*
|
|
4
|
+
* The orchestration engineer (per the project memory) is separately
|
|
5
|
+
* building a message-queue contract that will feed events into KG-MCP.
|
|
6
|
+
* When that lands, this class becomes a real subscriber. Until then it
|
|
7
|
+
* exists so:
|
|
8
|
+
* 1. The `IngestSource` interface has both implementations the codebase
|
|
9
|
+
* expects, exercising the contract in type-check
|
|
10
|
+
* 2. `serve.ts` can document the swap-in seam (commented `// const source = new QueueSubscriber(...)`)
|
|
11
|
+
* 3. Phase 5 doesn't have to add a new file — only fill this one in
|
|
12
|
+
*
|
|
13
|
+
* Both methods throw `not implemented` — calling them is a programming
|
|
14
|
+
* error in Phase 2.
|
|
15
|
+
*/
|
|
16
|
+
export class QueueSubscriber {
|
|
17
|
+
async start(_onEvent) {
|
|
18
|
+
throw new Error('QueueSubscriber.start() not implemented — pending orchestrator message-queue contract');
|
|
19
|
+
}
|
|
20
|
+
async stop() {
|
|
21
|
+
throw new Error('QueueSubscriber.stop() not implemented — pending orchestrator message-queue contract');
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=queue.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"queue.js","sourceRoot":"","sources":["../../src/ingest/queue.ts"],"names":[],"mappings":"AAEA;;;;;;;;;;;;;;GAcG;AACH,MAAM,OAAO,eAAe;IAC1B,KAAK,CAAC,KAAK,CAAC,QAA4C;QACtD,MAAM,IAAI,KAAK,CACb,uFAAuF,CACxF,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,IAAI;QACR,MAAM,IAAI,KAAK,CACb,sFAAsF,CACvF,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `IngestSource` interface for KG-MCP Phase 2.
|
|
3
|
+
*
|
|
4
|
+
* The data-plane seam between the file-system watcher (chokidar today,
|
|
5
|
+
* orchestrator message queue tomorrow) and the `IngesterService`.
|
|
6
|
+
*
|
|
7
|
+
* Phase 2 ships two implementations:
|
|
8
|
+
* - `ChokidarWatcher` (src/ingest/chokidar.ts) — watches a directory,
|
|
9
|
+
* emits events on file changes, with the mandatory 2-second debounce
|
|
10
|
+
* and per-path drop-oldest queue
|
|
11
|
+
* - `QueueSubscriber` (src/ingest/queue.ts) — stub that throws
|
|
12
|
+
* `not implemented`. Phase 5+ wires it to the orchestrator contract
|
|
13
|
+
* when that lands.
|
|
14
|
+
*
|
|
15
|
+
* The `ChokidarWatcher` ↔ `QueueSubscriber` swap is one line in
|
|
16
|
+
* `src/cli/serve.ts`. Per presearch.md §2.5 / D19.
|
|
17
|
+
*/
|
|
18
|
+
export type IngestEventKind = 'file:added' | 'file:changed' | 'file:removed';
|
|
19
|
+
export type Scope = 'project' | 'personal';
|
|
20
|
+
export interface IngestEvent {
|
|
21
|
+
/** What happened */
|
|
22
|
+
kind: IngestEventKind;
|
|
23
|
+
/** Absolute path to the markdown file */
|
|
24
|
+
path: string;
|
|
25
|
+
/** Which KG this event belongs to */
|
|
26
|
+
scope: Scope;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* The interface that `serve.ts` programs against. The watcher (or queue
|
|
30
|
+
* subscriber) calls `start(onEvent)` and pumps events into the callback;
|
|
31
|
+
* `stop()` cleans up.
|
|
32
|
+
*
|
|
33
|
+
* Errors thrown by the `onEvent` callback are caught by the source and
|
|
34
|
+
* logged — the source MUST NOT crash on a single failing event, since
|
|
35
|
+
* a single corrupt file shouldn't bring down the watcher for an entire
|
|
36
|
+
* wiki dir.
|
|
37
|
+
*/
|
|
38
|
+
export interface IngestSource {
|
|
39
|
+
start(onEvent: (ev: IngestEvent) => Promise<void>): Promise<void>;
|
|
40
|
+
stop(): Promise<void>;
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=source.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"source.d.ts","sourceRoot":"","sources":["../../src/ingest/source.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,cAAc,GAAG,cAAc,CAAC;AAE7E,MAAM,MAAM,KAAK,GAAG,SAAS,GAAG,UAAU,CAAC;AAE3C,MAAM,WAAW,WAAW;IAC1B,oBAAoB;IACpB,IAAI,EAAE,eAAe,CAAC;IACtB,yCAAyC;IACzC,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,KAAK,EAAE,KAAK,CAAC;CACd;AAED;;;;;;;;;GASG;AACH,MAAM,WAAW,YAAY;IAC3B,KAAK,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,OAAO,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAClE,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACvB"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `IngestSource` interface for KG-MCP Phase 2.
|
|
3
|
+
*
|
|
4
|
+
* The data-plane seam between the file-system watcher (chokidar today,
|
|
5
|
+
* orchestrator message queue tomorrow) and the `IngesterService`.
|
|
6
|
+
*
|
|
7
|
+
* Phase 2 ships two implementations:
|
|
8
|
+
* - `ChokidarWatcher` (src/ingest/chokidar.ts) — watches a directory,
|
|
9
|
+
* emits events on file changes, with the mandatory 2-second debounce
|
|
10
|
+
* and per-path drop-oldest queue
|
|
11
|
+
* - `QueueSubscriber` (src/ingest/queue.ts) — stub that throws
|
|
12
|
+
* `not implemented`. Phase 5+ wires it to the orchestrator contract
|
|
13
|
+
* when that lands.
|
|
14
|
+
*
|
|
15
|
+
* The `ChokidarWatcher` ↔ `QueueSubscriber` swap is one line in
|
|
16
|
+
* `src/cli/serve.ts`. Per presearch.md §2.5 / D19.
|
|
17
|
+
*/
|
|
18
|
+
export {};
|
|
19
|
+
//# sourceMappingURL=source.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"source.js","sourceRoot":"","sources":["../../src/ingest/source.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG"}
|