inkdex 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/.github/workflows/ci.yml +73 -0
- package/.github/workflows/release.yml +65 -0
- package/AGENTS.md +32 -0
- package/LICENSE +190 -0
- package/README.md +40 -0
- package/biome.json +43 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +38 -0
- package/dist/embedder/embedder.d.ts +9 -0
- package/dist/embedder/embedder.js +39 -0
- package/dist/ingest/chunker.d.ts +7 -0
- package/dist/ingest/chunker.js +114 -0
- package/dist/ingest/index-docs.d.ts +2 -0
- package/dist/ingest/index-docs.js +78 -0
- package/dist/logger.d.ts +6 -0
- package/dist/logger.js +28 -0
- package/dist/search/search.d.ts +7 -0
- package/dist/search/search.js +70 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.js +66 -0
- package/dist/store/db.d.ts +13 -0
- package/dist/store/db.js +149 -0
- package/dist/types.d.ts +14 -0
- package/dist/types.js +1 -0
- package/dist/version.d.ts +1 -0
- package/dist/version.js +13 -0
- package/inkdex-0.0.1.tgz +0 -0
- package/package.json +46 -0
- package/release.sh +33 -0
- package/src/cli.ts +45 -0
- package/src/embedder/embedder.ts +52 -0
- package/src/ingest/chunker.ts +158 -0
- package/src/ingest/index-docs.ts +120 -0
- package/src/logger.ts +39 -0
- package/src/search/search.ts +93 -0
- package/src/server.ts +96 -0
- package/src/store/db.ts +217 -0
- package/src/types.ts +16 -0
- package/src/version.ts +16 -0
- package/test/fixtures/docs/api.md +26 -0
- package/test/fixtures/docs/getting-started.md +13 -0
- package/test/helpers/index.ts +14 -0
- package/test/integration/embedder.test.ts +52 -0
- package/test/integration/server.test.ts +125 -0
- package/test/unit/chunker.test.ts +193 -0
- package/test/unit/db.test.ts +190 -0
- package/test/unit/index-docs.test.ts +120 -0
- package/test/unit/logger.test.ts +11 -0
- package/test/unit/search.test.ts +93 -0
- package/test/unit/version.test.ts +16 -0
- package/test-docs/api-reference.md +76 -0
- package/test-docs/deployment.md +55 -0
- package/test-docs/getting-started.md +52 -0
- package/tsconfig.json +18 -0
package/src/store/db.ts
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { mkdirSync } from "node:fs";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import type { StatementSync } from "node:sqlite";
|
|
6
|
+
import { DatabaseSync } from "node:sqlite";
|
|
7
|
+
import type { ChunkRow } from "../types.js";
|
|
8
|
+
|
|
9
|
+
const STORE_DIR = join(homedir(), ".inkdex");
|
|
10
|
+
|
|
11
|
+
/** @package */
|
|
12
|
+
export function dbPath(docsPath: string): string {
|
|
13
|
+
const hash = createHash("sha256").update(docsPath).digest("hex").slice(0, 12);
|
|
14
|
+
return join(STORE_DIR, `${hash}.db`);
|
|
15
|
+
}
|
|
16
|
+
const SCHEMA_VERSION = 2;
|
|
17
|
+
|
|
18
|
+
const CHUNK_COLUMNS =
|
|
19
|
+
"id, document_path, file_heading, heading, text, metadata, embedding";
|
|
20
|
+
|
|
21
|
+
let db: DatabaseSync;
|
|
22
|
+
|
|
23
|
+
let stmts: {
|
|
24
|
+
getAllDocs: StatementSync;
|
|
25
|
+
upsertDoc: StatementSync;
|
|
26
|
+
deleteChunksByDoc: StatementSync;
|
|
27
|
+
deleteDoc: StatementSync;
|
|
28
|
+
insertChunk: StatementSync;
|
|
29
|
+
getAllChunks: StatementSync;
|
|
30
|
+
countChunks: StatementSync;
|
|
31
|
+
searchFts: StatementSync;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
function createSchema(): void {
|
|
35
|
+
db.exec(`
|
|
36
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
37
|
+
path TEXT PRIMARY KEY,
|
|
38
|
+
hash TEXT NOT NULL
|
|
39
|
+
);
|
|
40
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
41
|
+
id INTEGER PRIMARY KEY,
|
|
42
|
+
document_path TEXT NOT NULL REFERENCES documents(path),
|
|
43
|
+
file_heading TEXT NOT NULL,
|
|
44
|
+
heading TEXT NOT NULL,
|
|
45
|
+
text TEXT NOT NULL,
|
|
46
|
+
metadata TEXT NOT NULL,
|
|
47
|
+
embedding BLOB NOT NULL
|
|
48
|
+
);
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks(document_path);
|
|
50
|
+
|
|
51
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts
|
|
52
|
+
USING fts5(text, content=chunks, content_rowid=id);
|
|
53
|
+
|
|
54
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks BEGIN
|
|
55
|
+
INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
|
|
56
|
+
END;
|
|
57
|
+
|
|
58
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_delete AFTER DELETE ON chunks BEGIN
|
|
59
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, text)
|
|
60
|
+
VALUES('delete', old.id, old.text);
|
|
61
|
+
END;
|
|
62
|
+
`);
|
|
63
|
+
db.exec(`PRAGMA user_version = ${SCHEMA_VERSION}`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function prepareStatements(): void {
|
|
67
|
+
stmts = {
|
|
68
|
+
getAllDocs: db.prepare("SELECT path, hash FROM documents"),
|
|
69
|
+
upsertDoc: db.prepare(
|
|
70
|
+
"INSERT OR REPLACE INTO documents (path, hash) VALUES (?, ?)",
|
|
71
|
+
),
|
|
72
|
+
deleteChunksByDoc: db.prepare("DELETE FROM chunks WHERE document_path = ?"),
|
|
73
|
+
deleteDoc: db.prepare("DELETE FROM documents WHERE path = ?"),
|
|
74
|
+
insertChunk: db.prepare(
|
|
75
|
+
"INSERT INTO chunks (document_path, file_heading, heading, text, metadata, embedding) VALUES (?, ?, ?, ?, ?, ?)",
|
|
76
|
+
),
|
|
77
|
+
getAllChunks: db.prepare(`SELECT ${CHUNK_COLUMNS} FROM chunks`),
|
|
78
|
+
countChunks: db.prepare("SELECT COUNT(*) as count FROM chunks"),
|
|
79
|
+
searchFts: db.prepare(
|
|
80
|
+
"SELECT rowid FROM chunks_fts WHERE chunks_fts MATCH ? ORDER BY bm25(chunks_fts) LIMIT ?",
|
|
81
|
+
),
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function openDb(docsPath: string): void {
|
|
86
|
+
mkdirSync(STORE_DIR, { recursive: true });
|
|
87
|
+
db = new DatabaseSync(dbPath(docsPath));
|
|
88
|
+
db.exec("PRAGMA journal_mode = WAL");
|
|
89
|
+
db.exec("PRAGMA foreign_keys = ON");
|
|
90
|
+
|
|
91
|
+
const { user_version } = db.prepare("PRAGMA user_version").get() as {
|
|
92
|
+
user_version: number;
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
if (user_version !== SCHEMA_VERSION) {
|
|
96
|
+
db.exec("DROP TABLE IF EXISTS chunks");
|
|
97
|
+
db.exec("DROP TABLE IF EXISTS documents");
|
|
98
|
+
createSchema();
|
|
99
|
+
} else {
|
|
100
|
+
createSchema();
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
prepareStatements();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export function closeDb(): void {
|
|
107
|
+
db?.close();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export function getAllDocumentHashes(): Record<string, string> {
|
|
111
|
+
const rows = stmts.getAllDocs.all() as {
|
|
112
|
+
path: string;
|
|
113
|
+
hash: string;
|
|
114
|
+
}[];
|
|
115
|
+
const result: Record<string, string> = {};
|
|
116
|
+
for (const row of rows) {
|
|
117
|
+
result[row.path] = row.hash;
|
|
118
|
+
}
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export function setDocumentHash(path: string, hash: string): void {
|
|
123
|
+
stmts.upsertDoc.run(path, hash);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export function removeDocument(path: string): void {
|
|
127
|
+
stmts.deleteChunksByDoc.run(path);
|
|
128
|
+
stmts.deleteDoc.run(path);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function embeddingToBlob(embedding: number[]): Buffer {
|
|
132
|
+
return Buffer.from(new Float32Array(embedding).buffer);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function blobToEmbedding(blob: Uint8Array): number[] {
|
|
136
|
+
return Array.from(new Float32Array(new Uint8Array(blob).buffer));
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
interface RawChunkRow {
|
|
140
|
+
id: number;
|
|
141
|
+
document_path: string;
|
|
142
|
+
file_heading: string;
|
|
143
|
+
heading: string;
|
|
144
|
+
text: string;
|
|
145
|
+
metadata: string;
|
|
146
|
+
embedding: Uint8Array;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function toChunkRow(row: RawChunkRow): ChunkRow {
|
|
150
|
+
return {
|
|
151
|
+
id: row.id,
|
|
152
|
+
path: row.document_path,
|
|
153
|
+
fileHeading: row.file_heading,
|
|
154
|
+
heading: row.heading,
|
|
155
|
+
text: row.text,
|
|
156
|
+
metadata: JSON.parse(row.metadata),
|
|
157
|
+
embedding: blobToEmbedding(row.embedding),
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export function insertChunk(
|
|
162
|
+
documentPath: string,
|
|
163
|
+
fileHeading: string,
|
|
164
|
+
heading: string,
|
|
165
|
+
text: string,
|
|
166
|
+
metadata: Record<string, unknown>,
|
|
167
|
+
embedding: number[],
|
|
168
|
+
): void {
|
|
169
|
+
stmts.insertChunk.run(
|
|
170
|
+
documentPath,
|
|
171
|
+
fileHeading,
|
|
172
|
+
heading,
|
|
173
|
+
text,
|
|
174
|
+
JSON.stringify(metadata),
|
|
175
|
+
embeddingToBlob(embedding),
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
export function getAllChunks(): ChunkRow[] {
|
|
180
|
+
const rows = stmts.getAllChunks.all() as unknown as RawChunkRow[];
|
|
181
|
+
return rows.map(toChunkRow);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function getChunkCount(): number {
|
|
185
|
+
const row = stmts.countChunks.get() as {
|
|
186
|
+
count: number;
|
|
187
|
+
};
|
|
188
|
+
return row.count;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
export function searchFts(query: string, limit: number): number[] {
|
|
192
|
+
// Sanitize: split into words, quote each to escape FTS5 operators
|
|
193
|
+
const terms = query
|
|
194
|
+
.split(/\s+/)
|
|
195
|
+
.filter((t) => t.length > 0)
|
|
196
|
+
.map((t) => `"${t.replace(/"/g, '""')}"`)
|
|
197
|
+
.join(" ");
|
|
198
|
+
if (terms.length === 0) return [];
|
|
199
|
+
try {
|
|
200
|
+
const rows = stmts.searchFts.all(terms, limit) as { rowid: number }[];
|
|
201
|
+
return rows.map((r) => r.rowid);
|
|
202
|
+
} catch {
|
|
203
|
+
// FTS5 MATCH can fail on edge-case inputs; fall back to empty
|
|
204
|
+
return [];
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
export function runInTransaction(fn: () => void): void {
|
|
209
|
+
db.exec("BEGIN");
|
|
210
|
+
try {
|
|
211
|
+
fn();
|
|
212
|
+
db.exec("COMMIT");
|
|
213
|
+
} catch (error) {
|
|
214
|
+
db.exec("ROLLBACK");
|
|
215
|
+
throw error;
|
|
216
|
+
}
|
|
217
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export interface BaseChunk {
|
|
2
|
+
path: string;
|
|
3
|
+
fileHeading: string;
|
|
4
|
+
heading: string;
|
|
5
|
+
text: string;
|
|
6
|
+
metadata: Record<string, unknown>;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface ChunkRow extends BaseChunk {
|
|
10
|
+
id: number;
|
|
11
|
+
embedding: number[];
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface SearchResult extends Omit<ChunkRow, "id" | "embedding"> {
|
|
15
|
+
score: number;
|
|
16
|
+
}
|
package/src/version.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { dirname, join } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
|
|
5
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
|
|
7
|
+
export function getVersion(): string {
|
|
8
|
+
try {
|
|
9
|
+
const packageJson = JSON.parse(
|
|
10
|
+
readFileSync(join(__dirname, "..", "package.json"), "utf-8"),
|
|
11
|
+
);
|
|
12
|
+
return packageJson.version || "unknown";
|
|
13
|
+
} catch {
|
|
14
|
+
return "unknown";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: API Documentation
|
|
3
|
+
version: "2.0"
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# API Documentation
|
|
7
|
+
|
|
8
|
+
## Authentication
|
|
9
|
+
|
|
10
|
+
All API requests require a valid JWT token in the Authorization header.
|
|
11
|
+
To obtain a token, send a POST request to `/auth/login` with your credentials.
|
|
12
|
+
Tokens expire after 24 hours and must be refreshed using the `/auth/refresh` endpoint.
|
|
13
|
+
|
|
14
|
+
## Rate Limiting
|
|
15
|
+
|
|
16
|
+
The API enforces rate limiting to ensure fair usage across all clients.
|
|
17
|
+
Default limits are 100 requests per minute for authenticated users
|
|
18
|
+
and 10 requests per minute for unauthenticated access.
|
|
19
|
+
Exceeding the limit returns a 429 Too Many Requests response.
|
|
20
|
+
|
|
21
|
+
## Error Handling
|
|
22
|
+
|
|
23
|
+
All errors follow a consistent JSON format with a `code` field and a `message` field.
|
|
24
|
+
Common error codes include 400 for bad requests, 401 for unauthorized access,
|
|
25
|
+
403 for forbidden resources, and 500 for internal server errors.
|
|
26
|
+
Each error response includes a unique `request_id` for debugging purposes.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Getting Started
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
Install the package using npm. Run `npm install mylib` in your project directory.
|
|
6
|
+
The package requires Node.js version 18 or higher and works on all major platforms
|
|
7
|
+
including Windows, macOS, and Linux.
|
|
8
|
+
|
|
9
|
+
## Configuration
|
|
10
|
+
|
|
11
|
+
Create a configuration file at the root of your project named `mylib.config.json`.
|
|
12
|
+
The minimum required configuration includes the `apiKey` and `endpoint` fields.
|
|
13
|
+
You can also set environment variables prefixed with `MYLIB_` to override config values.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export function delay(ms: number): Promise<void> {
|
|
2
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export function createMockLogger() {
|
|
6
|
+
return {
|
|
7
|
+
trace: () => {},
|
|
8
|
+
debug: () => {},
|
|
9
|
+
info: () => {},
|
|
10
|
+
warn: () => {},
|
|
11
|
+
error: () => {},
|
|
12
|
+
fatal: () => {},
|
|
13
|
+
};
|
|
14
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import assert from "node:assert";
|
|
2
|
+
import { describe, it } from "node:test";
|
|
3
|
+
import { Embedder } from "../../src/embedder/embedder.js";
|
|
4
|
+
|
|
5
|
+
describe("embedder", () => {
|
|
6
|
+
let embedder: Embedder;
|
|
7
|
+
|
|
8
|
+
it("should load the model", async () => {
|
|
9
|
+
embedder = await Embedder.load();
|
|
10
|
+
assert.ok(embedder);
|
|
11
|
+
assert.ok(embedder.maxTokens > 0);
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("should tokenize text", () => {
|
|
15
|
+
const tokens = embedder.tokenize("hello world");
|
|
16
|
+
assert.ok(Array.isArray(tokens));
|
|
17
|
+
assert.ok(tokens.length > 0);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("should embed a single text", async () => {
|
|
21
|
+
const embedding = await embedder.embed("hello world");
|
|
22
|
+
assert.ok(Array.isArray(embedding));
|
|
23
|
+
assert.ok(embedding.length > 0);
|
|
24
|
+
assert.strictEqual(typeof embedding[0], "number");
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("should return normalized embeddings", async () => {
|
|
28
|
+
const embedding = await embedder.embed("test text");
|
|
29
|
+
const magnitude = Math.sqrt(embedding.reduce((sum, v) => sum + v * v, 0));
|
|
30
|
+
assert.ok(Math.abs(magnitude - 1.0) < 0.01);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it("should embed a batch of texts", async () => {
|
|
34
|
+
const embeddings = await embedder.embedBatch(["hello", "world"]);
|
|
35
|
+
assert.strictEqual(embeddings.length, 2);
|
|
36
|
+
assert.strictEqual(embeddings[0].length, embeddings[1].length);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("should return empty array for empty batch", async () => {
|
|
40
|
+
const embeddings = await embedder.embedBatch([]);
|
|
41
|
+
assert.deepStrictEqual(embeddings, []);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
it("should produce different embeddings for different texts", async () => {
|
|
45
|
+
const [a, b] = await embedder.embedBatch([
|
|
46
|
+
"javascript programming",
|
|
47
|
+
"cooking recipes",
|
|
48
|
+
]);
|
|
49
|
+
const same = a.every((v, i) => v === b[i]);
|
|
50
|
+
assert.ok(!same);
|
|
51
|
+
});
|
|
52
|
+
});
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import assert from "node:assert";
|
|
2
|
+
import { resolve } from "node:path";
|
|
3
|
+
import { describe, it } from "node:test";
|
|
4
|
+
import { Client } from "@modelcontextprotocol/sdk/client/index.js";
|
|
5
|
+
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
|
|
6
|
+
|
|
7
|
+
const DOCS_PATH = resolve("test/fixtures/docs");
|
|
8
|
+
|
|
9
|
+
function createTransport(): StdioClientTransport {
|
|
10
|
+
return new StdioClientTransport({
|
|
11
|
+
command: "tsx",
|
|
12
|
+
args: ["src/cli.ts"],
|
|
13
|
+
env: { ...process.env, DOCS_PATH, LOG_LEVEL: "silent" },
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function createClient(): Client {
|
|
18
|
+
return new Client(
|
|
19
|
+
{ name: "test-client", version: "1.0.0" },
|
|
20
|
+
{ capabilities: {} },
|
|
21
|
+
);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
describe("server integration", () => {
|
|
25
|
+
it("should list search_docs tool", async () => {
|
|
26
|
+
const client = createClient();
|
|
27
|
+
const transport = createTransport();
|
|
28
|
+
|
|
29
|
+
await client.connect(transport);
|
|
30
|
+
const { tools } = await client.listTools();
|
|
31
|
+
|
|
32
|
+
assert.ok(tools.length > 0);
|
|
33
|
+
assert.strictEqual(tools[0].name, "search_docs");
|
|
34
|
+
|
|
35
|
+
await client.close();
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it("should search docs and return results", async () => {
|
|
39
|
+
const client = createClient();
|
|
40
|
+
const transport = createTransport();
|
|
41
|
+
|
|
42
|
+
await client.connect(transport);
|
|
43
|
+
|
|
44
|
+
const result = await client.callTool({
|
|
45
|
+
name: "search_docs",
|
|
46
|
+
arguments: { query: "how to authenticate", limit: 3 },
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
assert.ok(result.content);
|
|
50
|
+
assert.ok(Array.isArray(result.content));
|
|
51
|
+
assert.strictEqual(result.content[0].type, "text");
|
|
52
|
+
assert.ok(result.content[0].text.includes("Authentication"));
|
|
53
|
+
|
|
54
|
+
await client.close();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("should clamp limit to maximum of 20", async () => {
|
|
58
|
+
const client = createClient();
|
|
59
|
+
const transport = createTransport();
|
|
60
|
+
|
|
61
|
+
await client.connect(transport);
|
|
62
|
+
|
|
63
|
+
const result = await client.callTool({
|
|
64
|
+
name: "search_docs",
|
|
65
|
+
arguments: { query: "test", limit: 999 },
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
assert.ok(result.content);
|
|
69
|
+
assert.ok(Array.isArray(result.content));
|
|
70
|
+
|
|
71
|
+
await client.close();
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("should clamp limit to minimum of 1", async () => {
|
|
75
|
+
const client = createClient();
|
|
76
|
+
const transport = createTransport();
|
|
77
|
+
|
|
78
|
+
await client.connect(transport);
|
|
79
|
+
|
|
80
|
+
const result = await client.callTool({
|
|
81
|
+
name: "search_docs",
|
|
82
|
+
arguments: { query: "test", limit: -5 },
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
assert.ok(result.content);
|
|
86
|
+
assert.ok(Array.isArray(result.content));
|
|
87
|
+
|
|
88
|
+
await client.close();
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it("should use default limit when not specified", async () => {
|
|
92
|
+
const client = createClient();
|
|
93
|
+
const transport = createTransport();
|
|
94
|
+
|
|
95
|
+
await client.connect(transport);
|
|
96
|
+
|
|
97
|
+
const result = await client.callTool({
|
|
98
|
+
name: "search_docs",
|
|
99
|
+
arguments: { query: "install" },
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
assert.ok(result.content);
|
|
103
|
+
assert.ok(Array.isArray(result.content));
|
|
104
|
+
assert.strictEqual(result.content[0].type, "text");
|
|
105
|
+
|
|
106
|
+
await client.close();
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it("should return no results message for unrelated query", async () => {
|
|
110
|
+
const client = createClient();
|
|
111
|
+
const transport = createTransport();
|
|
112
|
+
|
|
113
|
+
await client.connect(transport);
|
|
114
|
+
|
|
115
|
+
const result = await client.callTool({
|
|
116
|
+
name: "search_docs",
|
|
117
|
+
arguments: { query: "xyzzy", limit: 1 },
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
assert.ok(result.content);
|
|
121
|
+
assert.ok(Array.isArray(result.content));
|
|
122
|
+
|
|
123
|
+
await client.close();
|
|
124
|
+
});
|
|
125
|
+
});
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import assert from "node:assert";
|
|
2
|
+
import { describe, it } from "node:test";
|
|
3
|
+
import type { ChunkOptions } from "../../src/ingest/chunker.js";
|
|
4
|
+
import { chunkMarkdown } from "../../src/ingest/chunker.js";
|
|
5
|
+
|
|
6
|
+
// Use character length as a stand-in for token counting in unit tests
|
|
7
|
+
const defaultOptions: ChunkOptions = {
|
|
8
|
+
maxTokens: 2000,
|
|
9
|
+
countTokens: (text: string) => text.length,
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
describe("chunker", () => {
|
|
13
|
+
it("should split on H2 headings", () => {
|
|
14
|
+
const md = `# My Doc
|
|
15
|
+
|
|
16
|
+
## First Section
|
|
17
|
+
|
|
18
|
+
This is the first section with enough content to pass the minimum length filter for chunks.
|
|
19
|
+
|
|
20
|
+
## Second Section
|
|
21
|
+
|
|
22
|
+
This is the second section with enough content to pass the minimum length filter for chunks.
|
|
23
|
+
`;
|
|
24
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
25
|
+
assert.strictEqual(chunks.length, 2);
|
|
26
|
+
assert.strictEqual(chunks[0].heading, "First Section");
|
|
27
|
+
assert.strictEqual(chunks[1].heading, "Second Section");
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("should extract H1 as fileHeading", () => {
|
|
31
|
+
const md = `# API Reference
|
|
32
|
+
|
|
33
|
+
## Methods
|
|
34
|
+
|
|
35
|
+
Here are the available methods with detailed descriptions and usage examples for each one.
|
|
36
|
+
`;
|
|
37
|
+
const chunks = chunkMarkdown(md, "api.md", defaultOptions);
|
|
38
|
+
assert.strictEqual(chunks[0].fileHeading, "API Reference");
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it("should fall back to filename when no H1", () => {
|
|
42
|
+
const md = `## Only Section
|
|
43
|
+
|
|
44
|
+
Content here with enough text to pass the minimum length requirement for the chunk filter.
|
|
45
|
+
`;
|
|
46
|
+
const chunks = chunkMarkdown(md, "guide.md", defaultOptions);
|
|
47
|
+
assert.strictEqual(chunks[0].fileHeading, "guide");
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
it("should parse frontmatter", () => {
|
|
51
|
+
const md = `---
|
|
52
|
+
title: My Guide
|
|
53
|
+
version: "1.0"
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
# Guide
|
|
57
|
+
|
|
58
|
+
## Section One
|
|
59
|
+
|
|
60
|
+
This section has enough content to be included as a chunk in the final output results.
|
|
61
|
+
`;
|
|
62
|
+
const chunks = chunkMarkdown(md, "guide.md", defaultOptions);
|
|
63
|
+
assert.strictEqual(chunks[0].metadata.title, "My Guide");
|
|
64
|
+
assert.strictEqual(chunks[0].metadata.version, "1.0");
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it("should skip empty sections", () => {
|
|
68
|
+
const md = `# Doc
|
|
69
|
+
|
|
70
|
+
## Empty
|
|
71
|
+
|
|
72
|
+
## Has Content
|
|
73
|
+
|
|
74
|
+
This section has actual content that should be included in the output.
|
|
75
|
+
`;
|
|
76
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
77
|
+
assert.strictEqual(chunks.length, 1);
|
|
78
|
+
assert.strictEqual(chunks[0].heading, "Has Content");
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it("should strip HTML comments", () => {
|
|
82
|
+
const md = `# Doc
|
|
83
|
+
|
|
84
|
+
## Section
|
|
85
|
+
|
|
86
|
+
<!-- This is a comment that should be removed -->
|
|
87
|
+
The actual content remains here and has enough length to pass the minimum filter requirement.
|
|
88
|
+
`;
|
|
89
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
90
|
+
assert.ok(!chunks[0].text.includes("<!--"));
|
|
91
|
+
assert.ok(chunks[0].text.includes("actual content"));
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("should sub-split oversized sections", () => {
|
|
95
|
+
const paragraph =
|
|
96
|
+
"This is a paragraph with enough words to take up space. ".repeat(30);
|
|
97
|
+
const md = `# Doc
|
|
98
|
+
|
|
99
|
+
## Big Section
|
|
100
|
+
|
|
101
|
+
${paragraph}
|
|
102
|
+
|
|
103
|
+
### Sub Heading A
|
|
104
|
+
|
|
105
|
+
${paragraph}
|
|
106
|
+
|
|
107
|
+
### Sub Heading B
|
|
108
|
+
|
|
109
|
+
${paragraph}
|
|
110
|
+
`;
|
|
111
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
112
|
+
assert.ok(
|
|
113
|
+
chunks.length > 1,
|
|
114
|
+
`Expected multiple chunks, got ${chunks.length}`,
|
|
115
|
+
);
|
|
116
|
+
for (const chunk of chunks) {
|
|
117
|
+
assert.ok(
|
|
118
|
+
chunk.text.length <= 2200,
|
|
119
|
+
`Chunk too large: ${chunk.text.length} chars`,
|
|
120
|
+
);
|
|
121
|
+
assert.strictEqual(chunk.heading, "Big Section");
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
it("should keep small sections as single chunks", () => {
|
|
126
|
+
const md = `# Doc
|
|
127
|
+
|
|
128
|
+
## Small Section
|
|
129
|
+
|
|
130
|
+
This section is well under the limit and should remain as a single chunk without splitting.
|
|
131
|
+
`;
|
|
132
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
133
|
+
assert.strictEqual(chunks.length, 1);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
it("should preserve heading for all sub-chunks", () => {
|
|
137
|
+
const longText = "Sentence one with content. ".repeat(200);
|
|
138
|
+
const md = `# Doc
|
|
139
|
+
|
|
140
|
+
## API Reference
|
|
141
|
+
|
|
142
|
+
${longText}
|
|
143
|
+
`;
|
|
144
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
145
|
+
assert.ok(chunks.length > 1);
|
|
146
|
+
for (const chunk of chunks) {
|
|
147
|
+
assert.strictEqual(chunk.heading, "API Reference");
|
|
148
|
+
assert.strictEqual(chunk.fileHeading, "Doc");
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it("should respect custom measure function for splitting", () => {
|
|
153
|
+
const md = `# Doc
|
|
154
|
+
|
|
155
|
+
## Section
|
|
156
|
+
|
|
157
|
+
${"word ".repeat(100)}
|
|
158
|
+
`;
|
|
159
|
+
// Count by words, max 20 words per chunk
|
|
160
|
+
const wordOptions: ChunkOptions = {
|
|
161
|
+
maxTokens: 20,
|
|
162
|
+
countTokens: (text: string) => text.split(/\s+/).filter(Boolean).length,
|
|
163
|
+
};
|
|
164
|
+
const chunks = chunkMarkdown(md, "test.md", wordOptions);
|
|
165
|
+
assert.ok(
|
|
166
|
+
chunks.length > 1,
|
|
167
|
+
`Expected multiple chunks, got ${chunks.length}`,
|
|
168
|
+
);
|
|
169
|
+
for (const chunk of chunks) {
|
|
170
|
+
const wordCount = chunk.text.split(/\s+/).filter(Boolean).length;
|
|
171
|
+
assert.ok(
|
|
172
|
+
wordCount <= 25,
|
|
173
|
+
`Chunk has ${wordCount} words, expected <= 25 (with overlap)`,
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it("should include pre-H2 content with fileHeading", () => {
|
|
179
|
+
const md = `# My Doc
|
|
180
|
+
|
|
181
|
+
This is intro content before any H2 heading.
|
|
182
|
+
|
|
183
|
+
## First Section
|
|
184
|
+
|
|
185
|
+
Section content here.
|
|
186
|
+
`;
|
|
187
|
+
const chunks = chunkMarkdown(md, "test.md", defaultOptions);
|
|
188
|
+
assert.strictEqual(chunks.length, 2);
|
|
189
|
+
assert.strictEqual(chunks[0].heading, "My Doc");
|
|
190
|
+
assert.ok(chunks[0].text.includes("intro content"));
|
|
191
|
+
assert.strictEqual(chunks[1].heading, "First Section");
|
|
192
|
+
});
|
|
193
|
+
});
|