inkdex 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/dist/store/db.js +1 -1
- package/package.json +13 -3
- package/.claude/settings.local.json +0 -15
- package/.github/workflows/ci.yml +0 -73
- package/.github/workflows/release.yml +0 -65
- package/AGENTS.md +0 -32
- package/biome.json +0 -43
- package/inkdex-0.0.1.tgz +0 -0
- package/release.sh +0 -33
- package/src/cli.ts +0 -45
- package/src/embedder/embedder.ts +0 -52
- package/src/ingest/chunker.ts +0 -158
- package/src/ingest/index-docs.ts +0 -120
- package/src/logger.ts +0 -39
- package/src/search/search.ts +0 -93
- package/src/server.ts +0 -96
- package/src/store/db.ts +0 -217
- package/src/types.ts +0 -16
- package/src/version.ts +0 -16
- package/test/fixtures/docs/api.md +0 -26
- package/test/fixtures/docs/getting-started.md +0 -13
- package/test/helpers/index.ts +0 -14
- package/test/integration/embedder.test.ts +0 -52
- package/test/integration/server.test.ts +0 -125
- package/test/unit/chunker.test.ts +0 -193
- package/test/unit/db.test.ts +0 -190
- package/test/unit/index-docs.test.ts +0 -120
- package/test/unit/logger.test.ts +0 -11
- package/test/unit/search.test.ts +0 -93
- package/test/unit/version.test.ts +0 -16
- package/test-docs/api-reference.md +0 -76
- package/test-docs/deployment.md +0 -55
- package/test-docs/getting-started.md +0 -52
- package/tsconfig.json +0 -18
package/LICENSE
CHANGED
package/dist/store/db.js
CHANGED
|
@@ -9,7 +9,7 @@ export function dbPath(docsPath) {
|
|
|
9
9
|
const hash = createHash("sha256").update(docsPath).digest("hex").slice(0, 12);
|
|
10
10
|
return join(STORE_DIR, `${hash}.db`);
|
|
11
11
|
}
|
|
12
|
-
const SCHEMA_VERSION =
|
|
12
|
+
const SCHEMA_VERSION = 1;
|
|
13
13
|
const CHUNK_COLUMNS = "id, document_path, file_heading, heading, text, metadata, embedding";
|
|
14
14
|
let db;
|
|
15
15
|
let stmts;
|
package/package.json
CHANGED
|
@@ -1,7 +1,19 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "inkdex",
|
|
3
|
-
"version": "0.0
|
|
3
|
+
"version": "0.1.0",
|
|
4
4
|
"description": "MCP server that makes your markdown docs searchable",
|
|
5
|
+
"license": "Apache-2.0",
|
|
6
|
+
"author": "Anton Lundén",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "https://github.com/kandobyte/inkdex"
|
|
10
|
+
},
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/kandobyte/inkdex/issues"
|
|
13
|
+
},
|
|
14
|
+
"files": [
|
|
15
|
+
"dist"
|
|
16
|
+
],
|
|
5
17
|
"type": "module",
|
|
6
18
|
"main": "dist/cli.js",
|
|
7
19
|
"bin": {
|
|
@@ -30,8 +42,6 @@
|
|
|
30
42
|
"markdown",
|
|
31
43
|
"rag"
|
|
32
44
|
],
|
|
33
|
-
"author": "Anton Lundén",
|
|
34
|
-
"license": "Apache-2.0",
|
|
35
45
|
"dependencies": {
|
|
36
46
|
"@huggingface/transformers": "^3.8.1",
|
|
37
47
|
"@modelcontextprotocol/sdk": "^1.25.3",
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"permissions": {
|
|
3
|
-
"allow": [
|
|
4
|
-
"WebFetch(domain:github.com)",
|
|
5
|
-
"mcp__acp__Bash",
|
|
6
|
-
"mcp__acp__Write",
|
|
7
|
-
"mcp__acp__Edit",
|
|
8
|
-
"WebFetch(domain:raw.githubusercontent.com)",
|
|
9
|
-
"WebFetch(domain:api.github.com)",
|
|
10
|
-
"WebFetch(domain:www.firecrawl.dev)",
|
|
11
|
-
"WebFetch(domain:unstructured.io)",
|
|
12
|
-
"WebFetch(domain:www.npmjs.com)"
|
|
13
|
-
]
|
|
14
|
-
}
|
|
15
|
-
}
|
package/.github/workflows/ci.yml
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
name: CI
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches: [main]
|
|
6
|
-
paths:
|
|
7
|
-
- "src/**"
|
|
8
|
-
- "test/**"
|
|
9
|
-
- "package.json"
|
|
10
|
-
- "package-lock.json"
|
|
11
|
-
- "tsconfig.json"
|
|
12
|
-
- ".github/workflows/**"
|
|
13
|
-
pull_request:
|
|
14
|
-
branches: [main]
|
|
15
|
-
paths:
|
|
16
|
-
- "src/**"
|
|
17
|
-
- "test/**"
|
|
18
|
-
- "package.json"
|
|
19
|
-
- "package-lock.json"
|
|
20
|
-
- "tsconfig.json"
|
|
21
|
-
- ".github/workflows/**"
|
|
22
|
-
|
|
23
|
-
jobs:
|
|
24
|
-
audit:
|
|
25
|
-
name: Security Audit
|
|
26
|
-
runs-on: ubuntu-latest
|
|
27
|
-
steps:
|
|
28
|
-
- uses: actions/checkout@v5
|
|
29
|
-
|
|
30
|
-
- uses: actions/setup-node@v6
|
|
31
|
-
with:
|
|
32
|
-
node-version: "22"
|
|
33
|
-
|
|
34
|
-
- run: npm audit --audit-level=critical
|
|
35
|
-
|
|
36
|
-
check:
|
|
37
|
-
name: Lint & Format
|
|
38
|
-
runs-on: ubuntu-latest
|
|
39
|
-
steps:
|
|
40
|
-
- uses: actions/checkout@v5
|
|
41
|
-
|
|
42
|
-
- uses: actions/setup-node@v6
|
|
43
|
-
with:
|
|
44
|
-
node-version: "22"
|
|
45
|
-
|
|
46
|
-
- run: npm ci
|
|
47
|
-
- run: npm run check
|
|
48
|
-
|
|
49
|
-
test-unit:
|
|
50
|
-
name: Unit Tests
|
|
51
|
-
runs-on: ubuntu-latest
|
|
52
|
-
steps:
|
|
53
|
-
- uses: actions/checkout@v5
|
|
54
|
-
|
|
55
|
-
- uses: actions/setup-node@v6
|
|
56
|
-
with:
|
|
57
|
-
node-version: "22"
|
|
58
|
-
|
|
59
|
-
- run: npm ci
|
|
60
|
-
- run: npm run test:unit
|
|
61
|
-
|
|
62
|
-
test-integration:
|
|
63
|
-
name: Integration Tests
|
|
64
|
-
runs-on: ubuntu-latest
|
|
65
|
-
steps:
|
|
66
|
-
- uses: actions/checkout@v5
|
|
67
|
-
|
|
68
|
-
- uses: actions/setup-node@v6
|
|
69
|
-
with:
|
|
70
|
-
node-version: "22"
|
|
71
|
-
|
|
72
|
-
- run: npm ci
|
|
73
|
-
- run: npm run test:integration
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
name: Release
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
tags:
|
|
6
|
-
- "v*"
|
|
7
|
-
|
|
8
|
-
permissions:
|
|
9
|
-
contents: write
|
|
10
|
-
id-token: write
|
|
11
|
-
|
|
12
|
-
jobs:
|
|
13
|
-
check:
|
|
14
|
-
name: Check
|
|
15
|
-
runs-on: ubuntu-latest
|
|
16
|
-
steps:
|
|
17
|
-
- uses: actions/checkout@v5
|
|
18
|
-
|
|
19
|
-
- uses: actions/setup-node@v6
|
|
20
|
-
with:
|
|
21
|
-
node-version: "22"
|
|
22
|
-
|
|
23
|
-
- run: npm ci
|
|
24
|
-
- run: npm run check
|
|
25
|
-
|
|
26
|
-
test:
|
|
27
|
-
name: Test
|
|
28
|
-
runs-on: ubuntu-latest
|
|
29
|
-
steps:
|
|
30
|
-
- uses: actions/checkout@v5
|
|
31
|
-
|
|
32
|
-
- uses: actions/setup-node@v6
|
|
33
|
-
with:
|
|
34
|
-
node-version: "22"
|
|
35
|
-
|
|
36
|
-
- run: npm ci
|
|
37
|
-
- run: npm run test
|
|
38
|
-
|
|
39
|
-
npm:
|
|
40
|
-
name: npm
|
|
41
|
-
needs: [check, test]
|
|
42
|
-
runs-on: ubuntu-latest
|
|
43
|
-
steps:
|
|
44
|
-
- uses: actions/checkout@v5
|
|
45
|
-
|
|
46
|
-
- uses: actions/setup-node@v6
|
|
47
|
-
with:
|
|
48
|
-
node-version: "22"
|
|
49
|
-
registry-url: "https://registry.npmjs.org"
|
|
50
|
-
|
|
51
|
-
- run: npm install -g npm@latest
|
|
52
|
-
- run: npm ci
|
|
53
|
-
- run: npm run build
|
|
54
|
-
- run: npm publish --access public --provenance
|
|
55
|
-
|
|
56
|
-
release:
|
|
57
|
-
name: Release
|
|
58
|
-
needs: [npm]
|
|
59
|
-
runs-on: ubuntu-latest
|
|
60
|
-
steps:
|
|
61
|
-
- uses: actions/checkout@v5
|
|
62
|
-
|
|
63
|
-
- uses: softprops/action-gh-release@v2
|
|
64
|
-
with:
|
|
65
|
-
generate_release_notes: true
|
package/AGENTS.md
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
# AGENTS.md
|
|
2
|
-
|
|
3
|
-
- Use [Conventional Commits](https://www.conventionalcommits.org/) (`feat:`, `fix:`, `chore:`, `docs:`)
|
|
4
|
-
|
|
5
|
-
## General Coding Guidelines
|
|
6
|
-
|
|
7
|
-
- Maintain consistency with existing patterns and style in the codebase
|
|
8
|
-
- Use TypeScript strictly: enable `strict: true`, prefer `unknown` over `any`, avoid type assertions unless necessary
|
|
9
|
-
- Write comments that explain *why*, not *what*—update or remove stale comments when modifying code
|
|
10
|
-
- Prefer renaming over commenting: if code needs a comment to explain what it does, rename instead
|
|
11
|
-
- Use JSDoc (`/** */`) only for exported functions/types; use `//` for implementation notes
|
|
12
|
-
- Use `@package` on exports internal to their feature package
|
|
13
|
-
- Include `@example` in JSDoc when input/output isn't obvious from the signature
|
|
14
|
-
- No commented-out code, no TODO/FIXME without a linked issue
|
|
15
|
-
- Naming: camelCase functions/variables, PascalCase types/classes, UPPER_SNAKE_CASE constants; prefix booleans with `is`/`has`/`should`
|
|
16
|
-
- Keep functions focused and single-responsibility; favor immutable patterns (`readonly`, no mutation)
|
|
17
|
-
- Handle errors consistently: prefer typed errors or Result patterns, handle promise rejections explicitly
|
|
18
|
-
- Use modern syntax: optional chaining (`?.`), nullish coalescing (`??`), `satisfies`, ES modules
|
|
19
|
-
- After refactoring, run `npm run test` to verify tests pass and coverage requirements are met
|
|
20
|
-
- Write tests covering happy path, edge cases, and error conditions with descriptive names
|
|
21
|
-
- Test should validate observable behavior not implementation details
|
|
22
|
-
|
|
23
|
-
## Development
|
|
24
|
-
|
|
25
|
-
```bash
|
|
26
|
-
npm install
|
|
27
|
-
npm run build # TypeScript compilation
|
|
28
|
-
npm run dev # Run via tsx
|
|
29
|
-
npm run check # Biome lint
|
|
30
|
-
npm run format # Biome format
|
|
31
|
-
npm test # Unit + integration
|
|
32
|
-
```
|
package/biome.json
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://biomejs.dev/schemas/2.3.14/schema.json",
|
|
3
|
-
"vcs": {
|
|
4
|
-
"enabled": true,
|
|
5
|
-
"clientKind": "git",
|
|
6
|
-
"useIgnoreFile": true
|
|
7
|
-
},
|
|
8
|
-
"assist": { "actions": { "source": { "organizeImports": "on" } } },
|
|
9
|
-
"formatter": {
|
|
10
|
-
"indentStyle": "space",
|
|
11
|
-
"indentWidth": 2
|
|
12
|
-
},
|
|
13
|
-
"linter": {
|
|
14
|
-
"enabled": true,
|
|
15
|
-
"rules": {
|
|
16
|
-
"recommended": true,
|
|
17
|
-
"suspicious": {
|
|
18
|
-
"noExplicitAny": "error"
|
|
19
|
-
},
|
|
20
|
-
"performance": {
|
|
21
|
-
"noDelete": "off"
|
|
22
|
-
},
|
|
23
|
-
"correctness": {
|
|
24
|
-
"noPrivateImports": "error"
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
},
|
|
28
|
-
"overrides": [
|
|
29
|
-
{
|
|
30
|
-
"includes": ["test/**"],
|
|
31
|
-
"linter": {
|
|
32
|
-
"rules": {
|
|
33
|
-
"correctness": {
|
|
34
|
-
"noPrivateImports": "off"
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
],
|
|
40
|
-
"files": {
|
|
41
|
-
"includes": ["**", "!**/dist", "!**/node_modules"]
|
|
42
|
-
}
|
|
43
|
-
}
|
package/inkdex-0.0.1.tgz
DELETED
|
Binary file
|
package/release.sh
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
if [[ "${1:-}" =~ ^(-h|--help)$ ]] || [[ -z "${1:-}" ]]; then
|
|
5
|
-
echo "Usage: ./release.sh <version>"
|
|
6
|
-
echo " version: X.Y.Z (e.g., 0.1.0)"
|
|
7
|
-
exit 0
|
|
8
|
-
fi
|
|
9
|
-
|
|
10
|
-
VERSION="$1"
|
|
11
|
-
|
|
12
|
-
if ! [[ "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
|
13
|
-
echo "Error: Version must be in format X.Y.Z (e.g., 0.1.0)"
|
|
14
|
-
exit 1
|
|
15
|
-
fi
|
|
16
|
-
|
|
17
|
-
echo "Releasing v$VERSION"
|
|
18
|
-
|
|
19
|
-
npm version "$VERSION" --no-git-tag-version
|
|
20
|
-
npm install
|
|
21
|
-
|
|
22
|
-
npm audit --audit-level=critical
|
|
23
|
-
npm run check
|
|
24
|
-
npm run test:unit
|
|
25
|
-
npm run test:integration
|
|
26
|
-
|
|
27
|
-
npm run build
|
|
28
|
-
|
|
29
|
-
git add package.json package-lock.json
|
|
30
|
-
git commit -m "v$VERSION"
|
|
31
|
-
git tag "v$VERSION"
|
|
32
|
-
|
|
33
|
-
echo "Done. Push with: git push origin main v$VERSION"
|
package/src/cli.ts
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { stat } from "node:fs/promises";
|
|
4
|
-
import { resolve } from "node:path";
|
|
5
|
-
import { Embedder } from "./embedder/embedder.js";
|
|
6
|
-
import { indexDocs } from "./ingest/index-docs.js";
|
|
7
|
-
import { logger } from "./logger.js";
|
|
8
|
-
import { startServer } from "./server.js";
|
|
9
|
-
import { closeDb, openDb } from "./store/db.js";
|
|
10
|
-
|
|
11
|
-
process.on("uncaughtException", (error) => {
|
|
12
|
-
logger.error({ error }, "Uncaught exception");
|
|
13
|
-
process.exit(1);
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
process.on("unhandledRejection", (reason) => {
|
|
17
|
-
logger.error({ reason }, "Unhandled rejection");
|
|
18
|
-
process.exit(1);
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
async function main(): Promise<void> {
|
|
22
|
-
const docsPath = process.env.DOCS_PATH;
|
|
23
|
-
if (!docsPath) {
|
|
24
|
-
logger.error("DOCS_PATH environment variable is required");
|
|
25
|
-
process.exit(1);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const resolved = resolve(docsPath);
|
|
29
|
-
const info = await stat(resolved).catch(() => null);
|
|
30
|
-
if (!info?.isDirectory()) {
|
|
31
|
-
logger.error({ path: resolved }, "DOCS_PATH is not a directory");
|
|
32
|
-
process.exit(1);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
const embedder = await Embedder.load();
|
|
36
|
-
openDb(resolved);
|
|
37
|
-
await indexDocs(embedder, resolved);
|
|
38
|
-
await startServer(embedder);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
main().catch((error) => {
|
|
42
|
-
closeDb();
|
|
43
|
-
logger.error({ error }, "Failed to start server");
|
|
44
|
-
process.exit(1);
|
|
45
|
-
});
|
package/src/embedder/embedder.ts
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import type { FeatureExtractionPipeline } from "@huggingface/transformers";
|
|
2
|
-
import { pipeline } from "@huggingface/transformers";
|
|
3
|
-
|
|
4
|
-
const MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
5
|
-
const BATCH_SIZE = 32;
|
|
6
|
-
|
|
7
|
-
export class Embedder {
|
|
8
|
-
readonly maxTokens: number;
|
|
9
|
-
private readonly pipeline: FeatureExtractionPipeline;
|
|
10
|
-
|
|
11
|
-
private constructor(pipe: FeatureExtractionPipeline) {
|
|
12
|
-
this.pipeline = pipe;
|
|
13
|
-
this.maxTokens = (pipe.tokenizer.model_max_length as number) ?? 256;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
static async load(): Promise<Embedder> {
|
|
17
|
-
const pipe = await pipeline<"feature-extraction">(
|
|
18
|
-
"feature-extraction",
|
|
19
|
-
MODEL,
|
|
20
|
-
);
|
|
21
|
-
return new Embedder(pipe);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
tokenize(text: string): number[] {
|
|
25
|
-
return this.pipeline.tokenizer.encode(text);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
async embed(text: string): Promise<number[]> {
|
|
29
|
-
const result = await this.pipeline(text, {
|
|
30
|
-
pooling: "mean",
|
|
31
|
-
normalize: true,
|
|
32
|
-
});
|
|
33
|
-
return (result.tolist() as number[][])[0];
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
37
|
-
if (texts.length === 0) return [];
|
|
38
|
-
|
|
39
|
-
const results: number[][] = [];
|
|
40
|
-
|
|
41
|
-
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
42
|
-
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
43
|
-
const result = await this.pipeline(batch, {
|
|
44
|
-
pooling: "mean",
|
|
45
|
-
normalize: true,
|
|
46
|
-
});
|
|
47
|
-
results.push(...(result.tolist() as number[][]));
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
return results;
|
|
51
|
-
}
|
|
52
|
-
}
|
package/src/ingest/chunker.ts
DELETED
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import { basename } from "node:path";
|
|
2
|
-
import matter from "gray-matter";
|
|
3
|
-
import type { BaseChunk } from "../types.js";
|
|
4
|
-
|
|
5
|
-
const OVERLAP_RATIO = 0.1;
|
|
6
|
-
const SUB_SEPARATORS = [/^### /m, /\n\n/, /\. /];
|
|
7
|
-
|
|
8
|
-
export interface ChunkOptions {
|
|
9
|
-
readonly maxTokens: number;
|
|
10
|
-
readonly countTokens: (text: string) => number;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
function extractH1(body: string): string | null {
|
|
14
|
-
const match = body.match(/^# (.+)$/m);
|
|
15
|
-
return match ? match[1].trim() : null;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
function clean(text: string): string {
|
|
19
|
-
return text
|
|
20
|
-
.replace(/<!--.*?-->/gs, "")
|
|
21
|
-
.replace(/\n{3,}/g, "\n\n")
|
|
22
|
-
.trim();
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function splitWithOverlap(
|
|
26
|
-
text: string,
|
|
27
|
-
separators: RegExp[],
|
|
28
|
-
maxTokens: number,
|
|
29
|
-
overlap: number,
|
|
30
|
-
countTokens: (text: string) => number,
|
|
31
|
-
): string[] {
|
|
32
|
-
if (countTokens(text) <= maxTokens) return [text];
|
|
33
|
-
|
|
34
|
-
const separator = separators[0];
|
|
35
|
-
const remaining = separators.slice(1);
|
|
36
|
-
|
|
37
|
-
const parts = text.split(separator).filter((p) => p.trim());
|
|
38
|
-
if (parts.length <= 1) {
|
|
39
|
-
// Separator didn't help — try the next one
|
|
40
|
-
if (remaining.length > 0) {
|
|
41
|
-
return splitWithOverlap(text, remaining, maxTokens, overlap, countTokens);
|
|
42
|
-
}
|
|
43
|
-
// Last resort: hard split
|
|
44
|
-
return hardSplit(text, maxTokens, overlap, countTokens);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
const chunks: string[] = [];
|
|
48
|
-
let current = "";
|
|
49
|
-
|
|
50
|
-
for (const part of parts) {
|
|
51
|
-
const combined = current ? `${current}\n\n${part}` : part;
|
|
52
|
-
if (current && countTokens(combined) > maxTokens) {
|
|
53
|
-
chunks.push(current.trim());
|
|
54
|
-
// Start next chunk with overlap from the end of the previous
|
|
55
|
-
const overlapText = current.slice(-overlap);
|
|
56
|
-
current = overlapText + part;
|
|
57
|
-
} else {
|
|
58
|
-
current = combined;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
if (current.trim()) chunks.push(current.trim());
|
|
62
|
-
|
|
63
|
-
// Recursively split any chunks that are still too large
|
|
64
|
-
return chunks.flatMap((chunk) => {
|
|
65
|
-
if (countTokens(chunk) <= maxTokens) return [chunk];
|
|
66
|
-
if (remaining.length > 0) {
|
|
67
|
-
return splitWithOverlap(
|
|
68
|
-
chunk,
|
|
69
|
-
remaining,
|
|
70
|
-
maxTokens,
|
|
71
|
-
overlap,
|
|
72
|
-
countTokens,
|
|
73
|
-
);
|
|
74
|
-
}
|
|
75
|
-
return hardSplit(chunk, maxTokens, overlap, countTokens);
|
|
76
|
-
});
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
function hardSplit(
|
|
80
|
-
text: string,
|
|
81
|
-
maxTokens: number,
|
|
82
|
-
overlap: number,
|
|
83
|
-
countTokens: (text: string) => number,
|
|
84
|
-
): string[] {
|
|
85
|
-
const chunks: string[] = [];
|
|
86
|
-
const words = text.split(/\s+/);
|
|
87
|
-
let current = "";
|
|
88
|
-
|
|
89
|
-
for (const word of words) {
|
|
90
|
-
const next = current ? `${current} ${word}` : word;
|
|
91
|
-
if (countTokens(next) > maxTokens && current) {
|
|
92
|
-
chunks.push(current.trim());
|
|
93
|
-
// Keep overlap from end of current chunk
|
|
94
|
-
const overlapText = current.slice(-overlap);
|
|
95
|
-
current = overlapText + word;
|
|
96
|
-
} else {
|
|
97
|
-
current = next;
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
if (current.trim()) chunks.push(current.trim());
|
|
101
|
-
|
|
102
|
-
return chunks;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
/** @package */
|
|
106
|
-
export function chunkMarkdown(
|
|
107
|
-
content: string,
|
|
108
|
-
path: string,
|
|
109
|
-
options: ChunkOptions,
|
|
110
|
-
): BaseChunk[] {
|
|
111
|
-
const { maxTokens, countTokens } = options;
|
|
112
|
-
const overlap = Math.floor(maxTokens * OVERLAP_RATIO);
|
|
113
|
-
const { data: metadata, content: body } = matter(content);
|
|
114
|
-
const fileHeading = extractH1(body) || basename(path, ".md");
|
|
115
|
-
const sections = body.split(/^## /m);
|
|
116
|
-
const chunks: BaseChunk[] = [];
|
|
117
|
-
|
|
118
|
-
for (let i = 0; i < sections.length; i++) {
|
|
119
|
-
const section = sections[i];
|
|
120
|
-
if (!section.trim()) continue;
|
|
121
|
-
|
|
122
|
-
let heading: string;
|
|
123
|
-
let text: string;
|
|
124
|
-
|
|
125
|
-
if (i === 0) {
|
|
126
|
-
// Content before the first ## — strip the H1 line and use fileHeading
|
|
127
|
-
heading = fileHeading;
|
|
128
|
-
const withoutH1 = section.replace(/^# .+$/m, "");
|
|
129
|
-
text = clean(withoutH1);
|
|
130
|
-
} else {
|
|
131
|
-
const [headingLine, ...rest] = section.split("\n");
|
|
132
|
-
heading = headingLine.trim();
|
|
133
|
-
text = clean(rest.join("\n"));
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
if (!text) continue;
|
|
137
|
-
|
|
138
|
-
const subChunks = splitWithOverlap(
|
|
139
|
-
text,
|
|
140
|
-
SUB_SEPARATORS,
|
|
141
|
-
maxTokens,
|
|
142
|
-
overlap,
|
|
143
|
-
countTokens,
|
|
144
|
-
);
|
|
145
|
-
|
|
146
|
-
for (const sub of subChunks) {
|
|
147
|
-
chunks.push({
|
|
148
|
-
path,
|
|
149
|
-
fileHeading,
|
|
150
|
-
heading,
|
|
151
|
-
text: sub,
|
|
152
|
-
metadata,
|
|
153
|
-
});
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
return chunks;
|
|
158
|
-
}
|
package/src/ingest/index-docs.ts
DELETED
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
import { createHash } from "node:crypto";
|
|
2
|
-
import { glob, readFile } from "node:fs/promises";
|
|
3
|
-
import { relative } from "node:path";
|
|
4
|
-
import type { Embedder } from "../embedder/embedder.js";
|
|
5
|
-
import { logger } from "../logger.js";
|
|
6
|
-
import {
|
|
7
|
-
getAllDocumentHashes,
|
|
8
|
-
insertChunk,
|
|
9
|
-
removeDocument,
|
|
10
|
-
runInTransaction,
|
|
11
|
-
setDocumentHash,
|
|
12
|
-
} from "../store/db.js";
|
|
13
|
-
import { chunkMarkdown } from "./chunker.js";
|
|
14
|
-
|
|
15
|
-
const MAX_CHUNK_FILL = 0.8;
|
|
16
|
-
|
|
17
|
-
async function findMarkdownFiles(docsPath: string): Promise<string[]> {
|
|
18
|
-
const files: string[] = [];
|
|
19
|
-
for await (const entry of glob("**/*.md", { cwd: docsPath })) {
|
|
20
|
-
files.push(`${docsPath}/${entry}`);
|
|
21
|
-
}
|
|
22
|
-
return files.sort();
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function hashContent(content: string): string {
|
|
26
|
-
return createHash("sha256").update(content).digest("hex");
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
export async function indexDocs(
|
|
30
|
-
embedder: Embedder,
|
|
31
|
-
docsPath: string,
|
|
32
|
-
): Promise<void> {
|
|
33
|
-
const files = await findMarkdownFiles(docsPath);
|
|
34
|
-
|
|
35
|
-
if (files.length === 0) {
|
|
36
|
-
logger.warn({ path: docsPath }, "No markdown files found");
|
|
37
|
-
return;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const fileContents = new Map<string, string>();
|
|
41
|
-
for (const file of files) {
|
|
42
|
-
const key = relative(docsPath, file);
|
|
43
|
-
const content = await readFile(file, "utf-8");
|
|
44
|
-
fileContents.set(key, content);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
const storedHashes = getAllDocumentHashes();
|
|
48
|
-
|
|
49
|
-
const changedKeys: string[] = [];
|
|
50
|
-
for (const [key, content] of fileContents) {
|
|
51
|
-
if (storedHashes[key] !== hashContent(content)) {
|
|
52
|
-
changedKeys.push(key);
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
const removedKeys: string[] = [];
|
|
57
|
-
for (const key of Object.keys(storedHashes)) {
|
|
58
|
-
if (!fileContents.has(key)) {
|
|
59
|
-
removedKeys.push(key);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
if (changedKeys.length === 0 && removedKeys.length === 0) {
|
|
64
|
-
logger.info({ files: files.length }, "Index up to date");
|
|
65
|
-
return;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
const start = performance.now();
|
|
69
|
-
|
|
70
|
-
logger.info(
|
|
71
|
-
{ changed: changedKeys.length, removed: removedKeys.length },
|
|
72
|
-
"Indexing changed files",
|
|
73
|
-
);
|
|
74
|
-
|
|
75
|
-
if (removedKeys.length > 0) {
|
|
76
|
-
runInTransaction(() => {
|
|
77
|
-
for (const key of removedKeys) {
|
|
78
|
-
removeDocument(key);
|
|
79
|
-
}
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
const chunkOptions = {
|
|
84
|
-
maxTokens: Math.floor(embedder.maxTokens * MAX_CHUNK_FILL),
|
|
85
|
-
countTokens: (text: string) => embedder.tokenize(text).length,
|
|
86
|
-
};
|
|
87
|
-
|
|
88
|
-
let totalChunks = 0;
|
|
89
|
-
for (const key of changedKeys) {
|
|
90
|
-
const content = fileContents.get(key) as string;
|
|
91
|
-
const chunks = chunkMarkdown(content, key, chunkOptions);
|
|
92
|
-
|
|
93
|
-
logger.debug({ path: key, chunks: chunks.length }, "Embedding chunks");
|
|
94
|
-
const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
|
|
95
|
-
|
|
96
|
-
runInTransaction(() => {
|
|
97
|
-
removeDocument(key);
|
|
98
|
-
setDocumentHash(key, hashContent(content));
|
|
99
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
100
|
-
const chunk = chunks[i];
|
|
101
|
-
insertChunk(
|
|
102
|
-
chunk.path,
|
|
103
|
-
chunk.fileHeading,
|
|
104
|
-
chunk.heading,
|
|
105
|
-
chunk.text,
|
|
106
|
-
chunk.metadata,
|
|
107
|
-
embeddings[i],
|
|
108
|
-
);
|
|
109
|
-
}
|
|
110
|
-
});
|
|
111
|
-
|
|
112
|
-
totalChunks += chunks.length;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
const duration = ((performance.now() - start) / 1000).toFixed(1);
|
|
116
|
-
logger.info(
|
|
117
|
-
{ duration: `${duration}s`, chunks: totalChunks },
|
|
118
|
-
"Indexing complete",
|
|
119
|
-
);
|
|
120
|
-
}
|