@tideshift/sextant 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +25 -0
- package/LICENSE +21 -0
- package/README.md +123 -0
- package/package.json +53 -0
- package/src/config.ts +50 -0
- package/src/ignore-files.ts +31 -0
- package/src/index.ts +69 -0
- package/src/indexer/chunker.ts +245 -0
- package/src/indexer/embedder.ts +79 -0
- package/src/indexer/freshness.ts +74 -0
- package/src/indexer/pipeline.ts +161 -0
- package/src/indexer/state.ts +70 -0
- package/src/indexer/types.ts +34 -0
- package/src/scripts/diag.ts +154 -0
- package/src/server.ts +98 -0
- package/src/store/metadata-db.ts +109 -0
- package/src/store/orama-store.ts +172 -0
- package/src/store/persistence.ts +99 -0
- package/src/tools/get.ts +22 -0
- package/src/tools/list.ts +31 -0
- package/src/tools/reindex.ts +29 -0
- package/src/tools/search.ts +93 -0
- package/src/tools/status.ts +53 -0
package/.env.example
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Path to the docs folder to index (absolute or relative to project root)
|
|
2
|
+
DOCS_PATH=./docs
|
|
3
|
+
|
|
4
|
+
# Ollama endpoint
|
|
5
|
+
OLLAMA_URL=http://localhost:11434
|
|
6
|
+
|
|
7
|
+
# Embedding model
|
|
8
|
+
EMBEDDING_MODEL=nomic-embed-text
|
|
9
|
+
|
|
10
|
+
# Embedding dimensions (must match model; nomic-embed-text = 768, mxbai-embed-large = 1024)
|
|
11
|
+
EMBEDDING_DIMS=768
|
|
12
|
+
|
|
13
|
+
# Data storage path (for Orama index + SQLite metadata)
|
|
14
|
+
DATA_PATH=.sextant
|
|
15
|
+
|
|
16
|
+
# Chunk size limits
|
|
17
|
+
MAX_CHUNK_TOKENS=512
|
|
18
|
+
CHUNK_OVERLAP_LINES=2
|
|
19
|
+
|
|
20
|
+
# Search defaults
|
|
21
|
+
DEFAULT_TOP_K=10
|
|
22
|
+
|
|
23
|
+
# Hybrid search weights (0.0-1.0, must sum to 1.0)
|
|
24
|
+
HYBRID_WEIGHT_TEXT=0.5
|
|
25
|
+
HYBRID_WEIGHT_VECTOR=0.5
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tideshift Labs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/sextant-logo.png" alt="Sextant" width="300">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License: MIT"></a>
|
|
7
|
+
<img src="https://img.shields.io/badge/runtime-Bun-f9f1e1?logo=bun" alt="Bun">
|
|
8
|
+
<img src="https://img.shields.io/badge/protocol-MCP-7c3aed" alt="MCP">
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
# Sextant
|
|
12
|
+
|
|
13
|
+
Sextant is an MCP server that provides hybrid semantic and keyword search over a local folder of markdown files. It indexes your docs, chunks them by heading, embeds them through a local Ollama model, and exposes search and retrieval tools over the Model Context Protocol (MCP).
|
|
14
|
+
|
|
15
|
+
Claude Code (or any MCP client) can then search your documentation using natural language queries, exact keyword lookups, or a combination of both.
|
|
16
|
+
|
|
17
|
+
## How it works
|
|
18
|
+
|
|
19
|
+
1. Sextant scans a folder of markdown files and splits them into chunks at each heading boundary.
|
|
20
|
+
2. Each chunk is embedded locally using Ollama (nomic-embed-text).
|
|
21
|
+
3. Chunks and their embeddings are stored in Orama, which handles full-text, vector, and hybrid search in a single library.
|
|
22
|
+
4. File metadata is tracked in SQLite (via bun:sqlite) so unchanged files are skipped on restart.
|
|
23
|
+
5. Each search request checks for stale files and triggers a background reindex if needed.
|
|
24
|
+
6. The MCP server exposes five tools over stdio: `search_docs`, `list_docs`, `get_doc`, `reindex_docs`, and `sextant_status`.
|
|
25
|
+
|
|
26
|
+
The entire stack runs locally with no external services. There are no native modules; everything is pure JavaScript/TypeScript.
|
|
27
|
+
|
|
28
|
+
### Index freshness
|
|
29
|
+
|
|
30
|
+
Sextant keeps its index up to date automatically:
|
|
31
|
+
|
|
32
|
+
- On startup, Sextant compares every file's modification time against what was previously indexed. New and changed files are re-indexed; deleted files are removed from the index. This covers cases where docs were updated while the server was not running (for example, after a `git pull`).
|
|
33
|
+
- On each search, a freshness check detects new, modified, or deleted files and triggers a background reindex without blocking the query. If files have changed, the response includes a note and results update on the next search.
|
|
34
|
+
- If another Sextant process has persisted a newer index to disk, it is picked up automatically before searching.
|
|
35
|
+
|
|
36
|
+
## Prerequisites
|
|
37
|
+
|
|
38
|
+
### Bun
|
|
39
|
+
|
|
40
|
+
Sextant runs on [Bun](https://bun.sh), which is used as the runtime, package manager, and bundler.
|
|
41
|
+
|
|
42
|
+
**Windows (PowerShell):**
|
|
43
|
+
|
|
44
|
+
```powershell
|
|
45
|
+
powershell -c "irm bun.sh/install.ps1 | iex"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**macOS / Linux:**
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
curl -fsSL https://bun.sh/install | bash
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Verify the installation with `bun --version`.
|
|
55
|
+
|
|
56
|
+
### Ollama
|
|
57
|
+
|
|
58
|
+
[Ollama](https://ollama.com) must be installed and running locally. Pull the embedding model before first use:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
ollama pull nomic-embed-text
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Ollama runs on `http://localhost:11434` by default. If Ollama is not running, keyword search will still work, but semantic and hybrid search will be unavailable.
|
|
65
|
+
|
|
66
|
+
## Usage with Claude Code
|
|
67
|
+
|
|
68
|
+
Add the following to your Claude Code MCP configuration (`.claude/mcp.json` in your project):
|
|
69
|
+
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"mcpServers": {
|
|
73
|
+
"sextant": {
|
|
74
|
+
"type": "stdio",
|
|
75
|
+
"command": "bunx",
|
|
76
|
+
"args": ["@tideshift/sextant"],
|
|
77
|
+
"env": {
|
|
78
|
+
"DOCS_PATH": "/absolute/path/to/your/docs"
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Set `DOCS_PATH` to the absolute path of the markdown folder you want to index.
|
|
86
|
+
|
|
87
|
+
Once configured, Claude Code will have access to the following tools:
|
|
88
|
+
|
|
89
|
+
- **search_docs** - Search documentation using hybrid semantic + keyword search. Supports three modes: `hybrid` (default, combines both), `semantic` (vector similarity only), and `keyword` (exact text matching only). Accepts an optional category filter.
|
|
90
|
+
- **list_docs** - List all indexed documents with their categories, titles, and chunk counts.
|
|
91
|
+
- **get_doc** - Retrieve the full content of a specific document by file path.
|
|
92
|
+
- **reindex_docs** - Force a full re-index of all documents.
|
|
93
|
+
- **sextant_status** - Check server health, indexing progress, Ollama connectivity, and index statistics.
|
|
94
|
+
|
|
95
|
+
## Configuration
|
|
96
|
+
|
|
97
|
+
All settings can be configured through environment variables or a `.env` file:
|
|
98
|
+
|
|
99
|
+
| Variable | Default | Description |
|
|
100
|
+
|---|---|---|
|
|
101
|
+
| `DOCS_PATH` | `./docs` | Path to the markdown folder to index |
|
|
102
|
+
| `OLLAMA_URL` | `http://localhost:11434` | Ollama API endpoint |
|
|
103
|
+
| `EMBEDDING_MODEL` | `nomic-embed-text` | Ollama embedding model name |
|
|
104
|
+
| `EMBEDDING_DIMS` | `768` | Embedding vector dimensions (must match model) |
|
|
105
|
+
| `DATA_PATH` | `.sextant` | Where to store the index and metadata |
|
|
106
|
+
| `MAX_CHUNK_TOKENS` | `512` | Maximum chunk size (estimated as chars / 4) |
|
|
107
|
+
| `DEFAULT_TOP_K` | `10` | Default number of search results |
|
|
108
|
+
| `HYBRID_WEIGHT_TEXT` | `0.5` | Weight for keyword matching in hybrid mode |
|
|
109
|
+
| `HYBRID_WEIGHT_VECTOR` | `0.5` | Weight for semantic matching in hybrid mode |
|
|
110
|
+
|
|
111
|
+
## Contributing
|
|
112
|
+
|
|
113
|
+
See [docs/contributing.md](docs/contributing.md) for local development setup, building, and diagnostics.
|
|
114
|
+
|
|
115
|
+
## Technical details
|
|
116
|
+
|
|
117
|
+
See [docs/technical_details.md](docs/technical_details.md) for architecture, project structure, chunking strategy, persistence model, index freshness logic, and edge cases.
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
Made with ❤️ in Vancouver, BC by Tideshift Labs, developed using [Claude Code](https://claude.ai/code).
|
|
122
|
+
|
|
123
|
+
[MIT](LICENSE)
|
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tideshift/sextant",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "MCP server for hybrid semantic + keyword search over local markdown docs",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/Tideshift-Labs/sextant.git"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://github.com/Tideshift-Labs/sextant",
|
|
11
|
+
"keywords": [
|
|
12
|
+
"mcp",
|
|
13
|
+
"search",
|
|
14
|
+
"documentation",
|
|
15
|
+
"semantic-search",
|
|
16
|
+
"ollama",
|
|
17
|
+
"embeddings",
|
|
18
|
+
"orama",
|
|
19
|
+
"markdown",
|
|
20
|
+
"claude",
|
|
21
|
+
"model-context-protocol"
|
|
22
|
+
],
|
|
23
|
+
"module": "src/index.ts",
|
|
24
|
+
"type": "module",
|
|
25
|
+
"bin": {
|
|
26
|
+
"sextant": "src/index.ts"
|
|
27
|
+
},
|
|
28
|
+
"files": [
|
|
29
|
+
"src/**/*",
|
|
30
|
+
".env.example"
|
|
31
|
+
],
|
|
32
|
+
"scripts": {
|
|
33
|
+
"dev": "bun run src/index.ts",
|
|
34
|
+
"start": "bun run src/index.ts",
|
|
35
|
+
"build": "bun build --compile --minify src/index.ts --outfile dist/docs-mcp-server",
|
|
36
|
+
"typecheck": "bunx tsc --noEmit",
|
|
37
|
+
"test": "bun test",
|
|
38
|
+
"reindex": "bun run src/scripts/reindex.ts"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@types/bun": "latest",
|
|
42
|
+
"@types/node": "^25.5.0",
|
|
43
|
+
"bun-types": "^1.3.10"
|
|
44
|
+
},
|
|
45
|
+
"peerDependencies": {
|
|
46
|
+
"typescript": "^5"
|
|
47
|
+
},
|
|
48
|
+
"dependencies": {
|
|
49
|
+
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
50
|
+
"@orama/orama": "^3.1.18",
|
|
51
|
+
"dotenv": "^17.3.1"
|
|
52
|
+
}
|
|
53
|
+
}
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import dotenv from 'dotenv';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
4
|
+
dotenv.config();
|
|
5
|
+
|
|
6
|
+
function envStr(key: string, fallback: string): string {
|
|
7
|
+
return process.env[key] ?? fallback;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function envInt(key: string, fallback: number): number {
|
|
11
|
+
const v = process.env[key];
|
|
12
|
+
return v ? parseInt(v, 10) : fallback;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function envFloat(key: string, fallback: number): number {
|
|
16
|
+
const v = process.env[key];
|
|
17
|
+
return v ? parseFloat(v) : fallback;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function envBool(key: string, fallback: boolean): boolean {
|
|
21
|
+
const v = process.env[key];
|
|
22
|
+
if (!v) return fallback;
|
|
23
|
+
return v.toLowerCase() === 'true' || v === '1';
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export const config = {
|
|
27
|
+
docsPath: path.resolve(envStr('DOCS_PATH', './docs')),
|
|
28
|
+
ollamaUrl: envStr('OLLAMA_URL', 'http://localhost:11434'),
|
|
29
|
+
embeddingModel: envStr('EMBEDDING_MODEL', 'nomic-embed-text'),
|
|
30
|
+
embeddingDims: envInt('EMBEDDING_DIMS', 768),
|
|
31
|
+
dataPath: path.resolve(envStr('DATA_PATH', '.sextant')),
|
|
32
|
+
maxChunkTokens: envInt('MAX_CHUNK_TOKENS', 512),
|
|
33
|
+
chunkOverlapLines: envInt('CHUNK_OVERLAP_LINES', 2),
|
|
34
|
+
defaultTopK: envInt('DEFAULT_TOP_K', 10),
|
|
35
|
+
hybridWeightText: envFloat('HYBRID_WEIGHT_TEXT', 0.5),
|
|
36
|
+
hybridWeightVector: envFloat('HYBRID_WEIGHT_VECTOR', 0.5),
|
|
37
|
+
similarityThreshold: envFloat('SIMILARITY_THRESHOLD', 0.5),
|
|
38
|
+
|
|
39
|
+
// Embedding instructions (nomic-embed-text uses "search_document: " / "search_query: " prefixes)
|
|
40
|
+
indexInstruction: envStr('INDEX_INSTRUCTION', 'search_document: '),
|
|
41
|
+
queryInstruction: envStr('QUERY_INSTRUCTION', 'search_query: '),
|
|
42
|
+
|
|
43
|
+
// Persistence debounce
|
|
44
|
+
persistDebounceMs: envInt('PERSIST_DEBOUNCE_MS', 5000),
|
|
45
|
+
|
|
46
|
+
// Embedding batch size
|
|
47
|
+
embeddingBatchSize: envInt('EMBEDDING_BATCH_SIZE', 32),
|
|
48
|
+
} as const;
|
|
49
|
+
|
|
50
|
+
export type Config = typeof config;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { existsSync, readFileSync, appendFileSync } from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
4
|
+
const SEXTANT_ENTRY = '.sextant/';
|
|
5
|
+
const IGNORE_FILES = ['.gitignore', '.dvignore', '.p4ignore', '.hgignore'];
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* For each known ignore file that exists in the project root,
|
|
9
|
+
* append `.sextant/` if it isn't already listed.
|
|
10
|
+
*/
|
|
11
|
+
export function ensureIgnored(projectRoot: string): void {
|
|
12
|
+
for (const file of IGNORE_FILES) {
|
|
13
|
+
const filePath = path.join(projectRoot, file);
|
|
14
|
+
if (!existsSync(filePath)) continue;
|
|
15
|
+
|
|
16
|
+
try {
|
|
17
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
18
|
+
// Check if already ignored (exact line match)
|
|
19
|
+
const lines = content.split(/\r?\n/);
|
|
20
|
+
if (lines.some((line) => line.trim() === SEXTANT_ENTRY || line.trim() === '.sextant')) {
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const suffix = content.endsWith('\n') ? '' : '\n';
|
|
25
|
+
appendFileSync(filePath, `${suffix}${SEXTANT_ENTRY}\n`);
|
|
26
|
+
console.error(`[ignore] Added ${SEXTANT_ENTRY} to ${file}`);
|
|
27
|
+
} catch (err) {
|
|
28
|
+
console.error(`[ignore] Could not update ${file}:`, err);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { mkdirSync } from 'fs';
|
|
3
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
4
|
+
import { config } from './config.ts';
|
|
5
|
+
import { initMetadataDb, closeMetadataDb } from './store/metadata-db.ts';
|
|
6
|
+
import { initStore } from './store/orama-store.ts';
|
|
7
|
+
import { loadFromDisk, flushPersist } from './store/persistence.ts';
|
|
8
|
+
import { indexAll } from './indexer/pipeline.ts';
|
|
9
|
+
import { requestCancel } from './indexer/state.ts';
|
|
10
|
+
import { createMcpServer } from './server.ts';
|
|
11
|
+
import { ensureIgnored } from './ignore-files.ts';
|
|
12
|
+
|
|
13
|
+
async function main() {
|
|
14
|
+
console.error('[startup] sextant starting...');
|
|
15
|
+
console.error(`[startup] Docs path: ${config.docsPath}`);
|
|
16
|
+
console.error(`[startup] Data path: ${config.dataPath}`);
|
|
17
|
+
|
|
18
|
+
// Ensure directories exist
|
|
19
|
+
mkdirSync(config.dataPath, { recursive: true });
|
|
20
|
+
mkdirSync(config.docsPath, { recursive: true });
|
|
21
|
+
|
|
22
|
+
// Add .sextant/ to ignore files if they exist in cwd
|
|
23
|
+
ensureIgnored(process.cwd());
|
|
24
|
+
|
|
25
|
+
// Initialize metadata DB
|
|
26
|
+
initMetadataDb();
|
|
27
|
+
console.error('[startup] Metadata DB initialized');
|
|
28
|
+
|
|
29
|
+
// Load persisted index from disk, or create fresh
|
|
30
|
+
const loaded = await loadFromDisk();
|
|
31
|
+
if (!loaded) {
|
|
32
|
+
await initStore();
|
|
33
|
+
console.error('[startup] Created fresh Orama index');
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Start MCP server immediately so the connection is available while indexing
|
|
37
|
+
const server = createMcpServer();
|
|
38
|
+
const transport = new StdioServerTransport();
|
|
39
|
+
await server.connect(transport);
|
|
40
|
+
console.error('[startup] MCP server running on stdio');
|
|
41
|
+
|
|
42
|
+
// Run incremental reindex in background (only re-embeds changed files)
|
|
43
|
+
const indexingPromise = indexAll(config.docsPath)
|
|
44
|
+
.then((stats) => {
|
|
45
|
+
console.error(`[startup] Indexing complete: ${stats.filesProcessed} files, ${stats.chunksCreated} chunks in ${stats.duration}ms`);
|
|
46
|
+
})
|
|
47
|
+
.catch((err) => {
|
|
48
|
+
console.error('[startup] Initial indexing failed (server keeps running):', err);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// Graceful shutdown
|
|
52
|
+
const shutdown = async () => {
|
|
53
|
+
console.error('[shutdown] Shutting down...');
|
|
54
|
+
requestCancel();
|
|
55
|
+
// Give indexing up to 2s to finish gracefully
|
|
56
|
+
await Promise.race([indexingPromise, Bun.sleep(2000)]);
|
|
57
|
+
await flushPersist();
|
|
58
|
+
closeMetadataDb();
|
|
59
|
+
process.exit(0);
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
process.on('SIGINT', shutdown);
|
|
63
|
+
process.on('SIGTERM', shutdown);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
main().catch((err) => {
|
|
67
|
+
console.error('Fatal error:', err);
|
|
68
|
+
process.exit(1);
|
|
69
|
+
});
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import path from 'path';
|
|
2
|
+
import { createHash } from 'crypto';
|
|
3
|
+
import { config } from '../config.ts';
|
|
4
|
+
import type { DocChunk, ChunkMetadata } from './types.ts';
|
|
5
|
+
|
|
6
|
+
const MAX_CHUNK_CHARS = config.maxChunkTokens * 4; // ~1 token ≈ 4 chars
|
|
7
|
+
|
|
8
|
+
interface FrontmatterResult {
|
|
9
|
+
metadata: ChunkMetadata;
|
|
10
|
+
bodyStartIndex: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function parseFrontmatter(lines: string[]): FrontmatterResult {
|
|
14
|
+
const metadata: ChunkMetadata = {};
|
|
15
|
+
if (lines[0]?.trim() !== '---') {
|
|
16
|
+
return { metadata, bodyStartIndex: 0 };
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
for (let i = 1; i < lines.length; i++) {
|
|
20
|
+
if (lines[i]!.trim() === '---') {
|
|
21
|
+
// Parse YAML-like frontmatter between the --- delimiters
|
|
22
|
+
for (let j = 1; j < i; j++) {
|
|
23
|
+
const line = lines[j]!;
|
|
24
|
+
const match = line.match(/^(\w+)\s*:\s*(.+)$/);
|
|
25
|
+
if (match) {
|
|
26
|
+
const [, key, value] = match;
|
|
27
|
+
if (key === 'title') metadata.title = value!.replace(/^["']|["']$/g, '');
|
|
28
|
+
if (key === 'category') metadata.category = value!.replace(/^["']|["']$/g, '');
|
|
29
|
+
if (key === 'tags') {
|
|
30
|
+
metadata.tags = value!
|
|
31
|
+
.replace(/^\[|\]$/g, '')
|
|
32
|
+
.split(',')
|
|
33
|
+
.map((t) => t.trim().replace(/^["']|["']$/g, ''));
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return { metadata, bodyStartIndex: i + 1 };
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return { metadata, bodyStartIndex: 0 };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function makeChunkId(filePath: string, content: string): string {
|
|
44
|
+
const input = `${filePath}::${content}`;
|
|
45
|
+
return createHash('sha256').update(input).digest('hex').slice(0, 16);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function getHeadingLevel(line: string): number {
|
|
49
|
+
const match = line.match(/^(#{1,6})\s/);
|
|
50
|
+
return match ? match[1]!.length : 0;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function getHeadingText(line: string): string {
|
|
54
|
+
return line.replace(/^#{1,6}\s+/, '').trim();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function isInsideCodeBlock(lines: string[], index: number): boolean {
|
|
58
|
+
let fenceCount = 0;
|
|
59
|
+
for (let i = 0; i < index; i++) {
|
|
60
|
+
if (lines[i]!.trimStart().startsWith('```')) {
|
|
61
|
+
fenceCount++;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return fenceCount % 2 === 1;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
interface HeadingSection {
|
|
68
|
+
headingHierarchy: string[];
|
|
69
|
+
lines: string[];
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function splitByHeadings(lines: string[]): HeadingSection[] {
|
|
73
|
+
const sections: HeadingSection[] = [];
|
|
74
|
+
const hierarchyStack: { level: number; text: string }[] = [];
|
|
75
|
+
let currentLines: string[] = [];
|
|
76
|
+
let hasHeading = false;
|
|
77
|
+
|
|
78
|
+
for (let i = 0; i < lines.length; i++) {
|
|
79
|
+
const line = lines[i]!;
|
|
80
|
+
const level = getHeadingLevel(line);
|
|
81
|
+
|
|
82
|
+
if (level > 0 && !isInsideCodeBlock(lines, i)) {
|
|
83
|
+
// Save previous section
|
|
84
|
+
if (currentLines.length > 0 || hasHeading) {
|
|
85
|
+
sections.push({
|
|
86
|
+
headingHierarchy: hierarchyStack.map((h) => h.text),
|
|
87
|
+
lines: currentLines,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Update heading hierarchy
|
|
92
|
+
while (hierarchyStack.length > 0 && hierarchyStack[hierarchyStack.length - 1]!.level >= level) {
|
|
93
|
+
hierarchyStack.pop();
|
|
94
|
+
}
|
|
95
|
+
hierarchyStack.push({ level, text: getHeadingText(line) });
|
|
96
|
+
|
|
97
|
+
currentLines = [line];
|
|
98
|
+
hasHeading = true;
|
|
99
|
+
} else {
|
|
100
|
+
currentLines.push(line);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Push final section
|
|
105
|
+
if (currentLines.length > 0) {
|
|
106
|
+
sections.push({
|
|
107
|
+
headingHierarchy: hasHeading ? hierarchyStack.map((h) => h.text) : [],
|
|
108
|
+
lines: currentLines,
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return sections;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function splitOversizedParagraph(para: string): string[] {
|
|
116
|
+
// Hard-split a single paragraph that exceeds MAX_CHUNK_CHARS by lines
|
|
117
|
+
const lines = para.split('\n');
|
|
118
|
+
const pieces: string[] = [];
|
|
119
|
+
let current = '';
|
|
120
|
+
|
|
121
|
+
for (const line of lines) {
|
|
122
|
+
// If a single line exceeds the limit, truncate it
|
|
123
|
+
const safeLine = line.length > MAX_CHUNK_CHARS ? line.slice(0, MAX_CHUNK_CHARS) : line;
|
|
124
|
+
|
|
125
|
+
if (current.length + safeLine.length + 1 > MAX_CHUNK_CHARS && current.length > 0) {
|
|
126
|
+
pieces.push(current.trim());
|
|
127
|
+
current = safeLine;
|
|
128
|
+
} else {
|
|
129
|
+
current = current ? current + '\n' + safeLine : safeLine;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (current.trim()) {
|
|
134
|
+
pieces.push(current.trim());
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return pieces.length > 0 ? pieces : [para];
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function splitLargeChunk(content: string, overlapLines: number): string[] {
|
|
141
|
+
if (content.length <= MAX_CHUNK_CHARS) return [content];
|
|
142
|
+
|
|
143
|
+
const paragraphs = content.split(/\n\n+/);
|
|
144
|
+
const subChunks: string[] = [];
|
|
145
|
+
let current = '';
|
|
146
|
+
|
|
147
|
+
for (const para of paragraphs) {
|
|
148
|
+
// If a single paragraph exceeds the limit, hard-split it by lines
|
|
149
|
+
if (para.length > MAX_CHUNK_CHARS) {
|
|
150
|
+
if (current.trim()) {
|
|
151
|
+
subChunks.push(current.trim());
|
|
152
|
+
current = '';
|
|
153
|
+
}
|
|
154
|
+
subChunks.push(...splitOversizedParagraph(para));
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (current.length + para.length + 2 > MAX_CHUNK_CHARS && current.length > 0) {
|
|
159
|
+
subChunks.push(current.trim());
|
|
160
|
+
// Add overlap: last N lines of previous chunk
|
|
161
|
+
const prevLines = current.trimEnd().split('\n');
|
|
162
|
+
const overlap = prevLines.slice(-overlapLines).join('\n');
|
|
163
|
+
current = overlap ? overlap + '\n\n' + para : para;
|
|
164
|
+
} else {
|
|
165
|
+
current = current ? current + '\n\n' + para : para;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (current.trim()) {
|
|
170
|
+
subChunks.push(current.trim());
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return subChunks.length > 0 ? subChunks : [content];
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export function chunkMarkdown(
|
|
177
|
+
filePath: string,
|
|
178
|
+
rawContent: string,
|
|
179
|
+
lastModified: number,
|
|
180
|
+
docsPath: string
|
|
181
|
+
): { chunks: DocChunk[]; metadata: ChunkMetadata } {
|
|
182
|
+
const relativePath = path.relative(docsPath, filePath).replace(/\\/g, '/');
|
|
183
|
+
const fileName = path.basename(filePath);
|
|
184
|
+
const parts = relativePath.split('/');
|
|
185
|
+
const category = parts.length > 1 ? parts[0]! : 'root';
|
|
186
|
+
|
|
187
|
+
const lines = rawContent.split('\n');
|
|
188
|
+
const { metadata, bodyStartIndex } = parseFrontmatter(lines);
|
|
189
|
+
const bodyLines = lines.slice(bodyStartIndex);
|
|
190
|
+
const fileCategory = metadata.category ?? category;
|
|
191
|
+
|
|
192
|
+
const sections = splitByHeadings(bodyLines);
|
|
193
|
+
const chunks: DocChunk[] = [];
|
|
194
|
+
|
|
195
|
+
// If no headings found, treat entire file as one chunk
|
|
196
|
+
if (sections.length === 0 || (sections.length === 1 && sections[0]!.headingHierarchy.length === 0)) {
|
|
197
|
+
const content = bodyLines.join('\n').trim();
|
|
198
|
+
if (content) {
|
|
199
|
+
const hierarchy = [metadata.title ?? fileName.replace(/\.md$/, '')];
|
|
200
|
+
const subChunks = splitLargeChunk(content, config.chunkOverlapLines);
|
|
201
|
+
for (let i = 0; i < subChunks.length; i++) {
|
|
202
|
+
chunks.push({
|
|
203
|
+
id: makeChunkId(relativePath, subChunks[i]!),
|
|
204
|
+
filePath: relativePath,
|
|
205
|
+
fileName,
|
|
206
|
+
category: fileCategory,
|
|
207
|
+
headingHierarchy: hierarchy,
|
|
208
|
+
headingSlug: hierarchy.join(' > '),
|
|
209
|
+
chunkIndex: i,
|
|
210
|
+
content: subChunks[i]!,
|
|
211
|
+
charCount: subChunks[i]!.length,
|
|
212
|
+
lastModified,
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
return { chunks, metadata };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
for (const section of sections) {
|
|
220
|
+
const content = section.lines.join('\n').trim();
|
|
221
|
+
if (!content) continue;
|
|
222
|
+
|
|
223
|
+
const hierarchy = section.headingHierarchy.length > 0
|
|
224
|
+
? section.headingHierarchy
|
|
225
|
+
: [metadata.title ?? fileName.replace(/\.md$/, '')];
|
|
226
|
+
|
|
227
|
+
const subChunks = splitLargeChunk(content, config.chunkOverlapLines);
|
|
228
|
+
for (let i = 0; i < subChunks.length; i++) {
|
|
229
|
+
chunks.push({
|
|
230
|
+
id: makeChunkId(relativePath, subChunks[i]!),
|
|
231
|
+
filePath: relativePath,
|
|
232
|
+
fileName,
|
|
233
|
+
category: fileCategory,
|
|
234
|
+
headingHierarchy: hierarchy,
|
|
235
|
+
headingSlug: hierarchy.join(' > '),
|
|
236
|
+
chunkIndex: i,
|
|
237
|
+
content: subChunks[i]!,
|
|
238
|
+
charCount: subChunks[i]!.length,
|
|
239
|
+
lastModified,
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return { chunks, metadata };
|
|
245
|
+
}
|