@tideshift/sextant 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example ADDED
@@ -0,0 +1,25 @@
1
+ # Path to the docs folder to index (absolute or relative to project root)
2
+ DOCS_PATH=./docs
3
+
4
+ # Ollama endpoint
5
+ OLLAMA_URL=http://localhost:11434
6
+
7
+ # Embedding model
8
+ EMBEDDING_MODEL=nomic-embed-text
9
+
10
+ # Embedding dimensions (must match model; nomic-embed-text = 768, mxbai-embed-large = 1024)
11
+ EMBEDDING_DIMS=768
12
+
13
+ # Data storage path (for Orama index + SQLite metadata)
14
+ DATA_PATH=.sextant
15
+
16
+ # Chunk size limits
17
+ MAX_CHUNK_TOKENS=512
18
+ CHUNK_OVERLAP_LINES=2
19
+
20
+ # Search defaults
21
+ DEFAULT_TOP_K=10
22
+
23
+ # Hybrid search weights (0.0-1.0, must sum to 1.0)
24
+ HYBRID_WEIGHT_TEXT=0.5
25
+ HYBRID_WEIGHT_VECTOR=0.5
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tideshift Labs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,123 @@
1
+ <p align="center">
2
+ <img src="docs/sextant-logo.png" alt="Sextant" width="300">
3
+ </p>
4
+
5
+ <p align="center">
6
+ <a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License: MIT"></a>
7
+ <img src="https://img.shields.io/badge/runtime-Bun-f9f1e1?logo=bun" alt="Bun">
8
+ <img src="https://img.shields.io/badge/protocol-MCP-7c3aed" alt="MCP">
9
+ </p>
10
+
11
+ # Sextant
12
+
13
+ Sextant is an MCP server that provides hybrid semantic and keyword search over a local folder of markdown files. It indexes your docs, chunks them by heading, embeds them through a local Ollama model, and exposes search and retrieval tools over the Model Context Protocol (MCP).
14
+
15
+ Claude Code (or any MCP client) can then search your documentation using natural language queries, exact keyword lookups, or a combination of both.
16
+
17
+ ## How it works
18
+
19
+ 1. Sextant scans a folder of markdown files and splits them into chunks at each heading boundary.
20
+ 2. Each chunk is embedded locally using Ollama (nomic-embed-text).
21
+ 3. Chunks and their embeddings are stored in Orama, which handles full-text, vector, and hybrid search in a single library.
22
+ 4. File metadata is tracked in SQLite (via bun:sqlite) so unchanged files are skipped on restart.
23
+ 5. Each search request checks for stale files and triggers a background reindex if needed.
24
+ 6. The MCP server exposes five tools over stdio: `search_docs`, `list_docs`, `get_doc`, `reindex_docs`, and `sextant_status`.
25
+
26
+ The entire stack runs locally with no external services. There are no native modules; everything is pure JavaScript/TypeScript.
27
+
28
+ ### Index freshness
29
+
30
+ Sextant keeps its index up to date automatically:
31
+
32
+ - On startup, Sextant compares every file's modification time against what was previously indexed. New and changed files are re-indexed; deleted files are removed from the index. This covers cases where docs were updated while the server was not running (for example, after a `git pull`).
33
+ - On each search, a freshness check detects new, modified, or deleted files and triggers a background reindex without blocking the query. If files have changed, the response includes a note and results update on the next search.
34
+ - If another Sextant process has persisted a newer index to disk, it is picked up automatically before searching.
35
+
36
+ ## Prerequisites
37
+
38
+ ### Bun
39
+
40
+ Sextant runs on [Bun](https://bun.sh), which is used as the runtime, package manager, and bundler.
41
+
42
+ **Windows (PowerShell):**
43
+
44
+ ```powershell
45
+ powershell -c "irm bun.sh/install.ps1 | iex"
46
+ ```
47
+
48
+ **macOS / Linux:**
49
+
50
+ ```bash
51
+ curl -fsSL https://bun.sh/install | bash
52
+ ```
53
+
54
+ Verify the installation with `bun --version`.
55
+
56
+ ### Ollama
57
+
58
+ [Ollama](https://ollama.com) must be installed and running locally. Pull the embedding model before first use:
59
+
60
+ ```bash
61
+ ollama pull nomic-embed-text
62
+ ```
63
+
64
+ Ollama runs on `http://localhost:11434` by default. If Ollama is not running, keyword search will still work, but semantic and hybrid search will be unavailable.
65
+
66
+ ## Usage with Claude Code
67
+
68
+ Add the following to your Claude Code MCP configuration (`.claude/mcp.json` in your project):
69
+
70
+ ```json
71
+ {
72
+ "mcpServers": {
73
+ "sextant": {
74
+ "type": "stdio",
75
+ "command": "bunx",
76
+ "args": ["@tideshift/sextant"],
77
+ "env": {
78
+ "DOCS_PATH": "/absolute/path/to/your/docs"
79
+ }
80
+ }
81
+ }
82
+ }
83
+ ```
84
+
85
+ Set `DOCS_PATH` to the absolute path of the markdown folder you want to index.
86
+
87
+ Once configured, Claude Code will have access to the following tools:
88
+
89
+ - **search_docs** - Search documentation using hybrid semantic + keyword search. Supports three modes: `hybrid` (default, combines both), `semantic` (vector similarity only), and `keyword` (exact text matching only). Accepts an optional category filter.
90
+ - **list_docs** - List all indexed documents with their categories, titles, and chunk counts.
91
+ - **get_doc** - Retrieve the full content of a specific document by file path.
92
+ - **reindex_docs** - Force a full re-index of all documents.
93
+ - **sextant_status** - Check server health, indexing progress, Ollama connectivity, and index statistics.
94
+
95
+ ## Configuration
96
+
97
+ All settings can be configured through environment variables or a `.env` file:
98
+
99
+ | Variable | Default | Description |
100
+ |---|---|---|
101
+ | `DOCS_PATH` | `./docs` | Path to the markdown folder to index |
102
+ | `OLLAMA_URL` | `http://localhost:11434` | Ollama API endpoint |
103
+ | `EMBEDDING_MODEL` | `nomic-embed-text` | Ollama embedding model name |
104
+ | `EMBEDDING_DIMS` | `768` | Embedding vector dimensions (must match model) |
105
+ | `DATA_PATH` | `.sextant` | Where to store the index and metadata |
106
+ | `MAX_CHUNK_TOKENS` | `512` | Maximum chunk size (estimated as chars / 4) |
107
+ | `DEFAULT_TOP_K` | `10` | Default number of search results |
108
+ | `HYBRID_WEIGHT_TEXT` | `0.5` | Weight for keyword matching in hybrid mode |
109
+ | `HYBRID_WEIGHT_VECTOR` | `0.5` | Weight for semantic matching in hybrid mode |
110
+
111
+ ## Contributing
112
+
113
+ See [docs/contributing.md](docs/contributing.md) for local development setup, building, and diagnostics.
114
+
115
+ ## Technical details
116
+
117
+ See [docs/technical_details.md](docs/technical_details.md) for architecture, project structure, chunking strategy, persistence model, index freshness logic, and edge cases.
118
+
119
+ ## License
120
+
121
+ Made with ❤️ in Vancouver, BC by Tideshift Labs, developed using [Claude Code](https://claude.ai/code).
122
+
123
+ [MIT](LICENSE)
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "@tideshift/sextant",
3
+ "version": "0.1.0",
4
+ "description": "MCP server for hybrid semantic + keyword search over local markdown docs",
5
+ "license": "MIT",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/Tideshift-Labs/sextant.git"
9
+ },
10
+ "homepage": "https://github.com/Tideshift-Labs/sextant",
11
+ "keywords": [
12
+ "mcp",
13
+ "search",
14
+ "documentation",
15
+ "semantic-search",
16
+ "ollama",
17
+ "embeddings",
18
+ "orama",
19
+ "markdown",
20
+ "claude",
21
+ "model-context-protocol"
22
+ ],
23
+ "module": "src/index.ts",
24
+ "type": "module",
25
+ "bin": {
26
+ "sextant": "src/index.ts"
27
+ },
28
+ "files": [
29
+ "src/**/*",
30
+ ".env.example"
31
+ ],
32
+ "scripts": {
33
+ "dev": "bun run src/index.ts",
34
+ "start": "bun run src/index.ts",
35
+ "build": "bun build --compile --minify src/index.ts --outfile dist/docs-mcp-server",
36
+ "typecheck": "bunx tsc --noEmit",
37
+ "test": "bun test",
38
+ "reindex": "bun run src/scripts/reindex.ts"
39
+ },
40
+ "devDependencies": {
41
+ "@types/bun": "latest",
42
+ "@types/node": "^25.5.0",
43
+ "bun-types": "^1.3.10"
44
+ },
45
+ "peerDependencies": {
46
+ "typescript": "^5"
47
+ },
48
+ "dependencies": {
49
+ "@modelcontextprotocol/sdk": "^1.27.1",
50
+ "@orama/orama": "^3.1.18",
51
+ "dotenv": "^17.3.1"
52
+ }
53
+ }
package/src/config.ts ADDED
@@ -0,0 +1,50 @@
1
+ import dotenv from 'dotenv';
2
+ import path from 'path';
3
+
4
+ dotenv.config();
5
+
6
+ function envStr(key: string, fallback: string): string {
7
+ return process.env[key] ?? fallback;
8
+ }
9
+
10
+ function envInt(key: string, fallback: number): number {
11
+ const v = process.env[key];
12
+ return v ? parseInt(v, 10) : fallback;
13
+ }
14
+
15
+ function envFloat(key: string, fallback: number): number {
16
+ const v = process.env[key];
17
+ return v ? parseFloat(v) : fallback;
18
+ }
19
+
20
+ function envBool(key: string, fallback: boolean): boolean {
21
+ const v = process.env[key];
22
+ if (!v) return fallback;
23
+ return v.toLowerCase() === 'true' || v === '1';
24
+ }
25
+
26
+ export const config = {
27
+ docsPath: path.resolve(envStr('DOCS_PATH', './docs')),
28
+ ollamaUrl: envStr('OLLAMA_URL', 'http://localhost:11434'),
29
+ embeddingModel: envStr('EMBEDDING_MODEL', 'nomic-embed-text'),
30
+ embeddingDims: envInt('EMBEDDING_DIMS', 768),
31
+ dataPath: path.resolve(envStr('DATA_PATH', '.sextant')),
32
+ maxChunkTokens: envInt('MAX_CHUNK_TOKENS', 512),
33
+ chunkOverlapLines: envInt('CHUNK_OVERLAP_LINES', 2),
34
+ defaultTopK: envInt('DEFAULT_TOP_K', 10),
35
+ hybridWeightText: envFloat('HYBRID_WEIGHT_TEXT', 0.5),
36
+ hybridWeightVector: envFloat('HYBRID_WEIGHT_VECTOR', 0.5),
37
+ similarityThreshold: envFloat('SIMILARITY_THRESHOLD', 0.5),
38
+
39
+ // Embedding instructions (nomic-embed-text uses "search_document: " / "search_query: " prefixes)
40
+ indexInstruction: envStr('INDEX_INSTRUCTION', 'search_document: '),
41
+ queryInstruction: envStr('QUERY_INSTRUCTION', 'search_query: '),
42
+
43
+ // Persistence debounce
44
+ persistDebounceMs: envInt('PERSIST_DEBOUNCE_MS', 5000),
45
+
46
+ // Embedding batch size
47
+ embeddingBatchSize: envInt('EMBEDDING_BATCH_SIZE', 32),
48
+ } as const;
49
+
50
+ export type Config = typeof config;
@@ -0,0 +1,31 @@
1
+ import { existsSync, readFileSync, appendFileSync } from 'fs';
2
+ import path from 'path';
3
+
4
+ const SEXTANT_ENTRY = '.sextant/';
5
+ const IGNORE_FILES = ['.gitignore', '.dvignore', '.p4ignore', '.hgignore'];
6
+
7
+ /**
8
+ * For each known ignore file that exists in the project root,
9
+ * append `.sextant/` if it isn't already listed.
10
+ */
11
+ export function ensureIgnored(projectRoot: string): void {
12
+ for (const file of IGNORE_FILES) {
13
+ const filePath = path.join(projectRoot, file);
14
+ if (!existsSync(filePath)) continue;
15
+
16
+ try {
17
+ const content = readFileSync(filePath, 'utf-8');
18
+ // Check if already ignored (exact line match)
19
+ const lines = content.split(/\r?\n/);
20
+ if (lines.some((line) => line.trim() === SEXTANT_ENTRY || line.trim() === '.sextant')) {
21
+ continue;
22
+ }
23
+
24
+ const suffix = content.endsWith('\n') ? '' : '\n';
25
+ appendFileSync(filePath, `${suffix}${SEXTANT_ENTRY}\n`);
26
+ console.error(`[ignore] Added ${SEXTANT_ENTRY} to ${file}`);
27
+ } catch (err) {
28
+ console.error(`[ignore] Could not update ${file}:`, err);
29
+ }
30
+ }
31
+ }
package/src/index.ts ADDED
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env bun
2
+ import { mkdirSync } from 'fs';
3
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
+ import { config } from './config.ts';
5
+ import { initMetadataDb, closeMetadataDb } from './store/metadata-db.ts';
6
+ import { initStore } from './store/orama-store.ts';
7
+ import { loadFromDisk, flushPersist } from './store/persistence.ts';
8
+ import { indexAll } from './indexer/pipeline.ts';
9
+ import { requestCancel } from './indexer/state.ts';
10
+ import { createMcpServer } from './server.ts';
11
+ import { ensureIgnored } from './ignore-files.ts';
12
+
13
+ async function main() {
14
+ console.error('[startup] sextant starting...');
15
+ console.error(`[startup] Docs path: ${config.docsPath}`);
16
+ console.error(`[startup] Data path: ${config.dataPath}`);
17
+
18
+ // Ensure directories exist
19
+ mkdirSync(config.dataPath, { recursive: true });
20
+ mkdirSync(config.docsPath, { recursive: true });
21
+
22
+ // Add .sextant/ to ignore files if they exist in cwd
23
+ ensureIgnored(process.cwd());
24
+
25
+ // Initialize metadata DB
26
+ initMetadataDb();
27
+ console.error('[startup] Metadata DB initialized');
28
+
29
+ // Load persisted index from disk, or create fresh
30
+ const loaded = await loadFromDisk();
31
+ if (!loaded) {
32
+ await initStore();
33
+ console.error('[startup] Created fresh Orama index');
34
+ }
35
+
36
+ // Start MCP server immediately so the connection is available while indexing
37
+ const server = createMcpServer();
38
+ const transport = new StdioServerTransport();
39
+ await server.connect(transport);
40
+ console.error('[startup] MCP server running on stdio');
41
+
42
+ // Run incremental reindex in background (only re-embeds changed files)
43
+ const indexingPromise = indexAll(config.docsPath)
44
+ .then((stats) => {
45
+ console.error(`[startup] Indexing complete: ${stats.filesProcessed} files, ${stats.chunksCreated} chunks in ${stats.duration}ms`);
46
+ })
47
+ .catch((err) => {
48
+ console.error('[startup] Initial indexing failed (server keeps running):', err);
49
+ });
50
+
51
+ // Graceful shutdown
52
+ const shutdown = async () => {
53
+ console.error('[shutdown] Shutting down...');
54
+ requestCancel();
55
+ // Give indexing up to 2s to finish gracefully
56
+ await Promise.race([indexingPromise, Bun.sleep(2000)]);
57
+ await flushPersist();
58
+ closeMetadataDb();
59
+ process.exit(0);
60
+ };
61
+
62
+ process.on('SIGINT', shutdown);
63
+ process.on('SIGTERM', shutdown);
64
+ }
65
+
66
+ main().catch((err) => {
67
+ console.error('Fatal error:', err);
68
+ process.exit(1);
69
+ });
@@ -0,0 +1,245 @@
1
+ import path from 'path';
2
+ import { createHash } from 'crypto';
3
+ import { config } from '../config.ts';
4
+ import type { DocChunk, ChunkMetadata } from './types.ts';
5
+
6
+ const MAX_CHUNK_CHARS = config.maxChunkTokens * 4; // ~1 token ≈ 4 chars
7
+
8
+ interface FrontmatterResult {
9
+ metadata: ChunkMetadata;
10
+ bodyStartIndex: number;
11
+ }
12
+
13
+ function parseFrontmatter(lines: string[]): FrontmatterResult {
14
+ const metadata: ChunkMetadata = {};
15
+ if (lines[0]?.trim() !== '---') {
16
+ return { metadata, bodyStartIndex: 0 };
17
+ }
18
+
19
+ for (let i = 1; i < lines.length; i++) {
20
+ if (lines[i]!.trim() === '---') {
21
+ // Parse YAML-like frontmatter between the --- delimiters
22
+ for (let j = 1; j < i; j++) {
23
+ const line = lines[j]!;
24
+ const match = line.match(/^(\w+)\s*:\s*(.+)$/);
25
+ if (match) {
26
+ const [, key, value] = match;
27
+ if (key === 'title') metadata.title = value!.replace(/^["']|["']$/g, '');
28
+ if (key === 'category') metadata.category = value!.replace(/^["']|["']$/g, '');
29
+ if (key === 'tags') {
30
+ metadata.tags = value!
31
+ .replace(/^\[|\]$/g, '')
32
+ .split(',')
33
+ .map((t) => t.trim().replace(/^["']|["']$/g, ''));
34
+ }
35
+ }
36
+ }
37
+ return { metadata, bodyStartIndex: i + 1 };
38
+ }
39
+ }
40
+ return { metadata, bodyStartIndex: 0 };
41
+ }
42
+
43
+ function makeChunkId(filePath: string, content: string): string {
44
+ const input = `${filePath}::${content}`;
45
+ return createHash('sha256').update(input).digest('hex').slice(0, 16);
46
+ }
47
+
48
+ function getHeadingLevel(line: string): number {
49
+ const match = line.match(/^(#{1,6})\s/);
50
+ return match ? match[1]!.length : 0;
51
+ }
52
+
53
+ function getHeadingText(line: string): string {
54
+ return line.replace(/^#{1,6}\s+/, '').trim();
55
+ }
56
+
57
+ function isInsideCodeBlock(lines: string[], index: number): boolean {
58
+ let fenceCount = 0;
59
+ for (let i = 0; i < index; i++) {
60
+ if (lines[i]!.trimStart().startsWith('```')) {
61
+ fenceCount++;
62
+ }
63
+ }
64
+ return fenceCount % 2 === 1;
65
+ }
66
+
67
+ interface HeadingSection {
68
+ headingHierarchy: string[];
69
+ lines: string[];
70
+ }
71
+
72
+ function splitByHeadings(lines: string[]): HeadingSection[] {
73
+ const sections: HeadingSection[] = [];
74
+ const hierarchyStack: { level: number; text: string }[] = [];
75
+ let currentLines: string[] = [];
76
+ let hasHeading = false;
77
+
78
+ for (let i = 0; i < lines.length; i++) {
79
+ const line = lines[i]!;
80
+ const level = getHeadingLevel(line);
81
+
82
+ if (level > 0 && !isInsideCodeBlock(lines, i)) {
83
+ // Save previous section
84
+ if (currentLines.length > 0 || hasHeading) {
85
+ sections.push({
86
+ headingHierarchy: hierarchyStack.map((h) => h.text),
87
+ lines: currentLines,
88
+ });
89
+ }
90
+
91
+ // Update heading hierarchy
92
+ while (hierarchyStack.length > 0 && hierarchyStack[hierarchyStack.length - 1]!.level >= level) {
93
+ hierarchyStack.pop();
94
+ }
95
+ hierarchyStack.push({ level, text: getHeadingText(line) });
96
+
97
+ currentLines = [line];
98
+ hasHeading = true;
99
+ } else {
100
+ currentLines.push(line);
101
+ }
102
+ }
103
+
104
+ // Push final section
105
+ if (currentLines.length > 0) {
106
+ sections.push({
107
+ headingHierarchy: hasHeading ? hierarchyStack.map((h) => h.text) : [],
108
+ lines: currentLines,
109
+ });
110
+ }
111
+
112
+ return sections;
113
+ }
114
+
115
+ function splitOversizedParagraph(para: string): string[] {
116
+ // Hard-split a single paragraph that exceeds MAX_CHUNK_CHARS by lines
117
+ const lines = para.split('\n');
118
+ const pieces: string[] = [];
119
+ let current = '';
120
+
121
+ for (const line of lines) {
122
+ // If a single line exceeds the limit, truncate it
123
+ const safeLine = line.length > MAX_CHUNK_CHARS ? line.slice(0, MAX_CHUNK_CHARS) : line;
124
+
125
+ if (current.length + safeLine.length + 1 > MAX_CHUNK_CHARS && current.length > 0) {
126
+ pieces.push(current.trim());
127
+ current = safeLine;
128
+ } else {
129
+ current = current ? current + '\n' + safeLine : safeLine;
130
+ }
131
+ }
132
+
133
+ if (current.trim()) {
134
+ pieces.push(current.trim());
135
+ }
136
+
137
+ return pieces.length > 0 ? pieces : [para];
138
+ }
139
+
140
+ function splitLargeChunk(content: string, overlapLines: number): string[] {
141
+ if (content.length <= MAX_CHUNK_CHARS) return [content];
142
+
143
+ const paragraphs = content.split(/\n\n+/);
144
+ const subChunks: string[] = [];
145
+ let current = '';
146
+
147
+ for (const para of paragraphs) {
148
+ // If a single paragraph exceeds the limit, hard-split it by lines
149
+ if (para.length > MAX_CHUNK_CHARS) {
150
+ if (current.trim()) {
151
+ subChunks.push(current.trim());
152
+ current = '';
153
+ }
154
+ subChunks.push(...splitOversizedParagraph(para));
155
+ continue;
156
+ }
157
+
158
+ if (current.length + para.length + 2 > MAX_CHUNK_CHARS && current.length > 0) {
159
+ subChunks.push(current.trim());
160
+ // Add overlap: last N lines of previous chunk
161
+ const prevLines = current.trimEnd().split('\n');
162
+ const overlap = prevLines.slice(-overlapLines).join('\n');
163
+ current = overlap ? overlap + '\n\n' + para : para;
164
+ } else {
165
+ current = current ? current + '\n\n' + para : para;
166
+ }
167
+ }
168
+
169
+ if (current.trim()) {
170
+ subChunks.push(current.trim());
171
+ }
172
+
173
+ return subChunks.length > 0 ? subChunks : [content];
174
+ }
175
+
176
+ export function chunkMarkdown(
177
+ filePath: string,
178
+ rawContent: string,
179
+ lastModified: number,
180
+ docsPath: string
181
+ ): { chunks: DocChunk[]; metadata: ChunkMetadata } {
182
+ const relativePath = path.relative(docsPath, filePath).replace(/\\/g, '/');
183
+ const fileName = path.basename(filePath);
184
+ const parts = relativePath.split('/');
185
+ const category = parts.length > 1 ? parts[0]! : 'root';
186
+
187
+ const lines = rawContent.split('\n');
188
+ const { metadata, bodyStartIndex } = parseFrontmatter(lines);
189
+ const bodyLines = lines.slice(bodyStartIndex);
190
+ const fileCategory = metadata.category ?? category;
191
+
192
+ const sections = splitByHeadings(bodyLines);
193
+ const chunks: DocChunk[] = [];
194
+
195
+ // If no headings found, treat entire file as one chunk
196
+ if (sections.length === 0 || (sections.length === 1 && sections[0]!.headingHierarchy.length === 0)) {
197
+ const content = bodyLines.join('\n').trim();
198
+ if (content) {
199
+ const hierarchy = [metadata.title ?? fileName.replace(/\.md$/, '')];
200
+ const subChunks = splitLargeChunk(content, config.chunkOverlapLines);
201
+ for (let i = 0; i < subChunks.length; i++) {
202
+ chunks.push({
203
+ id: makeChunkId(relativePath, subChunks[i]!),
204
+ filePath: relativePath,
205
+ fileName,
206
+ category: fileCategory,
207
+ headingHierarchy: hierarchy,
208
+ headingSlug: hierarchy.join(' > '),
209
+ chunkIndex: i,
210
+ content: subChunks[i]!,
211
+ charCount: subChunks[i]!.length,
212
+ lastModified,
213
+ });
214
+ }
215
+ }
216
+ return { chunks, metadata };
217
+ }
218
+
219
+ for (const section of sections) {
220
+ const content = section.lines.join('\n').trim();
221
+ if (!content) continue;
222
+
223
+ const hierarchy = section.headingHierarchy.length > 0
224
+ ? section.headingHierarchy
225
+ : [metadata.title ?? fileName.replace(/\.md$/, '')];
226
+
227
+ const subChunks = splitLargeChunk(content, config.chunkOverlapLines);
228
+ for (let i = 0; i < subChunks.length; i++) {
229
+ chunks.push({
230
+ id: makeChunkId(relativePath, subChunks[i]!),
231
+ filePath: relativePath,
232
+ fileName,
233
+ category: fileCategory,
234
+ headingHierarchy: hierarchy,
235
+ headingSlug: hierarchy.join(' > '),
236
+ chunkIndex: i,
237
+ content: subChunks[i]!,
238
+ charCount: subChunks[i]!.length,
239
+ lastModified,
240
+ });
241
+ }
242
+ }
243
+
244
+ return { chunks, metadata };
245
+ }