gitmem-mcp 1.4.3 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/bin/gitmem.js +2 -1
- package/dist/hooks/format-utils.js +4 -0
- package/dist/schemas/session-close.d.ts +15 -15
- package/dist/schemas/session-close.js +3 -3
- package/dist/server.js +13 -0
- package/dist/services/doc-chunker.d.ts +45 -0
- package/dist/services/doc-chunker.js +208 -0
- package/dist/services/doc-index.d.ts +88 -0
- package/dist/services/doc-index.js +328 -0
- package/dist/services/enforcement.js +0 -1
- package/dist/tools/definitions.d.ts +688 -0
- package/dist/tools/definitions.js +87 -0
- package/dist/tools/index-docs.d.ts +30 -0
- package/dist/tools/index-docs.js +163 -0
- package/dist/tools/prepare-context.js +7 -0
- package/dist/tools/recall.js +10 -1
- package/dist/tools/search-docs.d.ts +38 -0
- package/dist/tools/search-docs.js +94 -0
- package/dist/tools/search.js +11 -1
- package/dist/tools/session-close.js +45 -2
- package/dist/tools/session-start.js +26 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [1.5.1] - 2026-05-13
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- **CI smoke test tool count**: Updated `EXPECTED_TOOL_COUNTS` to reflect `index_docs` and `search_docs` additions (+2 per tier). The 1.5.0 release failed to publish because the smoke test expected 23 free-tier tools but found 25.
|
|
14
|
+
|
|
15
|
+
## [1.5.0] - 2026-05-11
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
- **`index_docs` tool**: Scan a directory of markdown files, chunk them, and store in a local doc index for semantic search. Supports incremental indexing (only re-processes changed files), force re-index, and project-scoped indexes. Aliases: `gitmem-idx`.
|
|
19
|
+
- **`search_docs` tool**: Search indexed repository documentation using semantic similarity (pro tier) or BM25 keyword search (free tier). Returns relevant chunks with file paths for targeted reading. Aliases: `gitmem-sd`.
|
|
20
|
+
- **Citation protocol**: `recall`, `search`, and `prepare_context` now include a citation rule instructing agents to cite record IDs when referencing facts from institutional memory.
|
|
21
|
+
- **Low confidence tagging**: Recall and search results with similarity below 0.55 are tagged `[low confidence]` — these matches have a 66% N/A rate historically.
|
|
22
|
+
- **Session duration on resume**: `session_start` now shows elapsed session time and loaded scar count when resuming or refreshing an existing session.
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
- **Quick close hard gate**: `session_close` with `close_type: "quick"` now rejects sessions over 30 minutes, requiring standard close instead.
|
|
26
|
+
- **Standard close recall gate**: `session_close` with `close_type: "standard"` now requires at least one `recall()` call during the session (exemptions: quick close, autonomous agents, sessions with inline reflection).
|
|
27
|
+
|
|
28
|
+
## [1.4.4] - 2026-03-31
|
|
29
|
+
|
|
30
|
+
### Fixed
|
|
31
|
+
- **Project drift on session resume eliminated**: When resuming an existing session (same hostname+PID), the stored project now overrides whatever the agent passes. Previously, context compaction could cause agents to send the wrong project (e.g., `orchestra_dev` instead of `weekend_warrior`), creating a session under the wrong project with wrong threads and decisions. The active-sessions registry already stored the correct project — it just wasn't used on resume.
|
|
32
|
+
- **`closing_reflection` array coercion**: Values passed as arrays in `closing_reflection` are now coerced to strings, preventing schema validation errors on session close.
|
|
33
|
+
- **`create_thread` no longer triggers false enforcement warnings**: Removed from `CONSEQUENTIAL_TOOLS` list — creating threads is lightweight and shouldn't require prior recall.
|
|
34
|
+
|
|
10
35
|
## [1.4.3] - 2026-02-24
|
|
11
36
|
|
|
12
37
|
### Fixed
|
package/bin/gitmem.js
CHANGED
|
@@ -154,9 +154,10 @@ async function cmdInit() {
|
|
|
154
154
|
// Merge: skip scars that already exist by id
|
|
155
155
|
const existingIds = new Set(existing.map((s) => s.id));
|
|
156
156
|
let added = 0;
|
|
157
|
+
const now = new Date().toISOString();
|
|
157
158
|
for (const scar of starterScars) {
|
|
158
159
|
if (!existingIds.has(scar.id)) {
|
|
159
|
-
existing.push(scar);
|
|
160
|
+
existing.push({ ...scar, created_at: now, source_date: now.slice(0, 10) });
|
|
160
161
|
added++;
|
|
161
162
|
console.log(` + ${scar.title}`);
|
|
162
163
|
} else {
|
|
@@ -57,6 +57,10 @@ export function formatCompact(scars, plan, maxTokens) {
|
|
|
57
57
|
lines.push(line);
|
|
58
58
|
included++;
|
|
59
59
|
}
|
|
60
|
+
// Citation reminder for sub-agent context (compact — one line)
|
|
61
|
+
if (included > 0) {
|
|
62
|
+
lines.push("Cite record IDs for any factual claims from these scars.");
|
|
63
|
+
}
|
|
60
64
|
return { payload: lines.join("\n"), included };
|
|
61
65
|
}
|
|
62
66
|
/**
|
|
@@ -13,11 +13,11 @@ export declare const ClosingReflectionSchema: z.ZodObject<{
|
|
|
13
13
|
wrong_assumption: z.ZodString;
|
|
14
14
|
scars_applied: z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>;
|
|
15
15
|
/** Q7: What from this session should be captured as institutional memory? */
|
|
16
|
-
institutional_memory_items: z.ZodOptional<z.ZodString
|
|
16
|
+
institutional_memory_items: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
17
17
|
/** Q8: How did the human prefer to work this session? */
|
|
18
|
-
collaborative_dynamic: z.ZodOptional<z.ZodString
|
|
18
|
+
collaborative_dynamic: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
19
19
|
/** Q9: What collaborative dynamic worked or didn't work? */
|
|
20
|
-
rapport_notes: z.ZodOptional<z.ZodString
|
|
20
|
+
rapport_notes: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
21
21
|
}, "strip", z.ZodTypeAny, {
|
|
22
22
|
what_broke: string;
|
|
23
23
|
what_took_longer: string;
|
|
@@ -35,9 +35,9 @@ export declare const ClosingReflectionSchema: z.ZodObject<{
|
|
|
35
35
|
what_worked: string;
|
|
36
36
|
wrong_assumption: string;
|
|
37
37
|
scars_applied: string | string[];
|
|
38
|
-
institutional_memory_items?: string | undefined;
|
|
39
|
-
collaborative_dynamic?: string | undefined;
|
|
40
|
-
rapport_notes?: string | undefined;
|
|
38
|
+
institutional_memory_items?: string | string[] | undefined;
|
|
39
|
+
collaborative_dynamic?: string | string[] | undefined;
|
|
40
|
+
rapport_notes?: string | string[] | undefined;
|
|
41
41
|
}>;
|
|
42
42
|
export type ClosingReflection = z.infer<typeof ClosingReflectionSchema>;
|
|
43
43
|
/**
|
|
@@ -154,11 +154,11 @@ export declare const SessionCloseParamsSchema: z.ZodObject<{
|
|
|
154
154
|
wrong_assumption: z.ZodString;
|
|
155
155
|
scars_applied: z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>;
|
|
156
156
|
/** Q7: What from this session should be captured as institutional memory? */
|
|
157
|
-
institutional_memory_items: z.ZodOptional<z.ZodString
|
|
157
|
+
institutional_memory_items: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
158
158
|
/** Q8: How did the human prefer to work this session? */
|
|
159
|
-
collaborative_dynamic: z.ZodOptional<z.ZodString
|
|
159
|
+
collaborative_dynamic: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
160
160
|
/** Q9: What collaborative dynamic worked or didn't work? */
|
|
161
|
-
rapport_notes: z.ZodOptional<z.ZodString
|
|
161
|
+
rapport_notes: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodEffects<z.ZodArray<z.ZodString, "many">, string, string[]>]>>;
|
|
162
162
|
}, "strip", z.ZodTypeAny, {
|
|
163
163
|
what_broke: string;
|
|
164
164
|
what_took_longer: string;
|
|
@@ -176,9 +176,9 @@ export declare const SessionCloseParamsSchema: z.ZodObject<{
|
|
|
176
176
|
what_worked: string;
|
|
177
177
|
wrong_assumption: string;
|
|
178
178
|
scars_applied: string | string[];
|
|
179
|
-
institutional_memory_items?: string | undefined;
|
|
180
|
-
collaborative_dynamic?: string | undefined;
|
|
181
|
-
rapport_notes?: string | undefined;
|
|
179
|
+
institutional_memory_items?: string | string[] | undefined;
|
|
180
|
+
collaborative_dynamic?: string | string[] | undefined;
|
|
181
|
+
rapport_notes?: string | string[] | undefined;
|
|
182
182
|
}>>;
|
|
183
183
|
human_corrections: z.ZodOptional<z.ZodString>;
|
|
184
184
|
decisions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
@@ -343,9 +343,9 @@ export declare const SessionCloseParamsSchema: z.ZodObject<{
|
|
|
343
343
|
what_worked: string;
|
|
344
344
|
wrong_assumption: string;
|
|
345
345
|
scars_applied: string | string[];
|
|
346
|
-
institutional_memory_items?: string | undefined;
|
|
347
|
-
collaborative_dynamic?: string | undefined;
|
|
348
|
-
rapport_notes?: string | undefined;
|
|
346
|
+
institutional_memory_items?: string | string[] | undefined;
|
|
347
|
+
collaborative_dynamic?: string | string[] | undefined;
|
|
348
|
+
rapport_notes?: string | string[] | undefined;
|
|
349
349
|
} | undefined;
|
|
350
350
|
human_corrections?: string | undefined;
|
|
351
351
|
scars_to_record?: {
|
|
@@ -15,11 +15,11 @@ export const ClosingReflectionSchema = z.object({
|
|
|
15
15
|
wrong_assumption: z.string(),
|
|
16
16
|
scars_applied: z.union([z.string(), z.array(z.string())]),
|
|
17
17
|
/** Q7: What from this session should be captured as institutional memory? */
|
|
18
|
-
institutional_memory_items: z.string().optional(),
|
|
18
|
+
institutional_memory_items: z.union([z.string(), z.array(z.string()).transform((arr) => arr.join(". "))]).optional(),
|
|
19
19
|
/** Q8: How did the human prefer to work this session? */
|
|
20
|
-
collaborative_dynamic: z.string().optional(),
|
|
20
|
+
collaborative_dynamic: z.union([z.string(), z.array(z.string()).transform((arr) => arr.join(". "))]).optional(),
|
|
21
21
|
/** Q9: What collaborative dynamic worked or didn't work? */
|
|
22
|
-
rapport_notes: z.string().optional(),
|
|
22
|
+
rapport_notes: z.union([z.string(), z.array(z.string()).transform((arr) => arr.join(". "))]).optional(),
|
|
23
23
|
});
|
|
24
24
|
/**
|
|
25
25
|
* Task completion proof schema
|
package/dist/server.js
CHANGED
|
@@ -36,6 +36,8 @@ import { dismissSuggestion } from "./tools/dismiss-suggestion.js";
|
|
|
36
36
|
import { cleanupThreads } from "./tools/cleanup-threads.js";
|
|
37
37
|
import { archiveLearning } from "./tools/archive-learning.js";
|
|
38
38
|
import { contributeFeedback } from "./tools/contribute-feedback.js";
|
|
39
|
+
import { indexDocs } from "./tools/index-docs.js";
|
|
40
|
+
import { searchDocsHandler } from "./tools/search-docs.js";
|
|
39
41
|
import { getCacheStatus, checkCacheHealth, flushCache, startBackgroundInit, } from "./services/startup.js";
|
|
40
42
|
import { getEffectTracker } from "./services/effect-tracker.js";
|
|
41
43
|
import { RIPPLE, ANSI } from "./services/display-protocol.js";
|
|
@@ -246,6 +248,8 @@ export function createServer() {
|
|
|
246
248
|
{ alias: "gitmem-al", full: "archive_learning", description: "Archive a scar/win/pattern (is_active=false)" },
|
|
247
249
|
{ alias: "gitmem-graph", full: "graph_traverse", description: "Traverse knowledge graph over institutional memory" },
|
|
248
250
|
{ alias: "gitmem-fb", full: "contribute_feedback", description: "Submit feedback about gitmem (10/session limit)" },
|
|
251
|
+
{ alias: "gitmem-idx", full: "index_docs", description: "Index markdown docs for semantic search" },
|
|
252
|
+
{ alias: "gitmem-sd", full: "search_docs", description: "Search indexed repository docs" },
|
|
249
253
|
];
|
|
250
254
|
if (hasBatchOperations()) {
|
|
251
255
|
commands.push({ alias: "gitmem-rsb", full: "record_scar_usage_batch", description: "Track multiple scars (batch)" });
|
|
@@ -315,6 +319,15 @@ export function createServer() {
|
|
|
315
319
|
case "gm-cache-f":
|
|
316
320
|
result = await flushCache(toolArgs.project || getProject() || "default");
|
|
317
321
|
break;
|
|
322
|
+
// Doc indexing and search
|
|
323
|
+
case "index_docs":
|
|
324
|
+
case "gitmem-idx":
|
|
325
|
+
result = await indexDocs(toolArgs);
|
|
326
|
+
break;
|
|
327
|
+
case "search_docs":
|
|
328
|
+
case "gitmem-sd":
|
|
329
|
+
result = await searchDocsHandler(toolArgs);
|
|
330
|
+
break;
|
|
318
331
|
default:
|
|
319
332
|
throw new Error(`Unknown tool: ${name}`);
|
|
320
333
|
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunker — Split markdown files into searchable chunks
|
|
3
|
+
*
|
|
4
|
+
* Strategy:
|
|
5
|
+
* 1. Split on H2 headers first (natural semantic boundaries)
|
|
6
|
+
* 2. If a section exceeds target size, split on paragraph boundaries
|
|
7
|
+
* 3. Each chunk carries metadata: file path, title, category, chunk index
|
|
8
|
+
*
|
|
9
|
+
* Target chunk size: 500-800 tokens (~2000-3200 chars)
|
|
10
|
+
*/
|
|
11
|
+
export interface DocChunk {
|
|
12
|
+
file_path: string;
|
|
13
|
+
chunk_index: number;
|
|
14
|
+
title: string;
|
|
15
|
+
section_title: string;
|
|
16
|
+
category: string;
|
|
17
|
+
content: string;
|
|
18
|
+
file_hash: string;
|
|
19
|
+
}
|
|
20
|
+
export interface DocFile {
|
|
21
|
+
absolute_path: string;
|
|
22
|
+
relative_path: string;
|
|
23
|
+
content: string;
|
|
24
|
+
hash: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Chunk a single markdown file into searchable segments
|
|
28
|
+
*/
|
|
29
|
+
export declare function chunkDocument(doc: DocFile): DocChunk[];
|
|
30
|
+
/**
|
|
31
|
+
* Scan a directory for markdown files
|
|
32
|
+
*/
|
|
33
|
+
export declare function scanDirectory(dirPath: string, options?: {
|
|
34
|
+
exclude?: string[];
|
|
35
|
+
}): DocFile[];
|
|
36
|
+
/**
|
|
37
|
+
* Chunk all markdown files in a directory
|
|
38
|
+
*/
|
|
39
|
+
export declare function chunkDirectory(dirPath: string, options?: {
|
|
40
|
+
exclude?: string[];
|
|
41
|
+
}): {
|
|
42
|
+
files: DocFile[];
|
|
43
|
+
chunks: DocChunk[];
|
|
44
|
+
};
|
|
45
|
+
//# sourceMappingURL=doc-chunker.d.ts.map
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunker — Split markdown files into searchable chunks
|
|
3
|
+
*
|
|
4
|
+
* Strategy:
|
|
5
|
+
* 1. Split on H2 headers first (natural semantic boundaries)
|
|
6
|
+
* 2. If a section exceeds target size, split on paragraph boundaries
|
|
7
|
+
* 3. Each chunk carries metadata: file path, title, category, chunk index
|
|
8
|
+
*
|
|
9
|
+
* Target chunk size: 500-800 tokens (~2000-3200 chars)
|
|
10
|
+
*/
|
|
11
|
+
import * as fs from "fs";
|
|
12
|
+
import * as path from "path";
|
|
13
|
+
import * as crypto from "crypto";
|
|
14
|
+
const TARGET_CHUNK_CHARS = 2400; // ~600 tokens
|
|
15
|
+
const MAX_CHUNK_CHARS = 3600; // ~900 tokens hard limit
|
|
16
|
+
const MIN_CHUNK_CHARS = 200; // Don't create tiny chunks
|
|
17
|
+
/**
|
|
18
|
+
* Extract title from markdown content (first H1, or filename)
|
|
19
|
+
*/
|
|
20
|
+
function extractTitle(content, filePath) {
|
|
21
|
+
const h1Match = content.match(/^#\s+(.+)$/m);
|
|
22
|
+
if (h1Match)
|
|
23
|
+
return h1Match[1].trim();
|
|
24
|
+
// Fall back to filename without extension
|
|
25
|
+
return path.basename(filePath, ".md").replace(/[-_]/g, " ");
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Extract category from directory structure
|
|
29
|
+
*/
|
|
30
|
+
function extractCategory(relativePath) {
|
|
31
|
+
const parts = relativePath.split(path.sep);
|
|
32
|
+
if (parts.length > 1)
|
|
33
|
+
return parts[0];
|
|
34
|
+
return "root";
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Split markdown into sections by H2 headers
|
|
38
|
+
*/
|
|
39
|
+
function splitByH2(content) {
|
|
40
|
+
const sections = [];
|
|
41
|
+
const lines = content.split("\n");
|
|
42
|
+
let currentTitle = "";
|
|
43
|
+
let currentLines = [];
|
|
44
|
+
for (const line of lines) {
|
|
45
|
+
const h2Match = line.match(/^##\s+(.+)$/);
|
|
46
|
+
if (h2Match) {
|
|
47
|
+
// Save previous section if it has content
|
|
48
|
+
if (currentLines.length > 0) {
|
|
49
|
+
const text = currentLines.join("\n").trim();
|
|
50
|
+
if (text.length > 0) {
|
|
51
|
+
sections.push({ title: currentTitle, content: text });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
currentTitle = h2Match[1].trim();
|
|
55
|
+
currentLines = [];
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
currentLines.push(line);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Don't forget the last section
|
|
62
|
+
if (currentLines.length > 0) {
|
|
63
|
+
const text = currentLines.join("\n").trim();
|
|
64
|
+
if (text.length > 0) {
|
|
65
|
+
sections.push({ title: currentTitle, content: text });
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return sections;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Split a text blob on paragraph boundaries to fit within target size
|
|
72
|
+
*/
|
|
73
|
+
function splitByParagraphs(text, maxChars) {
|
|
74
|
+
if (text.length <= maxChars)
|
|
75
|
+
return [text];
|
|
76
|
+
const chunks = [];
|
|
77
|
+
const paragraphs = text.split(/\n\n+/);
|
|
78
|
+
let current = "";
|
|
79
|
+
for (const para of paragraphs) {
|
|
80
|
+
if (current.length + para.length + 2 > maxChars && current.length > 0) {
|
|
81
|
+
chunks.push(current.trim());
|
|
82
|
+
current = para;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
current = current ? current + "\n\n" + para : para;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (current.trim().length > 0) {
|
|
89
|
+
chunks.push(current.trim());
|
|
90
|
+
}
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Compute SHA-256 hash of content
|
|
95
|
+
*/
|
|
96
|
+
function hashContent(content) {
|
|
97
|
+
return crypto.createHash("sha256").update(content).digest("hex");
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Chunk a single markdown file into searchable segments
|
|
101
|
+
*/
|
|
102
|
+
export function chunkDocument(doc) {
|
|
103
|
+
const title = extractTitle(doc.content, doc.relative_path);
|
|
104
|
+
const category = extractCategory(doc.relative_path);
|
|
105
|
+
const chunks = [];
|
|
106
|
+
let chunkIndex = 0;
|
|
107
|
+
// Split by H2 headers
|
|
108
|
+
const sections = splitByH2(doc.content);
|
|
109
|
+
for (const section of sections) {
|
|
110
|
+
// If section fits in one chunk, use it directly
|
|
111
|
+
if (section.content.length <= MAX_CHUNK_CHARS) {
|
|
112
|
+
if (section.content.length >= MIN_CHUNK_CHARS) {
|
|
113
|
+
chunks.push({
|
|
114
|
+
file_path: doc.relative_path,
|
|
115
|
+
chunk_index: chunkIndex++,
|
|
116
|
+
title,
|
|
117
|
+
section_title: section.title,
|
|
118
|
+
category,
|
|
119
|
+
content: section.content,
|
|
120
|
+
file_hash: doc.hash,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
// Section too large — split by paragraphs
|
|
126
|
+
const subChunks = splitByParagraphs(section.content, TARGET_CHUNK_CHARS);
|
|
127
|
+
for (const sub of subChunks) {
|
|
128
|
+
if (sub.length >= MIN_CHUNK_CHARS) {
|
|
129
|
+
chunks.push({
|
|
130
|
+
file_path: doc.relative_path,
|
|
131
|
+
chunk_index: chunkIndex++,
|
|
132
|
+
title,
|
|
133
|
+
section_title: section.title,
|
|
134
|
+
category,
|
|
135
|
+
content: sub,
|
|
136
|
+
file_hash: doc.hash,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Edge case: file with no H2 headers and short content — one chunk
|
|
143
|
+
if (chunks.length === 0 && doc.content.trim().length >= MIN_CHUNK_CHARS) {
|
|
144
|
+
chunks.push({
|
|
145
|
+
file_path: doc.relative_path,
|
|
146
|
+
chunk_index: 0,
|
|
147
|
+
title,
|
|
148
|
+
section_title: "",
|
|
149
|
+
category,
|
|
150
|
+
content: doc.content.trim().slice(0, MAX_CHUNK_CHARS),
|
|
151
|
+
file_hash: doc.hash,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
return chunks;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Scan a directory for markdown files
|
|
158
|
+
*/
|
|
159
|
+
export function scanDirectory(dirPath, options = {}) {
|
|
160
|
+
const exclude = options.exclude || ["_archive", "node_modules", ".git"];
|
|
161
|
+
const files = [];
|
|
162
|
+
function walk(currentPath) {
|
|
163
|
+
let entries;
|
|
164
|
+
try {
|
|
165
|
+
entries = fs.readdirSync(currentPath, { withFileTypes: true });
|
|
166
|
+
}
|
|
167
|
+
catch {
|
|
168
|
+
return; // Permission denied or inaccessible
|
|
169
|
+
}
|
|
170
|
+
for (const entry of entries) {
|
|
171
|
+
const fullPath = path.join(currentPath, entry.name);
|
|
172
|
+
if (entry.isDirectory()) {
|
|
173
|
+
if (!exclude.includes(entry.name)) {
|
|
174
|
+
walk(fullPath);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) {
|
|
178
|
+
try {
|
|
179
|
+
const content = fs.readFileSync(fullPath, "utf-8");
|
|
180
|
+
const relativePath = path.relative(dirPath, fullPath);
|
|
181
|
+
files.push({
|
|
182
|
+
absolute_path: fullPath,
|
|
183
|
+
relative_path: relativePath,
|
|
184
|
+
content,
|
|
185
|
+
hash: hashContent(content),
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
catch {
|
|
189
|
+
// Skip unreadable files
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
walk(dirPath);
|
|
195
|
+
return files;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Chunk all markdown files in a directory
|
|
199
|
+
*/
|
|
200
|
+
export function chunkDirectory(dirPath, options = {}) {
|
|
201
|
+
const files = scanDirectory(dirPath, options);
|
|
202
|
+
const chunks = [];
|
|
203
|
+
for (const file of files) {
|
|
204
|
+
chunks.push(...chunkDocument(file));
|
|
205
|
+
}
|
|
206
|
+
return { files, chunks };
|
|
207
|
+
}
|
|
208
|
+
//# sourceMappingURL=doc-chunker.js.map
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Index — Storage and search for indexed doc chunks
|
|
3
|
+
*
|
|
4
|
+
* Supports two backends:
|
|
5
|
+
* - Free tier: Local JSON file with BM25 keyword search
|
|
6
|
+
* - Pro/dev tier: In-memory vector index with embeddings
|
|
7
|
+
*
|
|
8
|
+
* Follows the same patterns as local-vector-search.ts and local-file-storage.ts
|
|
9
|
+
*/
|
|
10
|
+
import type { DocChunk } from "./doc-chunker.js";
|
|
11
|
+
export interface IndexedDocChunk {
|
|
12
|
+
id: string;
|
|
13
|
+
file_path: string;
|
|
14
|
+
chunk_index: number;
|
|
15
|
+
title: string;
|
|
16
|
+
section_title: string;
|
|
17
|
+
category: string;
|
|
18
|
+
content: string;
|
|
19
|
+
file_hash: string;
|
|
20
|
+
project: string;
|
|
21
|
+
embedding?: number[];
|
|
22
|
+
indexed_at: string;
|
|
23
|
+
}
|
|
24
|
+
export interface DocSearchResult {
|
|
25
|
+
id: string;
|
|
26
|
+
file_path: string;
|
|
27
|
+
chunk_index: number;
|
|
28
|
+
title: string;
|
|
29
|
+
section_title: string;
|
|
30
|
+
category: string;
|
|
31
|
+
content: string;
|
|
32
|
+
similarity: number;
|
|
33
|
+
project: string;
|
|
34
|
+
}
|
|
35
|
+
export interface IndexStats {
|
|
36
|
+
total_chunks: number;
|
|
37
|
+
total_files: number;
|
|
38
|
+
files_indexed: string[];
|
|
39
|
+
categories: Record<string, number>;
|
|
40
|
+
project: string;
|
|
41
|
+
has_embeddings: boolean;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Index doc chunks into storage.
|
|
45
|
+
*
|
|
46
|
+
* - Removes old chunks for the same project + file_path
|
|
47
|
+
* - Generates embeddings if available (pro/dev tier)
|
|
48
|
+
* - Stores to local JSON file
|
|
49
|
+
* - Loads into in-memory vector index if embeddings present
|
|
50
|
+
*
|
|
51
|
+
* Returns count of chunks indexed.
|
|
52
|
+
*/
|
|
53
|
+
export declare function indexChunks(chunks: DocChunk[], project: string, options?: {
|
|
54
|
+
batchSize?: number;
|
|
55
|
+
}): Promise<{
|
|
56
|
+
indexed: number;
|
|
57
|
+
embedded: number;
|
|
58
|
+
errors: number;
|
|
59
|
+
}>;
|
|
60
|
+
/**
|
|
61
|
+
* Search indexed docs using semantic similarity (pro/dev) or BM25 (free)
|
|
62
|
+
*/
|
|
63
|
+
export declare function searchDocs(query: string, options?: {
|
|
64
|
+
project?: string;
|
|
65
|
+
category?: string;
|
|
66
|
+
match_count?: number;
|
|
67
|
+
}): Promise<DocSearchResult[]>;
|
|
68
|
+
/**
|
|
69
|
+
* Get index statistics
|
|
70
|
+
*/
|
|
71
|
+
export declare function getIndexStats(project?: string): IndexStats;
|
|
72
|
+
/**
|
|
73
|
+
* Check which files have changed since last index (by hash)
|
|
74
|
+
*/
|
|
75
|
+
export declare function getChangedFiles(fileHashes: Map<string, string>, project: string): {
|
|
76
|
+
changed: string[];
|
|
77
|
+
unchanged: string[];
|
|
78
|
+
new_files: string[];
|
|
79
|
+
};
|
|
80
|
+
/**
|
|
81
|
+
* Initialize vector index from local storage on startup
|
|
82
|
+
*/
|
|
83
|
+
export declare function initDocVectorIndex(): void;
|
|
84
|
+
/**
|
|
85
|
+
* Clear the doc index for a project (or all)
|
|
86
|
+
*/
|
|
87
|
+
export declare function clearDocIndex(project?: string): number;
|
|
88
|
+
//# sourceMappingURL=doc-index.d.ts.map
|