@grec0/memory-bank-mcp 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -5
- package/dist/common/chunker.js +168 -24
- package/dist/common/fileScanner.js +94 -10
- package/dist/common/indexManager.js +97 -25
- package/dist/common/logger.js +54 -0
- package/dist/common/projectKnowledgeService.js +627 -0
- package/dist/common/vectorStore.js +77 -21
- package/dist/index.js +76 -8
- package/dist/tools/analyzeCoverage.js +1 -1
- package/dist/tools/generateProjectDocs.js +133 -0
- package/dist/tools/getProjectDocs.js +126 -0
- package/dist/tools/index.js +3 -0
- package/dist/tools/searchMemory.js +2 -2
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -28,14 +28,31 @@ Con Memory Bank, las IAs:
|
|
|
28
28
|
|
|
29
29
|
## 🚀 Características
|
|
30
30
|
|
|
31
|
+
### Core Memory Bank (Búsqueda Precisa)
|
|
31
32
|
- **🔍 Búsqueda Semántica**: Pregunta "¿cómo funciona la autenticación?" y obtén código relevante
|
|
32
|
-
- **🧩 Chunking Inteligente**: AST parsing para
|
|
33
|
+
- **🧩 Chunking Inteligente**: AST parsing para TS/JS/Python con límites de tokens (8192 máx)
|
|
33
34
|
- **⚡ Actualización Incremental**: Solo reindexa archivos modificados (detección por hash)
|
|
34
35
|
- **💾 Cache de Embeddings**: Evita regenerar embeddings innecesariamente
|
|
35
36
|
- **🎯 Filtros Avanzados**: Por archivo, lenguaje, tipo de chunk
|
|
36
37
|
- **📊 Estadísticas Detalladas**: Conoce el estado de tu índice en todo momento
|
|
37
38
|
- **🔒 Privacidad**: Vector store local, respeta .gitignore y .memoryignore
|
|
38
39
|
|
|
40
|
+
### Project Knowledge Layer (Conocimiento Global) 🆕
|
|
41
|
+
- **📄 Documentación Automática**: Genera 6 documentos markdown estructurados del proyecto
|
|
42
|
+
- **🧠 IA con Razonamiento**: Usa OpenAI Responses API con modelos de razonamiento (gpt-5-mini)
|
|
43
|
+
- **🔄 Actualización Inteligente**: Solo regenera documentos afectados por cambios
|
|
44
|
+
- **📚 Contexto Global**: Complementa búsqueda precisa con visión de alto nivel
|
|
45
|
+
|
|
46
|
+
Los documentos generados incluyen:
|
|
47
|
+
| Documento | Propósito |
|
|
48
|
+
|-----------|-----------|
|
|
49
|
+
| `projectBrief.md` | Descripción general del proyecto |
|
|
50
|
+
| `productContext.md` | Perspectiva de negocio y usuarios |
|
|
51
|
+
| `systemPatterns.md` | Patrones de arquitectura y diseño |
|
|
52
|
+
| `techContext.md` | Stack tecnológico y dependencias |
|
|
53
|
+
| `activeContext.md` | Estado actual de desarrollo |
|
|
54
|
+
| `progress.md` | Seguimiento de cambios |
|
|
55
|
+
|
|
39
56
|
## 📋 Requisitos
|
|
40
57
|
|
|
41
58
|
- **Node.js** >= 18.0.0
|
|
@@ -81,13 +98,18 @@ Crea un archivo `.env` en la raíz de tu workspace (o configúralas en tu client
|
|
|
81
98
|
# REQUERIDO: Tu API key de OpenAI
|
|
82
99
|
OPENAI_API_KEY=sk-your-api-key-here
|
|
83
100
|
|
|
84
|
-
# OPCIONAL: Configuración
|
|
101
|
+
# OPCIONAL: Configuración de indexación
|
|
85
102
|
MEMORYBANK_STORAGE_PATH=.memorybank # Dónde almacenar el índice
|
|
86
103
|
MEMORYBANK_EMBEDDING_MODEL=text-embedding-3-small # Modelo de OpenAI
|
|
87
104
|
MEMORYBANK_EMBEDDING_DIMENSIONS=1536 # Dimensiones (1536 o 512)
|
|
88
|
-
|
|
89
|
-
|
|
105
|
+
MEMORYBANK_MAX_TOKENS=7500 # Tokens máx por chunk (límite: 8192)
|
|
106
|
+
MEMORYBANK_CHUNK_OVERLAP_TOKENS=200 # Overlap en tokens entre chunks
|
|
90
107
|
MEMORYBANK_WORKSPACE_ROOT=/path/to/project # Raíz del workspace
|
|
108
|
+
|
|
109
|
+
# OPCIONAL: Project Knowledge Layer (documentación con IA)
|
|
110
|
+
MEMORYBANK_REASONING_MODEL=gpt-5-mini # Modelo de razonamiento
|
|
111
|
+
MEMORYBANK_REASONING_EFFORT=medium # low/medium/high
|
|
112
|
+
MEMORYBANK_AUTO_UPDATE_DOCS=false # Auto-actualizar docs al indexar
|
|
91
113
|
```
|
|
92
114
|
|
|
93
115
|
### Configuración en Claude Desktop
|
|
@@ -152,7 +174,7 @@ Busca código por similitud semántica.
|
|
|
152
174
|
**Parámetros:**
|
|
153
175
|
- `query` (requerido): Consulta en lenguaje natural
|
|
154
176
|
- `topK` (opcional): Número de resultados (default: 10)
|
|
155
|
-
- `minScore` (opcional): Score mínimo 0-1 (default: 0.
|
|
177
|
+
- `minScore` (opcional): Score mínimo 0-1 (default: 0.4)
|
|
156
178
|
- `filterByFile` (opcional): Filtrar por patrón de archivo
|
|
157
179
|
- `filterByLanguage` (opcional): Filtrar por lenguaje
|
|
158
180
|
|
|
@@ -205,6 +227,53 @@ Obtiene estadísticas del Memory Bank.
|
|
|
205
227
|
memorybank_get_stats({})
|
|
206
228
|
```
|
|
207
229
|
|
|
230
|
+
### `memorybank_analyze_coverage`
|
|
231
|
+
|
|
232
|
+
Analiza la cobertura de indexación del proyecto.
|
|
233
|
+
|
|
234
|
+
**Ejemplo:**
|
|
235
|
+
```
|
|
236
|
+
memorybank_analyze_coverage({})
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### `memorybank_generate_project_docs` 🆕
|
|
240
|
+
|
|
241
|
+
Genera documentación estructurada del proyecto usando IA con razonamiento (gpt-5-mini).
|
|
242
|
+
|
|
243
|
+
**Parámetros:**
|
|
244
|
+
- `projectId` (opcional): ID del proyecto
|
|
245
|
+
- `force` (opcional): Forzar regeneración (default: false)
|
|
246
|
+
|
|
247
|
+
**Ejemplo:**
|
|
248
|
+
```
|
|
249
|
+
memorybank_generate_project_docs({ force: true })
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Genera 6 documentos markdown:
|
|
253
|
+
- `projectBrief.md`: Descripción general
|
|
254
|
+
- `productContext.md`: Perspectiva de negocio
|
|
255
|
+
- `systemPatterns.md`: Patrones de arquitectura
|
|
256
|
+
- `techContext.md`: Stack tecnológico
|
|
257
|
+
- `activeContext.md`: Estado actual
|
|
258
|
+
- `progress.md`: Seguimiento
|
|
259
|
+
|
|
260
|
+
### `memorybank_get_project_docs` 🆕
|
|
261
|
+
|
|
262
|
+
Lee la documentación del proyecto generada por IA.
|
|
263
|
+
|
|
264
|
+
**Parámetros:**
|
|
265
|
+
- `document` (opcional): Documento específico o "all"/"summary" (default: "summary")
|
|
266
|
+
- `format` (opcional): "full" o "summary" (default: "full")
|
|
267
|
+
|
|
268
|
+
**Ejemplo:**
|
|
269
|
+
```
|
|
270
|
+
// Obtener resumen de todos los docs
|
|
271
|
+
memorybank_get_project_docs({ document: "summary" })
|
|
272
|
+
|
|
273
|
+
// Obtener documento específico
|
|
274
|
+
memorybank_get_project_docs({ document: "systemPatterns" })
|
|
275
|
+
```
|
|
276
|
+
|
|
208
277
|
## 🎯 Casos de Uso
|
|
209
278
|
|
|
210
279
|
### 1. Primera Indexación
|
package/dist/common/chunker.js
CHANGED
|
@@ -1,13 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Intelligent code chunker for Memory Bank
|
|
3
3
|
* Fragments code intelligently using AST parsing when possible
|
|
4
|
+
* Uses token counting to respect embedding model limits
|
|
4
5
|
*/
|
|
5
6
|
import * as fs from "fs";
|
|
6
7
|
import { parse } from "@babel/parser";
|
|
7
8
|
import traverseLib from "@babel/traverse";
|
|
8
9
|
import * as crypto from "crypto";
|
|
10
|
+
import { encode } from "gpt-tokenizer";
|
|
9
11
|
// Handle traverse library export
|
|
10
12
|
const traverse = typeof traverseLib === 'function' ? traverseLib : traverseLib.default;
|
|
13
|
+
// Constants for embedding model limits
|
|
14
|
+
// text-embedding-3-small has 8192 token limit, use 7500 for safety margin
|
|
15
|
+
const MAX_TOKENS_PER_CHUNK = 7500;
|
|
16
|
+
const DEFAULT_CHUNK_OVERLAP_TOKENS = 200;
|
|
17
|
+
/**
|
|
18
|
+
* Counts tokens in a text using tiktoken-compatible tokenizer
|
|
19
|
+
*/
|
|
20
|
+
export function countTokens(text) {
|
|
21
|
+
try {
|
|
22
|
+
return encode(text).length;
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
// Fallback estimation: ~4 characters per token for code
|
|
26
|
+
return Math.ceil(text.length / 4);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
11
29
|
/**
|
|
12
30
|
* Generates unique ID for a chunk based on content and metadata
|
|
13
31
|
*/
|
|
@@ -59,6 +77,94 @@ function extractContext(content, language) {
|
|
|
59
77
|
}
|
|
60
78
|
return contextLines.join("\n");
|
|
61
79
|
}
|
|
80
|
+
/**
|
|
81
|
+
* Splits a chunk that exceeds the token limit into smaller chunks
|
|
82
|
+
*/
|
|
83
|
+
function splitLargeChunk(chunk, maxTokens, overlapTokens) {
|
|
84
|
+
const tokenCount = countTokens(chunk.content);
|
|
85
|
+
// If under limit, return as-is
|
|
86
|
+
if (tokenCount <= maxTokens) {
|
|
87
|
+
return [{ ...chunk, tokenCount }];
|
|
88
|
+
}
|
|
89
|
+
console.error(`Splitting large chunk: ${chunk.filePath} (${chunk.name || 'unnamed'}) - ${tokenCount} tokens exceeds ${maxTokens} limit`);
|
|
90
|
+
const subChunks = [];
|
|
91
|
+
const lines = chunk.content.split("\n");
|
|
92
|
+
let currentLines = [];
|
|
93
|
+
let currentTokens = 0;
|
|
94
|
+
let subChunkStartLine = chunk.startLine;
|
|
95
|
+
let subChunkIndex = 0;
|
|
96
|
+
for (let i = 0; i < lines.length; i++) {
|
|
97
|
+
const line = lines[i];
|
|
98
|
+
const lineTokens = countTokens(line + "\n");
|
|
99
|
+
// If single line exceeds max, we have to include it anyway (extreme edge case)
|
|
100
|
+
if (lineTokens > maxTokens && currentLines.length === 0) {
|
|
101
|
+
currentLines.push(line);
|
|
102
|
+
currentTokens = lineTokens;
|
|
103
|
+
}
|
|
104
|
+
else if (currentTokens + lineTokens > maxTokens && currentLines.length > 0) {
|
|
105
|
+
// Save current chunk
|
|
106
|
+
const content = currentLines.join("\n");
|
|
107
|
+
const actualTokens = countTokens(content);
|
|
108
|
+
subChunks.push({
|
|
109
|
+
id: generateChunkId(chunk.filePath, content, subChunkStartLine),
|
|
110
|
+
filePath: chunk.filePath,
|
|
111
|
+
content,
|
|
112
|
+
startLine: subChunkStartLine,
|
|
113
|
+
endLine: chunk.startLine + i - 1,
|
|
114
|
+
chunkType: chunk.chunkType,
|
|
115
|
+
name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
|
|
116
|
+
language: chunk.language,
|
|
117
|
+
context: chunk.context,
|
|
118
|
+
tokenCount: actualTokens,
|
|
119
|
+
});
|
|
120
|
+
subChunkIndex++;
|
|
121
|
+
// Calculate overlap - try to include enough lines to reach overlapTokens
|
|
122
|
+
let overlapLines = [];
|
|
123
|
+
let overlapTokenCount = 0;
|
|
124
|
+
for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < overlapTokens; j--) {
|
|
125
|
+
overlapLines.unshift(currentLines[j]);
|
|
126
|
+
overlapTokenCount += countTokens(currentLines[j] + "\n");
|
|
127
|
+
}
|
|
128
|
+
currentLines = [...overlapLines, line];
|
|
129
|
+
currentTokens = overlapTokenCount + lineTokens;
|
|
130
|
+
subChunkStartLine = chunk.startLine + i - overlapLines.length;
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
currentLines.push(line);
|
|
134
|
+
currentTokens += lineTokens;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// Save final sub-chunk
|
|
138
|
+
if (currentLines.length > 0) {
|
|
139
|
+
const content = currentLines.join("\n");
|
|
140
|
+
const actualTokens = countTokens(content);
|
|
141
|
+
subChunks.push({
|
|
142
|
+
id: generateChunkId(chunk.filePath, content, subChunkStartLine),
|
|
143
|
+
filePath: chunk.filePath,
|
|
144
|
+
content,
|
|
145
|
+
startLine: subChunkStartLine,
|
|
146
|
+
endLine: chunk.endLine,
|
|
147
|
+
chunkType: chunk.chunkType,
|
|
148
|
+
name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
|
|
149
|
+
language: chunk.language,
|
|
150
|
+
context: chunk.context,
|
|
151
|
+
tokenCount: actualTokens,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
console.error(` Split into ${subChunks.length} sub-chunks`);
|
|
155
|
+
return subChunks;
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Processes chunks to ensure none exceed the token limit
|
|
159
|
+
*/
|
|
160
|
+
function enforceTokenLimits(chunks, maxTokens, overlapTokens) {
|
|
161
|
+
const result = [];
|
|
162
|
+
for (const chunk of chunks) {
|
|
163
|
+
const splitChunks = splitLargeChunk(chunk, maxTokens, overlapTokens);
|
|
164
|
+
result.push(...splitChunks);
|
|
165
|
+
}
|
|
166
|
+
return result;
|
|
167
|
+
}
|
|
62
168
|
/**
|
|
63
169
|
* Chunks TypeScript/JavaScript code using AST parsing
|
|
64
170
|
*/
|
|
@@ -170,8 +276,9 @@ function chunkTypeScriptJavaScript(options) {
|
|
|
170
276
|
}
|
|
171
277
|
},
|
|
172
278
|
});
|
|
173
|
-
// If no chunks were extracted
|
|
174
|
-
if (chunks.length === 0
|
|
279
|
+
// If no chunks were extracted, treat as single chunk
|
|
280
|
+
if (chunks.length === 0) {
|
|
281
|
+
const tokenCount = countTokens(options.content);
|
|
175
282
|
chunks.push({
|
|
176
283
|
id: generateChunkId(options.filePath, options.content, 1),
|
|
177
284
|
filePath: options.filePath,
|
|
@@ -181,15 +288,17 @@ function chunkTypeScriptJavaScript(options) {
|
|
|
181
288
|
chunkType: "file",
|
|
182
289
|
language: options.language,
|
|
183
290
|
context,
|
|
291
|
+
tokenCount,
|
|
184
292
|
});
|
|
185
293
|
}
|
|
186
294
|
}
|
|
187
295
|
catch (error) {
|
|
188
296
|
console.error(`AST parsing failed for ${options.filePath}, falling back to fixed chunking: ${error}`);
|
|
189
297
|
// Fallback to fixed chunking if AST parsing fails
|
|
190
|
-
return
|
|
298
|
+
return chunkByTokens(options);
|
|
191
299
|
}
|
|
192
|
-
|
|
300
|
+
// Enforce token limits on all chunks
|
|
301
|
+
return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
|
|
193
302
|
}
|
|
194
303
|
/**
|
|
195
304
|
* Chunks Python code using simple pattern matching
|
|
@@ -208,7 +317,7 @@ function chunkPython(options) {
|
|
|
208
317
|
for (let i = 0; i < lines.length; i++) {
|
|
209
318
|
const line = lines[i];
|
|
210
319
|
const trimmed = line.trim();
|
|
211
|
-
const indent = line.length - line.
|
|
320
|
+
const indent = line.length - line.trimStart().length;
|
|
212
321
|
// Detect function definition
|
|
213
322
|
if (trimmed.startsWith("def ")) {
|
|
214
323
|
// Save previous chunk if exists
|
|
@@ -317,45 +426,57 @@ function chunkPython(options) {
|
|
|
317
426
|
context,
|
|
318
427
|
});
|
|
319
428
|
}
|
|
320
|
-
|
|
429
|
+
// Enforce token limits on all chunks
|
|
430
|
+
return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
|
|
321
431
|
}
|
|
322
432
|
/**
|
|
323
|
-
* Chunks code by
|
|
433
|
+
* Chunks code by token count with overlap (replacement for chunkByFixedSize)
|
|
324
434
|
*/
|
|
325
|
-
function
|
|
435
|
+
function chunkByTokens(options) {
|
|
326
436
|
const chunks = [];
|
|
327
437
|
const lines = options.content.split("\n");
|
|
328
438
|
const context = extractContext(options.content, options.language);
|
|
329
439
|
let currentLines = [];
|
|
330
|
-
let
|
|
440
|
+
let currentTokens = 0;
|
|
331
441
|
let chunkStartLine = 1;
|
|
332
442
|
for (let i = 0; i < lines.length; i++) {
|
|
333
443
|
const line = lines[i];
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
if (currentSize >= options.maxChunkSize) {
|
|
444
|
+
const lineTokens = countTokens(line + "\n");
|
|
445
|
+
// If we've reached max tokens
|
|
446
|
+
if (currentTokens + lineTokens > options.maxTokens && currentLines.length > 0) {
|
|
338
447
|
const content = currentLines.join("\n");
|
|
448
|
+
const actualTokens = countTokens(content);
|
|
339
449
|
chunks.push({
|
|
340
450
|
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
341
451
|
filePath: options.filePath,
|
|
342
452
|
content,
|
|
343
453
|
startLine: chunkStartLine,
|
|
344
|
-
endLine: i
|
|
454
|
+
endLine: i,
|
|
345
455
|
chunkType: "block",
|
|
346
456
|
language: options.language,
|
|
347
457
|
context,
|
|
458
|
+
tokenCount: actualTokens,
|
|
348
459
|
});
|
|
349
|
-
// Calculate overlap
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
460
|
+
// Calculate overlap in lines (approximate)
|
|
461
|
+
let overlapLines = [];
|
|
462
|
+
let overlapTokenCount = 0;
|
|
463
|
+
for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < options.chunkOverlapTokens; j--) {
|
|
464
|
+
overlapLines.unshift(currentLines[j]);
|
|
465
|
+
overlapTokenCount += countTokens(currentLines[j] + "\n");
|
|
466
|
+
}
|
|
467
|
+
currentLines = [...overlapLines, line];
|
|
468
|
+
currentTokens = overlapTokenCount + lineTokens;
|
|
469
|
+
chunkStartLine = i + 1 - overlapLines.length;
|
|
470
|
+
}
|
|
471
|
+
else {
|
|
472
|
+
currentLines.push(line);
|
|
473
|
+
currentTokens += lineTokens;
|
|
354
474
|
}
|
|
355
475
|
}
|
|
356
476
|
// Add remaining content as final chunk
|
|
357
477
|
if (currentLines.length > 0) {
|
|
358
478
|
const content = currentLines.join("\n");
|
|
479
|
+
const actualTokens = countTokens(content);
|
|
359
480
|
chunks.push({
|
|
360
481
|
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
361
482
|
filePath: options.filePath,
|
|
@@ -365,10 +486,18 @@ function chunkByFixedSize(options) {
|
|
|
365
486
|
chunkType: "block",
|
|
366
487
|
language: options.language,
|
|
367
488
|
context,
|
|
489
|
+
tokenCount: actualTokens,
|
|
368
490
|
});
|
|
369
491
|
}
|
|
370
492
|
return chunks;
|
|
371
493
|
}
|
|
494
|
+
/**
|
|
495
|
+
* Legacy function for backwards compatibility
|
|
496
|
+
* @deprecated Use chunkByTokens instead
|
|
497
|
+
*/
|
|
498
|
+
function chunkByFixedSize(options) {
|
|
499
|
+
return chunkByTokens(options);
|
|
500
|
+
}
|
|
372
501
|
/**
|
|
373
502
|
* Main chunking function - routes to appropriate strategy based on language
|
|
374
503
|
*/
|
|
@@ -377,6 +506,9 @@ export function chunkCode(options) {
|
|
|
377
506
|
filePath: options.filePath,
|
|
378
507
|
content: options.content,
|
|
379
508
|
language: options.language,
|
|
509
|
+
maxTokens: options.maxTokens || MAX_TOKENS_PER_CHUNK,
|
|
510
|
+
chunkOverlapTokens: options.chunkOverlapTokens || DEFAULT_CHUNK_OVERLAP_TOKENS,
|
|
511
|
+
// Legacy options mapping
|
|
380
512
|
maxChunkSize: options.maxChunkSize || 1000,
|
|
381
513
|
chunkOverlap: options.chunkOverlap || 200,
|
|
382
514
|
};
|
|
@@ -388,20 +520,32 @@ export function chunkCode(options) {
|
|
|
388
520
|
return chunkPython(fullOptions);
|
|
389
521
|
}
|
|
390
522
|
else {
|
|
391
|
-
// For other languages, use
|
|
392
|
-
return
|
|
523
|
+
// For other languages, use token-based chunking
|
|
524
|
+
return chunkByTokens(fullOptions);
|
|
393
525
|
}
|
|
394
526
|
}
|
|
395
527
|
/**
|
|
396
528
|
* Chunks a file by reading it from disk
|
|
397
529
|
*/
|
|
398
|
-
export function chunkFile(filePath, language,
|
|
530
|
+
export function chunkFile(filePath, language, maxTokens, chunkOverlapTokens) {
|
|
399
531
|
const content = fs.readFileSync(filePath, "utf-8");
|
|
400
532
|
return chunkCode({
|
|
401
533
|
filePath,
|
|
402
534
|
content,
|
|
403
535
|
language,
|
|
404
|
-
|
|
405
|
-
|
|
536
|
+
maxTokens,
|
|
537
|
+
chunkOverlapTokens,
|
|
406
538
|
});
|
|
407
539
|
}
|
|
540
|
+
/**
|
|
541
|
+
* Utility to check if content would fit in a single embedding
|
|
542
|
+
*/
|
|
543
|
+
export function wouldFitInSingleEmbedding(content, maxTokens = MAX_TOKENS_PER_CHUNK) {
|
|
544
|
+
return countTokens(content) <= maxTokens;
|
|
545
|
+
}
|
|
546
|
+
/**
|
|
547
|
+
* Get the maximum tokens allowed per chunk
|
|
548
|
+
*/
|
|
549
|
+
export function getMaxTokensPerChunk() {
|
|
550
|
+
return MAX_TOKENS_PER_CHUNK;
|
|
551
|
+
}
|
|
@@ -10,48 +10,116 @@ import ignoreLib from "ignore";
|
|
|
10
10
|
const ignore = typeof ignoreLib === 'function' ? ignoreLib : ignoreLib.default;
|
|
11
11
|
// Language detection by file extension
|
|
12
12
|
const LANGUAGE_MAP = {
|
|
13
|
+
// TypeScript/JavaScript
|
|
13
14
|
".ts": "typescript",
|
|
14
15
|
".tsx": "typescript",
|
|
15
16
|
".js": "javascript",
|
|
16
17
|
".jsx": "javascript",
|
|
17
18
|
".mjs": "javascript",
|
|
18
19
|
".cjs": "javascript",
|
|
20
|
+
// Python
|
|
19
21
|
".py": "python",
|
|
22
|
+
".pyi": "python",
|
|
23
|
+
".pyw": "python",
|
|
24
|
+
// JVM Languages
|
|
20
25
|
".java": "java",
|
|
26
|
+
".kt": "kotlin",
|
|
27
|
+
".kts": "kotlin",
|
|
28
|
+
".scala": "scala",
|
|
29
|
+
".groovy": "groovy",
|
|
30
|
+
".gradle": "groovy",
|
|
31
|
+
// C/C++
|
|
21
32
|
".c": "c",
|
|
22
33
|
".cpp": "cpp",
|
|
23
34
|
".cc": "cpp",
|
|
24
35
|
".cxx": "cpp",
|
|
25
36
|
".h": "c",
|
|
26
37
|
".hpp": "cpp",
|
|
38
|
+
".hxx": "cpp",
|
|
39
|
+
// .NET
|
|
27
40
|
".cs": "csharp",
|
|
41
|
+
".fs": "fsharp",
|
|
42
|
+
".vb": "vb",
|
|
43
|
+
// Systems Languages
|
|
28
44
|
".go": "go",
|
|
29
45
|
".rs": "rust",
|
|
46
|
+
// Scripting Languages
|
|
30
47
|
".rb": "ruby",
|
|
31
48
|
".php": "php",
|
|
32
|
-
".
|
|
33
|
-
".
|
|
34
|
-
".
|
|
35
|
-
".scala": "scala",
|
|
49
|
+
".pl": "perl",
|
|
50
|
+
".pm": "perl",
|
|
51
|
+
".lua": "lua",
|
|
36
52
|
".r": "r",
|
|
37
53
|
".R": "r",
|
|
38
|
-
|
|
54
|
+
// Mobile
|
|
55
|
+
".swift": "swift",
|
|
56
|
+
".m": "objectivec",
|
|
57
|
+
".mm": "objectivec",
|
|
58
|
+
// Shell
|
|
39
59
|
".sh": "shell",
|
|
40
60
|
".bash": "shell",
|
|
41
61
|
".zsh": "shell",
|
|
42
62
|
".fish": "shell",
|
|
43
|
-
".
|
|
44
|
-
".
|
|
45
|
-
".
|
|
46
|
-
".
|
|
47
|
-
|
|
63
|
+
".ps1": "powershell",
|
|
64
|
+
".psm1": "powershell",
|
|
65
|
+
".bat": "batch",
|
|
66
|
+
".cmd": "batch",
|
|
67
|
+
// Web
|
|
48
68
|
".html": "html",
|
|
49
69
|
".htm": "html",
|
|
50
70
|
".css": "css",
|
|
51
71
|
".scss": "scss",
|
|
52
72
|
".sass": "sass",
|
|
73
|
+
".less": "less",
|
|
53
74
|
".vue": "vue",
|
|
54
75
|
".svelte": "svelte",
|
|
76
|
+
".astro": "astro",
|
|
77
|
+
// Data/Config
|
|
78
|
+
".json": "json",
|
|
79
|
+
".jsonc": "json",
|
|
80
|
+
".json5": "json",
|
|
81
|
+
".yaml": "yaml",
|
|
82
|
+
".yml": "yaml",
|
|
83
|
+
".toml": "toml",
|
|
84
|
+
".xml": "xml",
|
|
85
|
+
".ini": "ini",
|
|
86
|
+
".cfg": "ini",
|
|
87
|
+
".conf": "ini",
|
|
88
|
+
".properties": "properties",
|
|
89
|
+
".env": "dotenv",
|
|
90
|
+
".env.local": "dotenv",
|
|
91
|
+
".env.example": "dotenv",
|
|
92
|
+
// Documentation
|
|
93
|
+
".md": "markdown",
|
|
94
|
+
".mdx": "markdown",
|
|
95
|
+
".rst": "rst",
|
|
96
|
+
".txt": "text",
|
|
97
|
+
// Database
|
|
98
|
+
".sql": "sql",
|
|
99
|
+
".prisma": "prisma",
|
|
100
|
+
".graphql": "graphql",
|
|
101
|
+
".gql": "graphql",
|
|
102
|
+
// Other
|
|
103
|
+
".dockerfile": "dockerfile",
|
|
104
|
+
".tf": "terraform",
|
|
105
|
+
".hcl": "hcl",
|
|
106
|
+
".proto": "protobuf",
|
|
107
|
+
".sol": "solidity",
|
|
108
|
+
".zig": "zig",
|
|
109
|
+
".nim": "nim",
|
|
110
|
+
".ex": "elixir",
|
|
111
|
+
".exs": "elixir",
|
|
112
|
+
".erl": "erlang",
|
|
113
|
+
".hrl": "erlang",
|
|
114
|
+
".clj": "clojure",
|
|
115
|
+
".cljs": "clojure",
|
|
116
|
+
".cljc": "clojure",
|
|
117
|
+
".hs": "haskell",
|
|
118
|
+
".elm": "elm",
|
|
119
|
+
".dart": "dart",
|
|
120
|
+
".v": "v",
|
|
121
|
+
".asm": "assembly",
|
|
122
|
+
".s": "assembly",
|
|
55
123
|
};
|
|
56
124
|
// Binary file extensions to skip
|
|
57
125
|
const BINARY_EXTENSIONS = new Set([
|
|
@@ -132,8 +200,24 @@ export function isCodeFile(filePath) {
|
|
|
132
200
|
// Additional checks for files without extension or special cases
|
|
133
201
|
const basename = path.basename(filePath);
|
|
134
202
|
const codeFileNames = new Set([
|
|
203
|
+
// Build/DevOps
|
|
135
204
|
"Makefile", "Dockerfile", "Jenkinsfile", "Vagrantfile",
|
|
136
205
|
"Rakefile", "Gemfile", "Podfile", "Fastfile",
|
|
206
|
+
"CMakeLists.txt", "meson.build", "BUILD", "WORKSPACE",
|
|
207
|
+
// Config files
|
|
208
|
+
".gitignore", ".gitattributes", ".dockerignore",
|
|
209
|
+
".editorconfig", ".prettierrc", ".eslintrc",
|
|
210
|
+
".babelrc", ".browserslistrc",
|
|
211
|
+
"tsconfig.json", "jsconfig.json", "package.json",
|
|
212
|
+
"angular.json", "nest-cli.json", "nx.json",
|
|
213
|
+
"webpack.config.js", "vite.config.js", "rollup.config.js",
|
|
214
|
+
// CI/CD
|
|
215
|
+
".gitlab-ci.yml", ".travis.yml", "azure-pipelines.yml",
|
|
216
|
+
"bitbucket-pipelines.yml", "cloudbuild.yaml",
|
|
217
|
+
// K8s/Helm
|
|
218
|
+
"Chart.yaml", "values.yaml", "kustomization.yaml",
|
|
219
|
+
// Lock files (optional - might want to skip these)
|
|
220
|
+
// "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
|
|
137
221
|
]);
|
|
138
222
|
return codeFileNames.has(basename);
|
|
139
223
|
}
|