gitnexus 1.6.3-rc.21 → 1.6.3-rc.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/ai-context.js +1 -1
- package/dist/cli/group.js +73 -0
- package/dist/core/embeddings/chunker.js +30 -25
- package/dist/core/embeddings/embedding-pipeline.d.ts +6 -0
- package/dist/core/embeddings/embedding-pipeline.js +15 -6
- package/dist/core/embeddings/text-generator.d.ts +1 -1
- package/dist/core/embeddings/text-generator.js +33 -24
- package/dist/core/embeddings/types.d.ts +43 -1
- package/dist/core/embeddings/types.js +101 -29
- package/dist/core/group/cross-impact.d.ts +41 -0
- package/dist/core/group/cross-impact.js +454 -0
- package/dist/core/group/group-path-utils.d.ts +17 -0
- package/dist/core/group/group-path-utils.js +40 -0
- package/dist/core/group/resolve-at-member.d.ts +10 -0
- package/dist/core/group/resolve-at-member.js +31 -0
- package/dist/core/group/service.d.ts +9 -0
- package/dist/core/group/service.js +219 -20
- package/dist/core/group/types.d.ts +30 -0
- package/dist/core/lbug/lbug-adapter.d.ts +12 -0
- package/dist/core/lbug/lbug-adapter.js +30 -0
- package/dist/core/run-analyze.js +7 -12
- package/dist/core/search/bm25-index.d.ts +6 -0
- package/dist/core/search/bm25-index.js +54 -2
- package/dist/mcp/local/local-backend.d.ts +18 -3
- package/dist/mcp/local/local-backend.js +141 -15
- package/dist/mcp/resources.d.ts +31 -0
- package/dist/mcp/resources.js +100 -17
- package/dist/mcp/tools.d.ts +4 -1
- package/dist/mcp/tools.js +75 -54
- package/package.json +1 -1
package/dist/cli/ai-context.js
CHANGED
|
@@ -89,7 +89,7 @@ This project is indexed by GitNexus as **${projectName}**${noStats ? '' : ` (${s
|
|
|
89
89
|
${groupNames && groupNames.length > 0
|
|
90
90
|
? `## Cross-Repo Groups
|
|
91
91
|
|
|
92
|
-
This repository is listed under GitNexus **group(s): ${groupNames.join(', ')}** (see \`~/.gitnexus/groups/\`). For
|
|
92
|
+
This repository is listed under GitNexus **group(s): ${groupNames.join(', ')}** (see \`~/.gitnexus/groups/\`). For cross-repo analysis, use MCP tools \`impact\`, \`query\`, and \`context\` with \`repo\` set to \`@<groupName>\` or \`@<groupName>/<memberPath>\` (paths match keys in that group’s \`group.yaml\`). Use \`group_list\` / \`group_sync\` for membership and sync. From the terminal: \`npx gitnexus group list\`, \`npx gitnexus group sync <name>\`, \`npx gitnexus group impact <name> --target <symbol> --repo <group-path>\`.
|
|
93
93
|
|
|
94
94
|
`
|
|
95
95
|
: ''}## CLI
|
package/dist/cli/group.js
CHANGED
|
@@ -149,6 +149,79 @@ export function registerGroupCommands(program) {
|
|
|
149
149
|
console.log(`\nWrote contracts.json (${result.contracts.length} contracts, ${result.crossLinks.length} cross-links)`);
|
|
150
150
|
}
|
|
151
151
|
});
|
|
152
|
+
group
|
|
153
|
+
.command('impact <name>')
|
|
154
|
+
.description('Cross-repo impact for a symbol in one member repo of a group')
|
|
155
|
+
.requiredOption('--target <symbol>', 'Symbol or file name to analyze')
|
|
156
|
+
.requiredOption('--repo <groupPath>', 'Member path from group.yaml (e.g. app/backend), not the indexed repo name')
|
|
157
|
+
.option('--direction <dir>', 'upstream or downstream', 'upstream')
|
|
158
|
+
.option('--service <path>', 'Optional monorepo service directory prefix (path filter)')
|
|
159
|
+
.option('--subgroup <path>', 'Optional prefix limiting which group repos participate in cross fan-out')
|
|
160
|
+
.option('--max-depth <n>', 'Max graph traversal depth')
|
|
161
|
+
.option('--cross-depth <n>', 'Cross-repository hop depth')
|
|
162
|
+
.option('--min-confidence <n>', 'Minimum relation confidence (0–1)')
|
|
163
|
+
.option('--include-tests', 'Include test files in traversal', false)
|
|
164
|
+
.option('--timeout-ms <n>', 'Phase-1 local impact wall time in milliseconds')
|
|
165
|
+
.option('--json', 'JSON output')
|
|
166
|
+
.action(async (name, opts) => {
|
|
167
|
+
const { LocalBackend } = await import('../mcp/local/local-backend.js');
|
|
168
|
+
const backend = new LocalBackend();
|
|
169
|
+
try {
|
|
170
|
+
await backend.init();
|
|
171
|
+
const payload = {
|
|
172
|
+
name,
|
|
173
|
+
repo: opts.repo,
|
|
174
|
+
target: opts.target,
|
|
175
|
+
direction: opts.direction || 'upstream',
|
|
176
|
+
};
|
|
177
|
+
if (opts.service)
|
|
178
|
+
payload.service = opts.service;
|
|
179
|
+
if (opts.subgroup)
|
|
180
|
+
payload.subgroup = opts.subgroup;
|
|
181
|
+
if (opts.maxDepth !== undefined && opts.maxDepth !== '') {
|
|
182
|
+
const n = parseInt(String(opts.maxDepth), 10);
|
|
183
|
+
if (!Number.isNaN(n))
|
|
184
|
+
payload.maxDepth = n;
|
|
185
|
+
}
|
|
186
|
+
if (opts.crossDepth !== undefined && opts.crossDepth !== '') {
|
|
187
|
+
const n = parseInt(String(opts.crossDepth), 10);
|
|
188
|
+
if (!Number.isNaN(n))
|
|
189
|
+
payload.crossDepth = n;
|
|
190
|
+
}
|
|
191
|
+
if (opts.minConfidence !== undefined && opts.minConfidence !== '') {
|
|
192
|
+
const n = parseFloat(String(opts.minConfidence));
|
|
193
|
+
if (!Number.isNaN(n))
|
|
194
|
+
payload.minConfidence = n;
|
|
195
|
+
}
|
|
196
|
+
if (opts.timeoutMs !== undefined && opts.timeoutMs !== '') {
|
|
197
|
+
const n = parseInt(String(opts.timeoutMs), 10);
|
|
198
|
+
if (!Number.isNaN(n))
|
|
199
|
+
payload.timeoutMs = n;
|
|
200
|
+
}
|
|
201
|
+
if (opts.includeTests)
|
|
202
|
+
payload.includeTests = true;
|
|
203
|
+
const raw = await backend.getGroupService().groupImpact(payload);
|
|
204
|
+
if (raw && typeof raw === 'object' && 'error' in raw) {
|
|
205
|
+
console.error(String(raw.error));
|
|
206
|
+
process.exitCode = 1;
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
if (opts.json) {
|
|
210
|
+
console.log(JSON.stringify(raw, null, 2));
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
const summary = raw?.summary;
|
|
214
|
+
const risk = raw?.risk;
|
|
215
|
+
console.log(`Group impact for "${name}" (${String(opts.repo)}): risk=${risk ?? '?'}`);
|
|
216
|
+
if (summary) {
|
|
217
|
+
console.log(` direct=${summary.direct ?? 0} processes=${summary.processes_affected ?? 0} cross=${summary.cross_repo_hits ?? 0}`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
finally {
|
|
222
|
+
await backend.dispose().catch(() => { });
|
|
223
|
+
}
|
|
224
|
+
});
|
|
152
225
|
group
|
|
153
226
|
.command('query <name> <query>')
|
|
154
227
|
.description('Search execution flows across all repos in a group')
|
|
@@ -10,6 +10,7 @@ export { characterChunk } from './character-chunk.js';
|
|
|
10
10
|
import { characterChunk } from './character-chunk.js';
|
|
11
11
|
import { ensureAndParse, findDeclarationNode, findFunctionNode } from './ast-utils.js';
|
|
12
12
|
import { buildLineIndex, resolveChunkLines } from './line-index.js';
|
|
13
|
+
import { CHUNKING_RULES, CHUNK_MODE_AST_DECLARATION, CHUNK_MODE_AST_FUNCTION, } from './types.js';
|
|
13
14
|
/**
|
|
14
15
|
* Main chunkNode function: dispatches by label
|
|
15
16
|
*/
|
|
@@ -27,26 +28,24 @@ export const chunkNode = async (label, content, filePath, startLine, endLine, ch
|
|
|
27
28
|
},
|
|
28
29
|
];
|
|
29
30
|
}
|
|
30
|
-
|
|
31
|
-
if (
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
const rule = CHUNKING_RULES[label];
|
|
32
|
+
if (!rule) {
|
|
33
|
+
return characterChunk(content, startLine, endLine, chunkSize, overlap);
|
|
34
|
+
}
|
|
35
|
+
try {
|
|
36
|
+
if (rule.mode === CHUNK_MODE_AST_FUNCTION) {
|
|
37
|
+
const astChunks = await astChunk(content, filePath, startLine, endLine, chunkSize, overlap, rule);
|
|
34
38
|
if (astChunks.length > 0)
|
|
35
39
|
return astChunks;
|
|
36
40
|
}
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
if (label === 'Class' || label === 'Interface') {
|
|
42
|
-
try {
|
|
43
|
-
const declarationChunks = await declarationChunk(label, content, filePath, startLine, endLine, chunkSize, overlap);
|
|
41
|
+
if (rule.mode === CHUNK_MODE_AST_DECLARATION) {
|
|
42
|
+
const declarationChunks = await declarationChunk(content, filePath, startLine, endLine, chunkSize, overlap, rule);
|
|
44
43
|
if (declarationChunks.length > 0)
|
|
45
44
|
return declarationChunks;
|
|
46
45
|
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
// AST parsing failed — fall through to character fallback
|
|
50
49
|
}
|
|
51
50
|
// Character-based fallback for everything else
|
|
52
51
|
return characterChunk(content, startLine, endLine, chunkSize, overlap);
|
|
@@ -56,7 +55,7 @@ export const chunkNode = async (label, content, filePath, startLine, endLine, ch
|
|
|
56
55
|
* Parse snippet content, locate the function declaration node,
|
|
57
56
|
* split body by statement boundaries.
|
|
58
57
|
*/
|
|
59
|
-
const astChunk = async (content, filePath, startLine, endLine, chunkSize, overlap) => {
|
|
58
|
+
const astChunk = async (content, filePath, startLine, endLine, chunkSize, overlap, rule) => {
|
|
60
59
|
const tree = await ensureAndParse(content, filePath);
|
|
61
60
|
if (!tree)
|
|
62
61
|
return [];
|
|
@@ -84,7 +83,7 @@ const astChunk = async (content, filePath, startLine, endLine, chunkSize, overla
|
|
|
84
83
|
}
|
|
85
84
|
if (statements.length === 0)
|
|
86
85
|
return [];
|
|
87
|
-
return chunkByUnits(content, lineOffsets, startLine, chunkSize, overlap, statements, targetNode.startIndex, targetNode.endIndex,
|
|
86
|
+
return chunkByUnits(content, lineOffsets, startLine, chunkSize, overlap, statements, targetNode.startIndex, targetNode.endIndex, rule.includePrefix, rule.includeSuffix);
|
|
88
87
|
};
|
|
89
88
|
const DECLARATION_BODY_NODE_TYPES = new Set([
|
|
90
89
|
'class_body',
|
|
@@ -102,7 +101,7 @@ const FIELD_LIKE_MEMBER_TYPES = new Set([
|
|
|
102
101
|
'pair',
|
|
103
102
|
'enum_assignment',
|
|
104
103
|
]);
|
|
105
|
-
const declarationChunk = async (
|
|
104
|
+
const declarationChunk = async (content, filePath, startLine, endLine, chunkSize, overlap, rule) => {
|
|
106
105
|
const tree = await ensureAndParse(content, filePath);
|
|
107
106
|
if (!tree)
|
|
108
107
|
return [];
|
|
@@ -112,10 +111,10 @@ const declarationChunk = async (label, content, filePath, startLine, endLine, ch
|
|
|
112
111
|
const bodyNode = getDeclarationBodyNode(targetNode);
|
|
113
112
|
if (!bodyNode)
|
|
114
113
|
return [];
|
|
115
|
-
const members = collectDeclarationUnits(bodyNode,
|
|
114
|
+
const members = collectDeclarationUnits(bodyNode, rule.groupFields);
|
|
116
115
|
if (members.length === 0)
|
|
117
116
|
return [];
|
|
118
|
-
return chunkByUnits(content, buildLineIndex(content), startLine, chunkSize, overlap, members, targetNode.startIndex, targetNode.endIndex,
|
|
117
|
+
return chunkByUnits(content, buildLineIndex(content), startLine, chunkSize, overlap, members, targetNode.startIndex, targetNode.endIndex, rule.includePrefix, rule.includeSuffix);
|
|
119
118
|
};
|
|
120
119
|
const buildChunk = (content, lineOffsets, chunkIndex, startOffset, endOffset, baseStartLine) => {
|
|
121
120
|
const lineRange = resolveChunkLines(lineOffsets, startOffset, endOffset, baseStartLine);
|
|
@@ -150,12 +149,18 @@ const chunkByUnits = (content, lineOffsets, baseStartLine, chunkSize, overlap, u
|
|
|
150
149
|
}
|
|
151
150
|
if (candidateEndOffset - chunkStartOffset > chunkSize) {
|
|
152
151
|
const oversizedUnit = units[chunkStartUnitIdx];
|
|
153
|
-
const
|
|
154
|
-
|
|
152
|
+
const oversizedStartOffset = chunkStartUnitIdx === 0 && includeContainerPrefixOnFirstChunk
|
|
153
|
+
? containerStartOffset
|
|
154
|
+
: oversizedUnit.startIndex;
|
|
155
|
+
const oversizedEndOffset = chunkStartUnitIdx === units.length - 1 && includeContainerSuffixOnLastChunk
|
|
156
|
+
? containerEndOffset
|
|
157
|
+
: oversizedUnit.endIndex;
|
|
158
|
+
const oversizedLineRange = resolveChunkLines(lineOffsets, oversizedStartOffset, oversizedEndOffset, baseStartLine);
|
|
159
|
+
const oversizedChunks = characterChunk(content.slice(oversizedStartOffset, oversizedEndOffset), oversizedLineRange.startLine, oversizedLineRange.endLine, chunkSize, overlap).map((chunk, offsetIdx) => ({
|
|
155
160
|
...chunk,
|
|
156
161
|
chunkIndex: chunks.length + offsetIdx,
|
|
157
|
-
startOffset: chunk.startOffset +
|
|
158
|
-
endOffset: chunk.endOffset +
|
|
162
|
+
startOffset: chunk.startOffset + oversizedStartOffset,
|
|
163
|
+
endOffset: chunk.endOffset + oversizedStartOffset,
|
|
159
164
|
}));
|
|
160
165
|
chunks.push(...oversizedChunks);
|
|
161
166
|
chunkStartUnitIdx += 1;
|
|
@@ -200,7 +205,7 @@ const getDeclarationBodyNode = (node) => {
|
|
|
200
205
|
}
|
|
201
206
|
return null;
|
|
202
207
|
};
|
|
203
|
-
const collectDeclarationUnits = (bodyNode,
|
|
208
|
+
const collectDeclarationUnits = (bodyNode, groupFields) => {
|
|
204
209
|
const members = [];
|
|
205
210
|
for (let i = 0; i < bodyNode.namedChildCount; i++) {
|
|
206
211
|
const child = bodyNode.namedChild(i);
|
|
@@ -209,7 +214,7 @@ const collectDeclarationUnits = (bodyNode, label) => {
|
|
|
209
214
|
members.push({
|
|
210
215
|
startIndex: child.startIndex,
|
|
211
216
|
endIndex: child.endIndex,
|
|
212
|
-
groupable:
|
|
217
|
+
groupable: groupFields && FIELD_LIKE_MEMBER_TYPES.has(child.type),
|
|
213
218
|
});
|
|
214
219
|
}
|
|
215
220
|
if (members.length === 0)
|
|
@@ -9,6 +9,12 @@
|
|
|
9
9
|
* 5. Create vector index for semantic search
|
|
10
10
|
*/
|
|
11
11
|
import { type EmbeddingProgress, type EmbeddingConfig, type EmbeddableNode, type SemanticSearchResult, type EmbeddingContext } from './types.js';
|
|
12
|
+
/**
|
|
13
|
+
* Bump this when the embedding text template changes in a way that should
|
|
14
|
+
* invalidate existing vectors, such as metadata/header shape changes,
|
|
15
|
+
* structural container context changes, or preceding-context formatting rules.
|
|
16
|
+
*/
|
|
17
|
+
export declare const EMBEDDING_TEXT_VERSION = "v2";
|
|
12
18
|
/**
|
|
13
19
|
* Compute a stable content fingerprint for an embeddable node.
|
|
14
20
|
* Used to detect when the underlying text has changed so stale vectors
|
|
@@ -13,10 +13,16 @@ import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady,
|
|
|
13
13
|
import { generateEmbeddingText } from './text-generator.js';
|
|
14
14
|
import { chunkNode, characterChunk } from './chunker.js';
|
|
15
15
|
import { extractStructuralNames } from './structural-extractor.js';
|
|
16
|
-
import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, isShortLabel, LABELS_WITH_EXPORTED, STRUCTURAL_LABELS, collectBestChunks, } from './types.js';
|
|
16
|
+
import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, isShortLabel, LABEL_METHOD, LABELS_WITH_EXPORTED, STRUCTURAL_LABELS, collectBestChunks, } from './types.js';
|
|
17
17
|
import { EMBEDDING_TABLE_NAME, EMBEDDING_INDEX_NAME, CREATE_VECTOR_INDEX_QUERY, STALE_HASH_SENTINEL, } from '../lbug/schema.js';
|
|
18
18
|
import { loadVectorExtension } from '../lbug/lbug-adapter.js';
|
|
19
19
|
const isDev = process.env.NODE_ENV === 'development';
|
|
20
|
+
/**
|
|
21
|
+
* Bump this when the embedding text template changes in a way that should
|
|
22
|
+
* invalidate existing vectors, such as metadata/header shape changes,
|
|
23
|
+
* structural container context changes, or preceding-context formatting rules.
|
|
24
|
+
*/
|
|
25
|
+
export const EMBEDDING_TEXT_VERSION = 'v2';
|
|
20
26
|
/**
|
|
21
27
|
* Compute a stable content fingerprint for an embeddable node.
|
|
22
28
|
* Used to detect when the underlying text has changed so stale vectors
|
|
@@ -27,8 +33,9 @@ export const contentHashForNode = (node, config = {}) => {
|
|
|
27
33
|
// Hash must be deterministic across runs, so exclude methodNames/fieldNames
|
|
28
34
|
// which are populated during the batch loop via AST extraction.
|
|
29
35
|
// Using only node.content ensures the hash stays stable.
|
|
36
|
+
// NOTE: A change to extractStructuralNames behavior requires bumping EMBEDDING_TEXT_VERSION.
|
|
30
37
|
const text = generateEmbeddingText({ ...node, methodNames: undefined, fieldNames: undefined }, node.content, config);
|
|
31
|
-
return createHash('sha1').update(text).digest('hex');
|
|
38
|
+
return createHash('sha1').update(EMBEDDING_TEXT_VERSION).update('\n').update(text).digest('hex');
|
|
32
39
|
};
|
|
33
40
|
/**
|
|
34
41
|
* Query all embeddable nodes from LadybugDB
|
|
@@ -39,7 +46,7 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
39
46
|
for (const label of EMBEDDABLE_LABELS) {
|
|
40
47
|
try {
|
|
41
48
|
let query;
|
|
42
|
-
if (label ===
|
|
49
|
+
if (label === LABEL_METHOD) {
|
|
43
50
|
// Method has parameterCount and returnType
|
|
44
51
|
query = `
|
|
45
52
|
MATCH (n:Method)
|
|
@@ -72,7 +79,7 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
72
79
|
}
|
|
73
80
|
const rows = await executeQuery(query);
|
|
74
81
|
for (const row of rows) {
|
|
75
|
-
const hasExportedColumn = label ===
|
|
82
|
+
const hasExportedColumn = label === LABEL_METHOD || LABELS_WITH_EXPORTED.has(label);
|
|
76
83
|
allNodes.push({
|
|
77
84
|
id: row.id ?? row[0],
|
|
78
85
|
name: row.name ?? row[1],
|
|
@@ -83,7 +90,7 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
83
90
|
endLine: row.endLine ?? row[6],
|
|
84
91
|
isExported: hasExportedColumn ? (row.isExported ?? row[7]) : undefined,
|
|
85
92
|
description: row.description ?? (hasExportedColumn ? row[8] : row[7]),
|
|
86
|
-
...(label ===
|
|
93
|
+
...(label === LABEL_METHOD
|
|
87
94
|
? {
|
|
88
95
|
parameterCount: row.parameterCount ?? row[9],
|
|
89
96
|
returnType: row.returnType ?? row[10],
|
|
@@ -301,8 +308,9 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
301
308
|
chunks = characterChunk(node.content, startLine, endLine, chunkSize, overlap);
|
|
302
309
|
}
|
|
303
310
|
}
|
|
311
|
+
let prevTail = '';
|
|
304
312
|
for (const chunk of chunks) {
|
|
305
|
-
const text = generateEmbeddingText(node, chunk.text, finalConfig);
|
|
313
|
+
const text = generateEmbeddingText(node, chunk.text, finalConfig, chunk.chunkIndex, prevTail);
|
|
306
314
|
allTexts.push(text);
|
|
307
315
|
allUpdates.push({
|
|
308
316
|
nodeId: node.id,
|
|
@@ -311,6 +319,7 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
|
|
|
311
319
|
endLine: chunk.endLine,
|
|
312
320
|
contentHash: hash,
|
|
313
321
|
});
|
|
322
|
+
prevTail = overlap > 0 ? chunk.text.slice(-overlap) : '';
|
|
314
323
|
}
|
|
315
324
|
}
|
|
316
325
|
// Embed chunk texts in sub-batches to control memory
|
|
@@ -24,7 +24,7 @@ export declare const extractDeclarationOnly: (content: string) => string;
|
|
|
24
24
|
* Generate embedding text for any embeddable node
|
|
25
25
|
* Dispatches to the appropriate generator based on node label
|
|
26
26
|
*/
|
|
27
|
-
export declare const generateEmbeddingText: (node: EmbeddableNode, codeBody: string, config?: Partial<EmbeddingConfig
|
|
27
|
+
export declare const generateEmbeddingText: (node: EmbeddableNode, codeBody: string, config?: Partial<EmbeddingConfig>, chunkIndex?: number, prevTail?: string) => string;
|
|
28
28
|
/**
|
|
29
29
|
* Export truncation helper for testing
|
|
30
30
|
*/
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* Method/field names for Class nodes are extracted by the ingestion
|
|
9
9
|
* pipeline's AST extractors and passed via node.methodNames/node.fieldNames.
|
|
10
10
|
*/
|
|
11
|
-
import { DEFAULT_EMBEDDING_CONFIG, isShortLabel } from './types.js';
|
|
11
|
+
import { CHUNKING_RULES, DEFAULT_EMBEDDING_CONFIG, STRUCTURAL_TEXT_MODE_DECLARATION, isShortLabel, } from './types.js';
|
|
12
12
|
/**
|
|
13
13
|
* Truncate description to max length at sentence/word boundary
|
|
14
14
|
*/
|
|
@@ -71,34 +71,45 @@ const buildMetadataHeader = (node, config) => {
|
|
|
71
71
|
}
|
|
72
72
|
return parts.join('\n');
|
|
73
73
|
};
|
|
74
|
-
const generateCodeBodyText = (node, codeBody, config) => {
|
|
74
|
+
const generateCodeBodyText = (node, codeBody, config, prevTail) => {
|
|
75
75
|
const header = buildMetadataHeader(node, config);
|
|
76
|
-
const
|
|
77
|
-
|
|
76
|
+
const parts = [header];
|
|
77
|
+
if (prevTail) {
|
|
78
|
+
parts.push(`[preceding context]: ...${cleanContent(prevTail)}`);
|
|
79
|
+
}
|
|
80
|
+
parts.push('', cleanContent(codeBody));
|
|
81
|
+
return parts.join('\n');
|
|
78
82
|
};
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const generateClassText = (node, codeBody, config) => {
|
|
85
|
-
return generateStructuralTypeText(node, codeBody, config);
|
|
83
|
+
const getCompactContainerContext = (cleanedContent, declarationOnly) => {
|
|
84
|
+
const source = declarationOnly || cleanedContent;
|
|
85
|
+
const nlIdx = source.indexOf('\n');
|
|
86
|
+
const firstLine = (nlIdx === -1 ? source : source.substring(0, nlIdx)).trim();
|
|
87
|
+
return firstLine ? `Container: ${firstLine}` : undefined;
|
|
86
88
|
};
|
|
87
|
-
const generateStructuralTypeText = (node, codeBody, config) => {
|
|
89
|
+
const generateStructuralTypeText = (node, codeBody, config, chunkIndex, prevTail) => {
|
|
88
90
|
const header = buildMetadataHeader(node, config);
|
|
89
91
|
const parts = [header];
|
|
90
|
-
|
|
92
|
+
const isFirstChunk = chunkIndex === undefined || chunkIndex === 0;
|
|
93
|
+
const cleanedContent = cleanContent(node.content);
|
|
94
|
+
const declarationOnly = extractDeclarationOnly(cleanedContent);
|
|
95
|
+
const compactContainerContext = getCompactContainerContext(cleanedContent, declarationOnly);
|
|
96
|
+
if (compactContainerContext) {
|
|
97
|
+
parts.push(compactContainerContext);
|
|
98
|
+
}
|
|
99
|
+
if (prevTail) {
|
|
100
|
+
parts.push(`[preceding context]: ...${cleanContent(prevTail)}`);
|
|
101
|
+
}
|
|
102
|
+
if (isFirstChunk && node.methodNames?.length) {
|
|
91
103
|
parts.push(`Methods: ${node.methodNames.join(', ')}`);
|
|
92
104
|
}
|
|
93
|
-
if (node.fieldNames?.length) {
|
|
105
|
+
if (isFirstChunk && node.fieldNames?.length) {
|
|
94
106
|
parts.push(`Properties: ${node.fieldNames.join(', ')}`);
|
|
95
107
|
}
|
|
96
|
-
|
|
97
|
-
if (declarationOnly) {
|
|
108
|
+
if (isFirstChunk && declarationOnly) {
|
|
98
109
|
parts.push('', declarationOnly);
|
|
99
110
|
}
|
|
100
111
|
const cleanedChunk = cleanContent(codeBody);
|
|
101
|
-
if (cleanedChunk && cleanedChunk !==
|
|
112
|
+
if (cleanedChunk && cleanedChunk !== cleanedContent) {
|
|
102
113
|
parts.push('', cleanedChunk);
|
|
103
114
|
}
|
|
104
115
|
return parts.join('\n');
|
|
@@ -179,19 +190,17 @@ export const extractDeclarationOnly = (content) => {
|
|
|
179
190
|
* Generate embedding text for any embeddable node
|
|
180
191
|
* Dispatches to the appropriate generator based on node label
|
|
181
192
|
*/
|
|
182
|
-
export const generateEmbeddingText = (node, codeBody, config = {}) => {
|
|
193
|
+
export const generateEmbeddingText = (node, codeBody, config = {}, chunkIndex, prevTail) => {
|
|
183
194
|
if (isShortLabel(node.label)) {
|
|
184
195
|
const header = buildMetadataHeader(node, config);
|
|
185
196
|
const cleaned = cleanContent(node.content);
|
|
186
197
|
return `${header}\n\n${cleaned}`;
|
|
187
198
|
}
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if (node.label === 'Interface') {
|
|
192
|
-
return generateStructuralTypeText(node, codeBody, config);
|
|
199
|
+
const chunkingRule = CHUNKING_RULES[node.label];
|
|
200
|
+
if (chunkingRule?.structuralTextMode === STRUCTURAL_TEXT_MODE_DECLARATION) {
|
|
201
|
+
return generateStructuralTypeText(node, codeBody, config, chunkIndex, prevTail);
|
|
193
202
|
}
|
|
194
|
-
return generateCodeBodyText(node, codeBody, config);
|
|
203
|
+
return generateCodeBodyText(node, codeBody, config, prevTail);
|
|
195
204
|
};
|
|
196
205
|
/**
|
|
197
206
|
* Export truncation helper for testing
|
|
@@ -3,6 +3,38 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Type definitions for the embedding generation and semantic search system.
|
|
5
5
|
*/
|
|
6
|
+
export declare const LABEL_FUNCTION: "Function";
|
|
7
|
+
export declare const LABEL_METHOD: "Method";
|
|
8
|
+
export declare const LABEL_CONSTRUCTOR: "Constructor";
|
|
9
|
+
export declare const LABEL_CLASS: "Class";
|
|
10
|
+
export declare const LABEL_INTERFACE: "Interface";
|
|
11
|
+
export declare const LABEL_STRUCT: "Struct";
|
|
12
|
+
export declare const LABEL_ENUM: "Enum";
|
|
13
|
+
export declare const LABEL_TRAIT: "Trait";
|
|
14
|
+
export declare const LABEL_IMPL: "Impl";
|
|
15
|
+
export declare const LABEL_MACRO: "Macro";
|
|
16
|
+
export declare const LABEL_NAMESPACE: "Namespace";
|
|
17
|
+
export declare const LABEL_TYPE_ALIAS: "TypeAlias";
|
|
18
|
+
export declare const LABEL_TYPEDEF: "Typedef";
|
|
19
|
+
export declare const LABEL_CONST: "Const";
|
|
20
|
+
export declare const LABEL_PROPERTY: "Property";
|
|
21
|
+
export declare const LABEL_RECORD: "Record";
|
|
22
|
+
export declare const LABEL_UNION: "Union";
|
|
23
|
+
export declare const LABEL_STATIC: "Static";
|
|
24
|
+
export declare const LABEL_VARIABLE: "Variable";
|
|
25
|
+
export declare const LABEL_CODE_ELEMENT: "CodeElement";
|
|
26
|
+
export declare const CHUNK_MODE_AST_FUNCTION: "ast-function";
|
|
27
|
+
export declare const CHUNK_MODE_AST_DECLARATION: "ast-declaration";
|
|
28
|
+
export declare const CHUNK_MODE_CHARACTER: "character";
|
|
29
|
+
export declare const STRUCTURAL_TEXT_MODE_NONE: "none";
|
|
30
|
+
export declare const STRUCTURAL_TEXT_MODE_DECLARATION: "declaration";
|
|
31
|
+
export interface ChunkingRule {
|
|
32
|
+
mode: typeof CHUNK_MODE_AST_FUNCTION | typeof CHUNK_MODE_AST_DECLARATION | typeof CHUNK_MODE_CHARACTER;
|
|
33
|
+
includePrefix: boolean;
|
|
34
|
+
includeSuffix: boolean;
|
|
35
|
+
groupFields: boolean;
|
|
36
|
+
structuralTextMode: typeof STRUCTURAL_TEXT_MODE_NONE | typeof STRUCTURAL_TEXT_MODE_DECLARATION;
|
|
37
|
+
}
|
|
6
38
|
/**
|
|
7
39
|
* Node labels that need chunking (have code body, potentially long)
|
|
8
40
|
*/
|
|
@@ -29,13 +61,22 @@ export declare const isChunkableLabel: (label: string) => boolean;
|
|
|
29
61
|
*/
|
|
30
62
|
export declare const isShortLabel: (label: string) => boolean;
|
|
31
63
|
/**
|
|
32
|
-
* Node labels that have structural names (methods/fields) extractable via AST
|
|
64
|
+
* Node labels that have structural names (methods/fields) extractable via AST.
|
|
65
|
+
* Only labels that consume methodNames/fieldNames in their embedding text should
|
|
66
|
+
* be listed here — extra entries trigger wasted AST parses with no effect on output.
|
|
33
67
|
*/
|
|
34
68
|
export declare const STRUCTURAL_LABELS: ReadonlySet<string>;
|
|
35
69
|
/**
|
|
36
70
|
* Node labels that have isExported column in their schema
|
|
37
71
|
*/
|
|
38
72
|
export declare const LABELS_WITH_EXPORTED: ReadonlySet<string>;
|
|
73
|
+
/**
|
|
74
|
+
* Labels that need special chunking and/or structural text semantics.
|
|
75
|
+
* Any chunkable label omitted here intentionally falls back to characterChunk
|
|
76
|
+
* plus generateCodeBodyText (for example Enum/Trait/Impl/Macro/Namespace).
|
|
77
|
+
*/
|
|
78
|
+
type ChunkableLabel = (typeof CHUNKABLE_LABELS)[number];
|
|
79
|
+
export declare const CHUNKING_RULES: Readonly<Partial<Record<ChunkableLabel, ChunkingRule>>>;
|
|
39
80
|
/**
|
|
40
81
|
* Embedding pipeline phases
|
|
41
82
|
*/
|
|
@@ -163,3 +204,4 @@ export declare const dedupBestChunks: (rows: ChunkSearchRow[], limit?: number) =
|
|
|
163
204
|
* or can tell the result set is exhausted.
|
|
164
205
|
*/
|
|
165
206
|
export declare const collectBestChunks: (limit: number, fetchRows: (fetchLimit: number) => Promise<ChunkSearchRow[]>, maxFetch?: number) => Promise<Map<string, BestChunkMatch>>;
|
|
207
|
+
export {};
|
|
@@ -3,34 +3,61 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Type definitions for the embedding generation and semantic search system.
|
|
5
5
|
*/
|
|
6
|
+
export const LABEL_FUNCTION = 'Function';
|
|
7
|
+
export const LABEL_METHOD = 'Method';
|
|
8
|
+
export const LABEL_CONSTRUCTOR = 'Constructor';
|
|
9
|
+
export const LABEL_CLASS = 'Class';
|
|
10
|
+
export const LABEL_INTERFACE = 'Interface';
|
|
11
|
+
export const LABEL_STRUCT = 'Struct';
|
|
12
|
+
export const LABEL_ENUM = 'Enum';
|
|
13
|
+
export const LABEL_TRAIT = 'Trait';
|
|
14
|
+
export const LABEL_IMPL = 'Impl';
|
|
15
|
+
export const LABEL_MACRO = 'Macro';
|
|
16
|
+
export const LABEL_NAMESPACE = 'Namespace';
|
|
17
|
+
export const LABEL_TYPE_ALIAS = 'TypeAlias';
|
|
18
|
+
export const LABEL_TYPEDEF = 'Typedef';
|
|
19
|
+
export const LABEL_CONST = 'Const';
|
|
20
|
+
export const LABEL_PROPERTY = 'Property';
|
|
21
|
+
export const LABEL_RECORD = 'Record';
|
|
22
|
+
export const LABEL_UNION = 'Union';
|
|
23
|
+
export const LABEL_STATIC = 'Static';
|
|
24
|
+
export const LABEL_VARIABLE = 'Variable';
|
|
25
|
+
export const LABEL_CODE_ELEMENT = 'CodeElement';
|
|
26
|
+
export const CHUNK_MODE_AST_FUNCTION = 'ast-function';
|
|
27
|
+
export const CHUNK_MODE_AST_DECLARATION = 'ast-declaration';
|
|
28
|
+
// CHUNK_MODE_CHARACTER exists for type completeness but is a no-op in CHUNKING_RULES —
|
|
29
|
+
// omit the entry entirely to get character fallback via chunker.ts dispatch.
|
|
30
|
+
export const CHUNK_MODE_CHARACTER = 'character';
|
|
31
|
+
export const STRUCTURAL_TEXT_MODE_NONE = 'none';
|
|
32
|
+
export const STRUCTURAL_TEXT_MODE_DECLARATION = 'declaration';
|
|
6
33
|
/**
|
|
7
34
|
* Node labels that need chunking (have code body, potentially long)
|
|
8
35
|
*/
|
|
9
36
|
export const CHUNKABLE_LABELS = [
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
37
|
+
LABEL_FUNCTION,
|
|
38
|
+
LABEL_METHOD,
|
|
39
|
+
LABEL_CONSTRUCTOR,
|
|
40
|
+
LABEL_CLASS,
|
|
41
|
+
LABEL_INTERFACE,
|
|
42
|
+
LABEL_STRUCT,
|
|
43
|
+
LABEL_ENUM,
|
|
44
|
+
LABEL_TRAIT,
|
|
45
|
+
LABEL_IMPL,
|
|
46
|
+
LABEL_MACRO,
|
|
47
|
+
LABEL_NAMESPACE,
|
|
21
48
|
];
|
|
22
49
|
/**
|
|
23
50
|
* Node labels that are short (no chunking needed, embed directly)
|
|
24
51
|
*/
|
|
25
52
|
export const SHORT_LABELS = [
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
53
|
+
LABEL_TYPE_ALIAS,
|
|
54
|
+
LABEL_TYPEDEF,
|
|
55
|
+
LABEL_CONST,
|
|
56
|
+
LABEL_PROPERTY,
|
|
57
|
+
LABEL_RECORD,
|
|
58
|
+
LABEL_UNION,
|
|
59
|
+
LABEL_STATIC,
|
|
60
|
+
LABEL_VARIABLE,
|
|
34
61
|
];
|
|
35
62
|
/**
|
|
36
63
|
* All embeddable labels (union of CHUNKABLE + SHORT)
|
|
@@ -49,24 +76,69 @@ export const isChunkableLabel = (label) => CHUNKABLE_LABELS.includes(label);
|
|
|
49
76
|
*/
|
|
50
77
|
export const isShortLabel = (label) => SHORT_LABELS.includes(label);
|
|
51
78
|
/**
|
|
52
|
-
* Node labels that have structural names (methods/fields) extractable via AST
|
|
79
|
+
* Node labels that have structural names (methods/fields) extractable via AST.
|
|
80
|
+
* Only labels that consume methodNames/fieldNames in their embedding text should
|
|
81
|
+
* be listed here — extra entries trigger wasted AST parses with no effect on output.
|
|
53
82
|
*/
|
|
54
83
|
export const STRUCTURAL_LABELS = new Set([
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
'Enum',
|
|
84
|
+
LABEL_CLASS,
|
|
85
|
+
LABEL_STRUCT,
|
|
86
|
+
LABEL_INTERFACE,
|
|
59
87
|
]);
|
|
60
88
|
/**
|
|
61
89
|
* Node labels that have isExported column in their schema
|
|
62
90
|
*/
|
|
63
91
|
export const LABELS_WITH_EXPORTED = new Set([
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
92
|
+
LABEL_FUNCTION,
|
|
93
|
+
LABEL_CLASS,
|
|
94
|
+
LABEL_INTERFACE,
|
|
95
|
+
LABEL_METHOD,
|
|
96
|
+
LABEL_CODE_ELEMENT,
|
|
69
97
|
]);
|
|
98
|
+
export const CHUNKING_RULES = {
|
|
99
|
+
[LABEL_FUNCTION]: {
|
|
100
|
+
mode: CHUNK_MODE_AST_FUNCTION,
|
|
101
|
+
includePrefix: true,
|
|
102
|
+
includeSuffix: true,
|
|
103
|
+
groupFields: false,
|
|
104
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_NONE,
|
|
105
|
+
},
|
|
106
|
+
[LABEL_METHOD]: {
|
|
107
|
+
mode: CHUNK_MODE_AST_FUNCTION,
|
|
108
|
+
includePrefix: true,
|
|
109
|
+
includeSuffix: true,
|
|
110
|
+
groupFields: false,
|
|
111
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_NONE,
|
|
112
|
+
},
|
|
113
|
+
[LABEL_CONSTRUCTOR]: {
|
|
114
|
+
mode: CHUNK_MODE_AST_FUNCTION,
|
|
115
|
+
includePrefix: true,
|
|
116
|
+
includeSuffix: true,
|
|
117
|
+
groupFields: false,
|
|
118
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_NONE,
|
|
119
|
+
},
|
|
120
|
+
[LABEL_CLASS]: {
|
|
121
|
+
mode: CHUNK_MODE_AST_DECLARATION,
|
|
122
|
+
includePrefix: true,
|
|
123
|
+
includeSuffix: false,
|
|
124
|
+
groupFields: true,
|
|
125
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_DECLARATION,
|
|
126
|
+
},
|
|
127
|
+
[LABEL_INTERFACE]: {
|
|
128
|
+
mode: CHUNK_MODE_AST_DECLARATION,
|
|
129
|
+
includePrefix: true,
|
|
130
|
+
includeSuffix: false,
|
|
131
|
+
groupFields: false,
|
|
132
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_DECLARATION,
|
|
133
|
+
},
|
|
134
|
+
[LABEL_STRUCT]: {
|
|
135
|
+
mode: CHUNK_MODE_AST_DECLARATION,
|
|
136
|
+
includePrefix: true,
|
|
137
|
+
includeSuffix: false,
|
|
138
|
+
groupFields: true,
|
|
139
|
+
structuralTextMode: STRUCTURAL_TEXT_MODE_DECLARATION,
|
|
140
|
+
},
|
|
141
|
+
};
|
|
70
142
|
/**
|
|
71
143
|
* Default embedding configuration
|
|
72
144
|
* Uses snowflake-arctic-embed-xs for browser efficiency
|