@dreb/semantic-search 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +17 -0
- package/.mcp.json +8 -0
- package/README.md +97 -0
- package/bin/server.js +14 -0
- package/dist/chunker.d.ts +21 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +51 -0
- package/dist/chunker.js.map +1 -0
- package/dist/db.d.ts +89 -0
- package/dist/db.d.ts.map +1 -0
- package/dist/db.js +406 -0
- package/dist/db.js.map +1 -0
- package/dist/embedder.d.ts +52 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +158 -0
- package/dist/embedder.js.map +1 -0
- package/dist/format.d.ts +4 -0
- package/dist/format.d.ts.map +1 -0
- package/dist/format.js +37 -0
- package/dist/format.js.map +1 -0
- package/dist/index-manager.d.ts +55 -0
- package/dist/index-manager.d.ts.map +1 -0
- package/dist/index-manager.js +311 -0
- package/dist/index-manager.js.map +1 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +21 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-server.d.ts +25 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +149 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/metrics/bm25.d.ts +10 -0
- package/dist/metrics/bm25.d.ts.map +1 -0
- package/dist/metrics/bm25.js +32 -0
- package/dist/metrics/bm25.js.map +1 -0
- package/dist/metrics/git-recency.d.ts +14 -0
- package/dist/metrics/git-recency.d.ts.map +1 -0
- package/dist/metrics/git-recency.js +123 -0
- package/dist/metrics/git-recency.js.map +1 -0
- package/dist/metrics/import-graph.d.ts +15 -0
- package/dist/metrics/import-graph.d.ts.map +1 -0
- package/dist/metrics/import-graph.js +115 -0
- package/dist/metrics/import-graph.js.map +1 -0
- package/dist/metrics/path-match.d.ts +13 -0
- package/dist/metrics/path-match.d.ts.map +1 -0
- package/dist/metrics/path-match.js +54 -0
- package/dist/metrics/path-match.js.map +1 -0
- package/dist/metrics/symbol-match.d.ts +12 -0
- package/dist/metrics/symbol-match.d.ts.map +1 -0
- package/dist/metrics/symbol-match.js +62 -0
- package/dist/metrics/symbol-match.js.map +1 -0
- package/dist/metrics/tokenize.d.ts +12 -0
- package/dist/metrics/tokenize.d.ts.map +1 -0
- package/dist/metrics/tokenize.js +29 -0
- package/dist/metrics/tokenize.js.map +1 -0
- package/dist/poem.d.ts +38 -0
- package/dist/poem.d.ts.map +1 -0
- package/dist/poem.js +214 -0
- package/dist/poem.js.map +1 -0
- package/dist/query-classifier.d.ts +17 -0
- package/dist/query-classifier.d.ts.map +1 -0
- package/dist/query-classifier.js +54 -0
- package/dist/query-classifier.js.map +1 -0
- package/dist/scanner.d.ts +30 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +343 -0
- package/dist/scanner.js.map +1 -0
- package/dist/search.d.ts +63 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +400 -0
- package/dist/search.js.map +1 -0
- package/dist/text-chunker.d.ts +15 -0
- package/dist/text-chunker.d.ts.map +1 -0
- package/dist/text-chunker.js +580 -0
- package/dist/text-chunker.js.map +1 -0
- package/dist/tree-sitter-chunker.d.ts +25 -0
- package/dist/tree-sitter-chunker.d.ts.map +1 -0
- package/dist/tree-sitter-chunker.js +357 -0
- package/dist/tree-sitter-chunker.js.map +1 -0
- package/dist/types.d.ts +98 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/vector-store.d.ts +43 -0
- package/dist/vector-store.d.ts.map +1 -0
- package/dist/vector-store.js +73 -0
- package/dist/vector-store.js.map +1 -0
- package/package.json +71 -0
- package/skills/search/SKILL.md +56 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector operations for semantic search.
|
|
3
|
+
*
|
|
4
|
+
* Pure JS implementations — no native dependencies or SQLite UDFs needed.
|
|
5
|
+
* Vectors are computed and compared in JS, stored as BLOBs in SQLite.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Compute cosine similarity between two normalized vectors.
|
|
9
|
+
*
|
|
10
|
+
* For normalized vectors, cosine similarity is simply the dot product:
|
|
11
|
+
* cos(a, b) = Σ a[i] * b[i]
|
|
12
|
+
*
|
|
13
|
+
* Returns a value in [-1, 1] where 1 = identical, 0 = orthogonal, -1 = opposite.
|
|
14
|
+
*/
|
|
15
|
+
export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
|
|
16
|
+
/**
|
|
17
|
+
* Pack a Float32Array into a Buffer for SQLite BLOB storage.
|
|
18
|
+
*
|
|
19
|
+
* Creates a copy to ensure the buffer isn't shared with other typed arrays.
|
|
20
|
+
*/
|
|
21
|
+
export declare function packVector(vector: Float32Array): Buffer;
|
|
22
|
+
/**
|
|
23
|
+
* Unpack a BLOB (Uint8Array from node:sqlite) back to a Float32Array.
|
|
24
|
+
*
|
|
25
|
+
* The returned array shares the underlying buffer with the input for
|
|
26
|
+
* zero-copy performance. Callers should not mutate the input after calling.
|
|
27
|
+
*/
|
|
28
|
+
export declare function unpackVector(blob: Uint8Array): Float32Array;
|
|
29
|
+
/**
|
|
30
|
+
* Find the top-K most similar vectors from a set.
|
|
31
|
+
*
|
|
32
|
+
* Computes cosine similarity between the query vector and every candidate,
|
|
33
|
+
* then returns the K highest-scoring results sorted by descending score.
|
|
34
|
+
*
|
|
35
|
+
* Uses a simple full scan — suitable for the index sizes we expect in a
|
|
36
|
+
* single-project codebase (typically <100K chunks). For millions of vectors,
|
|
37
|
+
* an approximate nearest neighbor index (HNSW, IVF) would be needed.
|
|
38
|
+
*/
|
|
39
|
+
export declare function topKSimilar(query: Float32Array, vectors: Map<number, Float32Array>, k: number): Array<{
|
|
40
|
+
id: number;
|
|
41
|
+
score: number;
|
|
42
|
+
}>;
|
|
43
|
+
//# sourceMappingURL=vector-store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vector-store.d.ts","sourceRoot":"","sources":["../src/vector-store.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAOzE;AAMD;;;;GAIG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,YAAY,GAAG,MAAM,CAEvD;AAED;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,UAAU,GAAG,YAAY,CAE3D;AAMD;;;;;;;;;GASG;AACH,wBAAgB,WAAW,CAC1B,KAAK,EAAE,YAAY,EACnB,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,YAAY,CAAC,EAClC,CAAC,EAAE,MAAM,GACP,KAAK,CAAC;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBtC","sourcesContent":["/**\n * Vector operations for semantic search.\n *\n * Pure JS implementations — no native dependencies or SQLite UDFs needed.\n * Vectors are computed and compared in JS, stored as BLOBs in SQLite.\n */\n\n// ============================================================================\n// Similarity\n// ============================================================================\n\n/**\n * Compute cosine similarity between two normalized vectors.\n *\n * For normalized vectors, cosine similarity is simply the dot product:\n * cos(a, b) = Σ a[i] * b[i]\n *\n * Returns a value in [-1, 1] where 1 = identical, 0 = orthogonal, -1 = opposite.\n */\nexport function cosineSimilarity(a: Float32Array, b: Float32Array): number {\n\tconst len = a.length;\n\tlet dot = 0;\n\tfor (let i = 0; i < len; i++) {\n\t\tdot += a[i] * b[i];\n\t}\n\treturn dot;\n}\n\n// ============================================================================\n// Serialization\n// ============================================================================\n\n/**\n * Pack a Float32Array into a Buffer for SQLite BLOB storage.\n *\n * Creates a copy to ensure the buffer isn't shared with other typed arrays.\n */\nexport function packVector(vector: Float32Array): Buffer {\n\treturn Buffer.from(vector.buffer, vector.byteOffset, vector.byteLength);\n}\n\n/**\n * Unpack a BLOB (Uint8Array from node:sqlite) back to a Float32Array.\n *\n * The returned array shares the underlying buffer with the input for\n * zero-copy performance. Callers should not mutate the input after calling.\n */\nexport function unpackVector(blob: Uint8Array): Float32Array {\n\treturn new Float32Array(blob.buffer, blob.byteOffset, blob.byteLength / 4);\n}\n\n// ============================================================================\n// Top-K Search\n// ============================================================================\n\n/**\n * Find the top-K most similar vectors from a set.\n *\n * Computes cosine similarity between the query vector and every candidate,\n * then returns the K highest-scoring results sorted by descending score.\n *\n * Uses a simple full scan — suitable for the index sizes we expect in a\n * single-project codebase (typically <100K chunks). For millions of vectors,\n * an approximate nearest neighbor index (HNSW, IVF) would be needed.\n */\nexport function topKSimilar(\n\tquery: Float32Array,\n\tvectors: Map<number, Float32Array>,\n\tk: number,\n): Array<{ id: number; score: number }> {\n\tif (k <= 0 || vectors.size === 0) return [];\n\n\t// For small k relative to n, a min-heap would be more efficient.\n\t// For typical codebase sizes (<100K vectors) the difference is negligible,\n\t// and a sorted array is simpler and correct.\n\tconst scored: Array<{ id: number; score: number }> = [];\n\n\tfor (const [id, vector] of vectors) {\n\t\tscored.push({ id, score: cosineSimilarity(query, vector) });\n\t}\n\n\t// Partial sort: only need top-k, but full sort is fine for expected sizes\n\tscored.sort((a, b) => b.score - a.score);\n\n\treturn scored.slice(0, k);\n}\n"]}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector operations for semantic search.
|
|
3
|
+
*
|
|
4
|
+
* Pure JS implementations — no native dependencies or SQLite UDFs needed.
|
|
5
|
+
* Vectors are computed and compared in JS, stored as BLOBs in SQLite.
|
|
6
|
+
*/
|
|
7
|
+
// ============================================================================
|
|
8
|
+
// Similarity
|
|
9
|
+
// ============================================================================
|
|
10
|
+
/**
|
|
11
|
+
* Compute cosine similarity between two normalized vectors.
|
|
12
|
+
*
|
|
13
|
+
* For normalized vectors, cosine similarity is simply the dot product:
|
|
14
|
+
* cos(a, b) = Σ a[i] * b[i]
|
|
15
|
+
*
|
|
16
|
+
* Returns a value in [-1, 1] where 1 = identical, 0 = orthogonal, -1 = opposite.
|
|
17
|
+
*/
|
|
18
|
+
export function cosineSimilarity(a, b) {
|
|
19
|
+
const len = a.length;
|
|
20
|
+
let dot = 0;
|
|
21
|
+
for (let i = 0; i < len; i++) {
|
|
22
|
+
dot += a[i] * b[i];
|
|
23
|
+
}
|
|
24
|
+
return dot;
|
|
25
|
+
}
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// Serialization
|
|
28
|
+
// ============================================================================
|
|
29
|
+
/**
|
|
30
|
+
* Pack a Float32Array into a Buffer for SQLite BLOB storage.
|
|
31
|
+
*
|
|
32
|
+
* Creates a copy to ensure the buffer isn't shared with other typed arrays.
|
|
33
|
+
*/
|
|
34
|
+
export function packVector(vector) {
|
|
35
|
+
return Buffer.from(vector.buffer, vector.byteOffset, vector.byteLength);
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Unpack a BLOB (Uint8Array from node:sqlite) back to a Float32Array.
|
|
39
|
+
*
|
|
40
|
+
* The returned array shares the underlying buffer with the input for
|
|
41
|
+
* zero-copy performance. Callers should not mutate the input after calling.
|
|
42
|
+
*/
|
|
43
|
+
export function unpackVector(blob) {
|
|
44
|
+
return new Float32Array(blob.buffer, blob.byteOffset, blob.byteLength / 4);
|
|
45
|
+
}
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// Top-K Search
|
|
48
|
+
// ============================================================================
|
|
49
|
+
/**
|
|
50
|
+
* Find the top-K most similar vectors from a set.
|
|
51
|
+
*
|
|
52
|
+
* Computes cosine similarity between the query vector and every candidate,
|
|
53
|
+
* then returns the K highest-scoring results sorted by descending score.
|
|
54
|
+
*
|
|
55
|
+
* Uses a simple full scan — suitable for the index sizes we expect in a
|
|
56
|
+
* single-project codebase (typically <100K chunks). For millions of vectors,
|
|
57
|
+
* an approximate nearest neighbor index (HNSW, IVF) would be needed.
|
|
58
|
+
*/
|
|
59
|
+
export function topKSimilar(query, vectors, k) {
|
|
60
|
+
if (k <= 0 || vectors.size === 0)
|
|
61
|
+
return [];
|
|
62
|
+
// For small k relative to n, a min-heap would be more efficient.
|
|
63
|
+
// For typical codebase sizes (<100K vectors) the difference is negligible,
|
|
64
|
+
// and a sorted array is simpler and correct.
|
|
65
|
+
const scored = [];
|
|
66
|
+
for (const [id, vector] of vectors) {
|
|
67
|
+
scored.push({ id, score: cosineSimilarity(query, vector) });
|
|
68
|
+
}
|
|
69
|
+
// Partial sort: only need top-k, but full sort is fine for expected sizes
|
|
70
|
+
scored.sort((a, b) => b.score - a.score);
|
|
71
|
+
return scored.slice(0, k);
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=vector-store.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vector-store.js","sourceRoot":"","sources":["../src/vector-store.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,+EAA+E;AAC/E,aAAa;AACb,+EAA+E;AAE/E;;;;;;;GAOG;AACH,MAAM,UAAU,gBAAgB,CAAC,CAAe,EAAE,CAAe,EAAU;IAC1E,MAAM,GAAG,GAAG,CAAC,CAAC,MAAM,CAAC;IACrB,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9B,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IACD,OAAO,GAAG,CAAC;AAAA,CACX;AAED,+EAA+E;AAC/E,gBAAgB;AAChB,+EAA+E;AAE/E;;;;GAIG;AACH,MAAM,UAAU,UAAU,CAAC,MAAoB,EAAU;IACxD,OAAO,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;AAAA,CACxE;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,IAAgB,EAAgB;IAC5D,OAAO,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;AAAA,CAC3E;AAED,+EAA+E;AAC/E,eAAe;AACf,+EAA+E;AAE/E;;;;;;;;;GASG;AACH,MAAM,UAAU,WAAW,CAC1B,KAAmB,EACnB,OAAkC,EAClC,CAAS,EAC8B;IACvC,IAAI,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE5C,iEAAiE;IACjE,2EAA2E;IAC3E,6CAA6C;IAC7C,MAAM,MAAM,GAAyC,EAAE,CAAC;IAExD,KAAK,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACpC,MAAM,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,KAAK,EAAE,gBAAgB,CAAC,KAAK,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,0EAA0E;IAC1E,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAEzC,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAAA,CAC1B","sourcesContent":["/**\n * Vector operations for semantic search.\n *\n * Pure JS implementations — no native dependencies or SQLite UDFs needed.\n * Vectors are computed and compared in JS, stored as BLOBs in SQLite.\n */\n\n// ============================================================================\n// Similarity\n// ============================================================================\n\n/**\n * Compute cosine similarity between two normalized vectors.\n *\n * For normalized vectors, cosine similarity is simply the dot product:\n * cos(a, b) = Σ a[i] * b[i]\n *\n * Returns a value in [-1, 1] where 1 = identical, 0 = orthogonal, -1 = opposite.\n */\nexport function cosineSimilarity(a: Float32Array, b: Float32Array): number {\n\tconst len = a.length;\n\tlet dot = 0;\n\tfor (let i = 0; i < len; i++) {\n\t\tdot += a[i] * b[i];\n\t}\n\treturn dot;\n}\n\n// ============================================================================\n// Serialization\n// ============================================================================\n\n/**\n * Pack a Float32Array into a Buffer for SQLite BLOB storage.\n *\n * Creates a copy to ensure the buffer isn't shared with other typed arrays.\n */\nexport function packVector(vector: Float32Array): Buffer {\n\treturn Buffer.from(vector.buffer, vector.byteOffset, vector.byteLength);\n}\n\n/**\n * Unpack a BLOB (Uint8Array from node:sqlite) back to a Float32Array.\n *\n * The returned array shares the underlying buffer with the input for\n * zero-copy performance. Callers should not mutate the input after calling.\n */\nexport function unpackVector(blob: Uint8Array): Float32Array {\n\treturn new Float32Array(blob.buffer, blob.byteOffset, blob.byteLength / 4);\n}\n\n// ============================================================================\n// Top-K Search\n// ============================================================================\n\n/**\n * Find the top-K most similar vectors from a set.\n *\n * Computes cosine similarity between the query vector and every candidate,\n * then returns the K highest-scoring results sorted by descending score.\n *\n * Uses a simple full scan — suitable for the index sizes we expect in a\n * single-project codebase (typically <100K chunks). For millions of vectors,\n * an approximate nearest neighbor index (HNSW, IVF) would be needed.\n */\nexport function topKSimilar(\n\tquery: Float32Array,\n\tvectors: Map<number, Float32Array>,\n\tk: number,\n): Array<{ id: number; score: number }> {\n\tif (k <= 0 || vectors.size === 0) return [];\n\n\t// For small k relative to n, a min-heap would be more efficient.\n\t// For typical codebase sizes (<100K vectors) the difference is negligible,\n\t// and a sorted array is simpler and correct.\n\tconst scored: Array<{ id: number; score: number }> = [];\n\n\tfor (const [id, vector] of vectors) {\n\t\tscored.push({ id, score: cosineSimilarity(query, vector) });\n\t}\n\n\t// Partial sort: only need top-k, but full sort is fine for expected sizes\n\tscored.sort((a, b) => b.score - a.score);\n\n\treturn scored.slice(0, k);\n}\n"]}
|
package/package.json
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@dreb/semantic-search",
|
|
3
|
+
"version": "1.18.0",
|
|
4
|
+
"description": "Semantic codebase search engine with embedding-based ranking and MCP server",
|
|
5
|
+
"publishConfig": {
|
|
6
|
+
"access": "public"
|
|
7
|
+
},
|
|
8
|
+
"type": "module",
|
|
9
|
+
"main": "./dist/index.js",
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"import": "./dist/index.js"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"bin": {
|
|
18
|
+
"semantic-search-mcp": "./bin/server.js"
|
|
19
|
+
},
|
|
20
|
+
"files": [
|
|
21
|
+
"dist",
|
|
22
|
+
"bin",
|
|
23
|
+
".claude-plugin",
|
|
24
|
+
".mcp.json",
|
|
25
|
+
"skills",
|
|
26
|
+
"README.md"
|
|
27
|
+
],
|
|
28
|
+
"scripts": {
|
|
29
|
+
"clean": "shx rm -rf dist",
|
|
30
|
+
"build": "tsgo -p tsconfig.build.json",
|
|
31
|
+
"dev": "tsgo -p tsconfig.build.json --watch --preserveWatchOutput",
|
|
32
|
+
"test": "vitest --run",
|
|
33
|
+
"prepublishOnly": "npm run clean && npm run build"
|
|
34
|
+
},
|
|
35
|
+
"dependencies": {
|
|
36
|
+
"@huggingface/transformers": "^4.0.1",
|
|
37
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
38
|
+
"ignore": "^7.0.5",
|
|
39
|
+
"tree-sitter-c": "^0.24.1",
|
|
40
|
+
"tree-sitter-cpp": "^0.23.4",
|
|
41
|
+
"tree-sitter-go": "^0.25.0",
|
|
42
|
+
"tree-sitter-java": "^0.23.5",
|
|
43
|
+
"tree-sitter-javascript": "^0.25.0",
|
|
44
|
+
"tree-sitter-python": "^0.25.0",
|
|
45
|
+
"tree-sitter-rust": "^0.24.0",
|
|
46
|
+
"tree-sitter-typescript": "^0.23.2",
|
|
47
|
+
"web-tree-sitter": "^0.26.8"
|
|
48
|
+
},
|
|
49
|
+
"devDependencies": {
|
|
50
|
+
"@types/node": "^24.3.0",
|
|
51
|
+
"shx": "^0.4.0",
|
|
52
|
+
"typescript": "^5.9.2",
|
|
53
|
+
"vitest": "^3.2.4"
|
|
54
|
+
},
|
|
55
|
+
"keywords": [
|
|
56
|
+
"semantic-search",
|
|
57
|
+
"codebase",
|
|
58
|
+
"embeddings",
|
|
59
|
+
"mcp"
|
|
60
|
+
],
|
|
61
|
+
"author": "Drew Brereton",
|
|
62
|
+
"license": "MIT",
|
|
63
|
+
"repository": {
|
|
64
|
+
"type": "git",
|
|
65
|
+
"url": "https://github.com/aebrer/dreb.git",
|
|
66
|
+
"directory": "packages/semantic-search"
|
|
67
|
+
},
|
|
68
|
+
"engines": {
|
|
69
|
+
"node": ">=22.0.0"
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Semantic Codebase Search
|
|
2
|
+
|
|
3
|
+
Use `search` as your **default exploration tool** for understanding code, finding implementations, and answering questions about the codebase. Use `grep` when you already know the exact text or pattern you're looking for.
|
|
4
|
+
|
|
5
|
+
## Indexing
|
|
6
|
+
|
|
7
|
+
The first query builds the index — this may take 10–60 seconds depending on project size. Subsequent queries are fast because the index is incrementally updated (only changed files are re-processed).
|
|
8
|
+
|
|
9
|
+
## Query Types
|
|
10
|
+
|
|
11
|
+
The search tool supports three kinds of queries, automatically classified:
|
|
12
|
+
|
|
13
|
+
- **Identifier queries** — e.g. `AuthMiddleware`, `handleRequest`, `SearchEngine` — finds definitions, usages, and related code for a specific symbol
|
|
14
|
+
- **Natural language queries** — e.g. `where is rate limiting handled`, `how does authentication work` — semantic search across code and documentation
|
|
15
|
+
- **Path queries** — e.g. `src/auth/`, `packages/ai` — finds code within a directory structure
|
|
16
|
+
|
|
17
|
+
## Parameters
|
|
18
|
+
|
|
19
|
+
| Parameter | Required | Description |
|
|
20
|
+
| ------------ | -------- | --------------------------------------------------------------------------- |
|
|
21
|
+
| `query` | Yes | Search query — natural language, identifier, or path |
|
|
22
|
+
| `projectDir` | Yes | Absolute path to the project directory. Set this to your current working directory |
|
|
23
|
+
| `path` | No | Restrict search to files under this subdirectory (relative to project root) |
|
|
24
|
+
| `limit` | No | Maximum number of results to return (default: 20) |
|
|
25
|
+
| `rebuild` | No | Force a clean index rebuild — use when files have changed significantly |
|
|
26
|
+
|
|
27
|
+
## Ranking
|
|
28
|
+
|
|
29
|
+
Results are ranked using 6 signals fused via **POEM** (Pareto-Optimal Embedded Modeling):
|
|
30
|
+
|
|
31
|
+
1. **BM25** — keyword matching via FTS5 full-text search
|
|
32
|
+
2. **Cosine similarity** — embedding-based semantic similarity (all-MiniLM-L6-v2)
|
|
33
|
+
3. **Path match** — query terms appearing in the file path
|
|
34
|
+
4. **Symbol match** — query terms matching function/class/type names
|
|
35
|
+
5. **Import graph proximity** — files imported by or importing high-scoring files
|
|
36
|
+
6. **Git recency** — recently modified files ranked higher
|
|
37
|
+
|
|
38
|
+
The weight given to each signal varies by query type. Identifier queries emphasize symbol match and BM25; natural language queries emphasize cosine similarity; path queries emphasize path match.
|
|
39
|
+
|
|
40
|
+
## Results
|
|
41
|
+
|
|
42
|
+
Each result includes:
|
|
43
|
+
|
|
44
|
+
- **File path** and **line range** (start–end)
|
|
45
|
+
- **Chunk kind** (function, class, method, interface, heading_section, etc.) and **name**
|
|
46
|
+
- **Metric scores** for each of the 6 signals
|
|
47
|
+
- **Content preview** of the matching code or text
|
|
48
|
+
|
|
49
|
+
## Tips
|
|
50
|
+
|
|
51
|
+
- Start broad, then narrow with `path` if you get too many results from different areas
|
|
52
|
+
- Use `limit` to get more results when exploring a broad topic (e.g. `limit: 50`)
|
|
53
|
+
- Use `rebuild: true` after major refactors, branch switches, or large file changes
|
|
54
|
+
- Identifier queries work best for finding where something is defined or used
|
|
55
|
+
- Natural language queries work best for understanding how a feature or concept is implemented
|
|
56
|
+
- Combine search with `grep` for a powerful workflow: search to find the right files, then grep for exact patterns within them
|