@dreb/coding-agent 1.18.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/tools/search.d.ts.map +1 -1
- package/dist/core/tools/search.js +14 -36
- package/dist/core/tools/search.js.map +1 -1
- package/package.json +2 -1
- package/dist/core/search/chunker.d.ts +0 -21
- package/dist/core/search/chunker.d.ts.map +0 -1
- package/dist/core/search/chunker.js +0 -51
- package/dist/core/search/chunker.js.map +0 -1
- package/dist/core/search/db.d.ts +0 -89
- package/dist/core/search/db.d.ts.map +0 -1
- package/dist/core/search/db.js +0 -406
- package/dist/core/search/db.js.map +0 -1
- package/dist/core/search/embedder.d.ts +0 -52
- package/dist/core/search/embedder.d.ts.map +0 -1
- package/dist/core/search/embedder.js +0 -158
- package/dist/core/search/embedder.js.map +0 -1
- package/dist/core/search/index-manager.d.ts +0 -55
- package/dist/core/search/index-manager.d.ts.map +0 -1
- package/dist/core/search/index-manager.js +0 -311
- package/dist/core/search/index-manager.js.map +0 -1
- package/dist/core/search/metrics/bm25.d.ts +0 -10
- package/dist/core/search/metrics/bm25.d.ts.map +0 -1
- package/dist/core/search/metrics/bm25.js +0 -32
- package/dist/core/search/metrics/bm25.js.map +0 -1
- package/dist/core/search/metrics/git-recency.d.ts +0 -14
- package/dist/core/search/metrics/git-recency.d.ts.map +0 -1
- package/dist/core/search/metrics/git-recency.js +0 -123
- package/dist/core/search/metrics/git-recency.js.map +0 -1
- package/dist/core/search/metrics/import-graph.d.ts +0 -15
- package/dist/core/search/metrics/import-graph.d.ts.map +0 -1
- package/dist/core/search/metrics/import-graph.js +0 -115
- package/dist/core/search/metrics/import-graph.js.map +0 -1
- package/dist/core/search/metrics/path-match.d.ts +0 -13
- package/dist/core/search/metrics/path-match.d.ts.map +0 -1
- package/dist/core/search/metrics/path-match.js +0 -54
- package/dist/core/search/metrics/path-match.js.map +0 -1
- package/dist/core/search/metrics/symbol-match.d.ts +0 -12
- package/dist/core/search/metrics/symbol-match.d.ts.map +0 -1
- package/dist/core/search/metrics/symbol-match.js +0 -62
- package/dist/core/search/metrics/symbol-match.js.map +0 -1
- package/dist/core/search/metrics/tokenize.d.ts +0 -12
- package/dist/core/search/metrics/tokenize.d.ts.map +0 -1
- package/dist/core/search/metrics/tokenize.js +0 -29
- package/dist/core/search/metrics/tokenize.js.map +0 -1
- package/dist/core/search/poem.d.ts +0 -38
- package/dist/core/search/poem.d.ts.map +0 -1
- package/dist/core/search/poem.js +0 -214
- package/dist/core/search/poem.js.map +0 -1
- package/dist/core/search/query-classifier.d.ts +0 -17
- package/dist/core/search/query-classifier.d.ts.map +0 -1
- package/dist/core/search/query-classifier.js +0 -54
- package/dist/core/search/query-classifier.js.map +0 -1
- package/dist/core/search/scanner.d.ts +0 -30
- package/dist/core/search/scanner.d.ts.map +0 -1
- package/dist/core/search/scanner.js +0 -344
- package/dist/core/search/scanner.js.map +0 -1
- package/dist/core/search/search.d.ts +0 -51
- package/dist/core/search/search.d.ts.map +0 -1
- package/dist/core/search/search.js +0 -381
- package/dist/core/search/search.js.map +0 -1
- package/dist/core/search/text-chunker.d.ts +0 -15
- package/dist/core/search/text-chunker.d.ts.map +0 -1
- package/dist/core/search/text-chunker.js +0 -580
- package/dist/core/search/text-chunker.js.map +0 -1
- package/dist/core/search/tree-sitter-chunker.d.ts +0 -25
- package/dist/core/search/tree-sitter-chunker.d.ts.map +0 -1
- package/dist/core/search/tree-sitter-chunker.js +0 -357
- package/dist/core/search/tree-sitter-chunker.js.map +0 -1
- package/dist/core/search/types.d.ts +0 -96
- package/dist/core/search/types.d.ts.map +0 -1
- package/dist/core/search/types.js +0 -6
- package/dist/core/search/types.js.map +0 -1
- package/dist/core/search/vector-store.d.ts +0 -43
- package/dist/core/search/vector-store.d.ts.map +0 -1
- package/dist/core/search/vector-store.js +0 -73
- package/dist/core/search/vector-store.js.map +0 -1
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* POEM — Pareto-Optimal Embedded Modeling, multi-metric ranking.
|
|
3
|
-
*
|
|
4
|
-
* Ranks search candidates across multiple relevance metrics without requiring
|
|
5
|
-
* hand-tuned weights. Uses the TFPR (Top-Fraction Pareto Ranker) approach:
|
|
6
|
-
* vectorized dominance matrix computation with column duplication for
|
|
7
|
-
* query-type-dependent metric weighting.
|
|
8
|
-
*
|
|
9
|
-
* Algorithm:
|
|
10
|
-
* 1. Prune: per-metric top-K → union of survivors
|
|
11
|
-
* 2. Build objectives matrix with column duplication for query-type weighting
|
|
12
|
-
* 3. For each objective column, sort candidates and accumulate pairwise
|
|
13
|
-
* dominance counts (duplicate columns contribute via weight multiplier)
|
|
14
|
-
* 4. Compute fitness: meanDominance × (numDominating + ε) / (numSubmitting + ε)
|
|
15
|
-
* 5. Sort by fitness, assign ranks
|
|
16
|
-
*
|
|
17
|
-
* References:
|
|
18
|
-
* - POEM paper: https://iopscience.iop.org/article/10.1088/2632-2153/ab891b
|
|
19
|
-
* - TFPR: https://github.com/merckgroup/aidd_tfpr
|
|
20
|
-
* - colourdle: https://github.com/aebrer/colourdle
|
|
21
|
-
*/
|
|
22
|
-
import type { QueryType } from "./query-classifier.js";
|
|
23
|
-
import { type MetricScores } from "./types.js";
|
|
24
|
-
export interface RankedCandidate {
|
|
25
|
-
id: number;
|
|
26
|
-
scores: MetricScores;
|
|
27
|
-
rank: number;
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Rank candidates using POEM / TFPR.
|
|
31
|
-
*
|
|
32
|
-
* @param candidates Map of candidateId → MetricScores (all values 0–1)
|
|
33
|
-
* @param queryType Query type for column duplication weighting
|
|
34
|
-
* @param topK Per-metric pruning limit (default: 1000)
|
|
35
|
-
* @returns Candidates ordered best-first with assigned ranks (0 = best)
|
|
36
|
-
*/
|
|
37
|
-
export declare function poemRank(candidates: Map<number, MetricScores>, queryType: QueryType, topK?: number): RankedCandidate[];
|
|
38
|
-
//# sourceMappingURL=poem.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"poem.d.ts","sourceRoot":"","sources":["../../../src/core/search/poem.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAiC,KAAK,YAAY,EAAE,MAAM,YAAY,CAAC;AAM9E,MAAM,WAAW,eAAe;IAC/B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,YAAY,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;CACb;AA4KD;;;;;;;GAOG;AACH,wBAAgB,QAAQ,CAAC,UAAU,EAAE,GAAG,CAAC,MAAM,EAAE,YAAY,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,SAAO,GAAG,eAAe,EAAE,CA+CpH","sourcesContent":["/**\n * POEM — Pareto-Optimal Embedded Modeling, multi-metric ranking.\n *\n * Ranks search candidates across multiple relevance metrics without requiring\n * hand-tuned weights. Uses the TFPR (Top-Fraction Pareto Ranker) approach:\n * vectorized dominance matrix computation with column duplication for\n * query-type-dependent metric weighting.\n *\n * Algorithm:\n * 1. Prune: per-metric top-K → union of survivors\n * 2. Build objectives matrix with column duplication for query-type weighting\n * 3. For each objective column, sort candidates and accumulate pairwise\n * dominance counts (duplicate columns contribute via weight multiplier)\n * 4. Compute fitness: meanDominance × (numDominating + ε) / (numSubmitting + ε)\n * 5. Sort by fitness, assign ranks\n *\n * References:\n * - POEM paper: https://iopscience.iop.org/article/10.1088/2632-2153/ab891b\n * - TFPR: https://github.com/merckgroup/aidd_tfpr\n * - colourdle: https://github.com/aebrer/colourdle\n */\n\nimport type { QueryType } from \"./query-classifier.js\";\nimport { METRIC_NAMES, type MetricName, type MetricScores } from \"./types.js\";\n\n// ============================================================================\n// Public types\n// ============================================================================\n\nexport interface RankedCandidate {\n\tid: number;\n\tscores: MetricScores;\n\trank: number;\n}\n\n// ============================================================================\n// Column duplication config per query type\n// ============================================================================\n\n/** How many times each metric column appears in the objectives matrix. */\ntype ColumnWeights = Record<MetricName, number>;\n\nconst COLUMN_WEIGHTS: Record<QueryType, ColumnWeights> = {\n\tidentifier: {\n\t\tbm25: 2,\n\t\tcosine: 1,\n\t\tpathMatch: 1,\n\t\tsymbolMatch: 2,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n\tnatural_language: {\n\t\tbm25: 1,\n\t\tcosine: 2,\n\t\tpathMatch: 1,\n\t\tsymbolMatch: 1,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n\tpath_like: {\n\t\tbm25: 1,\n\t\tcosine: 1,\n\t\tpathMatch: 3,\n\t\tsymbolMatch: 1,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n};\n\n// ============================================================================\n// Fitness smoothing constant (avoids division by zero)\n// ============================================================================\n\nconst EPSILON = 0.05;\n\n// ============================================================================\n// Pruning\n// ============================================================================\n\n/**\n * Per-metric top-K pruning → union of surviving candidate IDs.\n */\nfunction pruneTopK(candidates: Map<number, MetricScores>, topK: number): Set<number> {\n\tif (candidates.size <= topK) {\n\t\treturn new Set(candidates.keys());\n\t}\n\n\tconst union = new Set<number>();\n\n\tfor (const metric of METRIC_NAMES) {\n\t\tconst pairs: Array<[number, number]> = [];\n\t\tfor (const [id, scores] of candidates) {\n\t\t\tpairs.push([id, scores[metric] ?? 0]);\n\t\t}\n\t\tpairs.sort((a, b) => b[1] - a[1]);\n\t\tconst limit = Math.min(topK, pairs.length);\n\t\tfor (let i = 0; i < limit; i++) {\n\t\t\tunion.add(pairs[i][0]);\n\t\t}\n\t}\n\n\treturn union;\n}\n\n// ============================================================================\n// Dominance matrix computation\n// ============================================================================\n\n/**\n * Build the dominance count matrix using the TFPR approach.\n *\n * For each objective (metric), sorts candidates and accumulates pairwise\n * dominance: if candidate i ranks above candidate j on an objective,\n * dominanceCounts[i][j] increases by the column weight.\n *\n * Duplicate columns (from column duplication) are handled by multiplying\n * the contribution by the weight rather than re-sorting — same result,\n * no redundant work.\n *\n * Uses Uint16Array to keep memory compact (max possible value per cell\n * is the sum of all weights, which is ≤ 10).\n *\n * @param scores Dense array of MetricScores, indexed 0..n-1\n * @param weights Column weights from query type\n * @param topK Only top-K per objective contribute to pairwise dominance\n * @returns Flat dominance count matrix [n × n] and the total weight sum\n */\nfunction buildDominanceCounts(\n\tscores: MetricScores[],\n\tweights: ColumnWeights,\n\ttopK: number,\n): [counts: Uint16Array, totalWeight: number] {\n\tconst n = scores.length;\n\tconst counts = new Uint16Array(n * n);\n\tlet totalWeight = 0;\n\n\t// Reusable index array for sorting\n\tconst indices = new Array<number>(n);\n\tfor (let i = 0; i < n; i++) indices[i] = i;\n\n\tfor (const metric of METRIC_NAMES) {\n\t\tconst weight = weights[metric];\n\t\tif (weight === 0) continue;\n\t\ttotalWeight += weight;\n\n\t\t// Sort candidates by this metric (descending)\n\t\tconst sorted = indices.slice();\n\t\tsorted.sort((a, b) => (scores[b][metric] ?? 0) - (scores[a][metric] ?? 0));\n\n\t\t// Only consider top-K for pairwise dominance\n\t\tconst k = Math.min(topK, n);\n\n\t\t// For each pair in the top-K where i ranks above j:\n\t\t// i dominates j on this objective → add weight to counts[i, j]\n\t\tfor (let ri = 0; ri < k; ri++) {\n\t\t\tconst i = sorted[ri];\n\t\t\tconst iBase = i * n;\n\t\t\tfor (let rj = ri + 1; rj < k; rj++) {\n\t\t\t\tcounts[iBase + sorted[rj]] += weight;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn [counts, totalWeight];\n}\n\n/**\n * Compute fitness scores from the dominance count matrix.\n *\n * Fitness = meanDominance × (numDominating + ε) / (numSubmitting + ε)\n *\n * Where:\n * - meanDominance = average normalized dominance across all other candidates\n * - numDominating = count of candidates this one dominates (>50% of objectives)\n * - numSubmitting = count of candidates this one fails to dominate (<50%)\n */\nfunction computeFitness(counts: Uint16Array, n: number, totalWeight: number): Float64Array {\n\tconst fitness = new Float64Array(n);\n\tconst threshold = totalWeight * 0.5;\n\n\tfor (let i = 0; i < n; i++) {\n\t\tlet sumDom = 0;\n\t\tlet numDominating = 0;\n\t\tlet numSubmitting = 0;\n\t\tconst iBase = i * n;\n\n\t\tfor (let j = 0; j < n; j++) {\n\t\t\tif (i === j) continue;\n\t\t\tconst count = counts[iBase + j];\n\t\t\tsumDom += count;\n\t\t\tif (count > threshold) numDominating++;\n\t\t\tif (count < threshold) numSubmitting++;\n\t\t}\n\n\t\tconst meanDom = n > 1 ? sumDom / ((n - 1) * totalWeight) : 0;\n\t\tfitness[i] = (meanDom * (numDominating + EPSILON)) / (numSubmitting + EPSILON);\n\t}\n\n\treturn fitness;\n}\n\n// ============================================================================\n// Public API\n// ============================================================================\n\n/**\n * Rank candidates using POEM / TFPR.\n *\n * @param candidates Map of candidateId → MetricScores (all values 0–1)\n * @param queryType Query type for column duplication weighting\n * @param topK Per-metric pruning limit (default: 1000)\n * @returns Candidates ordered best-first with assigned ranks (0 = best)\n */\nexport function poemRank(candidates: Map<number, MetricScores>, queryType: QueryType, topK = 1000): RankedCandidate[] {\n\tif (candidates.size === 0) return [];\n\n\t// 1. Prune: per-metric top-K → union\n\tconst surviving = pruneTopK(candidates, topK);\n\n\t// 2. Build dense arrays\n\tconst ids: number[] = [];\n\tconst scores: MetricScores[] = [];\n\n\tfor (const id of surviving) {\n\t\tconst s = candidates.get(id)!;\n\t\tids.push(id);\n\t\tscores.push({\n\t\t\tbm25: s.bm25 ?? 0,\n\t\t\tcosine: s.cosine ?? 0,\n\t\t\tpathMatch: s.pathMatch ?? 0,\n\t\t\tsymbolMatch: s.symbolMatch ?? 0,\n\t\t\timportGraph: s.importGraph ?? 0,\n\t\t\tgitRecency: s.gitRecency ?? 0,\n\t\t});\n\t}\n\n\tconst n = ids.length;\n\n\t// Single candidate → rank 0\n\tif (n === 1) {\n\t\treturn [{ id: ids[0], scores: scores[0], rank: 0 }];\n\t}\n\n\t// 3. Compute dominance matrix with column duplication\n\tconst weights = COLUMN_WEIGHTS[queryType];\n\tconst [counts, totalWeight] = buildDominanceCounts(scores, weights, topK);\n\n\t// 4. Compute fitness scores\n\tconst fitness = computeFitness(counts, n, totalWeight);\n\n\t// 5. Sort by fitness (descending) and assign ranks\n\tconst order = new Array<number>(n);\n\tfor (let i = 0; i < n; i++) order[i] = i;\n\torder.sort((a, b) => fitness[b] - fitness[a]);\n\n\treturn order.map((idx, rank) => ({\n\t\tid: ids[idx],\n\t\tscores: scores[idx],\n\t\trank,\n\t}));\n}\n"]}
|
package/dist/core/search/poem.js
DELETED
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* POEM — Pareto-Optimal Embedded Modeling, multi-metric ranking.
|
|
3
|
-
*
|
|
4
|
-
* Ranks search candidates across multiple relevance metrics without requiring
|
|
5
|
-
* hand-tuned weights. Uses the TFPR (Top-Fraction Pareto Ranker) approach:
|
|
6
|
-
* vectorized dominance matrix computation with column duplication for
|
|
7
|
-
* query-type-dependent metric weighting.
|
|
8
|
-
*
|
|
9
|
-
* Algorithm:
|
|
10
|
-
* 1. Prune: per-metric top-K → union of survivors
|
|
11
|
-
* 2. Build objectives matrix with column duplication for query-type weighting
|
|
12
|
-
* 3. For each objective column, sort candidates and accumulate pairwise
|
|
13
|
-
* dominance counts (duplicate columns contribute via weight multiplier)
|
|
14
|
-
* 4. Compute fitness: meanDominance × (numDominating + ε) / (numSubmitting + ε)
|
|
15
|
-
* 5. Sort by fitness, assign ranks
|
|
16
|
-
*
|
|
17
|
-
* References:
|
|
18
|
-
* - POEM paper: https://iopscience.iop.org/article/10.1088/2632-2153/ab891b
|
|
19
|
-
* - TFPR: https://github.com/merckgroup/aidd_tfpr
|
|
20
|
-
* - colourdle: https://github.com/aebrer/colourdle
|
|
21
|
-
*/
|
|
22
|
-
import { METRIC_NAMES } from "./types.js";
|
|
23
|
-
const COLUMN_WEIGHTS = {
|
|
24
|
-
identifier: {
|
|
25
|
-
bm25: 2,
|
|
26
|
-
cosine: 1,
|
|
27
|
-
pathMatch: 1,
|
|
28
|
-
symbolMatch: 2,
|
|
29
|
-
importGraph: 1,
|
|
30
|
-
gitRecency: 1,
|
|
31
|
-
},
|
|
32
|
-
natural_language: {
|
|
33
|
-
bm25: 1,
|
|
34
|
-
cosine: 2,
|
|
35
|
-
pathMatch: 1,
|
|
36
|
-
symbolMatch: 1,
|
|
37
|
-
importGraph: 1,
|
|
38
|
-
gitRecency: 1,
|
|
39
|
-
},
|
|
40
|
-
path_like: {
|
|
41
|
-
bm25: 1,
|
|
42
|
-
cosine: 1,
|
|
43
|
-
pathMatch: 3,
|
|
44
|
-
symbolMatch: 1,
|
|
45
|
-
importGraph: 1,
|
|
46
|
-
gitRecency: 1,
|
|
47
|
-
},
|
|
48
|
-
};
|
|
49
|
-
// ============================================================================
|
|
50
|
-
// Fitness smoothing constant (avoids division by zero)
|
|
51
|
-
// ============================================================================
|
|
52
|
-
const EPSILON = 0.05;
|
|
53
|
-
// ============================================================================
|
|
54
|
-
// Pruning
|
|
55
|
-
// ============================================================================
|
|
56
|
-
/**
|
|
57
|
-
* Per-metric top-K pruning → union of surviving candidate IDs.
|
|
58
|
-
*/
|
|
59
|
-
function pruneTopK(candidates, topK) {
|
|
60
|
-
if (candidates.size <= topK) {
|
|
61
|
-
return new Set(candidates.keys());
|
|
62
|
-
}
|
|
63
|
-
const union = new Set();
|
|
64
|
-
for (const metric of METRIC_NAMES) {
|
|
65
|
-
const pairs = [];
|
|
66
|
-
for (const [id, scores] of candidates) {
|
|
67
|
-
pairs.push([id, scores[metric] ?? 0]);
|
|
68
|
-
}
|
|
69
|
-
pairs.sort((a, b) => b[1] - a[1]);
|
|
70
|
-
const limit = Math.min(topK, pairs.length);
|
|
71
|
-
for (let i = 0; i < limit; i++) {
|
|
72
|
-
union.add(pairs[i][0]);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
return union;
|
|
76
|
-
}
|
|
77
|
-
// ============================================================================
|
|
78
|
-
// Dominance matrix computation
|
|
79
|
-
// ============================================================================
|
|
80
|
-
/**
|
|
81
|
-
* Build the dominance count matrix using the TFPR approach.
|
|
82
|
-
*
|
|
83
|
-
* For each objective (metric), sorts candidates and accumulates pairwise
|
|
84
|
-
* dominance: if candidate i ranks above candidate j on an objective,
|
|
85
|
-
* dominanceCounts[i][j] increases by the column weight.
|
|
86
|
-
*
|
|
87
|
-
* Duplicate columns (from column duplication) are handled by multiplying
|
|
88
|
-
* the contribution by the weight rather than re-sorting — same result,
|
|
89
|
-
* no redundant work.
|
|
90
|
-
*
|
|
91
|
-
* Uses Uint16Array to keep memory compact (max possible value per cell
|
|
92
|
-
* is the sum of all weights, which is ≤ 10).
|
|
93
|
-
*
|
|
94
|
-
* @param scores Dense array of MetricScores, indexed 0..n-1
|
|
95
|
-
* @param weights Column weights from query type
|
|
96
|
-
* @param topK Only top-K per objective contribute to pairwise dominance
|
|
97
|
-
* @returns Flat dominance count matrix [n × n] and the total weight sum
|
|
98
|
-
*/
|
|
99
|
-
function buildDominanceCounts(scores, weights, topK) {
|
|
100
|
-
const n = scores.length;
|
|
101
|
-
const counts = new Uint16Array(n * n);
|
|
102
|
-
let totalWeight = 0;
|
|
103
|
-
// Reusable index array for sorting
|
|
104
|
-
const indices = new Array(n);
|
|
105
|
-
for (let i = 0; i < n; i++)
|
|
106
|
-
indices[i] = i;
|
|
107
|
-
for (const metric of METRIC_NAMES) {
|
|
108
|
-
const weight = weights[metric];
|
|
109
|
-
if (weight === 0)
|
|
110
|
-
continue;
|
|
111
|
-
totalWeight += weight;
|
|
112
|
-
// Sort candidates by this metric (descending)
|
|
113
|
-
const sorted = indices.slice();
|
|
114
|
-
sorted.sort((a, b) => (scores[b][metric] ?? 0) - (scores[a][metric] ?? 0));
|
|
115
|
-
// Only consider top-K for pairwise dominance
|
|
116
|
-
const k = Math.min(topK, n);
|
|
117
|
-
// For each pair in the top-K where i ranks above j:
|
|
118
|
-
// i dominates j on this objective → add weight to counts[i, j]
|
|
119
|
-
for (let ri = 0; ri < k; ri++) {
|
|
120
|
-
const i = sorted[ri];
|
|
121
|
-
const iBase = i * n;
|
|
122
|
-
for (let rj = ri + 1; rj < k; rj++) {
|
|
123
|
-
counts[iBase + sorted[rj]] += weight;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
return [counts, totalWeight];
|
|
128
|
-
}
|
|
129
|
-
/**
|
|
130
|
-
* Compute fitness scores from the dominance count matrix.
|
|
131
|
-
*
|
|
132
|
-
* Fitness = meanDominance × (numDominating + ε) / (numSubmitting + ε)
|
|
133
|
-
*
|
|
134
|
-
* Where:
|
|
135
|
-
* - meanDominance = average normalized dominance across all other candidates
|
|
136
|
-
* - numDominating = count of candidates this one dominates (>50% of objectives)
|
|
137
|
-
* - numSubmitting = count of candidates this one fails to dominate (<50%)
|
|
138
|
-
*/
|
|
139
|
-
function computeFitness(counts, n, totalWeight) {
|
|
140
|
-
const fitness = new Float64Array(n);
|
|
141
|
-
const threshold = totalWeight * 0.5;
|
|
142
|
-
for (let i = 0; i < n; i++) {
|
|
143
|
-
let sumDom = 0;
|
|
144
|
-
let numDominating = 0;
|
|
145
|
-
let numSubmitting = 0;
|
|
146
|
-
const iBase = i * n;
|
|
147
|
-
for (let j = 0; j < n; j++) {
|
|
148
|
-
if (i === j)
|
|
149
|
-
continue;
|
|
150
|
-
const count = counts[iBase + j];
|
|
151
|
-
sumDom += count;
|
|
152
|
-
if (count > threshold)
|
|
153
|
-
numDominating++;
|
|
154
|
-
if (count < threshold)
|
|
155
|
-
numSubmitting++;
|
|
156
|
-
}
|
|
157
|
-
const meanDom = n > 1 ? sumDom / ((n - 1) * totalWeight) : 0;
|
|
158
|
-
fitness[i] = (meanDom * (numDominating + EPSILON)) / (numSubmitting + EPSILON);
|
|
159
|
-
}
|
|
160
|
-
return fitness;
|
|
161
|
-
}
|
|
162
|
-
// ============================================================================
|
|
163
|
-
// Public API
|
|
164
|
-
// ============================================================================
|
|
165
|
-
/**
|
|
166
|
-
* Rank candidates using POEM / TFPR.
|
|
167
|
-
*
|
|
168
|
-
* @param candidates Map of candidateId → MetricScores (all values 0–1)
|
|
169
|
-
* @param queryType Query type for column duplication weighting
|
|
170
|
-
* @param topK Per-metric pruning limit (default: 1000)
|
|
171
|
-
* @returns Candidates ordered best-first with assigned ranks (0 = best)
|
|
172
|
-
*/
|
|
173
|
-
export function poemRank(candidates, queryType, topK = 1000) {
|
|
174
|
-
if (candidates.size === 0)
|
|
175
|
-
return [];
|
|
176
|
-
// 1. Prune: per-metric top-K → union
|
|
177
|
-
const surviving = pruneTopK(candidates, topK);
|
|
178
|
-
// 2. Build dense arrays
|
|
179
|
-
const ids = [];
|
|
180
|
-
const scores = [];
|
|
181
|
-
for (const id of surviving) {
|
|
182
|
-
const s = candidates.get(id);
|
|
183
|
-
ids.push(id);
|
|
184
|
-
scores.push({
|
|
185
|
-
bm25: s.bm25 ?? 0,
|
|
186
|
-
cosine: s.cosine ?? 0,
|
|
187
|
-
pathMatch: s.pathMatch ?? 0,
|
|
188
|
-
symbolMatch: s.symbolMatch ?? 0,
|
|
189
|
-
importGraph: s.importGraph ?? 0,
|
|
190
|
-
gitRecency: s.gitRecency ?? 0,
|
|
191
|
-
});
|
|
192
|
-
}
|
|
193
|
-
const n = ids.length;
|
|
194
|
-
// Single candidate → rank 0
|
|
195
|
-
if (n === 1) {
|
|
196
|
-
return [{ id: ids[0], scores: scores[0], rank: 0 }];
|
|
197
|
-
}
|
|
198
|
-
// 3. Compute dominance matrix with column duplication
|
|
199
|
-
const weights = COLUMN_WEIGHTS[queryType];
|
|
200
|
-
const [counts, totalWeight] = buildDominanceCounts(scores, weights, topK);
|
|
201
|
-
// 4. Compute fitness scores
|
|
202
|
-
const fitness = computeFitness(counts, n, totalWeight);
|
|
203
|
-
// 5. Sort by fitness (descending) and assign ranks
|
|
204
|
-
const order = new Array(n);
|
|
205
|
-
for (let i = 0; i < n; i++)
|
|
206
|
-
order[i] = i;
|
|
207
|
-
order.sort((a, b) => fitness[b] - fitness[a]);
|
|
208
|
-
return order.map((idx, rank) => ({
|
|
209
|
-
id: ids[idx],
|
|
210
|
-
scores: scores[idx],
|
|
211
|
-
rank,
|
|
212
|
-
}));
|
|
213
|
-
}
|
|
214
|
-
//# sourceMappingURL=poem.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"poem.js","sourceRoot":"","sources":["../../../src/core/search/poem.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,EAAE,YAAY,EAAsC,MAAM,YAAY,CAAC;AAmB9E,MAAM,cAAc,GAAqC;IACxD,UAAU,EAAE;QACX,IAAI,EAAE,CAAC;QACP,MAAM,EAAE,CAAC;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;QACd,WAAW,EAAE,CAAC;QACd,UAAU,EAAE,CAAC;KACb;IACD,gBAAgB,EAAE;QACjB,IAAI,EAAE,CAAC;QACP,MAAM,EAAE,CAAC;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;QACd,WAAW,EAAE,CAAC;QACd,UAAU,EAAE,CAAC;KACb;IACD,SAAS,EAAE;QACV,IAAI,EAAE,CAAC;QACP,MAAM,EAAE,CAAC;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;QACd,WAAW,EAAE,CAAC;QACd,UAAU,EAAE,CAAC;KACb;CACD,CAAC;AAEF,+EAA+E;AAC/E,uDAAuD;AACvD,+EAA+E;AAE/E,MAAM,OAAO,GAAG,IAAI,CAAC;AAErB,+EAA+E;AAC/E,UAAU;AACV,+EAA+E;AAE/E;;GAEG;AACH,SAAS,SAAS,CAAC,UAAqC,EAAE,IAAY,EAAe;IACpF,IAAI,UAAU,CAAC,IAAI,IAAI,IAAI,EAAE,CAAC;QAC7B,OAAO,IAAI,GAAG,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAEhC,KAAK,MAAM,MAAM,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,KAAK,GAA4B,EAAE,CAAC;QAC1C,KAAK,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;YACvC,KAAK,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;IACF,CAAC;IAED,OAAO,KAAK,CAAC;AAAA,CACb;AAED,+EAA+E;AAC/E,+BAA+B;AAC/B,+EAA+E;AAE/E;;;;;;;;;;;;;;;;;;GAkBG;AACH,SAAS,oBAAoB,CAC5B,MAAsB,EACtB,OAAsB,EACtB,IAAY,EACiC;IAC7C,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtC,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,mCAAmC;IACnC,MAAM,OAAO,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IAE3C,KAAK,MAAM,MAAM,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;QAC/B,IAAI,MAAM,KAAK,CAAC;YAAE,SAAS;QAC3B,WAAW,IAAI,MAAM,CAAC;QAEtB,8CAA8C;QAC9C,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;QAC/B,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAE3E,6CAA6C;QAC7C,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAE5B,oDAAoD;QACpD,iEAA+D;QAC/D,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC;YAC/B,MAAM,CAAC,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC;YACrB,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC;YACpB,KAAK,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC;gBACpC,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC;YACtC,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;AAAA,CAC7B;AAED;;;;;;;;;GASG;AACH,SAAS,cAAc,CAAC,MAAmB,EAAE,CAAS,EAAE,WAAmB,EAAgB;IAC1F,MAAM,OAAO,GAAG,IAAI,YAAY,CAAC,CAAC,CAAC,CAAC;IACpC,MAAM,SAAS,GAAG,WAAW,GAAG,GAAG,CAAC;IAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC;QAEpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC;gBAAE,SAAS;YACtB,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;YAChC,MAAM,IAAI,KAAK,CAAC;YAChB,IAAI,KAAK,GAAG,SAAS;gBAAE,aAAa,EAAE,CAAC;YACvC,IAAI,KAAK,GAAG,SAAS;gBAAE,aAAa,EAAE,CAAC;QACxC,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,CAAC,aAAa,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,aAAa,GAAG,OAAO,CAAC,CAAC;IAChF,CAAC;IAED,OAAO,OAAO,CAAC;AAAA,CACf;AAED,+EAA+E;AAC/E,aAAa;AACb,+EAA+E;AAE/E;;;;;;;GAOG;AACH,MAAM,UAAU,QAAQ,CAAC,UAAqC,EAAE,SAAoB,EAAE,IAAI,GAAG,IAAI,EAAqB;IACrH,IAAI,UAAU,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAErC,uCAAqC;IACrC,MAAM,SAAS,GAAG,SAAS,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAE9C,wBAAwB;IACxB,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,MAAM,GAAmB,EAAE,CAAC;IAElC,KAAK,MAAM,EAAE,IAAI,SAAS,EAAE,CAAC;QAC5B,MAAM,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,EAAE,CAAE,CAAC;QAC9B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACb,MAAM,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,CAAC;YACjB,MAAM,EAAE,CAAC,CAAC,MAAM,IAAI,CAAC;YACrB,SAAS,EAAE,CAAC,CAAC,SAAS,IAAI,CAAC;YAC3B,WAAW,EAAE,CAAC,CAAC,WAAW,IAAI,CAAC;YAC/B,WAAW,EAAE,CAAC,CAAC,WAAW,IAAI,CAAC;YAC/B,UAAU,EAAE,CAAC,CAAC,UAAU,IAAI,CAAC;SAC7B,CAAC,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC;IAErB,8BAA4B;IAC5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACb,OAAO,CAAC,EAAE,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC;IACrD,CAAC;IAED,sDAAsD;IACtD,MAAM,OAAO,GAAG,cAAc,CAAC,SAAS,CAAC,CAAC;IAC1C,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,GAAG,oBAAoB,CAAC,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,CAAC;IAE1E,4BAA4B;IAC5B,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;IAEvD,mDAAmD;IACnD,MAAM,KAAK,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IAE9C,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC;QAChC,EAAE,EAAE,GAAG,CAAC,GAAG,CAAC;QACZ,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC;QACnB,IAAI;KACJ,CAAC,CAAC,CAAC;AAAA,CACJ","sourcesContent":["/**\n * POEM — Pareto-Optimal Embedded Modeling, multi-metric ranking.\n *\n * Ranks search candidates across multiple relevance metrics without requiring\n * hand-tuned weights. Uses the TFPR (Top-Fraction Pareto Ranker) approach:\n * vectorized dominance matrix computation with column duplication for\n * query-type-dependent metric weighting.\n *\n * Algorithm:\n * 1. Prune: per-metric top-K → union of survivors\n * 2. Build objectives matrix with column duplication for query-type weighting\n * 3. For each objective column, sort candidates and accumulate pairwise\n * dominance counts (duplicate columns contribute via weight multiplier)\n * 4. Compute fitness: meanDominance × (numDominating + ε) / (numSubmitting + ε)\n * 5. Sort by fitness, assign ranks\n *\n * References:\n * - POEM paper: https://iopscience.iop.org/article/10.1088/2632-2153/ab891b\n * - TFPR: https://github.com/merckgroup/aidd_tfpr\n * - colourdle: https://github.com/aebrer/colourdle\n */\n\nimport type { QueryType } from \"./query-classifier.js\";\nimport { METRIC_NAMES, type MetricName, type MetricScores } from \"./types.js\";\n\n// ============================================================================\n// Public types\n// ============================================================================\n\nexport interface RankedCandidate {\n\tid: number;\n\tscores: MetricScores;\n\trank: number;\n}\n\n// ============================================================================\n// Column duplication config per query type\n// ============================================================================\n\n/** How many times each metric column appears in the objectives matrix. */\ntype ColumnWeights = Record<MetricName, number>;\n\nconst COLUMN_WEIGHTS: Record<QueryType, ColumnWeights> = {\n\tidentifier: {\n\t\tbm25: 2,\n\t\tcosine: 1,\n\t\tpathMatch: 1,\n\t\tsymbolMatch: 2,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n\tnatural_language: {\n\t\tbm25: 1,\n\t\tcosine: 2,\n\t\tpathMatch: 1,\n\t\tsymbolMatch: 1,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n\tpath_like: {\n\t\tbm25: 1,\n\t\tcosine: 1,\n\t\tpathMatch: 3,\n\t\tsymbolMatch: 1,\n\t\timportGraph: 1,\n\t\tgitRecency: 1,\n\t},\n};\n\n// ============================================================================\n// Fitness smoothing constant (avoids division by zero)\n// ============================================================================\n\nconst EPSILON = 0.05;\n\n// ============================================================================\n// Pruning\n// ============================================================================\n\n/**\n * Per-metric top-K pruning → union of surviving candidate IDs.\n */\nfunction pruneTopK(candidates: Map<number, MetricScores>, topK: number): Set<number> {\n\tif (candidates.size <= topK) {\n\t\treturn new Set(candidates.keys());\n\t}\n\n\tconst union = new Set<number>();\n\n\tfor (const metric of METRIC_NAMES) {\n\t\tconst pairs: Array<[number, number]> = [];\n\t\tfor (const [id, scores] of candidates) {\n\t\t\tpairs.push([id, scores[metric] ?? 0]);\n\t\t}\n\t\tpairs.sort((a, b) => b[1] - a[1]);\n\t\tconst limit = Math.min(topK, pairs.length);\n\t\tfor (let i = 0; i < limit; i++) {\n\t\t\tunion.add(pairs[i][0]);\n\t\t}\n\t}\n\n\treturn union;\n}\n\n// ============================================================================\n// Dominance matrix computation\n// ============================================================================\n\n/**\n * Build the dominance count matrix using the TFPR approach.\n *\n * For each objective (metric), sorts candidates and accumulates pairwise\n * dominance: if candidate i ranks above candidate j on an objective,\n * dominanceCounts[i][j] increases by the column weight.\n *\n * Duplicate columns (from column duplication) are handled by multiplying\n * the contribution by the weight rather than re-sorting — same result,\n * no redundant work.\n *\n * Uses Uint16Array to keep memory compact (max possible value per cell\n * is the sum of all weights, which is ≤ 10).\n *\n * @param scores Dense array of MetricScores, indexed 0..n-1\n * @param weights Column weights from query type\n * @param topK Only top-K per objective contribute to pairwise dominance\n * @returns Flat dominance count matrix [n × n] and the total weight sum\n */\nfunction buildDominanceCounts(\n\tscores: MetricScores[],\n\tweights: ColumnWeights,\n\ttopK: number,\n): [counts: Uint16Array, totalWeight: number] {\n\tconst n = scores.length;\n\tconst counts = new Uint16Array(n * n);\n\tlet totalWeight = 0;\n\n\t// Reusable index array for sorting\n\tconst indices = new Array<number>(n);\n\tfor (let i = 0; i < n; i++) indices[i] = i;\n\n\tfor (const metric of METRIC_NAMES) {\n\t\tconst weight = weights[metric];\n\t\tif (weight === 0) continue;\n\t\ttotalWeight += weight;\n\n\t\t// Sort candidates by this metric (descending)\n\t\tconst sorted = indices.slice();\n\t\tsorted.sort((a, b) => (scores[b][metric] ?? 0) - (scores[a][metric] ?? 0));\n\n\t\t// Only consider top-K for pairwise dominance\n\t\tconst k = Math.min(topK, n);\n\n\t\t// For each pair in the top-K where i ranks above j:\n\t\t// i dominates j on this objective → add weight to counts[i, j]\n\t\tfor (let ri = 0; ri < k; ri++) {\n\t\t\tconst i = sorted[ri];\n\t\t\tconst iBase = i * n;\n\t\t\tfor (let rj = ri + 1; rj < k; rj++) {\n\t\t\t\tcounts[iBase + sorted[rj]] += weight;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn [counts, totalWeight];\n}\n\n/**\n * Compute fitness scores from the dominance count matrix.\n *\n * Fitness = meanDominance × (numDominating + ε) / (numSubmitting + ε)\n *\n * Where:\n * - meanDominance = average normalized dominance across all other candidates\n * - numDominating = count of candidates this one dominates (>50% of objectives)\n * - numSubmitting = count of candidates this one fails to dominate (<50%)\n */\nfunction computeFitness(counts: Uint16Array, n: number, totalWeight: number): Float64Array {\n\tconst fitness = new Float64Array(n);\n\tconst threshold = totalWeight * 0.5;\n\n\tfor (let i = 0; i < n; i++) {\n\t\tlet sumDom = 0;\n\t\tlet numDominating = 0;\n\t\tlet numSubmitting = 0;\n\t\tconst iBase = i * n;\n\n\t\tfor (let j = 0; j < n; j++) {\n\t\t\tif (i === j) continue;\n\t\t\tconst count = counts[iBase + j];\n\t\t\tsumDom += count;\n\t\t\tif (count > threshold) numDominating++;\n\t\t\tif (count < threshold) numSubmitting++;\n\t\t}\n\n\t\tconst meanDom = n > 1 ? sumDom / ((n - 1) * totalWeight) : 0;\n\t\tfitness[i] = (meanDom * (numDominating + EPSILON)) / (numSubmitting + EPSILON);\n\t}\n\n\treturn fitness;\n}\n\n// ============================================================================\n// Public API\n// ============================================================================\n\n/**\n * Rank candidates using POEM / TFPR.\n *\n * @param candidates Map of candidateId → MetricScores (all values 0–1)\n * @param queryType Query type for column duplication weighting\n * @param topK Per-metric pruning limit (default: 1000)\n * @returns Candidates ordered best-first with assigned ranks (0 = best)\n */\nexport function poemRank(candidates: Map<number, MetricScores>, queryType: QueryType, topK = 1000): RankedCandidate[] {\n\tif (candidates.size === 0) return [];\n\n\t// 1. Prune: per-metric top-K → union\n\tconst surviving = pruneTopK(candidates, topK);\n\n\t// 2. Build dense arrays\n\tconst ids: number[] = [];\n\tconst scores: MetricScores[] = [];\n\n\tfor (const id of surviving) {\n\t\tconst s = candidates.get(id)!;\n\t\tids.push(id);\n\t\tscores.push({\n\t\t\tbm25: s.bm25 ?? 0,\n\t\t\tcosine: s.cosine ?? 0,\n\t\t\tpathMatch: s.pathMatch ?? 0,\n\t\t\tsymbolMatch: s.symbolMatch ?? 0,\n\t\t\timportGraph: s.importGraph ?? 0,\n\t\t\tgitRecency: s.gitRecency ?? 0,\n\t\t});\n\t}\n\n\tconst n = ids.length;\n\n\t// Single candidate → rank 0\n\tif (n === 1) {\n\t\treturn [{ id: ids[0], scores: scores[0], rank: 0 }];\n\t}\n\n\t// 3. Compute dominance matrix with column duplication\n\tconst weights = COLUMN_WEIGHTS[queryType];\n\tconst [counts, totalWeight] = buildDominanceCounts(scores, weights, topK);\n\n\t// 4. Compute fitness scores\n\tconst fitness = computeFitness(counts, n, totalWeight);\n\n\t// 5. Sort by fitness (descending) and assign ranks\n\tconst order = new Array<number>(n);\n\tfor (let i = 0; i < n; i++) order[i] = i;\n\torder.sort((a, b) => fitness[b] - fitness[a]);\n\n\treturn order.map((idx, rank) => ({\n\t\tid: ids[idx],\n\t\tscores: scores[idx],\n\t\trank,\n\t}));\n}\n"]}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Classify search queries into types for POEM column weighting.
|
|
3
|
-
*
|
|
4
|
-
* Query types affect how metric columns are duplicated during ranking:
|
|
5
|
-
* - identifier: emphasise BM25 and symbol-match scores
|
|
6
|
-
* - path_like: emphasise path-match scores
|
|
7
|
-
* - natural_language: emphasise cosine similarity scores
|
|
8
|
-
*/
|
|
9
|
-
export type QueryType = "identifier" | "natural_language" | "path_like";
|
|
10
|
-
/**
|
|
11
|
-
* Classify a search query to guide POEM column weighting.
|
|
12
|
-
*
|
|
13
|
-
* @param query Raw user query string
|
|
14
|
-
* @returns The detected query type
|
|
15
|
-
*/
|
|
16
|
-
export declare function classifyQuery(query: string): QueryType;
|
|
17
|
-
//# sourceMappingURL=query-classifier.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"query-classifier.d.ts","sourceRoot":"","sources":["../../../src/core/search/query-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,MAAM,SAAS,GAAG,YAAY,GAAG,kBAAkB,GAAG,WAAW,CAAC;AAoBxE;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAyBtD","sourcesContent":["/**\n * Classify search queries into types for POEM column weighting.\n *\n * Query types affect how metric columns are duplicated during ranking:\n * - identifier: emphasise BM25 and symbol-match scores\n * - path_like: emphasise path-match scores\n * - natural_language: emphasise cosine similarity scores\n */\n\nexport type QueryType = \"identifier\" | \"natural_language\" | \"path_like\";\n\n/** Matches camelCase or PascalCase boundaries (lowercase→uppercase). */\nconst CAMEL_RE = /[a-z][A-Z]/;\n\n/** Matches snake_case — word chars around an underscore. */\nconst SNAKE_RE = /\\w+_\\w+/;\n\n/** SCREAMING_SNAKE_CASE — two or more uppercase-letter groups joined by underscores. */\nconst SCREAMING_SNAKE_RE = /^[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+$/;\n\n/** Path separators. */\nconst PATH_SEP_RE = /[/\\\\]/;\n\n/** Dotted path like `foo.bar.baz` (3+ segments). */\nconst DOTTED_PATH_RE = /^\\w+\\.\\w+\\.\\w+/;\n\n/** File extension pattern — ends with `.ext` where ext is 1-5 alphanumeric chars. */\nconst FILE_EXT_RE = /\\.\\w{1,5}$/;\n\n/**\n * Classify a search query to guide POEM column weighting.\n *\n * @param query Raw user query string\n * @returns The detected query type\n */\nexport function classifyQuery(query: string): QueryType {\n\tconst trimmed = query.trim();\n\tif (trimmed.length === 0) return \"natural_language\";\n\n\t// --- path_like ---\n\tif (PATH_SEP_RE.test(trimmed)) return \"path_like\";\n\tif (DOTTED_PATH_RE.test(trimmed)) return \"path_like\";\n\t// File extension at end of a single token (e.g. \"config.yaml\", \"auth.ts\")\n\tconst words = trimmed.split(/\\s+/);\n\tif (words.length === 1 && FILE_EXT_RE.test(trimmed) && /\\./.test(trimmed)) {\n\t\treturn \"path_like\";\n\t}\n\n\t// --- identifier ---\n\t// Single token or short (≤3 words) with code-style naming\n\tif (words.length === 1) return \"identifier\";\n\tif (words.length <= 3) {\n\t\t// If any word looks like a code identifier, classify as identifier\n\t\tif (words.some((w) => CAMEL_RE.test(w) || SNAKE_RE.test(w) || SCREAMING_SNAKE_RE.test(w))) {\n\t\t\treturn \"identifier\";\n\t\t}\n\t}\n\n\t// --- natural_language ---\n\treturn \"natural_language\";\n}\n"]}
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Classify search queries into types for POEM column weighting.
|
|
3
|
-
*
|
|
4
|
-
* Query types affect how metric columns are duplicated during ranking:
|
|
5
|
-
* - identifier: emphasise BM25 and symbol-match scores
|
|
6
|
-
* - path_like: emphasise path-match scores
|
|
7
|
-
* - natural_language: emphasise cosine similarity scores
|
|
8
|
-
*/
|
|
9
|
-
/** Matches camelCase or PascalCase boundaries (lowercase→uppercase). */
|
|
10
|
-
const CAMEL_RE = /[a-z][A-Z]/;
|
|
11
|
-
/** Matches snake_case — word chars around an underscore. */
|
|
12
|
-
const SNAKE_RE = /\w+_\w+/;
|
|
13
|
-
/** SCREAMING_SNAKE_CASE — two or more uppercase-letter groups joined by underscores. */
|
|
14
|
-
const SCREAMING_SNAKE_RE = /^[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+$/;
|
|
15
|
-
/** Path separators. */
|
|
16
|
-
const PATH_SEP_RE = /[/\\]/;
|
|
17
|
-
/** Dotted path like `foo.bar.baz` (3+ segments). */
|
|
18
|
-
const DOTTED_PATH_RE = /^\w+\.\w+\.\w+/;
|
|
19
|
-
/** File extension pattern — ends with `.ext` where ext is 1-5 alphanumeric chars. */
|
|
20
|
-
const FILE_EXT_RE = /\.\w{1,5}$/;
|
|
21
|
-
/**
|
|
22
|
-
* Classify a search query to guide POEM column weighting.
|
|
23
|
-
*
|
|
24
|
-
* @param query Raw user query string
|
|
25
|
-
* @returns The detected query type
|
|
26
|
-
*/
|
|
27
|
-
export function classifyQuery(query) {
|
|
28
|
-
const trimmed = query.trim();
|
|
29
|
-
if (trimmed.length === 0)
|
|
30
|
-
return "natural_language";
|
|
31
|
-
// --- path_like ---
|
|
32
|
-
if (PATH_SEP_RE.test(trimmed))
|
|
33
|
-
return "path_like";
|
|
34
|
-
if (DOTTED_PATH_RE.test(trimmed))
|
|
35
|
-
return "path_like";
|
|
36
|
-
// File extension at end of a single token (e.g. "config.yaml", "auth.ts")
|
|
37
|
-
const words = trimmed.split(/\s+/);
|
|
38
|
-
if (words.length === 1 && FILE_EXT_RE.test(trimmed) && /\./.test(trimmed)) {
|
|
39
|
-
return "path_like";
|
|
40
|
-
}
|
|
41
|
-
// --- identifier ---
|
|
42
|
-
// Single token or short (≤3 words) with code-style naming
|
|
43
|
-
if (words.length === 1)
|
|
44
|
-
return "identifier";
|
|
45
|
-
if (words.length <= 3) {
|
|
46
|
-
// If any word looks like a code identifier, classify as identifier
|
|
47
|
-
if (words.some((w) => CAMEL_RE.test(w) || SNAKE_RE.test(w) || SCREAMING_SNAKE_RE.test(w))) {
|
|
48
|
-
return "identifier";
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
// --- natural_language ---
|
|
52
|
-
return "natural_language";
|
|
53
|
-
}
|
|
54
|
-
//# sourceMappingURL=query-classifier.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"query-classifier.js","sourceRoot":"","sources":["../../../src/core/search/query-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,0EAAwE;AACxE,MAAM,QAAQ,GAAG,YAAY,CAAC;AAE9B,8DAA4D;AAC5D,MAAM,QAAQ,GAAG,SAAS,CAAC;AAE3B,0FAAwF;AACxF,MAAM,kBAAkB,GAAG,iCAAiC,CAAC;AAE7D,uBAAuB;AACvB,MAAM,WAAW,GAAG,OAAO,CAAC;AAE5B,oDAAoD;AACpD,MAAM,cAAc,GAAG,gBAAgB,CAAC;AAExC,uFAAqF;AACrF,MAAM,WAAW,GAAG,YAAY,CAAC;AAEjC;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,KAAa,EAAa;IACvD,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;IAC7B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,kBAAkB,CAAC;IAEpD,oBAAoB;IACpB,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,WAAW,CAAC;IAClD,IAAI,cAAc,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,WAAW,CAAC;IACrD,0EAA0E;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACnC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC3E,OAAO,WAAW,CAAC;IACpB,CAAC;IAED,qBAAqB;IACrB,4DAA0D;IAC1D,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,YAAY,CAAC;IAC5C,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,mEAAmE;QACnE,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3F,OAAO,YAAY,CAAC;QACrB,CAAC;IACF,CAAC;IAED,2BAA2B;IAC3B,OAAO,kBAAkB,CAAC;AAAA,CAC1B","sourcesContent":["/**\n * Classify search queries into types for POEM column weighting.\n *\n * Query types affect how metric columns are duplicated during ranking:\n * - identifier: emphasise BM25 and symbol-match scores\n * - path_like: emphasise path-match scores\n * - natural_language: emphasise cosine similarity scores\n */\n\nexport type QueryType = \"identifier\" | \"natural_language\" | \"path_like\";\n\n/** Matches camelCase or PascalCase boundaries (lowercase→uppercase). */\nconst CAMEL_RE = /[a-z][A-Z]/;\n\n/** Matches snake_case — word chars around an underscore. */\nconst SNAKE_RE = /\\w+_\\w+/;\n\n/** SCREAMING_SNAKE_CASE — two or more uppercase-letter groups joined by underscores. */\nconst SCREAMING_SNAKE_RE = /^[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+$/;\n\n/** Path separators. */\nconst PATH_SEP_RE = /[/\\\\]/;\n\n/** Dotted path like `foo.bar.baz` (3+ segments). */\nconst DOTTED_PATH_RE = /^\\w+\\.\\w+\\.\\w+/;\n\n/** File extension pattern — ends with `.ext` where ext is 1-5 alphanumeric chars. */\nconst FILE_EXT_RE = /\\.\\w{1,5}$/;\n\n/**\n * Classify a search query to guide POEM column weighting.\n *\n * @param query Raw user query string\n * @returns The detected query type\n */\nexport function classifyQuery(query: string): QueryType {\n\tconst trimmed = query.trim();\n\tif (trimmed.length === 0) return \"natural_language\";\n\n\t// --- path_like ---\n\tif (PATH_SEP_RE.test(trimmed)) return \"path_like\";\n\tif (DOTTED_PATH_RE.test(trimmed)) return \"path_like\";\n\t// File extension at end of a single token (e.g. \"config.yaml\", \"auth.ts\")\n\tconst words = trimmed.split(/\\s+/);\n\tif (words.length === 1 && FILE_EXT_RE.test(trimmed) && /\\./.test(trimmed)) {\n\t\treturn \"path_like\";\n\t}\n\n\t// --- identifier ---\n\t// Single token or short (≤3 words) with code-style naming\n\tif (words.length === 1) return \"identifier\";\n\tif (words.length <= 3) {\n\t\t// If any word looks like a code identifier, classify as identifier\n\t\tif (words.some((w) => CAMEL_RE.test(w) || SNAKE_RE.test(w) || SCREAMING_SNAKE_RE.test(w))) {\n\t\t\treturn \"identifier\";\n\t\t}\n\t}\n\n\t// --- natural_language ---\n\treturn \"natural_language\";\n}\n"]}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* File scanner for the semantic search subsystem.
|
|
3
|
-
*
|
|
4
|
-
* Discovers project files for indexing by walking the directory tree,
|
|
5
|
-
* respecting .gitignore rules, and classifying files by type.
|
|
6
|
-
*/
|
|
7
|
-
import type { FileType } from "./types.js";
|
|
8
|
-
/** A file discovered by the scanner, ready for indexing. */
|
|
9
|
-
export interface ScannedFile {
|
|
10
|
-
/** Path relative to the project root (posix separators). */
|
|
11
|
-
filePath: string;
|
|
12
|
-
/** Detected file type. */
|
|
13
|
-
fileType: FileType;
|
|
14
|
-
/** File modification time in milliseconds since epoch. */
|
|
15
|
-
mtime: number;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Detect the {@link FileType} for a file path based on its extension.
|
|
19
|
-
* Returns `null` for unrecognized extensions or files without an extension.
|
|
20
|
-
*/
|
|
21
|
-
export declare function detectFileType(filePath: string): FileType | null;
|
|
22
|
-
/**
|
|
23
|
-
* Scan a project directory and return all indexable files.
|
|
24
|
-
*
|
|
25
|
-
* Walks the tree rooted at {@link projectRoot}, respects `.gitignore` rules,
|
|
26
|
-
* skips binary / oversized files, and optionally includes memory files from
|
|
27
|
-
* a global memory directory.
|
|
28
|
-
*/
|
|
29
|
-
export declare function scanProject(projectRoot: string, globalMemoryDir?: string): Promise<ScannedFile[]>;
|
|
30
|
-
//# sourceMappingURL=scanner.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"scanner.d.ts","sourceRoot":"","sources":["../../../src/core/search/scanner.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAM3C,4DAA4D;AAC5D,MAAM,WAAW,WAAW;IAC3B,4DAA4D;IAC5D,QAAQ,EAAE,MAAM,CAAC;IACjB,0BAA0B;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,0DAA0D;IAC1D,KAAK,EAAE,MAAM,CAAC;CACd;AAiED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,QAAQ,GAAG,IAAI,CAIhE;AAED;;;;;;GAMG;AACH,wBAAsB,WAAW,CAAC,WAAW,EAAE,MAAM,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAgCvG","sourcesContent":["/**\n * File scanner for the semantic search subsystem.\n *\n * Discovers project files for indexing by walking the directory tree,\n * respecting .gitignore rules, and classifying files by type.\n */\n\nimport { existsSync, readdirSync, readFileSync, type Stats, statSync } from \"node:fs\";\nimport { homedir } from \"node:os\";\nimport { extname, isAbsolute, join, relative, sep } from \"node:path\";\nimport ignore from \"ignore\";\nimport { getDrebToolVisibleDirs } from \"../tools/dreb-paths.js\";\nimport type { FileType } from \"./types.js\";\n\n// ============================================================================\n// Public types\n// ============================================================================\n\n/** A file discovered by the scanner, ready for indexing. */\nexport interface ScannedFile {\n\t/** Path relative to the project root (posix separators). */\n\tfilePath: string;\n\t/** Detected file type. */\n\tfileType: FileType;\n\t/** File modification time in milliseconds since epoch. */\n\tmtime: number;\n}\n\n// ============================================================================\n// Constants\n// ============================================================================\n\n/** Maximum file size to index (1 MB). */\nconst MAX_FILE_SIZE = 1024 * 1024;\n\n/** Directories unconditionally skipped during traversal. */\nconst SKIP_DIRS = new Set([\n\t\"node_modules\",\n\t\".git\",\n\t\".dreb/index\",\n\t\".hg\",\n\t\".svn\",\n\t\"__pycache__\",\n\t\".tox\",\n\t\".venv\",\n\t\"dist\",\n\t\"build\",\n\t\".next\",\n\t\".nuxt\",\n\t\"coverage\",\n\t\".cache\",\n]);\n\n/** Extension → FileType mapping. */\nconst EXTENSION_MAP: ReadonlyMap<string, FileType> = new Map<string, FileType>([\n\t// Tree-sitter languages\n\t[\".ts\", \"typescript\"],\n\t[\".tsx\", \"tsx\"],\n\t[\".js\", \"javascript\"],\n\t[\".mjs\", \"javascript\"],\n\t[\".cjs\", \"javascript\"],\n\t[\".py\", \"python\"],\n\t[\".go\", \"go\"],\n\t[\".rs\", \"rust\"],\n\t[\".java\", \"java\"],\n\t[\".c\", \"c\"],\n\t[\".h\", \"c\"],\n\t[\".cpp\", \"cpp\"],\n\t[\".hpp\", \"cpp\"],\n\t[\".cc\", \"cpp\"],\n\t[\".cxx\", \"cpp\"],\n\t[\".hh\", \"cpp\"],\n\t[\".hxx\", \"cpp\"],\n\t// Text file types\n\t[\".md\", \"markdown\"],\n\t[\".mdx\", \"markdown\"],\n\t[\".yml\", \"yaml\"],\n\t[\".yaml\", \"yaml\"],\n\t[\".json\", \"json\"],\n\t[\".toml\", \"toml\"],\n\t[\".txt\", \"plaintext\"],\n\t[\".cfg\", \"plaintext\"],\n\t[\".ini\", \"plaintext\"],\n\t[\".env\", \"plaintext\"],\n\t[\".conf\", \"plaintext\"],\n]);\n\n// ============================================================================\n// Public API\n// ============================================================================\n\n/**\n * Detect the {@link FileType} for a file path based on its extension.\n * Returns `null` for unrecognized extensions or files without an extension.\n */\nexport function detectFileType(filePath: string): FileType | null {\n\tconst ext = extname(filePath).toLowerCase();\n\tif (!ext) return null;\n\treturn EXTENSION_MAP.get(ext) ?? null;\n}\n\n/**\n * Scan a project directory and return all indexable files.\n *\n * Walks the tree rooted at {@link projectRoot}, respects `.gitignore` rules,\n * skips binary / oversized files, and optionally includes memory files from\n * a global memory directory.\n */\nexport async function scanProject(projectRoot: string, globalMemoryDir?: string): Promise<ScannedFile[]> {\n\tconst results: ScannedFile[] = [];\n\n\t// Detect if projectRoot is the home directory — use shallow scan mode\n\t// to avoid recursing into the entire home dir (which would be catastrophic).\n\tconst isHomeDir = isHomeDirPath(projectRoot);\n\n\tif (isHomeDir) {\n\t\t// Shallow mode: only scan top-level files and ~/.dreb/memory/\n\t\tscanShallow(projectRoot, results);\n\t} else {\n\t\t// Normal mode: full recursive walk with .gitignore\n\t\tconst ig = ignore();\n\t\tloadGitignore(ig, projectRoot, projectRoot);\n\t\twalkDirectory(projectRoot, projectRoot, ig, results);\n\t}\n\n\t// Include tool-visible .dreb/ subdirs (bypasses gitignore).\n\t// In home dir mode, global memory is already handled separately below,\n\t// and we don't want to double-scan ~/.dreb/memory/.\n\tif (!isHomeDir) {\n\t\tfor (const dir of getDrebToolVisibleDirs(projectRoot)) {\n\t\t\tscanMemoryDir(dir, projectRoot, results);\n\t\t}\n\t}\n\n\t// Include global memory files if the directory exists\n\tif (globalMemoryDir && existsSync(globalMemoryDir)) {\n\t\tscanMemoryDir(globalMemoryDir, projectRoot, results);\n\t}\n\n\treturn results;\n}\n\n/** Check if a path is the user's home directory. */\nfunction isHomeDirPath(dir: string): boolean {\n\ttry {\n\t\tconst home = homedir();\n\t\t// Normalize trailing slashes for comparison\n\t\tconst normalizedDir = dir.replace(/[/\\\\]+$/, \"\");\n\t\tconst normalizedHome = home.replace(/[/\\\\]+$/, \"\");\n\t\treturn normalizedDir === normalizedHome;\n\t} catch {\n\t\treturn false;\n\t}\n}\n\n/**\n * Shallow scan mode for home directory: only index top-level files\n * (no directory recursion) to avoid scanning the entire home directory.\n * Memory files are handled separately via scanMemoryDir.\n */\nfunction scanShallow(dir: string, results: ScannedFile[]): void {\n\tlet entries: string[];\n\ttry {\n\t\tentries = readdirSync(dir);\n\t} catch {\n\t\treturn;\n\t}\n\n\tfor (const entry of entries) {\n\t\t// Skip dotfiles/dotdirs in home dir (except specific ones we want)\n\t\tif (entry.startsWith(\".\")) continue;\n\n\t\tconst fullPath = join(dir, entry);\n\n\t\tlet stats: Stats;\n\t\ttry {\n\t\t\tstats = statSync(fullPath);\n\t\t} catch {\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Only index files, not directories (shallow mode)\n\t\tif (!stats.isFile()) continue;\n\t\tif (stats.size > MAX_FILE_SIZE) continue;\n\t\tif (stats.size === 0) continue;\n\n\t\tconst fileType = detectFileType(entry);\n\t\tif (!fileType) continue;\n\n\t\tresults.push({\n\t\t\tfilePath: entry,\n\t\t\tfileType,\n\t\t\tmtime: stats.mtimeMs,\n\t\t});\n\t}\n}\n\n// ============================================================================\n// Internal helpers\n// ============================================================================\n\ntype IgnoreMatcher = ReturnType<typeof ignore>;\n\n/** Convert an OS path to posix separators for ignore matching. */\nfunction toPosix(p: string): string {\n\treturn p.split(sep).join(\"/\");\n}\n\n/** Load .gitignore rules from a directory into the ignore matcher. */\nfunction loadGitignore(ig: IgnoreMatcher, dir: string, root: string): void {\n\tconst gitignorePath = join(dir, \".gitignore\");\n\tif (!existsSync(gitignorePath)) return;\n\n\ttry {\n\t\tconst content = readFileSync(gitignorePath, \"utf-8\");\n\t\tconst relDir = relative(root, dir);\n\t\tconst prefix = relDir ? `${toPosix(relDir)}/` : \"\";\n\n\t\tconst patterns = content\n\t\t\t.split(/\\r?\\n/)\n\t\t\t.map((line) => prefixPattern(line, prefix))\n\t\t\t.filter((line): line is string => line !== null);\n\n\t\tif (patterns.length > 0) {\n\t\t\tig.add(patterns);\n\t\t}\n\t} catch {\n\t\t// Unreadable .gitignore — skip silently\n\t}\n}\n\n/**\n * Prefix a .gitignore pattern with a directory path so it applies\n * correctly when matching against root-relative paths.\n */\nfunction prefixPattern(line: string, prefix: string): string | null {\n\tconst trimmed = line.trim();\n\tif (!trimmed) return null;\n\tif (trimmed.startsWith(\"#\") && !trimmed.startsWith(\"\\\\#\")) return null;\n\n\tlet pattern = line;\n\tlet negated = false;\n\n\tif (pattern.startsWith(\"!\")) {\n\t\tnegated = true;\n\t\tpattern = pattern.slice(1);\n\t} else if (pattern.startsWith(\"\\\\!\")) {\n\t\tpattern = pattern.slice(1);\n\t}\n\n\tconst prefixed = prefix ? `${prefix}${pattern}` : pattern;\n\treturn negated ? `!${prefixed}` : prefixed;\n}\n\n/**\n * Check if a directory component (relative to root) should be unconditionally skipped.\n * Handles both top-level names (\"node_modules\") and nested paths (\".dreb/index\").\n */\nfunction shouldSkipDir(relPath: string): boolean {\n\tconst posix = toPosix(relPath);\n\n\t// Check the directory name itself\n\tconst parts = posix.split(\"/\");\n\tconst name = parts[parts.length - 1];\n\tif (SKIP_DIRS.has(name)) return true;\n\n\t// Check multi-segment skip patterns (e.g. \".dreb/index\")\n\tfor (const skip of SKIP_DIRS) {\n\t\tif (skip.includes(\"/\") && (posix === skip || posix.endsWith(`/${skip}`))) {\n\t\t\treturn true;\n\t\t}\n\t}\n\n\treturn false;\n}\n\n/** Recursively walk a directory, collecting indexable files. */\nfunction walkDirectory(dir: string, root: string, ig: IgnoreMatcher, results: ScannedFile[]): void {\n\tlet entries: string[];\n\ttry {\n\t\tentries = readdirSync(dir);\n\t} catch {\n\t\treturn; // Permission denied, etc.\n\t}\n\n\tfor (const entry of entries) {\n\t\tconst fullPath = join(dir, entry);\n\t\tconst relPath = relative(root, fullPath);\n\t\tconst posixRel = toPosix(relPath);\n\n\t\tlet stats: Stats;\n\t\ttry {\n\t\t\tstats = statSync(fullPath);\n\t\t} catch {\n\t\t\tcontinue; // Broken symlink, etc.\n\t\t}\n\n\t\tif (stats.isDirectory()) {\n\t\t\t// Hard-coded skip list\n\t\t\tif (shouldSkipDir(relPath)) continue;\n\n\t\t\t// .gitignore check (directories need trailing slash)\n\t\t\tif (ig.ignores(`${posixRel}/`)) continue;\n\n\t\t\t// Load nested .gitignore before descending\n\t\t\tloadGitignore(ig, fullPath, root);\n\n\t\t\twalkDirectory(fullPath, root, ig, results);\n\t\t\tcontinue;\n\t\t}\n\n\t\tif (!stats.isFile()) continue;\n\n\t\t// .gitignore check for files\n\t\tif (ig.ignores(posixRel)) continue;\n\n\t\t// Size gate\n\t\tif (stats.size > MAX_FILE_SIZE) continue;\n\t\tif (stats.size === 0) continue;\n\n\t\t// File type detection\n\t\tconst fileType = detectFileType(entry);\n\t\tif (!fileType) continue;\n\n\t\tresults.push({\n\t\t\tfilePath: posixRel,\n\t\t\tfileType,\n\t\t\tmtime: stats.mtimeMs,\n\t\t});\n\t}\n}\n\n/**\n * Scan a memory directory (project or global) for indexable files.\n *\n * Memory directories are always fully included — no .gitignore filtering —\n * because they live outside the normal project tree or in `.dreb/` which\n * is typically gitignored.\n *\n * Paths for global memory files are stored with a `~memory/` prefix\n * to distinguish them from project files.\n */\nfunction scanMemoryDir(memoryDir: string, projectRoot: string, results: ScannedFile[], baseMemoryDir?: string): void {\n\tlet entries: string[];\n\ttry {\n\t\tentries = readdirSync(memoryDir);\n\t} catch {\n\t\treturn;\n\t}\n\n\tfor (const entry of entries) {\n\t\tconst fullPath = join(memoryDir, entry);\n\n\t\tlet stats: Stats;\n\t\ttry {\n\t\t\tstats = statSync(fullPath);\n\t\t} catch {\n\t\t\tcontinue;\n\t\t}\n\n\t\tif (stats.isDirectory()) {\n\t\t\t// Recurse into subdirectories\n\t\t\tscanMemoryDir(fullPath, projectRoot, results, baseMemoryDir ?? memoryDir);\n\t\t\tcontinue;\n\t\t}\n\n\t\tif (!stats.isFile()) continue;\n\t\tif (stats.size > MAX_FILE_SIZE) continue;\n\t\tif (stats.size === 0) continue;\n\n\t\tconst fileType = detectFileType(entry);\n\t\tif (!fileType) continue;\n\n\t\t// If the memory dir is inside the project root, use normal relative path.\n\t\t// Otherwise, use a ~memory/ prefix so paths remain unique and identifiable.\n\t\tconst rel = relative(projectRoot, fullPath);\n\t\tconst isOutsideProject = rel.startsWith(\"..\") || isAbsolute(rel);\n\t\tconst rootMemoryDir = baseMemoryDir ?? memoryDir;\n\t\tconst filePath = isOutsideProject ? `~memory/${relative(rootMemoryDir, fullPath)}` : rel;\n\n\t\tresults.push({\n\t\t\tfilePath: toPosix(filePath),\n\t\t\tfileType,\n\t\t\tmtime: stats.mtimeMs,\n\t\t});\n\t}\n}\n"]}
|