@comfanion/usethis_search 3.0.0-dev.0 → 3.0.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file-indexer.ts +13 -0
- package/index.ts +5 -1
- package/package.json +3 -1
- package/tools/codeindex.ts +155 -6
- package/tools/read-interceptor.ts +78 -5
- package/vectorizer/analyzers/lsp-analyzer.ts +225 -94
- package/vectorizer/analyzers/lsp-client.ts +369 -0
- package/vectorizer/graph-builder.ts +106 -3
- package/vectorizer/graph-db.ts +192 -0
- package/vectorizer/index.js +93 -9
- package/vectorizer/usage-tracker.ts +204 -0
package/vectorizer/index.js
CHANGED
|
@@ -17,6 +17,7 @@ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
|
|
|
17
17
|
import { SearchMetrics } from "./search-metrics.ts";
|
|
18
18
|
import { GraphDB } from "./graph-db.ts";
|
|
19
19
|
import { GraphBuilder } from "./graph-builder.ts";
|
|
20
|
+
import { UsageTracker } from "./usage-tracker.ts";
|
|
20
21
|
|
|
21
22
|
// Suppress transformers.js logs unless DEBUG is set
|
|
22
23
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
@@ -354,6 +355,8 @@ class CodebaseIndexer {
|
|
|
354
355
|
this.metrics = null; // lazy-loaded SearchMetrics
|
|
355
356
|
this.graphDB = null; // Graph DB for relationships
|
|
356
357
|
this.graphBuilder = null; // Graph builder orchestrator
|
|
358
|
+
this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
|
|
359
|
+
this.usageTracker = null; // Usage tracking & provenance (v3)
|
|
357
360
|
}
|
|
358
361
|
|
|
359
362
|
async init() {
|
|
@@ -371,6 +374,10 @@ class CodebaseIndexer {
|
|
|
371
374
|
this.graphDB = await new GraphDB(graphPath).init();
|
|
372
375
|
this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
|
|
373
376
|
|
|
377
|
+
// Usage tracker — provenance & usage stats
|
|
378
|
+
this.usageTracker = new UsageTracker(this.cacheDir);
|
|
379
|
+
await this.usageTracker.load();
|
|
380
|
+
|
|
374
381
|
return this;
|
|
375
382
|
}
|
|
376
383
|
|
|
@@ -399,6 +406,18 @@ class CodebaseIndexer {
|
|
|
399
406
|
}
|
|
400
407
|
this._bm25Rows = null;
|
|
401
408
|
this.metrics = null;
|
|
409
|
+
// Close graph DB to release LevelDB lock
|
|
410
|
+
if (this.graphDB) {
|
|
411
|
+
try { await this.graphDB.close(); } catch { /* best effort */ }
|
|
412
|
+
this.graphDB = null;
|
|
413
|
+
this.graphBuilder = null;
|
|
414
|
+
}
|
|
415
|
+
// Save & release usage tracker
|
|
416
|
+
if (this.usageTracker) {
|
|
417
|
+
try { await this.usageTracker.save(); } catch { /* best effort */ }
|
|
418
|
+
this.usageTracker = null;
|
|
419
|
+
}
|
|
420
|
+
this._chunkCache = null;
|
|
402
421
|
clearQueryCache();
|
|
403
422
|
if (global.gc) global.gc();
|
|
404
423
|
}
|
|
@@ -521,6 +540,13 @@ class CodebaseIndexer {
|
|
|
521
540
|
await this.graphDB.deleteByFile(relPath);
|
|
522
541
|
await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
523
542
|
|
|
543
|
+
// FR-054: Store graph build timestamp + file hash as metadata triple
|
|
544
|
+
try {
|
|
545
|
+
await this.graphDB.setFileMeta(relPath, hash, Date.now());
|
|
546
|
+
} catch {
|
|
547
|
+
// non-fatal — metadata is advisory
|
|
548
|
+
}
|
|
549
|
+
|
|
524
550
|
const data = [];
|
|
525
551
|
for (let i = 0; i < chunksWithIds.length; i++) {
|
|
526
552
|
const embedding = await this.embed(chunksWithIds[i].content);
|
|
@@ -769,20 +795,53 @@ class CodebaseIndexer {
|
|
|
769
795
|
|
|
770
796
|
neighbors.sort((a, b) => b.score - a.score);
|
|
771
797
|
result.relatedContext = neighbors.slice(0, 3);
|
|
798
|
+
|
|
799
|
+
// FR-060: Record provenance for each attached chunk
|
|
800
|
+
if (this.usageTracker) {
|
|
801
|
+
for (const n of result.relatedContext) {
|
|
802
|
+
this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// FR-061: Record usage counts for all returned chunks (main + attached)
|
|
809
|
+
if (this.usageTracker) {
|
|
810
|
+
const allChunkIds = [];
|
|
811
|
+
for (const r of finalResults) {
|
|
812
|
+
if (r.chunk_id) allChunkIds.push(r.chunk_id);
|
|
813
|
+
if (r.relatedContext) {
|
|
814
|
+
for (const rc of r.relatedContext) {
|
|
815
|
+
if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
|
|
816
|
+
}
|
|
817
|
+
}
|
|
772
818
|
}
|
|
819
|
+
this.usageTracker.recordSearchResults(allChunkIds);
|
|
820
|
+
// Save asynchronously (non-blocking)
|
|
821
|
+
this.usageTracker.save().catch(() => {});
|
|
773
822
|
}
|
|
774
823
|
|
|
775
824
|
return finalResults;
|
|
776
825
|
}
|
|
777
826
|
|
|
778
827
|
async findChunkById(chunkId) {
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
if (!
|
|
828
|
+
// Lazy-build an in-memory Map keyed by chunk_id on first call.
|
|
829
|
+
// The cache lives until unloadModel() clears it.
|
|
830
|
+
if (!this._chunkCache) {
|
|
831
|
+
const tableName = "chunks";
|
|
832
|
+
const tables = await this.db.tableNames();
|
|
833
|
+
if (!tables.includes(tableName)) return null;
|
|
782
834
|
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
835
|
+
const table = await this.db.openTable(tableName);
|
|
836
|
+
const rows = await table.search([0]).limit(100000).execute();
|
|
837
|
+
this._chunkCache = new Map();
|
|
838
|
+
for (const row of rows) {
|
|
839
|
+
if (row.chunk_id) {
|
|
840
|
+
this._chunkCache.set(row.chunk_id, row);
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
return this._chunkCache.get(chunkId) || null;
|
|
786
845
|
}
|
|
787
846
|
|
|
788
847
|
cosineSimilarity(vecA, vecB) {
|
|
@@ -880,14 +939,17 @@ class CodebaseIndexer {
|
|
|
880
939
|
|
|
881
940
|
let indexed = 0;
|
|
882
941
|
let skipped = 0;
|
|
942
|
+
const total = files.length;
|
|
883
943
|
|
|
884
|
-
for (
|
|
944
|
+
for (let i = 0; i < files.length; i++) {
|
|
945
|
+
const relPath = files[i];
|
|
885
946
|
const filePath = path.join(this.root, relPath);
|
|
886
947
|
try {
|
|
887
948
|
const wasIndexed = await this.indexFile(filePath);
|
|
888
949
|
if (wasIndexed) {
|
|
889
950
|
indexed++;
|
|
890
|
-
|
|
951
|
+
// FR-053: progress indicator includes graph building phase
|
|
952
|
+
if (onProgress) onProgress(indexed, total, relPath, i + 1);
|
|
891
953
|
} else {
|
|
892
954
|
skipped++;
|
|
893
955
|
}
|
|
@@ -896,7 +958,29 @@ class CodebaseIndexer {
|
|
|
896
958
|
}
|
|
897
959
|
}
|
|
898
960
|
|
|
899
|
-
|
|
961
|
+
// FR-005: Build semantic similarity edges as post-pass
|
|
962
|
+
// Only if we actually indexed new files and have a graph builder
|
|
963
|
+
let semanticEdges = 0;
|
|
964
|
+
if (indexed > 0 && this.graphBuilder && this.graphDB) {
|
|
965
|
+
try {
|
|
966
|
+
const tableName = "chunks";
|
|
967
|
+
const tables = await this.db.tableNames();
|
|
968
|
+
if (tables.includes(tableName)) {
|
|
969
|
+
const table = await this.db.openTable(tableName);
|
|
970
|
+
const allRows = await table.search([0]).limit(100000).execute();
|
|
971
|
+
const chunkData = allRows
|
|
972
|
+
.filter(r => r.chunk_id && r.vector)
|
|
973
|
+
.map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
|
|
974
|
+
semanticEdges = await this.graphBuilder.buildSemanticEdges(chunkData, 0.8, 3);
|
|
975
|
+
if (DEBUG) console.log(`[vectorizer] Built ${semanticEdges} semantic similarity edges`);
|
|
976
|
+
}
|
|
977
|
+
} catch (e) {
|
|
978
|
+
if (DEBUG) console.log(`[vectorizer] Semantic edge building failed:`, e.message);
|
|
979
|
+
// non-fatal — explicit edges still work
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
return { indexed, skipped, total, semanticEdges };
|
|
900
984
|
}
|
|
901
985
|
|
|
902
986
|
async indexSingleFile(filePath) {
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Usage Tracker — records provenance and usage statistics for chunks.
|
|
3
|
+
*
|
|
4
|
+
* FR-060: Record provenance for each attached chunk {query, main_chunk_id, attached_via_edge_type}
|
|
5
|
+
* FR-061: Increment usage_count when chunk appears in search results
|
|
6
|
+
* FR-062: API to query "where is chunk X used?" → list of referencing chunks
|
|
7
|
+
* FR-063: Use usage_count as additional ranking signal
|
|
8
|
+
*
|
|
9
|
+
* Storage: JSON file at .opencode/vectors/<index>/usage-stats.json
|
|
10
|
+
* Updated asynchronously (non-blocking to search).
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fs from "fs/promises"
|
|
14
|
+
import path from "path"
|
|
15
|
+
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Types
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
export interface ProvenanceRecord {
|
|
21
|
+
/** The search query that triggered this attachment */
|
|
22
|
+
query: string
|
|
23
|
+
/** The main result chunk that caused context attachment */
|
|
24
|
+
mainChunkId: string
|
|
25
|
+
/** The edge type that linked main → attached chunk */
|
|
26
|
+
edgeType: string
|
|
27
|
+
/** Timestamp */
|
|
28
|
+
timestamp: number
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface ChunkUsageStats {
|
|
32
|
+
/** How many times this chunk appeared in search results (main or attached) */
|
|
33
|
+
usageCount: number
|
|
34
|
+
/** Last time this chunk was returned in a search result */
|
|
35
|
+
lastUsed: number
|
|
36
|
+
/** Recent provenance records (max 20 per chunk to limit storage) */
|
|
37
|
+
provenance: ProvenanceRecord[]
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface UsageData {
|
|
41
|
+
/** Per-chunk usage statistics, keyed by chunk_id */
|
|
42
|
+
chunks: Record<string, ChunkUsageStats>
|
|
43
|
+
/** Global counters */
|
|
44
|
+
totalSearches: number
|
|
45
|
+
lastUpdated: number
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const MAX_PROVENANCE_PER_CHUNK = 20
|
|
49
|
+
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
// UsageTracker
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
export class UsageTracker {
|
|
55
|
+
private data: UsageData | null = null
|
|
56
|
+
private dirty = false
|
|
57
|
+
private savePath: string
|
|
58
|
+
|
|
59
|
+
constructor(private cacheDir: string) {
|
|
60
|
+
this.savePath = path.join(cacheDir, "usage-stats.json")
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---- lifecycle ----------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
async load(): Promise<void> {
|
|
66
|
+
try {
|
|
67
|
+
const raw = await fs.readFile(this.savePath, "utf-8")
|
|
68
|
+
this.data = JSON.parse(raw)
|
|
69
|
+
} catch {
|
|
70
|
+
this.data = { chunks: {}, totalSearches: 0, lastUpdated: Date.now() }
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async save(): Promise<void> {
|
|
75
|
+
if (!this.dirty || !this.data) return
|
|
76
|
+
this.data.lastUpdated = Date.now()
|
|
77
|
+
try {
|
|
78
|
+
await fs.mkdir(path.dirname(this.savePath), { recursive: true })
|
|
79
|
+
await fs.writeFile(this.savePath, JSON.stringify(this.data, null, 2), "utf-8")
|
|
80
|
+
this.dirty = false
|
|
81
|
+
} catch {
|
|
82
|
+
// non-fatal
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ---- FR-060: record provenance ------------------------------------------
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Record that `attachedChunkId` was attached to `mainChunkId` as context
|
|
90
|
+
* for `query`, via `edgeType` relation.
|
|
91
|
+
*/
|
|
92
|
+
recordProvenance(
|
|
93
|
+
query: string,
|
|
94
|
+
mainChunkId: string,
|
|
95
|
+
attachedChunkId: string,
|
|
96
|
+
edgeType: string,
|
|
97
|
+
): void {
|
|
98
|
+
if (!this.data) return
|
|
99
|
+
const stats = this.ensureChunkStats(attachedChunkId)
|
|
100
|
+
stats.provenance.push({
|
|
101
|
+
query,
|
|
102
|
+
mainChunkId,
|
|
103
|
+
edgeType,
|
|
104
|
+
timestamp: Date.now(),
|
|
105
|
+
})
|
|
106
|
+
// Cap provenance history
|
|
107
|
+
if (stats.provenance.length > MAX_PROVENANCE_PER_CHUNK) {
|
|
108
|
+
stats.provenance = stats.provenance.slice(-MAX_PROVENANCE_PER_CHUNK)
|
|
109
|
+
}
|
|
110
|
+
this.dirty = true
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ---- FR-061: increment usage_count --------------------------------------
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Record that these chunk IDs appeared in search results.
|
|
117
|
+
* Call once per search with all result chunk IDs (main + attached).
|
|
118
|
+
*/
|
|
119
|
+
recordSearchResults(chunkIds: string[]): void {
|
|
120
|
+
if (!this.data) return
|
|
121
|
+
this.data.totalSearches++
|
|
122
|
+
const now = Date.now()
|
|
123
|
+
for (const id of chunkIds) {
|
|
124
|
+
const stats = this.ensureChunkStats(id)
|
|
125
|
+
stats.usageCount++
|
|
126
|
+
stats.lastUsed = now
|
|
127
|
+
}
|
|
128
|
+
this.dirty = true
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ---- FR-062: "where is chunk X used?" -----------------------------------
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Get provenance info for a chunk: which queries led to it,
|
|
135
|
+
* which main chunks it was attached to, via which edges.
|
|
136
|
+
*/
|
|
137
|
+
getChunkProvenance(chunkId: string): ProvenanceRecord[] {
|
|
138
|
+
if (!this.data) return []
|
|
139
|
+
return this.data.chunks[chunkId]?.provenance ?? []
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Get usage stats for a chunk.
|
|
144
|
+
*/
|
|
145
|
+
getChunkStats(chunkId: string): ChunkUsageStats | null {
|
|
146
|
+
if (!this.data) return null
|
|
147
|
+
return this.data.chunks[chunkId] ?? null
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ---- FR-063: usage_count as ranking signal ------------------------------
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Get usage count for a chunk (0 if never seen).
|
|
154
|
+
* Used as additional ranking signal in search.
|
|
155
|
+
*/
|
|
156
|
+
getUsageCount(chunkId: string): number {
|
|
157
|
+
if (!this.data) return 0
|
|
158
|
+
return this.data.chunks[chunkId]?.usageCount ?? 0
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Get a usage boost factor for ranking (0.0 – 1.0).
|
|
163
|
+
* Normalized: most-used chunk → 1.0, unused → 0.0.
|
|
164
|
+
*/
|
|
165
|
+
getUsageBoost(chunkId: string): number {
|
|
166
|
+
if (!this.data) return 0
|
|
167
|
+
const stats = this.data.chunks[chunkId]
|
|
168
|
+
if (!stats || stats.usageCount === 0) return 0
|
|
169
|
+
|
|
170
|
+
// Find max usage count across all chunks for normalization
|
|
171
|
+
let maxUsage = 1
|
|
172
|
+
for (const s of Object.values(this.data.chunks)) {
|
|
173
|
+
if (s.usageCount > maxUsage) maxUsage = s.usageCount
|
|
174
|
+
}
|
|
175
|
+
return stats.usageCount / maxUsage
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// ---- summary ------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Get global usage summary.
|
|
182
|
+
*/
|
|
183
|
+
getSummary(): { totalSearches: number; trackedChunks: number; lastUpdated: number } {
|
|
184
|
+
if (!this.data) return { totalSearches: 0, trackedChunks: 0, lastUpdated: 0 }
|
|
185
|
+
return {
|
|
186
|
+
totalSearches: this.data.totalSearches,
|
|
187
|
+
trackedChunks: Object.keys(this.data.chunks).length,
|
|
188
|
+
lastUpdated: this.data.lastUpdated,
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// ---- internals ----------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
private ensureChunkStats(chunkId: string): ChunkUsageStats {
|
|
195
|
+
if (!this.data!.chunks[chunkId]) {
|
|
196
|
+
this.data!.chunks[chunkId] = {
|
|
197
|
+
usageCount: 0,
|
|
198
|
+
lastUsed: 0,
|
|
199
|
+
provenance: [],
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return this.data!.chunks[chunkId]
|
|
203
|
+
}
|
|
204
|
+
}
|