@comfanion/usethis_search 3.0.0-dev.0 → 3.0.0-dev.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
17
17
  import { SearchMetrics } from "./search-metrics.ts";
18
18
  import { GraphDB } from "./graph-db.ts";
19
19
  import { GraphBuilder } from "./graph-builder.ts";
20
+ import { UsageTracker } from "./usage-tracker.ts";
20
21
 
21
22
  // Suppress transformers.js logs unless DEBUG is set
22
23
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -354,6 +355,8 @@ class CodebaseIndexer {
354
355
  this.metrics = null; // lazy-loaded SearchMetrics
355
356
  this.graphDB = null; // Graph DB for relationships
356
357
  this.graphBuilder = null; // Graph builder orchestrator
358
+ this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
359
+ this.usageTracker = null; // Usage tracking & provenance (v3)
357
360
  }
358
361
 
359
362
  async init() {
@@ -371,6 +374,10 @@ class CodebaseIndexer {
371
374
  this.graphDB = await new GraphDB(graphPath).init();
372
375
  this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
373
376
 
377
+ // Usage tracker — provenance & usage stats
378
+ this.usageTracker = new UsageTracker(this.cacheDir);
379
+ await this.usageTracker.load();
380
+
374
381
  return this;
375
382
  }
376
383
 
@@ -399,6 +406,18 @@ class CodebaseIndexer {
399
406
  }
400
407
  this._bm25Rows = null;
401
408
  this.metrics = null;
409
+ // Close graph DB to release LevelDB lock
410
+ if (this.graphDB) {
411
+ try { await this.graphDB.close(); } catch { /* best effort */ }
412
+ this.graphDB = null;
413
+ this.graphBuilder = null;
414
+ }
415
+ // Save & release usage tracker
416
+ if (this.usageTracker) {
417
+ try { await this.usageTracker.save(); } catch { /* best effort */ }
418
+ this.usageTracker = null;
419
+ }
420
+ this._chunkCache = null;
402
421
  clearQueryCache();
403
422
  if (global.gc) global.gc();
404
423
  }
@@ -521,6 +540,13 @@ class CodebaseIndexer {
521
540
  await this.graphDB.deleteByFile(relPath);
522
541
  await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
523
542
 
543
+ // FR-054: Store graph build timestamp + file hash as metadata triple
544
+ try {
545
+ await this.graphDB.setFileMeta(relPath, hash, Date.now());
546
+ } catch {
547
+ // non-fatal — metadata is advisory
548
+ }
549
+
524
550
  const data = [];
525
551
  for (let i = 0; i < chunksWithIds.length; i++) {
526
552
  const embedding = await this.embed(chunksWithIds[i].content);
@@ -577,7 +603,13 @@ class CodebaseIndexer {
577
603
  if (!tables.includes(tableName)) return null;
578
604
 
579
605
  const table = await this.db.openTable(tableName);
580
- const allRows = await table.search([0]).limit(100000).execute();
606
+ let allRows;
607
+ try {
608
+ allRows = await table.search([0]).limit(100000).execute();
609
+ } catch (e) {
610
+ if (DEBUG) console.log("[vectorizer] BM25 index build failed (corrupted table?):", e.message);
611
+ return null;
612
+ }
581
613
 
582
614
  if (allRows.length === 0) return null;
583
615
 
@@ -617,7 +649,14 @@ class CodebaseIndexer {
617
649
  (options.tags && options.tags.length > 0);
618
650
  const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
619
651
  const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
620
- let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
652
+ let results;
653
+ try {
654
+ results = await table.search(queryEmbedding).limit(fetchLimit).execute();
655
+ } catch (e) {
656
+ // LanceDB schema error (e.g. missing vector column) — index is corrupted
657
+ if (DEBUG) console.log("[vectorizer] Vector search failed (corrupted index?):", e.message);
658
+ return [];
659
+ }
621
660
 
622
661
  // ── Hybrid search ───────────────────────────────────────────────────────
623
662
  if (HYBRID_CONFIG.enabled || options.hybrid) {
@@ -769,20 +808,59 @@ class CodebaseIndexer {
769
808
 
770
809
  neighbors.sort((a, b) => b.score - a.score);
771
810
  result.relatedContext = neighbors.slice(0, 3);
811
+
812
+ // FR-060: Record provenance for each attached chunk
813
+ if (this.usageTracker) {
814
+ for (const n of result.relatedContext) {
815
+ this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
816
+ }
817
+ }
772
818
  }
773
819
  }
774
820
 
821
+ // FR-061: Record usage counts for all returned chunks (main + attached)
822
+ if (this.usageTracker) {
823
+ const allChunkIds = [];
824
+ for (const r of finalResults) {
825
+ if (r.chunk_id) allChunkIds.push(r.chunk_id);
826
+ if (r.relatedContext) {
827
+ for (const rc of r.relatedContext) {
828
+ if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
829
+ }
830
+ }
831
+ }
832
+ this.usageTracker.recordSearchResults(allChunkIds);
833
+ // Save asynchronously (non-blocking)
834
+ this.usageTracker.save().catch(() => {});
835
+ }
836
+
775
837
  return finalResults;
776
838
  }
777
839
 
778
840
  async findChunkById(chunkId) {
779
- const tableName = "chunks";
780
- const tables = await this.db.tableNames();
781
- if (!tables.includes(tableName)) return null;
841
+ // Lazy-build an in-memory Map keyed by chunk_id on first call.
842
+ // The cache lives until unloadModel() clears it.
843
+ if (!this._chunkCache) {
844
+ const tableName = "chunks";
845
+ const tables = await this.db.tableNames();
846
+ if (!tables.includes(tableName)) return null;
782
847
 
783
- const table = await this.db.openTable(tableName);
784
- const rows = await table.search([0]).limit(100000).execute();
785
- return rows.find(r => r.chunk_id === chunkId) || null;
848
+ const table = await this.db.openTable(tableName);
849
+ let rows;
850
+ try {
851
+ rows = await table.search([0]).limit(100000).execute();
852
+ } catch (e) {
853
+ if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
854
+ return null;
855
+ }
856
+ this._chunkCache = new Map();
857
+ for (const row of rows) {
858
+ if (row.chunk_id) {
859
+ this._chunkCache.set(row.chunk_id, row);
860
+ }
861
+ }
862
+ }
863
+ return this._chunkCache.get(chunkId) || null;
786
864
  }
787
865
 
788
866
  cosineSimilarity(vecA, vecB) {
@@ -880,14 +958,17 @@ class CodebaseIndexer {
880
958
 
881
959
  let indexed = 0;
882
960
  let skipped = 0;
961
+ const total = files.length;
883
962
 
884
- for (const relPath of files) {
963
+ for (let i = 0; i < files.length; i++) {
964
+ const relPath = files[i];
885
965
  const filePath = path.join(this.root, relPath);
886
966
  try {
887
967
  const wasIndexed = await this.indexFile(filePath);
888
968
  if (wasIndexed) {
889
969
  indexed++;
890
- if (onProgress) onProgress(indexed, files.length, relPath);
970
+ // FR-053: progress indicator includes graph building phase
971
+ if (onProgress) onProgress(indexed, total, relPath, i + 1);
891
972
  } else {
892
973
  skipped++;
893
974
  }
@@ -896,7 +977,29 @@ class CodebaseIndexer {
896
977
  }
897
978
  }
898
979
 
899
- return { indexed, skipped, total: files.length };
980
+ // FR-005: Build semantic similarity edges as post-pass
981
+ // Only if we actually indexed new files and have a graph builder
982
+ let semanticEdges = 0;
983
+ if (indexed > 0 && this.graphBuilder && this.graphDB) {
984
+ try {
985
+ const tableName = "chunks";
986
+ const tables = await this.db.tableNames();
987
+ if (tables.includes(tableName)) {
988
+ const table = await this.db.openTable(tableName);
989
+ const allRows = await table.search([0]).limit(100000).execute();
990
+ const chunkData = allRows
991
+ .filter(r => r.chunk_id && r.vector)
992
+ .map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
993
+ semanticEdges = await this.graphBuilder.buildSemanticEdges(chunkData, 0.8, 3);
994
+ if (DEBUG) console.log(`[vectorizer] Built ${semanticEdges} semantic similarity edges`);
995
+ }
996
+ } catch (e) {
997
+ if (DEBUG) console.log(`[vectorizer] Semantic edge building failed:`, e.message);
998
+ // non-fatal — explicit edges still work
999
+ }
1000
+ }
1001
+
1002
+ return { indexed, skipped, total, semanticEdges };
900
1003
  }
901
1004
 
902
1005
  async indexSingleFile(filePath) {
@@ -0,0 +1,204 @@
1
+ /**
2
+ * Usage Tracker — records provenance and usage statistics for chunks.
3
+ *
4
+ * FR-060: Record provenance for each attached chunk {query, main_chunk_id, attached_via_edge_type}
5
+ * FR-061: Increment usage_count when chunk appears in search results
6
+ * FR-062: API to query "where is chunk X used?" → list of referencing chunks
7
+ * FR-063: Use usage_count as additional ranking signal
8
+ *
9
+ * Storage: JSON file at .opencode/vectors/<index>/usage-stats.json
10
+ * Updated asynchronously (non-blocking to search).
11
+ */
12
+
13
+ import fs from "fs/promises"
14
+ import path from "path"
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Types
18
+ // ---------------------------------------------------------------------------
19
+
20
+ export interface ProvenanceRecord {
21
+ /** The search query that triggered this attachment */
22
+ query: string
23
+ /** The main result chunk that caused context attachment */
24
+ mainChunkId: string
25
+ /** The edge type that linked main → attached chunk */
26
+ edgeType: string
27
+ /** Timestamp */
28
+ timestamp: number
29
+ }
30
+
31
+ export interface ChunkUsageStats {
32
+ /** How many times this chunk appeared in search results (main or attached) */
33
+ usageCount: number
34
+ /** Last time this chunk was returned in a search result */
35
+ lastUsed: number
36
+ /** Recent provenance records (max 20 per chunk to limit storage) */
37
+ provenance: ProvenanceRecord[]
38
+ }
39
+
40
+ export interface UsageData {
41
+ /** Per-chunk usage statistics, keyed by chunk_id */
42
+ chunks: Record<string, ChunkUsageStats>
43
+ /** Global counters */
44
+ totalSearches: number
45
+ lastUpdated: number
46
+ }
47
+
48
+ const MAX_PROVENANCE_PER_CHUNK = 20
49
+
50
+ // ---------------------------------------------------------------------------
51
+ // UsageTracker
52
+ // ---------------------------------------------------------------------------
53
+
54
+ export class UsageTracker {
55
+ private data: UsageData | null = null
56
+ private dirty = false
57
+ private savePath: string
58
+
59
+ constructor(private cacheDir: string) {
60
+ this.savePath = path.join(cacheDir, "usage-stats.json")
61
+ }
62
+
63
+ // ---- lifecycle ----------------------------------------------------------
64
+
65
+ async load(): Promise<void> {
66
+ try {
67
+ const raw = await fs.readFile(this.savePath, "utf-8")
68
+ this.data = JSON.parse(raw)
69
+ } catch {
70
+ this.data = { chunks: {}, totalSearches: 0, lastUpdated: Date.now() }
71
+ }
72
+ }
73
+
74
+ async save(): Promise<void> {
75
+ if (!this.dirty || !this.data) return
76
+ this.data.lastUpdated = Date.now()
77
+ try {
78
+ await fs.mkdir(path.dirname(this.savePath), { recursive: true })
79
+ await fs.writeFile(this.savePath, JSON.stringify(this.data, null, 2), "utf-8")
80
+ this.dirty = false
81
+ } catch {
82
+ // non-fatal
83
+ }
84
+ }
85
+
86
+ // ---- FR-060: record provenance ------------------------------------------
87
+
88
+ /**
89
+ * Record that `attachedChunkId` was attached to `mainChunkId` as context
90
+ * for `query`, via `edgeType` relation.
91
+ */
92
+ recordProvenance(
93
+ query: string,
94
+ mainChunkId: string,
95
+ attachedChunkId: string,
96
+ edgeType: string,
97
+ ): void {
98
+ if (!this.data) return
99
+ const stats = this.ensureChunkStats(attachedChunkId)
100
+ stats.provenance.push({
101
+ query,
102
+ mainChunkId,
103
+ edgeType,
104
+ timestamp: Date.now(),
105
+ })
106
+ // Cap provenance history
107
+ if (stats.provenance.length > MAX_PROVENANCE_PER_CHUNK) {
108
+ stats.provenance = stats.provenance.slice(-MAX_PROVENANCE_PER_CHUNK)
109
+ }
110
+ this.dirty = true
111
+ }
112
+
113
+ // ---- FR-061: increment usage_count --------------------------------------
114
+
115
+ /**
116
+ * Record that these chunk IDs appeared in search results.
117
+ * Call once per search with all result chunk IDs (main + attached).
118
+ */
119
+ recordSearchResults(chunkIds: string[]): void {
120
+ if (!this.data) return
121
+ this.data.totalSearches++
122
+ const now = Date.now()
123
+ for (const id of chunkIds) {
124
+ const stats = this.ensureChunkStats(id)
125
+ stats.usageCount++
126
+ stats.lastUsed = now
127
+ }
128
+ this.dirty = true
129
+ }
130
+
131
+ // ---- FR-062: "where is chunk X used?" -----------------------------------
132
+
133
+ /**
134
+ * Get provenance info for a chunk: which queries led to it,
135
+ * which main chunks it was attached to, via which edges.
136
+ */
137
+ getChunkProvenance(chunkId: string): ProvenanceRecord[] {
138
+ if (!this.data) return []
139
+ return this.data.chunks[chunkId]?.provenance ?? []
140
+ }
141
+
142
+ /**
143
+ * Get usage stats for a chunk.
144
+ */
145
+ getChunkStats(chunkId: string): ChunkUsageStats | null {
146
+ if (!this.data) return null
147
+ return this.data.chunks[chunkId] ?? null
148
+ }
149
+
150
+ // ---- FR-063: usage_count as ranking signal ------------------------------
151
+
152
+ /**
153
+ * Get usage count for a chunk (0 if never seen).
154
+ * Used as additional ranking signal in search.
155
+ */
156
+ getUsageCount(chunkId: string): number {
157
+ if (!this.data) return 0
158
+ return this.data.chunks[chunkId]?.usageCount ?? 0
159
+ }
160
+
161
+ /**
162
+ * Get a usage boost factor for ranking (0.0 – 1.0).
163
+ * Normalized: most-used chunk → 1.0, unused → 0.0.
164
+ */
165
+ getUsageBoost(chunkId: string): number {
166
+ if (!this.data) return 0
167
+ const stats = this.data.chunks[chunkId]
168
+ if (!stats || stats.usageCount === 0) return 0
169
+
170
+ // Find max usage count across all chunks for normalization
171
+ let maxUsage = 1
172
+ for (const s of Object.values(this.data.chunks)) {
173
+ if (s.usageCount > maxUsage) maxUsage = s.usageCount
174
+ }
175
+ return stats.usageCount / maxUsage
176
+ }
177
+
178
+ // ---- summary ------------------------------------------------------------
179
+
180
+ /**
181
+ * Get global usage summary.
182
+ */
183
+ getSummary(): { totalSearches: number; trackedChunks: number; lastUpdated: number } {
184
+ if (!this.data) return { totalSearches: 0, trackedChunks: 0, lastUpdated: 0 }
185
+ return {
186
+ totalSearches: this.data.totalSearches,
187
+ trackedChunks: Object.keys(this.data.chunks).length,
188
+ lastUpdated: this.data.lastUpdated,
189
+ }
190
+ }
191
+
192
+ // ---- internals ----------------------------------------------------------
193
+
194
+ private ensureChunkStats(chunkId: string): ChunkUsageStats {
195
+ if (!this.data!.chunks[chunkId]) {
196
+ this.data!.chunks[chunkId] = {
197
+ usageCount: 0,
198
+ lastUsed: 0,
199
+ provenance: [],
200
+ }
201
+ }
202
+ return this.data!.chunks[chunkId]
203
+ }
204
+ }
@@ -1,54 +0,0 @@
1
- import { tool } from "@opencode-ai/plugin"
2
- import path from "path"
3
-
4
- import { CodebaseIndexer } from "../vectorizer/index.js"
5
-
6
- export default tool({
7
- description: `Read file with graph-aware context attachment. When available, this tool searches the file in the index and returns content + related context from the graph (imports, links, etc.).
8
-
9
- Use this instead of the standard Read tool for better context awareness.`,
10
-
11
- args: {
12
- filePath: tool.schema.string().describe("Path to the file to read"),
13
- },
14
-
15
- async execute(args) {
16
- const projectRoot = process.cwd()
17
- const filePath = path.isAbsolute(args.filePath) ? args.filePath : path.join(projectRoot, args.filePath)
18
-
19
- const relPath = path.relative(projectRoot, filePath)
20
-
21
- const indexer = await new CodebaseIndexer(projectRoot, "code").init()
22
- const results = await indexer.search(relPath, 20, false, {})
23
- const fileChunks = results.filter(r => r.file === relPath)
24
- await indexer.unloadModel()
25
-
26
- if (fileChunks.length === 0) {
27
- return `File "${relPath}" not indexed. Use original Read tool or run codeindex({ action: "reindex", index: "code" })`
28
- }
29
-
30
- let output = `## ${relPath}\n\n`
31
-
32
- output += `### Content\n\n`
33
- for (const chunk of fileChunks) {
34
- output += chunk.content + "\n\n"
35
- }
36
-
37
- const allRelated = fileChunks
38
- .flatMap(c => c.relatedContext || [])
39
- .filter((r, i, arr) => arr.findIndex(x => x.chunk_id === r.chunk_id) === i)
40
-
41
- if (allRelated.length > 0) {
42
- output += `### Related Context\n\n`
43
- for (const rel of allRelated) {
44
- const snippet = rel.content.length > 300
45
- ? rel.content.substring(0, 300) + "..."
46
- : rel.content
47
- output += `**${rel.file}** (${rel.relation})\n`
48
- output += `\`\`\`\n${snippet}\n\`\`\`\n\n`
49
- }
50
- }
51
-
52
- return output
53
- },
54
- })