@comfanion/usethis_search 3.0.0-dev.0 → 3.0.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
17
17
  import { SearchMetrics } from "./search-metrics.ts";
18
18
  import { GraphDB } from "./graph-db.ts";
19
19
  import { GraphBuilder } from "./graph-builder.ts";
20
+ import { UsageTracker } from "./usage-tracker.ts";
20
21
 
21
22
  // Suppress transformers.js logs unless DEBUG is set
22
23
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -354,6 +355,8 @@ class CodebaseIndexer {
354
355
  this.metrics = null; // lazy-loaded SearchMetrics
355
356
  this.graphDB = null; // Graph DB for relationships
356
357
  this.graphBuilder = null; // Graph builder orchestrator
358
+ this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
359
+ this.usageTracker = null; // Usage tracking & provenance (v3)
357
360
  }
358
361
 
359
362
  async init() {
@@ -371,6 +374,10 @@ class CodebaseIndexer {
371
374
  this.graphDB = await new GraphDB(graphPath).init();
372
375
  this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
373
376
 
377
+ // Usage tracker — provenance & usage stats
378
+ this.usageTracker = new UsageTracker(this.cacheDir);
379
+ await this.usageTracker.load();
380
+
374
381
  return this;
375
382
  }
376
383
 
@@ -399,6 +406,18 @@ class CodebaseIndexer {
399
406
  }
400
407
  this._bm25Rows = null;
401
408
  this.metrics = null;
409
+ // Close graph DB to release LevelDB lock
410
+ if (this.graphDB) {
411
+ try { await this.graphDB.close(); } catch { /* best effort */ }
412
+ this.graphDB = null;
413
+ this.graphBuilder = null;
414
+ }
415
+ // Save & release usage tracker
416
+ if (this.usageTracker) {
417
+ try { await this.usageTracker.save(); } catch { /* best effort */ }
418
+ this.usageTracker = null;
419
+ }
420
+ this._chunkCache = null;
402
421
  clearQueryCache();
403
422
  if (global.gc) global.gc();
404
423
  }
@@ -521,6 +540,13 @@ class CodebaseIndexer {
521
540
  await this.graphDB.deleteByFile(relPath);
522
541
  await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
523
542
 
543
+ // FR-054: Store graph build timestamp + file hash as metadata triple
544
+ try {
545
+ await this.graphDB.setFileMeta(relPath, hash, Date.now());
546
+ } catch {
547
+ // non-fatal — metadata is advisory
548
+ }
549
+
524
550
  const data = [];
525
551
  for (let i = 0; i < chunksWithIds.length; i++) {
526
552
  const embedding = await this.embed(chunksWithIds[i].content);
@@ -769,20 +795,53 @@ class CodebaseIndexer {
769
795
 
770
796
  neighbors.sort((a, b) => b.score - a.score);
771
797
  result.relatedContext = neighbors.slice(0, 3);
798
+
799
+ // FR-060: Record provenance for each attached chunk
800
+ if (this.usageTracker) {
801
+ for (const n of result.relatedContext) {
802
+ this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
803
+ }
804
+ }
805
+ }
806
+ }
807
+
808
+ // FR-061: Record usage counts for all returned chunks (main + attached)
809
+ if (this.usageTracker) {
810
+ const allChunkIds = [];
811
+ for (const r of finalResults) {
812
+ if (r.chunk_id) allChunkIds.push(r.chunk_id);
813
+ if (r.relatedContext) {
814
+ for (const rc of r.relatedContext) {
815
+ if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
816
+ }
817
+ }
772
818
  }
819
+ this.usageTracker.recordSearchResults(allChunkIds);
820
+ // Save asynchronously (non-blocking)
821
+ this.usageTracker.save().catch(() => {});
773
822
  }
774
823
 
775
824
  return finalResults;
776
825
  }
777
826
 
778
827
  async findChunkById(chunkId) {
779
- const tableName = "chunks";
780
- const tables = await this.db.tableNames();
781
- if (!tables.includes(tableName)) return null;
828
+ // Lazy-build an in-memory Map keyed by chunk_id on first call.
829
+ // The cache lives until unloadModel() clears it.
830
+ if (!this._chunkCache) {
831
+ const tableName = "chunks";
832
+ const tables = await this.db.tableNames();
833
+ if (!tables.includes(tableName)) return null;
782
834
 
783
- const table = await this.db.openTable(tableName);
784
- const rows = await table.search([0]).limit(100000).execute();
785
- return rows.find(r => r.chunk_id === chunkId) || null;
835
+ const table = await this.db.openTable(tableName);
836
+ const rows = await table.search([0]).limit(100000).execute();
837
+ this._chunkCache = new Map();
838
+ for (const row of rows) {
839
+ if (row.chunk_id) {
840
+ this._chunkCache.set(row.chunk_id, row);
841
+ }
842
+ }
843
+ }
844
+ return this._chunkCache.get(chunkId) || null;
786
845
  }
787
846
 
788
847
  cosineSimilarity(vecA, vecB) {
@@ -880,14 +939,17 @@ class CodebaseIndexer {
880
939
 
881
940
  let indexed = 0;
882
941
  let skipped = 0;
942
+ const total = files.length;
883
943
 
884
- for (const relPath of files) {
944
+ for (let i = 0; i < files.length; i++) {
945
+ const relPath = files[i];
885
946
  const filePath = path.join(this.root, relPath);
886
947
  try {
887
948
  const wasIndexed = await this.indexFile(filePath);
888
949
  if (wasIndexed) {
889
950
  indexed++;
890
- if (onProgress) onProgress(indexed, files.length, relPath);
951
+ // FR-053: progress indicator includes graph building phase
952
+ if (onProgress) onProgress(indexed, total, relPath, i + 1);
891
953
  } else {
892
954
  skipped++;
893
955
  }
@@ -896,7 +958,29 @@ class CodebaseIndexer {
896
958
  }
897
959
  }
898
960
 
899
- return { indexed, skipped, total: files.length };
961
+ // FR-005: Build semantic similarity edges as post-pass
962
+ // Only if we actually indexed new files and have a graph builder
963
+ let semanticEdges = 0;
964
+ if (indexed > 0 && this.graphBuilder && this.graphDB) {
965
+ try {
966
+ const tableName = "chunks";
967
+ const tables = await this.db.tableNames();
968
+ if (tables.includes(tableName)) {
969
+ const table = await this.db.openTable(tableName);
970
+ const allRows = await table.search([0]).limit(100000).execute();
971
+ const chunkData = allRows
972
+ .filter(r => r.chunk_id && r.vector)
973
+ .map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
974
+ semanticEdges = await this.graphBuilder.buildSemanticEdges(chunkData, 0.8, 3);
975
+ if (DEBUG) console.log(`[vectorizer] Built ${semanticEdges} semantic similarity edges`);
976
+ }
977
+ } catch (e) {
978
+ if (DEBUG) console.log(`[vectorizer] Semantic edge building failed:`, e.message);
979
+ // non-fatal — explicit edges still work
980
+ }
981
+ }
982
+
983
+ return { indexed, skipped, total, semanticEdges };
900
984
  }
901
985
 
902
986
  async indexSingleFile(filePath) {
@@ -0,0 +1,204 @@
1
+ /**
2
+ * Usage Tracker — records provenance and usage statistics for chunks.
3
+ *
4
+ * FR-060: Record provenance for each attached chunk {query, main_chunk_id, attached_via_edge_type}
5
+ * FR-061: Increment usage_count when chunk appears in search results
6
+ * FR-062: API to query "where is chunk X used?" → list of referencing chunks
7
+ * FR-063: Use usage_count as additional ranking signal
8
+ *
9
+ * Storage: JSON file at .opencode/vectors/<index>/usage-stats.json
10
+ * Updated asynchronously (non-blocking to search).
11
+ */
12
+
13
+ import fs from "fs/promises"
14
+ import path from "path"
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Types
18
+ // ---------------------------------------------------------------------------
19
+
20
+ export interface ProvenanceRecord {
21
+ /** The search query that triggered this attachment */
22
+ query: string
23
+ /** The main result chunk that caused context attachment */
24
+ mainChunkId: string
25
+ /** The edge type that linked main → attached chunk */
26
+ edgeType: string
27
+ /** Timestamp */
28
+ timestamp: number
29
+ }
30
+
31
+ export interface ChunkUsageStats {
32
+ /** How many times this chunk appeared in search results (main or attached) */
33
+ usageCount: number
34
+ /** Last time this chunk was returned in a search result */
35
+ lastUsed: number
36
+ /** Recent provenance records (max 20 per chunk to limit storage) */
37
+ provenance: ProvenanceRecord[]
38
+ }
39
+
40
+ export interface UsageData {
41
+ /** Per-chunk usage statistics, keyed by chunk_id */
42
+ chunks: Record<string, ChunkUsageStats>
43
+ /** Global counters */
44
+ totalSearches: number
45
+ lastUpdated: number
46
+ }
47
+
48
+ const MAX_PROVENANCE_PER_CHUNK = 20
49
+
50
+ // ---------------------------------------------------------------------------
51
+ // UsageTracker
52
+ // ---------------------------------------------------------------------------
53
+
54
+ export class UsageTracker {
55
+ private data: UsageData | null = null
56
+ private dirty = false
57
+ private savePath: string
58
+
59
+ constructor(private cacheDir: string) {
60
+ this.savePath = path.join(cacheDir, "usage-stats.json")
61
+ }
62
+
63
+ // ---- lifecycle ----------------------------------------------------------
64
+
65
+ async load(): Promise<void> {
66
+ try {
67
+ const raw = await fs.readFile(this.savePath, "utf-8")
68
+ this.data = JSON.parse(raw)
69
+ } catch {
70
+ this.data = { chunks: {}, totalSearches: 0, lastUpdated: Date.now() }
71
+ }
72
+ }
73
+
74
+ async save(): Promise<void> {
75
+ if (!this.dirty || !this.data) return
76
+ this.data.lastUpdated = Date.now()
77
+ try {
78
+ await fs.mkdir(path.dirname(this.savePath), { recursive: true })
79
+ await fs.writeFile(this.savePath, JSON.stringify(this.data, null, 2), "utf-8")
80
+ this.dirty = false
81
+ } catch {
82
+ // non-fatal
83
+ }
84
+ }
85
+
86
+ // ---- FR-060: record provenance ------------------------------------------
87
+
88
+ /**
89
+ * Record that `attachedChunkId` was attached to `mainChunkId` as context
90
+ * for `query`, via `edgeType` relation.
91
+ */
92
+ recordProvenance(
93
+ query: string,
94
+ mainChunkId: string,
95
+ attachedChunkId: string,
96
+ edgeType: string,
97
+ ): void {
98
+ if (!this.data) return
99
+ const stats = this.ensureChunkStats(attachedChunkId)
100
+ stats.provenance.push({
101
+ query,
102
+ mainChunkId,
103
+ edgeType,
104
+ timestamp: Date.now(),
105
+ })
106
+ // Cap provenance history
107
+ if (stats.provenance.length > MAX_PROVENANCE_PER_CHUNK) {
108
+ stats.provenance = stats.provenance.slice(-MAX_PROVENANCE_PER_CHUNK)
109
+ }
110
+ this.dirty = true
111
+ }
112
+
113
+ // ---- FR-061: increment usage_count --------------------------------------
114
+
115
+ /**
116
+ * Record that these chunk IDs appeared in search results.
117
+ * Call once per search with all result chunk IDs (main + attached).
118
+ */
119
+ recordSearchResults(chunkIds: string[]): void {
120
+ if (!this.data) return
121
+ this.data.totalSearches++
122
+ const now = Date.now()
123
+ for (const id of chunkIds) {
124
+ const stats = this.ensureChunkStats(id)
125
+ stats.usageCount++
126
+ stats.lastUsed = now
127
+ }
128
+ this.dirty = true
129
+ }
130
+
131
+ // ---- FR-062: "where is chunk X used?" -----------------------------------
132
+
133
+ /**
134
+ * Get provenance info for a chunk: which queries led to it,
135
+ * which main chunks it was attached to, via which edges.
136
+ */
137
+ getChunkProvenance(chunkId: string): ProvenanceRecord[] {
138
+ if (!this.data) return []
139
+ return this.data.chunks[chunkId]?.provenance ?? []
140
+ }
141
+
142
+ /**
143
+ * Get usage stats for a chunk.
144
+ */
145
+ getChunkStats(chunkId: string): ChunkUsageStats | null {
146
+ if (!this.data) return null
147
+ return this.data.chunks[chunkId] ?? null
148
+ }
149
+
150
+ // ---- FR-063: usage_count as ranking signal ------------------------------
151
+
152
+ /**
153
+ * Get usage count for a chunk (0 if never seen).
154
+ * Used as additional ranking signal in search.
155
+ */
156
+ getUsageCount(chunkId: string): number {
157
+ if (!this.data) return 0
158
+ return this.data.chunks[chunkId]?.usageCount ?? 0
159
+ }
160
+
161
+ /**
162
+ * Get a usage boost factor for ranking (0.0 – 1.0).
163
+ * Normalized: most-used chunk → 1.0, unused → 0.0.
164
+ */
165
+ getUsageBoost(chunkId: string): number {
166
+ if (!this.data) return 0
167
+ const stats = this.data.chunks[chunkId]
168
+ if (!stats || stats.usageCount === 0) return 0
169
+
170
+ // Find max usage count across all chunks for normalization
171
+ let maxUsage = 1
172
+ for (const s of Object.values(this.data.chunks)) {
173
+ if (s.usageCount > maxUsage) maxUsage = s.usageCount
174
+ }
175
+ return stats.usageCount / maxUsage
176
+ }
177
+
178
+ // ---- summary ------------------------------------------------------------
179
+
180
+ /**
181
+ * Get global usage summary.
182
+ */
183
+ getSummary(): { totalSearches: number; trackedChunks: number; lastUpdated: number } {
184
+ if (!this.data) return { totalSearches: 0, trackedChunks: 0, lastUpdated: 0 }
185
+ return {
186
+ totalSearches: this.data.totalSearches,
187
+ trackedChunks: Object.keys(this.data.chunks).length,
188
+ lastUpdated: this.data.lastUpdated,
189
+ }
190
+ }
191
+
192
+ // ---- internals ----------------------------------------------------------
193
+
194
+ private ensureChunkStats(chunkId: string): ChunkUsageStats {
195
+ if (!this.data!.chunks[chunkId]) {
196
+ this.data!.chunks[chunkId] = {
197
+ usageCount: 0,
198
+ lastUsed: 0,
199
+ provenance: [],
200
+ }
201
+ }
202
+ return this.data!.chunks[chunkId]
203
+ }
204
+ }