@ghcrawl/api-core 0.7.1 → 0.8.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/service.js CHANGED
@@ -1,12 +1,15 @@
1
1
  import http from 'node:http';
2
2
  import crypto from 'node:crypto';
3
+ import fs from 'node:fs';
3
4
  import { existsSync } from 'node:fs';
5
+ import { createRequire } from 'node:module';
4
6
  import os from 'node:os';
7
+ import path from 'node:path';
5
8
  import { fileURLToPath } from 'node:url';
6
9
  import { Worker } from 'node:worker_threads';
7
10
  import { IterableMapper } from '@shutterstock/p-map-iterable';
8
11
  import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
9
- import { buildClusters } from './cluster/build.js';
12
+ import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js';
10
13
  import { buildSourceKindEdges } from './cluster/exact-edges.js';
11
14
  import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
12
15
  import { migrate } from './db/migrate.js';
@@ -14,7 +17,8 @@ import { openDb } from './db/sqlite.js';
14
17
  import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
15
18
  import { makeGitHubClient } from './github/client.js';
16
19
  import { OpenAiProvider } from './openai/provider.js';
17
- import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
20
+ import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js';
21
+ import { VectorliteStore } from './vector/vectorlite-store.js';
18
22
  const SYNC_BATCH_SIZE = 100;
19
23
  const SYNC_BATCH_DELAY_MS = 5000;
20
24
  const STALE_CLOSED_SWEEP_LIMIT = 1000;
@@ -23,10 +27,31 @@ const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
23
27
  const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
24
28
  const EMBED_MAX_ITEM_TOKENS = 7000;
25
29
  const EMBED_MAX_BATCH_TOKENS = 250000;
30
+ const requireFromHere = createRequire(import.meta.url);
26
31
  const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
27
32
  const EMBED_CONTEXT_RETRY_ATTEMPTS = 5;
28
33
  const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9;
29
34
  const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95;
35
+ const SUMMARY_PROMPT_VERSION = 'v1';
36
+ const ACTIVE_EMBED_DIMENSIONS = 1024;
37
+ const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1';
38
+ const DEFAULT_CLUSTER_MIN_SCORE = 0.78;
39
+ const VECTORLITE_CLUSTER_EXPANDED_K = 24;
40
+ const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4;
41
+ const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512;
42
+ const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024;
43
+ const SUMMARY_MODEL_PRICING = {
44
+ 'gpt-5-mini': {
45
+ inputCostPerM: 0.25,
46
+ cachedInputCostPerM: 0.025,
47
+ outputCostPerM: 2.0,
48
+ },
49
+ 'gpt-5.4-mini': {
50
+ inputCostPerM: 0.75,
51
+ cachedInputCostPerM: 0.075,
52
+ outputCostPerM: 4.5,
53
+ },
54
+ };
30
55
  function nowIso() {
31
56
  return new Date().toISOString();
32
57
  }
@@ -180,6 +205,7 @@ export class GHCrawlService {
180
205
  db;
181
206
  github;
182
207
  ai;
208
+ vectorStore;
183
209
  constructor(options = {}) {
184
210
  this.config = options.config ?? loadConfig();
185
211
  ensureRuntimeDirs(this.config);
@@ -187,8 +213,10 @@ export class GHCrawlService {
187
213
  migrate(this.db);
188
214
  this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
189
215
  this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
216
+ this.vectorStore = options.vectorStore ?? new VectorliteStore();
190
217
  }
191
218
  close() {
219
+ this.vectorStore.close();
192
220
  this.db.close();
193
221
  }
194
222
  init() {
@@ -255,7 +283,17 @@ export class GHCrawlService {
255
283
  }
256
284
  }
257
285
  }
258
- return { health, github, openai };
286
+ const vectorliteHealth = this.vectorStore.checkRuntime();
287
+ return {
288
+ health,
289
+ github,
290
+ openai,
291
+ vectorlite: {
292
+ configured: this.config.vectorBackend === 'vectorlite',
293
+ runtimeOk: vectorliteHealth.ok,
294
+ error: vectorliteHealth.error,
295
+ },
296
+ };
259
297
  }
260
298
  listRepositories() {
261
299
  const rows = this.db.prepare('select * from repositories order by full_name asc').all();
@@ -602,31 +640,69 @@ export class GHCrawlService {
602
640
  });
603
641
  const pending = sources.filter((row) => {
604
642
  const latest = this.db
605
- .prepare('select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
643
+ .prepare('select content_hash, prompt_version from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
606
644
  .get(row.id, 'dedupe_summary', this.config.summaryModel);
607
- return latest?.content_hash !== row.summaryContentHash;
645
+ return latest?.content_hash !== row.summaryContentHash || latest?.prompt_version !== SUMMARY_PROMPT_VERSION;
608
646
  });
609
647
  params.onProgress?.(`[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`);
610
648
  let summarized = 0;
611
649
  let inputTokens = 0;
612
650
  let outputTokens = 0;
613
651
  let totalTokens = 0;
614
- for (const [index, row] of pending.entries()) {
615
- params.onProgress?.(`[summarize] ${index + 1}/${pending.length} thread #${row.number}`);
652
+ let cachedInputTokens = 0;
653
+ const startTime = Date.now();
654
+ const pricing = SUMMARY_MODEL_PRICING[this.config.summaryModel] ?? null;
655
+ // Stage 1: concurrent API calls
656
+ const fetcher = new IterableMapper(pending, async (row) => {
616
657
  const result = await ai.summarizeThread({
617
658
  model: this.config.summaryModel,
618
659
  text: row.summaryInput,
619
660
  });
661
+ return { row, result };
662
+ }, { concurrency: 5 });
663
+ // Stage 2: sequential DB writes — consumes from fetcher without blocking API completions
664
+ const writer = new IterableMapper(fetcher, async ({ row, result }) => {
620
665
  const summary = result.summary;
621
666
  this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
622
667
  this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
623
668
  this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
624
669
  this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
625
- if (result.usage) {
626
- inputTokens += result.usage.inputTokens;
627
- outputTokens += result.usage.outputTokens;
628
- totalTokens += result.usage.totalTokens;
629
- params.onProgress?.(`[summarize] tokens thread #${row.number} in=${result.usage.inputTokens} out=${result.usage.outputTokens} total=${result.usage.totalTokens} cached_in=${result.usage.cachedInputTokens} reasoning=${result.usage.reasoningTokens}`);
670
+ return { row, usage: result.usage };
671
+ }, { concurrency: 1 });
672
+ let index = 0;
673
+ for await (const { row, usage } of writer) {
674
+ index += 1;
675
+ if (usage) {
676
+ inputTokens += usage.inputTokens;
677
+ outputTokens += usage.outputTokens;
678
+ totalTokens += usage.totalTokens;
679
+ cachedInputTokens += usage.cachedInputTokens;
680
+ }
681
+ // Compute cost and ETA every 10 items or on the last item
682
+ if (index % 10 === 0 || index === pending.length) {
683
+ const remaining = pending.length - index;
684
+ const avgIn = inputTokens / index;
685
+ const avgOut = outputTokens / index;
686
+ const avgCachedIn = cachedInputTokens / index;
687
+ const elapsedSec = (Date.now() - startTime) / 1000;
688
+ const secPerItem = elapsedSec / index;
689
+ const etaSec = remaining * secPerItem;
690
+ const etaMin = Math.round(etaSec / 60);
691
+ const etaStr = etaMin >= 60 ? `${Math.floor(etaMin / 60)}h${etaMin % 60}m` : `${etaMin}m`;
692
+ if (pricing) {
693
+ const uncachedInput = inputTokens - cachedInputTokens;
694
+ const costSoFar = (uncachedInput / 1_000_000) * pricing.inputCostPerM +
695
+ (cachedInputTokens / 1_000_000) * pricing.cachedInputCostPerM +
696
+ (outputTokens / 1_000_000) * pricing.outputCostPerM;
697
+ const estTotalCost = costSoFar +
698
+ ((remaining * (avgIn - avgCachedIn)) / 1_000_000) * pricing.inputCostPerM +
699
+ ((remaining * avgCachedIn) / 1_000_000) * pricing.cachedInputCostPerM +
700
+ ((remaining * avgOut) / 1_000_000) * pricing.outputCostPerM;
701
+ params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | cost=$${costSoFar.toFixed(2)} est_total=$${estTotalCost.toFixed(2)} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
702
+ }
703
+ else {
704
+ params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
705
+ }
630
706
  }
631
707
  summarized += 1;
632
708
  }
@@ -670,11 +746,25 @@ export class GHCrawlService {
670
746
  const repository = this.requireRepository(params.owner, params.repo);
671
747
  const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
672
748
  try {
673
- const { rows, tasks, pending } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
749
+ if (params.threadNumber === undefined) {
750
+ if (!this.isRepoVectorStateCurrent(repository.id)) {
751
+ this.resetRepositoryVectors(repository.id, repository.fullName);
752
+ }
753
+ else {
754
+ const pruned = this.pruneInactiveRepositoryVectors(repository.id, repository.fullName);
755
+ if (pruned > 0) {
756
+ params.onProgress?.(`[embed] pruned ${pruned} closed or inactive vector(s) before refresh`);
757
+ }
758
+ }
759
+ }
760
+ const { rows, tasks, pending, missingSummaryThreadNumbers } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
674
761
  const skipped = tasks.length - pending.length;
675
762
  const truncated = tasks.filter((task) => task.wasTruncated).length;
676
- params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} embedding source(s) for ${repository.fullName}`);
677
- params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
763
+ if (missingSummaryThreadNumbers.length > 0) {
764
+ throw new Error(`Embedding basis ${this.config.embeddingBasis} requires summaries before embedding. Missing summaries for thread(s): ${missingSummaryThreadNumbers.slice(0, 10).join(', ')}${missingSummaryThreadNumbers.length > 10 ? ', …' : ''}.`);
765
+ }
766
+ params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} active vector task(s) for ${repository.fullName}`);
767
+ params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} dimensions=${ACTIVE_EMBED_DIMENSIONS} basis=${this.config.embeddingBasis} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
678
768
  let embedded = 0;
679
769
  const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
680
770
  const mapper = new IterableMapper(batches, async (batch) => {
@@ -686,14 +776,15 @@ export class GHCrawlService {
686
776
  let completedBatches = 0;
687
777
  for await (const batchResult of mapper) {
688
778
  completedBatches += 1;
689
- const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.sourceKind}`);
779
+ const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.basis}`);
690
780
  const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
691
781
  params.onProgress?.(`[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`);
692
782
  for (const { task, embedding } of batchResult) {
693
- this.upsertEmbedding(task.threadId, task.sourceKind, task.contentHash, embedding);
783
+ this.upsertActiveVector(repository.id, repository.fullName, task.threadId, task.basis, task.contentHash, embedding);
694
784
  embedded += 1;
695
785
  }
696
786
  }
787
+ this.markRepoVectorsCurrent(repository.id);
697
788
  this.finishRun('embedding_runs', runId, 'completed', { embedded });
698
789
  return embedResultSchema.parse({ runId, embedded });
699
790
  }
@@ -705,16 +796,70 @@ export class GHCrawlService {
705
796
  async clusterRepository(params) {
706
797
  const repository = this.requireRepository(params.owner, params.repo);
707
798
  const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
708
- const minScore = params.minScore ?? 0.82;
799
+ const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
709
800
  const k = params.k ?? 6;
710
801
  try {
711
- const { items, sourceKinds } = this.loadClusterableThreadMeta(repository.id);
712
- params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
713
- const aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, sourceKinds, {
714
- limit: k,
715
- minScore,
716
- onProgress: params.onProgress,
717
- });
802
+ let items;
803
+ let aggregatedEdges;
804
+ if (this.isRepoVectorStateCurrent(repository.id)) {
805
+ const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName);
806
+ const activeIds = new Set(vectorItems.map((item) => item.id));
807
+ const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k);
808
+ aggregatedEdges = new Map();
809
+ let processed = 0;
810
+ let lastProgressAt = Date.now();
811
+ params.onProgress?.(`[cluster] loaded ${vectorItems.length} active vector(s) for ${repository.fullName} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`);
812
+ for (const item of vectorItems) {
813
+ const neighbors = this.vectorStore.queryNearest({
814
+ storePath: this.repoVectorStorePath(repository.fullName),
815
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
816
+ vector: item.embedding,
817
+ limit: annQuery.limit,
818
+ candidateK: annQuery.candidateK + 1,
819
+ efSearch: annQuery.efSearch,
820
+ excludeThreadId: item.id,
821
+ });
822
+ for (const neighbor of neighbors) {
823
+ if (!activeIds.has(neighbor.threadId))
824
+ continue;
825
+ if (neighbor.score < minScore)
826
+ continue;
827
+ const key = this.edgeKey(item.id, neighbor.threadId);
828
+ const existing = aggregatedEdges.get(key);
829
+ if (existing) {
830
+ existing.score = Math.max(existing.score, neighbor.score);
831
+ }
832
+ else {
833
+ aggregatedEdges.set(key, {
834
+ leftThreadId: Math.min(item.id, neighbor.threadId),
835
+ rightThreadId: Math.max(item.id, neighbor.threadId),
836
+ score: neighbor.score,
837
+ sourceKinds: new Set(['dedupe_summary']),
838
+ });
839
+ }
840
+ }
841
+ processed += 1;
842
+ const now = Date.now();
843
+ if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
844
+ params.onProgress(`[cluster] queried ${processed}/${vectorItems.length} vectors current_edges=${aggregatedEdges.size}`);
845
+ lastProgressAt = now;
846
+ }
847
+ }
848
+ items = vectorItems;
849
+ }
850
+ else if (this.hasLegacyEmbeddings(repository.id)) {
851
+ const legacy = this.loadClusterableThreadMeta(repository.id);
852
+ items = legacy.items;
853
+ params.onProgress?.(`[cluster] loaded ${items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
854
+ aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, {
855
+ limit: k,
856
+ minScore,
857
+ onProgress: params.onProgress,
858
+ });
859
+ }
860
+ else {
861
+ throw new Error(`Vectors for ${repository.fullName} are stale or missing. Run refresh or embed first.`);
862
+ }
718
863
  const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
719
864
  leftThreadId: entry.leftThreadId,
720
865
  rightThreadId: entry.rightThreadId,
@@ -724,6 +869,10 @@ export class GHCrawlService {
724
869
  const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
725
870
  this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
726
871
  this.pruneOldClusterRuns(repository.id, runId);
872
+ if (this.isRepoVectorStateCurrent(repository.id)) {
873
+ this.markRepoClustersCurrent(repository.id);
874
+ this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress);
875
+ }
727
876
  params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
728
877
  this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
729
878
  return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
@@ -733,6 +882,263 @@ export class GHCrawlService {
733
882
  throw error;
734
883
  }
735
884
  }
885
+ clusterExperiment(params) {
886
+ const backend = params.backend ?? 'vectorlite';
887
+ const repository = this.requireRepository(params.owner, params.repo);
888
+ const loaded = this.loadClusterableThreadMeta(repository.id);
889
+ const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : [];
890
+ const activeSourceKind = this.config.embeddingBasis === 'title_summary' ? 'dedupe_summary' : 'body';
891
+ const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0);
892
+ const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds);
893
+ const items = useActiveVectors
894
+ ? activeVectors.map((item) => ({ id: item.id, number: item.number, title: item.title }))
895
+ : loaded.items;
896
+ const aggregation = params.aggregation ?? 'max';
897
+ const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
898
+ const k = params.k ?? 6;
899
+ const candidateK = Math.max(k, params.candidateK ?? Math.max(k * 16, 64));
900
+ const efSearch = params.efSearch;
901
+ const startedAt = Date.now();
902
+ const memoryBefore = process.memoryUsage();
903
+ let peakRssBytes = memoryBefore.rss;
904
+ let peakHeapUsedBytes = memoryBefore.heapUsed;
905
+ const recordMemory = () => {
906
+ const usage = process.memoryUsage();
907
+ peakRssBytes = Math.max(peakRssBytes, usage.rss);
908
+ peakHeapUsedBytes = Math.max(peakHeapUsedBytes, usage.heapUsed);
909
+ };
910
+ recordMemory();
911
+ if (useActiveVectors && params.sourceKinds && loaded.items.length === 0) {
912
+ params.onProgress?.(`[cluster-experiment] legacy source embeddings are unavailable for ${repository.fullName}; falling back to active ${this.config.embeddingBasis} vectors`);
913
+ }
914
+ params.onProgress?.(`[cluster-experiment] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} backend=${backend} k=${k} candidateK=${candidateK} minScore=${minScore} aggregation=${aggregation}`);
915
+ const perSourceScores = new Map();
916
+ let loadMs = 0;
917
+ let setupMs = 0;
918
+ let edgeBuildMs = 0;
919
+ let indexBuildMs = 0;
920
+ let queryMs = 0;
921
+ let clusterBuildMs = 0;
922
+ let tempDbPath = null;
923
+ let tempDb = null;
924
+ let tempDir = null;
925
+ try {
926
+ if (backend === 'exact') {
927
+ if (useActiveVectors) {
928
+ const loadStartedAt = Date.now();
929
+ const normalizedRows = activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding }));
930
+ loadMs += Date.now() - loadStartedAt;
931
+ recordMemory();
932
+ const edgesStartedAt = Date.now();
933
+ const edges = buildSourceKindEdges(normalizedRows, {
934
+ limit: k,
935
+ minScore,
936
+ progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
937
+ onProgress: (progress) => {
938
+ recordMemory();
939
+ if (!params.onProgress)
940
+ return;
941
+ params.onProgress(`[cluster-experiment] exact ${progress.processedItems}/${normalizedRows.length} active vectors processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
942
+ },
943
+ });
944
+ edgeBuildMs += Date.now() - edgesStartedAt;
945
+ this.collectSourceKindScores(perSourceScores, edges, activeSourceKind);
946
+ recordMemory();
947
+ }
948
+ else {
949
+ const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0);
950
+ let processedItems = 0;
951
+ for (const sourceKind of sourceKinds) {
952
+ const loadStartedAt = Date.now();
953
+ const normalizedRows = this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind);
954
+ loadMs += Date.now() - loadStartedAt;
955
+ recordMemory();
956
+ const edgesStartedAt = Date.now();
957
+ const edges = buildSourceKindEdges(normalizedRows, {
958
+ limit: k,
959
+ minScore,
960
+ progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
961
+ onProgress: (progress) => {
962
+ recordMemory();
963
+ if (!params.onProgress)
964
+ return;
965
+ params.onProgress(`[cluster-experiment] exact ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
966
+ },
967
+ });
968
+ edgeBuildMs += Date.now() - edgesStartedAt;
969
+ processedItems += normalizedRows.length;
970
+ this.collectSourceKindScores(perSourceScores, edges, sourceKind);
971
+ recordMemory();
972
+ }
973
+ }
974
+ }
975
+ else {
976
+ const setupStartedAt = Date.now();
977
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-vectorlite-'));
978
+ tempDbPath = path.join(tempDir, 'cluster-experiment.db');
979
+ tempDb = openDb(tempDbPath);
980
+ tempDb.pragma('journal_mode = MEMORY');
981
+ tempDb.pragma('synchronous = OFF');
982
+ tempDb.pragma('temp_store = MEMORY');
983
+ const vectorlite = requireFromHere('vectorlite');
984
+ tempDb.loadExtension(vectorlite.vectorlitePath());
985
+ setupMs += Date.now() - setupStartedAt;
986
+ recordMemory();
987
+ const vectorSources = useActiveVectors
988
+ ? [
989
+ {
990
+ sourceKind: activeSourceKind,
991
+ rows: activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding })),
992
+ },
993
+ ]
994
+ : sourceKinds.map((sourceKind) => ({
995
+ sourceKind,
996
+ rows: this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind).map((row) => ({
997
+ id: row.id,
998
+ normalizedEmbedding: row.normalizedEmbedding,
999
+ })),
1000
+ }));
1001
+ for (const source of vectorSources) {
1002
+ const sourceRowCount = source.rows.length;
1003
+ if (sourceRowCount === 0) {
1004
+ continue;
1005
+ }
1006
+ const dimension = source.rows[0].normalizedEmbedding.length;
1007
+ const safeCandidateK = Math.min(candidateK, Math.max(1, sourceRowCount - 1));
1008
+ const tableName = `vector_${source.sourceKind}`;
1009
+ params.onProgress?.(`[cluster-experiment] building ${source.sourceKind} HNSW index with ${sourceRowCount} vector(s)`);
1010
+ const indexStartedAt = Date.now();
1011
+ tempDb.exec(`create virtual table ${tableName} using vectorlite(vec float32[${dimension}], hnsw(max_elements=${sourceRowCount}));`);
1012
+ const insert = tempDb.prepare(`insert into ${tableName}(rowid, vec) values (?, ?)`);
1013
+ tempDb.transaction(() => {
1014
+ const loadStartedAt = Date.now();
1015
+ for (const row of source.rows) {
1016
+ insert.run(row.id, this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
1017
+ }
1018
+ loadMs += Date.now() - loadStartedAt;
1019
+ })();
1020
+ indexBuildMs += Date.now() - indexStartedAt;
1021
+ recordMemory();
1022
+ const queryStartedAt = Date.now();
1023
+ const querySql = efSearch !== undefined
1024
+ ? `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}, ${efSearch}))`
1025
+ : `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}))`;
1026
+ const query = tempDb.prepare(querySql);
1027
+ let processed = 0;
1028
+ let lastProgressAt = Date.now();
1029
+ const queryLoadStartedAt = Date.now();
1030
+ for (const row of source.rows) {
1031
+ const candidates = query.all(this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
1032
+ const ranked = rankNearestNeighborsByScore(candidates, {
1033
+ limit: k,
1034
+ minScore,
1035
+ score: (candidate) => {
1036
+ if (candidate.rowid === row.id) {
1037
+ return -1;
1038
+ }
1039
+ return this.normalizedDistanceToScore(candidate.distance);
1040
+ },
1041
+ });
1042
+ let addedThisRow = 0;
1043
+ for (const candidate of ranked) {
1044
+ const score = candidate.score;
1045
+ const key = this.edgeKey(row.id, candidate.item.rowid);
1046
+ const existing = perSourceScores.get(key);
1047
+ if (existing) {
1048
+ existing.scores.set(source.sourceKind, Math.max(existing.scores.get(source.sourceKind) ?? -1, score));
1049
+ continue;
1050
+ }
1051
+ const scores = new Map();
1052
+ scores.set(source.sourceKind, score);
1053
+ perSourceScores.set(key, {
1054
+ leftThreadId: Math.min(row.id, candidate.item.rowid),
1055
+ rightThreadId: Math.max(row.id, candidate.item.rowid),
1056
+ scores,
1057
+ });
1058
+ addedThisRow += 1;
1059
+ }
1060
+ processed += 1;
1061
+ const now = Date.now();
1062
+ if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
1063
+ recordMemory();
1064
+ params.onProgress(`[cluster-experiment] querying ${source.sourceKind} index ${processed}/${sourceRowCount} current_edges=${perSourceScores.size} added_this_step=${addedThisRow}`);
1065
+ lastProgressAt = now;
1066
+ }
1067
+ }
1068
+ loadMs += Date.now() - queryLoadStartedAt;
1069
+ queryMs += Date.now() - queryStartedAt;
1070
+ tempDb.exec(`drop table ${tableName}`);
1071
+ recordMemory();
1072
+ }
1073
+ }
1074
+ // Finalize edge scores using the configured aggregation method
1075
+ const defaultWeights = { dedupe_summary: 0.5, title: 0.3, body: 0.2 };
1076
+ const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) };
1077
+ const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore);
1078
+ params.onProgress?.(`[cluster-experiment] finalized ${aggregated.length} edges from ${perSourceScores.size} candidate pairs using ${aggregation} aggregation`);
1079
+ const clusterStartedAt = Date.now();
1080
+ const clusterNodes = items.map((item) => ({ threadId: item.id, number: item.number, title: item.title }));
1081
+ const clusterEdges = aggregated;
1082
+ const clusterMode = params.clusterMode ?? (params.maxClusterSize !== undefined ? 'refine' : 'basic');
1083
+ const clusters = clusterMode === 'bounded'
1084
+ ? buildSizeBoundedClusters(clusterNodes, clusterEdges, {
1085
+ maxClusterSize: params.maxClusterSize ?? 200,
1086
+ })
1087
+ : clusterMode === 'refine'
1088
+ ? buildRefinedClusters(clusterNodes, clusterEdges, {
1089
+ maxClusterSize: params.maxClusterSize ?? 200,
1090
+ refineStep: params.refineStep ?? 0.02,
1091
+ })
1092
+ : buildClusters(clusterNodes, clusterEdges);
1093
+ clusterBuildMs += Date.now() - clusterStartedAt;
1094
+ recordMemory();
1095
+ const memoryAfter = process.memoryUsage();
1096
+ const durationMs = backend === 'vectorlite'
1097
+ ? indexBuildMs + queryMs + clusterBuildMs
1098
+ : edgeBuildMs + clusterBuildMs;
1099
+ const totalDurationMs = Date.now() - startedAt;
1100
+ return {
1101
+ backend,
1102
+ repository,
1103
+ tempDbPath,
1104
+ threads: items.length,
1105
+ sourceKinds: sourceKinds.length,
1106
+ edges: aggregated.length,
1107
+ clusters: clusters.length,
1108
+ timingBasis: 'cluster-only',
1109
+ durationMs,
1110
+ totalDurationMs,
1111
+ loadMs,
1112
+ setupMs,
1113
+ edgeBuildMs,
1114
+ indexBuildMs,
1115
+ queryMs,
1116
+ clusterBuildMs,
1117
+ candidateK,
1118
+ memory: {
1119
+ rssBeforeBytes: memoryBefore.rss,
1120
+ rssAfterBytes: memoryAfter.rss,
1121
+ peakRssBytes,
1122
+ heapUsedBeforeBytes: memoryBefore.heapUsed,
1123
+ heapUsedAfterBytes: memoryAfter.heapUsed,
1124
+ peakHeapUsedBytes,
1125
+ },
1126
+ clusterSizes: this.summarizeClusterSizes(clusters),
1127
+ clustersDetail: params.includeClusters
1128
+ ? clusters.map((cluster) => ({
1129
+ representativeThreadId: cluster.representativeThreadId,
1130
+ memberThreadIds: [...cluster.members],
1131
+ }))
1132
+ : null,
1133
+ };
1134
+ }
1135
+ finally {
1136
+ tempDb?.close();
1137
+ if (tempDir) {
1138
+ fs.rmSync(tempDir, { recursive: true, force: true });
1139
+ }
1140
+ }
1141
+ }
736
1142
  async searchRepository(params) {
737
1143
  const mode = params.mode ?? 'hybrid';
738
1144
  const repository = this.requireRepository(params.owner, params.repo);
@@ -754,12 +1160,33 @@ export class GHCrawlService {
754
1160
  }
755
1161
  }
756
1162
  if (mode !== 'keyword' && this.ai) {
757
- const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
758
- for (const row of this.iterateStoredEmbeddings(repository.id)) {
759
- const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json));
760
- if (score < 0.2)
761
- continue;
762
- semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
1163
+ if (this.isRepoVectorStateCurrent(repository.id)) {
1164
+ const [queryEmbedding] = await this.ai.embedTexts({
1165
+ model: this.config.embedModel,
1166
+ texts: [params.query],
1167
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1168
+ });
1169
+ const neighbors = this.vectorStore.queryNearest({
1170
+ storePath: this.repoVectorStorePath(repository.fullName),
1171
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1172
+ vector: queryEmbedding,
1173
+ limit: limit * 2,
1174
+ candidateK: Math.max(limit * 8, 64),
1175
+ });
1176
+ for (const neighbor of neighbors) {
1177
+ if (neighbor.score < 0.2)
1178
+ continue;
1179
+ semanticScores.set(neighbor.threadId, Math.max(semanticScores.get(neighbor.threadId) ?? -1, neighbor.score));
1180
+ }
1181
+ }
1182
+ else if (this.hasLegacyEmbeddings(repository.id)) {
1183
+ const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
1184
+ for (const row of this.iterateStoredEmbeddings(repository.id)) {
1185
+ const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json));
1186
+ if (score < 0.2)
1187
+ continue;
1188
+ semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
1189
+ }
763
1190
  }
764
1191
  }
765
1192
  const candidateIds = new Set([...keywordScores.keys(), ...semanticScores.keys()]);
@@ -827,43 +1254,97 @@ export class GHCrawlService {
827
1254
  const repository = this.requireRepository(params.owner, params.repo);
828
1255
  const limit = params.limit ?? 10;
829
1256
  const minScore = params.minScore ?? 0.2;
830
- const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber);
831
- if (targetRows.length === 0) {
832
- throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
833
- }
834
- const targetRow = targetRows[0];
835
- const targetBySource = new Map();
836
- for (const row of targetRows) {
837
- targetBySource.set(row.source_kind, JSON.parse(row.embedding_json));
1257
+ const targetRow = this.db
1258
+ .prepare(`select t.*, tv.basis, tv.model, tv.dimensions, tv.content_hash, tv.vector_json, tv.vector_backend
1259
+ from threads t
1260
+ join thread_vectors tv on tv.thread_id = t.id
1261
+ where t.repo_id = ?
1262
+ and t.number = ?
1263
+ and t.state = 'open'
1264
+ and t.closed_at_local is null
1265
+ and tv.model = ?
1266
+ and tv.basis = ?
1267
+ and tv.dimensions = ?
1268
+ limit 1`)
1269
+ .get(repository.id, params.threadNumber, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
1270
+ let responseThread;
1271
+ let neighbors;
1272
+ if (targetRow) {
1273
+ responseThread = targetRow;
1274
+ const candidateRows = this.vectorStore
1275
+ .queryNearest({
1276
+ storePath: this.repoVectorStorePath(repository.fullName),
1277
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1278
+ vector: this.parseStoredVector(targetRow.vector_json),
1279
+ limit: limit * 2,
1280
+ candidateK: Math.max(limit * 8, 64),
1281
+ excludeThreadId: targetRow.id,
1282
+ })
1283
+ .filter((row) => row.score >= minScore);
1284
+ const candidateIds = candidateRows.map((row) => row.threadId);
1285
+ const neighborMeta = candidateIds.length
1286
+ ? this.db
1287
+ .prepare(`select * from threads
1288
+ where repo_id = ? and state = 'open' and closed_at_local is null and id in (${candidateIds.map(() => '?').join(',')})`)
1289
+ .all(repository.id, ...candidateIds)
1290
+ : [];
1291
+ const metaById = new Map(neighborMeta.map((row) => [row.id, row]));
1292
+ neighbors = candidateRows
1293
+ .map((row) => {
1294
+ const meta = metaById.get(row.threadId);
1295
+ if (!meta) {
1296
+ return null;
1297
+ }
1298
+ return {
1299
+ threadId: row.threadId,
1300
+ number: meta.number,
1301
+ kind: meta.kind,
1302
+ title: meta.title,
1303
+ score: row.score,
1304
+ };
1305
+ })
1306
+ .filter((row) => row !== null)
1307
+ .slice(0, limit);
838
1308
  }
839
- const aggregated = new Map();
840
- for (const row of this.iterateStoredEmbeddings(repository.id)) {
841
- if (row.id === targetRow.id)
842
- continue;
843
- const targetEmbedding = targetBySource.get(row.source_kind);
844
- if (!targetEmbedding)
845
- continue;
846
- const score = cosineSimilarity(targetEmbedding, JSON.parse(row.embedding_json));
847
- if (score < minScore)
848
- continue;
849
- const previous = aggregated.get(row.id);
850
- if (!previous || score > previous.score) {
851
- aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
1309
+ else {
1310
+ const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber);
1311
+ if (targetRows.length === 0) {
1312
+ throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
1313
+ }
1314
+ responseThread = targetRows[0];
1315
+ const targetBySource = new Map();
1316
+ for (const row of targetRows) {
1317
+ targetBySource.set(row.source_kind, JSON.parse(row.embedding_json));
1318
+ }
1319
+ const aggregated = new Map();
1320
+ for (const row of this.iterateStoredEmbeddings(repository.id)) {
1321
+ if (row.id === responseThread.id)
1322
+ continue;
1323
+ const targetEmbedding = targetBySource.get(row.source_kind);
1324
+ if (!targetEmbedding)
1325
+ continue;
1326
+ const score = cosineSimilarity(targetEmbedding, JSON.parse(row.embedding_json));
1327
+ if (score < minScore)
1328
+ continue;
1329
+ const previous = aggregated.get(row.id);
1330
+ if (!previous || score > previous.score) {
1331
+ aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
1332
+ }
852
1333
  }
1334
+ neighbors = Array.from(aggregated.entries())
1335
+ .map(([threadId, value]) => ({
1336
+ threadId,
1337
+ number: value.number,
1338
+ kind: value.kind,
1339
+ title: value.title,
1340
+ score: value.score,
1341
+ }))
1342
+ .sort((left, right) => right.score - left.score)
1343
+ .slice(0, limit);
853
1344
  }
854
- const neighbors = Array.from(aggregated.entries())
855
- .map(([threadId, value]) => ({
856
- threadId,
857
- number: value.number,
858
- kind: value.kind,
859
- title: value.title,
860
- score: value.score,
861
- }))
862
- .sort((left, right) => right.score - left.score)
863
- .slice(0, limit);
864
1345
  return neighborsResponseSchema.parse({
865
1346
  repository,
866
- thread: threadToDto(targetRow),
1347
+ thread: threadToDto(responseThread),
867
1348
  neighbors,
868
1349
  });
869
1350
  }
@@ -940,6 +1421,14 @@ export class GHCrawlService {
940
1421
  onProgress: params.onProgress,
941
1422
  });
942
1423
  }
1424
+ if (selected.embed && this.config.embeddingBasis === 'title_summary') {
1425
+ params.onProgress?.(`[refresh] embedding basis ${this.config.embeddingBasis} requires summaries; running summarize before embed`);
1426
+ await this.summarizeRepository({
1427
+ owner: params.owner,
1428
+ repo: params.repo,
1429
+ onProgress: params.onProgress,
1430
+ });
1431
+ }
943
1432
  if (selected.embed) {
944
1433
  embed = await this.embedRepository({
945
1434
  owner: params.owner,
@@ -1144,9 +1633,9 @@ export class GHCrawlService {
1144
1633
  const summaryRows = this.db
1145
1634
  .prepare(`select summary_kind, summary_text
1146
1635
  from document_summaries
1147
- where thread_id = ? and model = ?
1636
+ where thread_id = ? and model = ? and prompt_version = ?
1148
1637
  order by summary_kind asc`)
1149
- .all(row.id, this.config.summaryModel);
1638
+ .all(row.id, this.config.summaryModel, SUMMARY_PROMPT_VERSION);
1150
1639
  const summaries = {};
1151
1640
  for (const summary of summaryRows) {
1152
1641
  if (summary.summary_kind === 'problem_summary' ||
@@ -1308,7 +1797,178 @@ export class GHCrawlService {
1308
1797
  latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
1309
1798
  };
1310
1799
  }
1800
+ getDesiredPipelineState() {
1801
+ return {
1802
+ summary_model: this.config.summaryModel,
1803
+ summary_prompt_version: SUMMARY_PROMPT_VERSION,
1804
+ embedding_basis: this.config.embeddingBasis,
1805
+ embed_model: this.config.embedModel,
1806
+ embed_dimensions: ACTIVE_EMBED_DIMENSIONS,
1807
+ embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION,
1808
+ vector_backend: this.config.vectorBackend,
1809
+ };
1810
+ }
1811
+ getRepoPipelineState(repoId) {
1812
+ return (this.db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) ??
1813
+ null);
1814
+ }
1815
+ isRepoVectorStateCurrent(repoId) {
1816
+ const state = this.getRepoPipelineState(repoId);
1817
+ if (!state || !state.vectors_current_at) {
1818
+ return false;
1819
+ }
1820
+ const desired = this.getDesiredPipelineState();
1821
+ return (state.summary_model === desired.summary_model &&
1822
+ state.summary_prompt_version === desired.summary_prompt_version &&
1823
+ state.embedding_basis === desired.embedding_basis &&
1824
+ state.embed_model === desired.embed_model &&
1825
+ state.embed_dimensions === desired.embed_dimensions &&
1826
+ state.embed_pipeline_version === desired.embed_pipeline_version &&
1827
+ state.vector_backend === desired.vector_backend);
1828
+ }
1829
+ isRepoClusterStateCurrent(repoId) {
1830
+ const state = this.getRepoPipelineState(repoId);
1831
+ return this.isRepoVectorStateCurrent(repoId) && Boolean(state?.clusters_current_at);
1832
+ }
1833
+ hasLegacyEmbeddings(repoId) {
1834
+ const row = this.db
1835
+ .prepare(`select count(*) as count
1836
+ from document_embeddings e
1837
+ join threads t on t.id = e.thread_id
1838
+ where t.repo_id = ?
1839
+ and t.state = 'open'
1840
+ and t.closed_at_local is null
1841
+ and e.model = ?`)
1842
+ .get(repoId, this.config.embedModel);
1843
+ return row.count > 0;
1844
+ }
1845
+ writeRepoPipelineState(repoId, overrides) {
1846
+ const desired = this.getDesiredPipelineState();
1847
+ const current = this.getRepoPipelineState(repoId);
1848
+ this.db
1849
+ .prepare(`insert into repo_pipeline_state (
1850
+ repo_id,
1851
+ summary_model,
1852
+ summary_prompt_version,
1853
+ embedding_basis,
1854
+ embed_model,
1855
+ embed_dimensions,
1856
+ embed_pipeline_version,
1857
+ vector_backend,
1858
+ vectors_current_at,
1859
+ clusters_current_at,
1860
+ updated_at
1861
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1862
+ on conflict(repo_id) do update set
1863
+ summary_model = excluded.summary_model,
1864
+ summary_prompt_version = excluded.summary_prompt_version,
1865
+ embedding_basis = excluded.embedding_basis,
1866
+ embed_model = excluded.embed_model,
1867
+ embed_dimensions = excluded.embed_dimensions,
1868
+ embed_pipeline_version = excluded.embed_pipeline_version,
1869
+ vector_backend = excluded.vector_backend,
1870
+ vectors_current_at = excluded.vectors_current_at,
1871
+ clusters_current_at = excluded.clusters_current_at,
1872
+ updated_at = excluded.updated_at`)
1873
+ .run(repoId, desired.summary_model, desired.summary_prompt_version, desired.embedding_basis, desired.embed_model, desired.embed_dimensions, desired.embed_pipeline_version, desired.vector_backend, overrides.vectors_current_at ?? current?.vectors_current_at ?? null, overrides.clusters_current_at ?? current?.clusters_current_at ?? null, nowIso());
1874
+ }
1875
+ markRepoVectorsCurrent(repoId) {
1876
+ this.writeRepoPipelineState(repoId, {
1877
+ vectors_current_at: nowIso(),
1878
+ clusters_current_at: null,
1879
+ });
1880
+ }
1881
+ markRepoClustersCurrent(repoId) {
1882
+ const state = this.getRepoPipelineState(repoId);
1883
+ this.writeRepoPipelineState(repoId, {
1884
+ vectors_current_at: state?.vectors_current_at ?? nowIso(),
1885
+ clusters_current_at: nowIso(),
1886
+ });
1887
+ }
1888
+ repoVectorStorePath(repoFullName) {
1889
+ const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__');
1890
+ return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`);
1891
+ }
1892
+ resetRepositoryVectors(repoId, repoFullName) {
1893
+ this.db
1894
+ .prepare(`delete from thread_vectors
1895
+ where thread_id in (select id from threads where repo_id = ?)`)
1896
+ .run(repoId);
1897
+ this.vectorStore.resetRepository({
1898
+ storePath: this.repoVectorStorePath(repoFullName),
1899
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1900
+ });
1901
+ this.writeRepoPipelineState(repoId, {
1902
+ vectors_current_at: null,
1903
+ clusters_current_at: null,
1904
+ });
1905
+ }
1906
+ pruneInactiveRepositoryVectors(repoId, repoFullName) {
1907
+ const rows = this.db
1908
+ .prepare(`select tv.thread_id
1909
+ from thread_vectors tv
1910
+ join threads t on t.id = tv.thread_id
1911
+ where t.repo_id = ?
1912
+ and (t.state != 'open' or t.closed_at_local is not null)`)
1913
+ .all(repoId);
1914
+ if (rows.length === 0) {
1915
+ return 0;
1916
+ }
1917
+ const deleteVectorRow = this.db.prepare('delete from thread_vectors where thread_id = ?');
1918
+ this.db.transaction(() => {
1919
+ for (const row of rows) {
1920
+ deleteVectorRow.run(row.thread_id);
1921
+ this.vectorStore.deleteVector({
1922
+ storePath: this.repoVectorStorePath(repoFullName),
1923
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1924
+ threadId: row.thread_id,
1925
+ });
1926
+ }
1927
+ })();
1928
+ return rows.length;
1929
+ }
1930
+ cleanupMigratedRepositoryArtifacts(repoId, repoFullName, onProgress) {
1931
+ const legacyEmbeddingCount = this.countLegacyEmbeddings(repoId);
1932
+ const inlineJsonVectorCount = this.countInlineJsonThreadVectors(repoId);
1933
+ if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) {
1934
+ return;
1935
+ }
1936
+ if (legacyEmbeddingCount > 0) {
1937
+ this.db
1938
+ .prepare(`delete from document_embeddings
1939
+ where thread_id in (select id from threads where repo_id = ?)`)
1940
+ .run(repoId);
1941
+ onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`);
1942
+ }
1943
+ if (inlineJsonVectorCount > 0) {
1944
+ const rows = this.db
1945
+ .prepare(`select tv.thread_id, tv.vector_json
1946
+ from thread_vectors tv
1947
+ join threads t on t.id = tv.thread_id
1948
+ where t.repo_id = ?
1949
+ and typeof(tv.vector_json) = 'text'
1950
+ and tv.vector_json != ''`)
1951
+ .all(repoId);
1952
+ const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?');
1953
+ this.db.transaction(() => {
1954
+ for (const row of rows) {
1955
+ update.run(this.vectorBlob(JSON.parse(row.vector_json)), nowIso(), row.thread_id);
1956
+ }
1957
+ })();
1958
+ onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`);
1959
+ }
1960
+ if (this.config.dbPath !== ':memory:') {
1961
+ onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${repoFullName} migration changes`);
1962
+ this.db.pragma('wal_checkpoint(TRUNCATE)');
1963
+ this.db.exec('VACUUM');
1964
+ this.db.pragma('wal_checkpoint(TRUNCATE)');
1965
+ }
1966
+ }
1311
1967
  getLatestClusterRun(repoId) {
1968
+ const state = this.getRepoPipelineState(repoId);
1969
+ if (state && !this.isRepoClusterStateCurrent(repoId)) {
1970
+ return null;
1971
+ }
1312
1972
  return (this.db
1313
1973
  .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1314
1974
  .get(repoId) ?? null);
@@ -1778,7 +2438,7 @@ export class GHCrawlService {
1778
2438
  }
1779
2439
  }
1780
2440
  const summaryInput = parts.join('\n\n');
1781
- const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
2441
+ const summaryContentHash = stableContentHash(`summary:${SUMMARY_PROMPT_VERSION}:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
1782
2442
  return { summaryInput, summaryContentHash };
1783
2443
  }
1784
2444
  buildEmbeddingTasks(params) {
@@ -1821,6 +2481,35 @@ export class GHCrawlService {
1821
2481
  }
1822
2482
  return tasks;
1823
2483
  }
2484
+ buildActiveVectorTask(params) {
2485
+ const sections = [`title: ${normalizeSummaryText(params.title)}`];
2486
+ if (this.config.embeddingBasis === 'title_summary') {
2487
+ const summary = normalizeSummaryText(params.dedupeSummary ?? '');
2488
+ if (!summary) {
2489
+ return null;
2490
+ }
2491
+ sections.push(`summary: ${summary}`);
2492
+ }
2493
+ else {
2494
+ const body = normalizeSummaryText(params.body ?? '');
2495
+ if (body) {
2496
+ sections.push(`body: ${body}`);
2497
+ }
2498
+ }
2499
+ const prepared = this.prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS);
2500
+ if (!prepared) {
2501
+ return null;
2502
+ }
2503
+ return {
2504
+ threadId: params.threadId,
2505
+ threadNumber: params.threadNumber,
2506
+ basis: this.config.embeddingBasis,
2507
+ text: prepared.text,
2508
+ contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${this.config.embeddingBasis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`),
2509
+ estimatedTokens: prepared.estimatedTokens,
2510
+ wasTruncated: prepared.wasTruncated,
2511
+ };
2512
+ }
1824
2513
  prepareEmbeddingText(text, maxEstimatedTokens) {
1825
2514
  if (!text) {
1826
2515
  return null;
@@ -1862,6 +2551,7 @@ export class GHCrawlService {
1862
2551
  const embeddings = await ai.embedTexts({
1863
2552
  model: this.config.embedModel,
1864
2553
  texts: batch.map((task) => task.text),
2554
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1865
2555
  });
1866
2556
  return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
1867
2557
  }
@@ -1888,6 +2578,7 @@ export class GHCrawlService {
1888
2578
  const [embedding] = await ai.embedTexts({
1889
2579
  model: this.config.embedModel,
1890
2580
  texts: [current.text],
2581
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1891
2582
  });
1892
2583
  return { task: current, embedding };
1893
2584
  }
@@ -1900,11 +2591,11 @@ export class GHCrawlService {
1900
2591
  if (!next || next.text === current.text) {
1901
2592
  throw error;
1902
2593
  }
1903
- onProgress?.(`[embed] shortened #${current.threadNumber}:${current.sourceKind} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
2594
+ onProgress?.(`[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
1904
2595
  current = next;
1905
2596
  }
1906
2597
  }
1907
- throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.sourceKind} below model limits`);
2598
+ throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`);
1908
2599
  }
1909
2600
  shrinkEmbeddingTask(task, context) {
1910
2601
  const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
@@ -1921,7 +2612,7 @@ export class GHCrawlService {
1921
2612
  return {
1922
2613
  ...task,
1923
2614
  text: nextText,
1924
- contentHash: stableContentHash(`embedding:${task.sourceKind}\n${nextText}`),
2615
+ contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`),
1925
2616
  estimatedTokens: this.estimateEmbeddingTokens(nextText),
1926
2617
  wasTruncated: true,
1927
2618
  };
@@ -1996,6 +2687,46 @@ export class GHCrawlService {
1996
2687
  order by t.number asc, e.source_kind asc`)
1997
2688
  .iterate(repoId, this.config.embedModel);
1998
2689
  }
2690
+ loadNormalizedEmbeddingForSourceKindHead(repoId, sourceKind) {
2691
+ const row = this.db
2692
+ .prepare(`select t.id, e.embedding_json
2693
+ from threads t
2694
+ join document_embeddings e on e.thread_id = t.id
2695
+ where t.repo_id = ?
2696
+ and t.state = 'open'
2697
+ and t.closed_at_local is null
2698
+ and e.model = ?
2699
+ and e.source_kind = ?
2700
+ order by t.number asc
2701
+ limit 1`)
2702
+ .get(repoId, this.config.embedModel, sourceKind);
2703
+ if (!row) {
2704
+ return null;
2705
+ }
2706
+ return {
2707
+ id: row.id,
2708
+ normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
2709
+ };
2710
+ }
2711
+ *iterateNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
2712
+ const rows = this.db
2713
+ .prepare(`select t.id, e.embedding_json
2714
+ from threads t
2715
+ join document_embeddings e on e.thread_id = t.id
2716
+ where t.repo_id = ?
2717
+ and t.state = 'open'
2718
+ and t.closed_at_local is null
2719
+ and e.model = ?
2720
+ and e.source_kind = ?
2721
+ order by t.number asc`)
2722
+ .iterate(repoId, this.config.embedModel, sourceKind);
2723
+ for (const row of rows) {
2724
+ yield {
2725
+ id: row.id,
2726
+ normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
2727
+ };
2728
+ }
2729
+ }
1999
2730
  loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
2000
2731
  const rows = this.db
2001
2732
  .prepare(`select t.id, e.embedding_json
@@ -2013,6 +2744,12 @@ export class GHCrawlService {
2013
2744
  normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
2014
2745
  }));
2015
2746
  }
2747
+ normalizedEmbeddingBuffer(values) {
2748
+ return Buffer.from(Float32Array.from(values).buffer);
2749
+ }
2750
+ normalizedDistanceToScore(distance) {
2751
+ return 1 - distance / 2;
2752
+ }
2016
2753
  loadClusterableThreadMeta(repoId) {
2017
2754
  const rows = this.db
2018
2755
  .prepare(`select t.id, t.number, t.title, e.source_kind
@@ -2033,6 +2770,34 @@ export class GHCrawlService {
2033
2770
  sourceKinds: Array.from(sourceKinds.values()),
2034
2771
  };
2035
2772
  }
2773
+ loadClusterableActiveVectorMeta(repoId, _repoFullName) {
2774
+ const rows = this.db
2775
+ .prepare(`select t.id, t.number, t.title, tv.vector_json
2776
+ from threads t
2777
+ join thread_vectors tv on tv.thread_id = t.id
2778
+ where t.repo_id = ?
2779
+ and t.state = 'open'
2780
+ and t.closed_at_local is null
2781
+ and tv.model = ?
2782
+ and tv.basis = ?
2783
+ and tv.dimensions = ?
2784
+ order by t.number asc`)
2785
+ .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
2786
+ return rows.map((row) => ({
2787
+ id: row.id,
2788
+ number: row.number,
2789
+ title: row.title,
2790
+ embedding: this.parseStoredVector(row.vector_json),
2791
+ }));
2792
+ }
2793
+ loadNormalizedActiveVectors(repoId) {
2794
+ return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({
2795
+ id: row.id,
2796
+ number: row.number,
2797
+ title: row.title,
2798
+ embedding: normalizeEmbedding(row.embedding).normalized,
2799
+ }));
2800
+ }
2036
2801
  listStoredClusterNeighbors(repoId, threadId, limit) {
2037
2802
  const latestRun = this.getLatestClusterRun(repoId);
2038
2803
  if (!latestRun) {
@@ -2089,56 +2854,65 @@ export class GHCrawlService {
2089
2854
  }
2090
2855
  sql += ' order by t.number asc';
2091
2856
  const rows = this.db.prepare(sql).all(...args);
2092
- const summaryTexts = this.loadCombinedSummaryTextMap(repoId, threadNumber);
2093
- const tasks = rows.flatMap((row) => this.buildEmbeddingTasks({
2094
- threadId: row.id,
2095
- threadNumber: row.number,
2096
- title: row.title,
2097
- body: row.body,
2098
- dedupeSummary: summaryTexts.get(row.id) ?? null,
2099
- }));
2857
+ const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber);
2858
+ const missingSummaryThreadNumbers = [];
2859
+ const tasks = rows.flatMap((row) => {
2860
+ const task = this.buildActiveVectorTask({
2861
+ threadId: row.id,
2862
+ threadNumber: row.number,
2863
+ title: row.title,
2864
+ body: row.body,
2865
+ dedupeSummary: summaryTexts.get(row.id) ?? null,
2866
+ });
2867
+ if (task) {
2868
+ return [task];
2869
+ }
2870
+ if (this.config.embeddingBasis === 'title_summary') {
2871
+ missingSummaryThreadNumbers.push(row.number);
2872
+ }
2873
+ return [];
2874
+ });
2875
+ const pipelineCurrent = this.isRepoVectorStateCurrent(repoId);
2100
2876
  const existingRows = this.db
2101
- .prepare(`select e.thread_id, e.source_kind, e.content_hash
2102
- from document_embeddings e
2103
- join threads t on t.id = e.thread_id
2104
- where t.repo_id = ? and e.model = ?`)
2105
- .all(repoId, this.config.embedModel);
2877
+ .prepare(`select tv.thread_id, tv.content_hash
2878
+ from thread_vectors tv
2879
+ join threads t on t.id = tv.thread_id
2880
+ where t.repo_id = ?
2881
+ and tv.model = ?
2882
+ and tv.basis = ?
2883
+ and tv.dimensions = ?`)
2884
+ .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
2106
2885
  const existing = new Map();
2107
2886
  for (const row of existingRows) {
2108
- existing.set(`${row.thread_id}:${row.source_kind}`, row.content_hash);
2887
+ existing.set(String(row.thread_id), row.content_hash);
2109
2888
  }
2110
- const pending = tasks.filter((task) => existing.get(`${task.threadId}:${task.sourceKind}`) !== task.contentHash);
2111
- return { rows, tasks, existing, pending };
2889
+ const pending = pipelineCurrent
2890
+ ? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash)
2891
+ : tasks;
2892
+ return { rows, tasks, existing, pending, missingSummaryThreadNumbers };
2112
2893
  }
2113
- loadCombinedSummaryTextMap(repoId, threadNumber) {
2114
- let sql = `select s.thread_id, s.summary_kind, s.summary_text
2894
+ loadDedupeSummaryTextMap(repoId, threadNumber) {
2895
+ let sql = `select s.thread_id, s.summary_text
2115
2896
  from document_summaries s
2116
2897
  join threads t on t.id = s.thread_id
2117
- where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and s.model = ?`;
2118
- const args = [repoId, this.config.summaryModel];
2898
+ where t.repo_id = ?
2899
+ and t.state = 'open'
2900
+ and t.closed_at_local is null
2901
+ and s.model = ?
2902
+ and s.summary_kind = 'dedupe_summary'
2903
+ and s.prompt_version = ?`;
2904
+ const args = [repoId, this.config.summaryModel, SUMMARY_PROMPT_VERSION];
2119
2905
  if (threadNumber) {
2120
2906
  sql += ' and t.number = ?';
2121
2907
  args.push(threadNumber);
2122
2908
  }
2123
- sql += ' order by t.number asc, s.summary_kind asc';
2909
+ sql += ' order by t.number asc';
2124
2910
  const rows = this.db.prepare(sql).all(...args);
2125
- const byThread = new Map();
2126
- for (const row of rows) {
2127
- const entry = byThread.get(row.thread_id) ?? new Map();
2128
- entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
2129
- byThread.set(row.thread_id, entry);
2130
- }
2131
2911
  const combined = new Map();
2132
- const order = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary'];
2133
- for (const [threadId, entry] of byThread.entries()) {
2134
- const parts = order
2135
- .map((summaryKind) => {
2136
- const text = entry.get(summaryKind);
2137
- return text ? `${summaryKind}: ${text}` : '';
2138
- })
2139
- .filter(Boolean);
2140
- if (parts.length > 0) {
2141
- combined.set(threadId, parts.join('\n\n'));
2912
+ for (const row of rows) {
2913
+ const text = normalizeSummaryText(row.summary_text);
2914
+ if (text) {
2915
+ combined.set(row.thread_id, text);
2142
2916
  }
2143
2917
  }
2144
2918
  return combined;
@@ -2235,6 +3009,71 @@ export class GHCrawlService {
2235
3009
  });
2236
3010
  }
2237
3011
  }
3012
+ collectSourceKindScores(perSourceScores, edges, sourceKind) {
3013
+ for (const edge of edges) {
3014
+ const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
3015
+ const existing = perSourceScores.get(key);
3016
+ if (existing) {
3017
+ existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score));
3018
+ continue;
3019
+ }
3020
+ const scores = new Map();
3021
+ scores.set(sourceKind, edge.score);
3022
+ perSourceScores.set(key, {
3023
+ leftThreadId: edge.leftThreadId,
3024
+ rightThreadId: edge.rightThreadId,
3025
+ scores,
3026
+ });
3027
+ }
3028
+ }
3029
+ finalizeEdgeScores(perSourceScores, aggregation, weights, minScore) {
3030
+ const result = [];
3031
+ for (const entry of perSourceScores.values()) {
3032
+ const scoreValues = Array.from(entry.scores.values());
3033
+ let finalScore;
3034
+ switch (aggregation) {
3035
+ case 'max':
3036
+ finalScore = Math.max(...scoreValues);
3037
+ break;
3038
+ case 'mean':
3039
+ finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length;
3040
+ break;
3041
+ case 'weighted': {
3042
+ let weightedSum = 0;
3043
+ let weightSum = 0;
3044
+ for (const [kind, score] of entry.scores) {
3045
+ const w = weights[kind] ?? 0.1;
3046
+ weightedSum += score * w;
3047
+ weightSum += w;
3048
+ }
3049
+ finalScore = weightSum > 0 ? weightedSum / weightSum : 0;
3050
+ break;
3051
+ }
3052
+ case 'min-of-2':
3053
+ // Require at least 2 source kinds to agree (both above minScore)
3054
+ if (scoreValues.length < 2) {
3055
+ continue; // Skip edges with only 1 source kind
3056
+ }
3057
+ finalScore = Math.max(...scoreValues);
3058
+ break;
3059
+ case 'boost': {
3060
+ // Best score + bonus per additional agreeing source
3061
+ const best = Math.max(...scoreValues);
3062
+ const bonusSources = scoreValues.length - 1;
3063
+ finalScore = Math.min(1.0, best + bonusSources * 0.05);
3064
+ break;
3065
+ }
3066
+ }
3067
+ if (finalScore >= minScore) {
3068
+ result.push({
3069
+ leftThreadId: entry.leftThreadId,
3070
+ rightThreadId: entry.rightThreadId,
3071
+ score: finalScore,
3072
+ });
3073
+ }
3074
+ }
3075
+ return result;
3076
+ }
2238
3077
  countEmbeddingsForSourceKind(repoId, sourceKind) {
2239
3078
  const row = this.db
2240
3079
  .prepare(`select count(*) as count
@@ -2282,15 +3121,102 @@ export class GHCrawlService {
2282
3121
  pruneOldClusterRuns(repoId, keepRunId) {
2283
3122
  this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
2284
3123
  }
3124
+ summarizeClusterSizes(clusters) {
3125
+ const histogramCounts = new Map();
3126
+ const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left);
3127
+ let soloClusters = 0;
3128
+ for (const cluster of clusters) {
3129
+ const size = cluster.members.length;
3130
+ histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1);
3131
+ if (size === 1) {
3132
+ soloClusters += 1;
3133
+ }
3134
+ }
3135
+ return {
3136
+ soloClusters,
3137
+ maxClusterSize: topClusterSizes[0] ?? 0,
3138
+ topClusterSizes: topClusterSizes.slice(0, 50),
3139
+ histogram: Array.from(histogramCounts.entries())
3140
+ .map(([size, count]) => ({ size, count }))
3141
+ .sort((left, right) => left.size - right.size),
3142
+ };
3143
+ }
2285
3144
  upsertSummary(threadId, contentHash, summaryKind, summaryText) {
2286
3145
  this.db
2287
- .prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
2288
- values (?, ?, ?, ?, ?, ?, ?)
3146
+ .prepare(`insert into document_summaries (thread_id, summary_kind, model, prompt_version, content_hash, summary_text, created_at, updated_at)
3147
+ values (?, ?, ?, ?, ?, ?, ?, ?)
2289
3148
  on conflict(thread_id, summary_kind, model) do update set
3149
+ prompt_version = excluded.prompt_version,
2290
3150
  content_hash = excluded.content_hash,
2291
3151
  summary_text = excluded.summary_text,
2292
3152
  updated_at = excluded.updated_at`)
2293
- .run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
3153
+ .run(threadId, summaryKind, this.config.summaryModel, SUMMARY_PROMPT_VERSION, contentHash, summaryText, nowIso(), nowIso());
3154
+ }
3155
+ upsertActiveVector(repoId, repoFullName, threadId, basis, contentHash, embedding) {
3156
+ this.db
3157
+ .prepare(`insert into thread_vectors (thread_id, basis, model, dimensions, content_hash, vector_json, vector_backend, created_at, updated_at)
3158
+ values (?, ?, ?, ?, ?, ?, ?, ?, ?)
3159
+ on conflict(thread_id) do update set
3160
+ basis = excluded.basis,
3161
+ model = excluded.model,
3162
+ dimensions = excluded.dimensions,
3163
+ content_hash = excluded.content_hash,
3164
+ vector_json = excluded.vector_json,
3165
+ vector_backend = excluded.vector_backend,
3166
+ updated_at = excluded.updated_at`)
3167
+ .run(threadId, basis, this.config.embedModel, embedding.length, contentHash, this.vectorBlob(embedding), this.config.vectorBackend, nowIso(), nowIso());
3168
+ this.vectorStore.upsertVector({
3169
+ storePath: this.repoVectorStorePath(repoFullName),
3170
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
3171
+ threadId,
3172
+ vector: embedding,
3173
+ });
3174
+ }
3175
+ countLegacyEmbeddings(repoId) {
3176
+ const row = this.db
3177
+ .prepare(`select count(*) as count
3178
+ from document_embeddings
3179
+ where thread_id in (select id from threads where repo_id = ?)`)
3180
+ .get(repoId);
3181
+ return row.count;
3182
+ }
3183
+ countInlineJsonThreadVectors(repoId) {
3184
+ const row = this.db
3185
+ .prepare(`select count(*) as count
3186
+ from thread_vectors
3187
+ where thread_id in (select id from threads where repo_id = ?)
3188
+ and typeof(vector_json) = 'text'
3189
+ and vector_json != ''`)
3190
+ .get(repoId);
3191
+ return row.count;
3192
+ }
3193
+ getVectorliteClusterQuery(totalItems, requestedK) {
3194
+ if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) {
3195
+ return {
3196
+ limit: requestedK,
3197
+ candidateK: Math.max(requestedK * 16, 64),
3198
+ };
3199
+ }
3200
+ const limit = Math.min(Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), Math.max(1, totalItems - 1));
3201
+ const candidateK = Math.min(Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), Math.max(limit, totalItems - 1));
3202
+ return {
3203
+ limit,
3204
+ candidateK,
3205
+ efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH),
3206
+ };
3207
+ }
3208
+ vectorBlob(values) {
3209
+ return Buffer.from(Float32Array.from(values).buffer);
3210
+ }
3211
+ parseStoredVector(value) {
3212
+ if (typeof value === 'string') {
3213
+ if (!value) {
3214
+ throw new Error('Stored vector payload is empty. Run refresh or embed first.');
3215
+ }
3216
+ return JSON.parse(value);
3217
+ }
3218
+ const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT));
3219
+ return Array.from(floats);
2294
3220
  }
2295
3221
  upsertEmbedding(threadId, sourceKind, contentHash, embedding) {
2296
3222
  this.db