@ghcrawl/api-core 0.7.0 → 0.8.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/service.js CHANGED
@@ -1,12 +1,15 @@
1
1
  import http from 'node:http';
2
2
  import crypto from 'node:crypto';
3
+ import fs from 'node:fs';
3
4
  import { existsSync } from 'node:fs';
5
+ import { createRequire } from 'node:module';
4
6
  import os from 'node:os';
7
+ import path from 'node:path';
5
8
  import { fileURLToPath } from 'node:url';
6
9
  import { Worker } from 'node:worker_threads';
7
10
  import { IterableMapper } from '@shutterstock/p-map-iterable';
8
11
  import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
9
- import { buildClusters } from './cluster/build.js';
12
+ import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js';
10
13
  import { buildSourceKindEdges } from './cluster/exact-edges.js';
11
14
  import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
12
15
  import { migrate } from './db/migrate.js';
@@ -14,7 +17,8 @@ import { openDb } from './db/sqlite.js';
14
17
  import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
15
18
  import { makeGitHubClient } from './github/client.js';
16
19
  import { OpenAiProvider } from './openai/provider.js';
17
- import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
20
+ import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js';
21
+ import { VectorliteStore } from './vector/vectorlite-store.js';
18
22
  const SYNC_BATCH_SIZE = 100;
19
23
  const SYNC_BATCH_DELAY_MS = 5000;
20
24
  const STALE_CLOSED_SWEEP_LIMIT = 1000;
@@ -23,10 +27,31 @@ const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
23
27
  const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
24
28
  const EMBED_MAX_ITEM_TOKENS = 7000;
25
29
  const EMBED_MAX_BATCH_TOKENS = 250000;
30
+ const requireFromHere = createRequire(import.meta.url);
26
31
  const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
27
32
  const EMBED_CONTEXT_RETRY_ATTEMPTS = 5;
28
33
  const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9;
29
34
  const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95;
35
+ const SUMMARY_PROMPT_VERSION = 'v1';
36
+ const ACTIVE_EMBED_DIMENSIONS = 1024;
37
+ const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1';
38
+ const DEFAULT_CLUSTER_MIN_SCORE = 0.78;
39
+ const VECTORLITE_CLUSTER_EXPANDED_K = 24;
40
+ const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4;
41
+ const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512;
42
+ const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024;
43
+ const SUMMARY_MODEL_PRICING = {
44
+ 'gpt-5-mini': {
45
+ inputCostPerM: 0.25,
46
+ cachedInputCostPerM: 0.025,
47
+ outputCostPerM: 2.0,
48
+ },
49
+ 'gpt-5.4-mini': {
50
+ inputCostPerM: 0.75,
51
+ cachedInputCostPerM: 0.075,
52
+ outputCostPerM: 4.5,
53
+ },
54
+ };
30
55
  function nowIso() {
31
56
  return new Date().toISOString();
32
57
  }
@@ -180,7 +205,7 @@ export class GHCrawlService {
180
205
  db;
181
206
  github;
182
207
  ai;
183
- parsedEmbeddingCache = new Map();
208
+ vectorStore;
184
209
  constructor(options = {}) {
185
210
  this.config = options.config ?? loadConfig();
186
211
  ensureRuntimeDirs(this.config);
@@ -188,9 +213,10 @@ export class GHCrawlService {
188
213
  migrate(this.db);
189
214
  this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
190
215
  this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
216
+ this.vectorStore = options.vectorStore ?? new VectorliteStore();
191
217
  }
192
218
  close() {
193
- this.parsedEmbeddingCache.clear();
219
+ this.vectorStore.close();
194
220
  this.db.close();
195
221
  }
196
222
  init() {
@@ -257,7 +283,17 @@ export class GHCrawlService {
257
283
  }
258
284
  }
259
285
  }
260
- return { health, github, openai };
286
+ const vectorliteHealth = this.vectorStore.checkRuntime();
287
+ return {
288
+ health,
289
+ github,
290
+ openai,
291
+ vectorlite: {
292
+ configured: this.config.vectorBackend === 'vectorlite',
293
+ runtimeOk: vectorliteHealth.ok,
294
+ error: vectorliteHealth.error,
295
+ },
296
+ };
261
297
  }
262
298
  listRepositories() {
263
299
  const rows = this.db.prepare('select * from repositories order by full_name asc').all();
@@ -409,7 +445,6 @@ export class GHCrawlService {
409
445
  updated_at = ?
410
446
  where id = ?`)
411
447
  .run(closedAt, closedAt, row.id);
412
- this.parsedEmbeddingCache.delete(repository.id);
413
448
  const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id);
414
449
  const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0;
415
450
  const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id);
@@ -543,7 +578,6 @@ export class GHCrawlService {
543
578
  })
544
579
  : 0;
545
580
  const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile;
546
- this.parsedEmbeddingCache.delete(repoId);
547
581
  if (threadsClosed > 0) {
548
582
  this.reconcileClusterCloseState(repoId);
549
583
  }
@@ -606,31 +640,69 @@ export class GHCrawlService {
606
640
  });
607
641
  const pending = sources.filter((row) => {
608
642
  const latest = this.db
609
- .prepare('select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
643
+ .prepare('select content_hash, prompt_version from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
610
644
  .get(row.id, 'dedupe_summary', this.config.summaryModel);
611
- return latest?.content_hash !== row.summaryContentHash;
645
+ return latest?.content_hash !== row.summaryContentHash || latest?.prompt_version !== SUMMARY_PROMPT_VERSION;
612
646
  });
613
647
  params.onProgress?.(`[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`);
614
648
  let summarized = 0;
615
649
  let inputTokens = 0;
616
650
  let outputTokens = 0;
617
651
  let totalTokens = 0;
618
- for (const [index, row] of pending.entries()) {
619
- params.onProgress?.(`[summarize] ${index + 1}/${pending.length} thread #${row.number}`);
652
+ let cachedInputTokens = 0;
653
+ const startTime = Date.now();
654
+ const pricing = SUMMARY_MODEL_PRICING[this.config.summaryModel] ?? null;
655
+ // Stage 1: concurrent API calls
656
+ const fetcher = new IterableMapper(pending, async (row) => {
620
657
  const result = await ai.summarizeThread({
621
658
  model: this.config.summaryModel,
622
659
  text: row.summaryInput,
623
660
  });
661
+ return { row, result };
662
+ }, { concurrency: 5 });
663
+ // Stage 2: sequential DB writes — consumes from fetcher without blocking API completions
664
+ const writer = new IterableMapper(fetcher, async ({ row, result }) => {
624
665
  const summary = result.summary;
625
666
  this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
626
667
  this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
627
668
  this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
628
669
  this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
629
- if (result.usage) {
630
- inputTokens += result.usage.inputTokens;
631
- outputTokens += result.usage.outputTokens;
632
- totalTokens += result.usage.totalTokens;
633
- params.onProgress?.(`[summarize] tokens thread #${row.number} in=${result.usage.inputTokens} out=${result.usage.outputTokens} total=${result.usage.totalTokens} cached_in=${result.usage.cachedInputTokens} reasoning=${result.usage.reasoningTokens}`);
670
+ return { row, usage: result.usage };
671
+ }, { concurrency: 1 });
672
+ let index = 0;
673
+ for await (const { row, usage } of writer) {
674
+ index += 1;
675
+ if (usage) {
676
+ inputTokens += usage.inputTokens;
677
+ outputTokens += usage.outputTokens;
678
+ totalTokens += usage.totalTokens;
679
+ cachedInputTokens += usage.cachedInputTokens;
680
+ }
681
+ // Compute cost and ETA every 10 items or on the last item
682
+ if (index % 10 === 0 || index === pending.length) {
683
+ const remaining = pending.length - index;
684
+ const avgIn = inputTokens / index;
685
+ const avgOut = outputTokens / index;
686
+ const avgCachedIn = cachedInputTokens / index;
687
+ const elapsedSec = (Date.now() - startTime) / 1000;
688
+ const secPerItem = elapsedSec / index;
689
+ const etaSec = remaining * secPerItem;
690
+ const etaMin = Math.round(etaSec / 60);
691
+ const etaStr = etaMin >= 60 ? `${Math.floor(etaMin / 60)}h${etaMin % 60}m` : `${etaMin}m`;
692
+ if (pricing) {
693
+ const uncachedInput = inputTokens - cachedInputTokens;
694
+ const costSoFar = (uncachedInput / 1_000_000) * pricing.inputCostPerM +
695
+ (cachedInputTokens / 1_000_000) * pricing.cachedInputCostPerM +
696
+ (outputTokens / 1_000_000) * pricing.outputCostPerM;
697
+ const estTotalCost = costSoFar +
698
+ ((remaining * (avgIn - avgCachedIn)) / 1_000_000) * pricing.inputCostPerM +
699
+ ((remaining * avgCachedIn) / 1_000_000) * pricing.cachedInputCostPerM +
700
+ ((remaining * avgOut) / 1_000_000) * pricing.outputCostPerM;
701
+ params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | cost=$${costSoFar.toFixed(2)} est_total=$${estTotalCost.toFixed(2)} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
702
+ }
703
+ else {
704
+ params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
705
+ }
634
706
  }
635
707
  summarized += 1;
636
708
  }
@@ -674,11 +746,25 @@ export class GHCrawlService {
674
746
  const repository = this.requireRepository(params.owner, params.repo);
675
747
  const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
676
748
  try {
677
- const { rows, tasks, pending } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
749
+ if (params.threadNumber === undefined) {
750
+ if (!this.isRepoVectorStateCurrent(repository.id)) {
751
+ this.resetRepositoryVectors(repository.id, repository.fullName);
752
+ }
753
+ else {
754
+ const pruned = this.pruneInactiveRepositoryVectors(repository.id, repository.fullName);
755
+ if (pruned > 0) {
756
+ params.onProgress?.(`[embed] pruned ${pruned} closed or inactive vector(s) before refresh`);
757
+ }
758
+ }
759
+ }
760
+ const { rows, tasks, pending, missingSummaryThreadNumbers } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
678
761
  const skipped = tasks.length - pending.length;
679
762
  const truncated = tasks.filter((task) => task.wasTruncated).length;
680
- params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} embedding source(s) for ${repository.fullName}`);
681
- params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
763
+ if (missingSummaryThreadNumbers.length > 0) {
764
+ throw new Error(`Embedding basis ${this.config.embeddingBasis} requires summaries before embedding. Missing summaries for thread(s): ${missingSummaryThreadNumbers.slice(0, 10).join(', ')}${missingSummaryThreadNumbers.length > 10 ? ', …' : ''}.`);
765
+ }
766
+ params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} active vector task(s) for ${repository.fullName}`);
767
+ params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} dimensions=${ACTIVE_EMBED_DIMENSIONS} basis=${this.config.embeddingBasis} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
682
768
  let embedded = 0;
683
769
  const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
684
770
  const mapper = new IterableMapper(batches, async (batch) => {
@@ -690,14 +776,15 @@ export class GHCrawlService {
690
776
  let completedBatches = 0;
691
777
  for await (const batchResult of mapper) {
692
778
  completedBatches += 1;
693
- const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.sourceKind}`);
779
+ const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.basis}`);
694
780
  const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
695
781
  params.onProgress?.(`[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`);
696
782
  for (const { task, embedding } of batchResult) {
697
- this.upsertEmbedding(task.threadId, task.sourceKind, task.contentHash, embedding);
783
+ this.upsertActiveVector(repository.id, repository.fullName, task.threadId, task.basis, task.contentHash, embedding);
698
784
  embedded += 1;
699
785
  }
700
786
  }
787
+ this.markRepoVectorsCurrent(repository.id);
701
788
  this.finishRun('embedding_runs', runId, 'completed', { embedded });
702
789
  return embedResultSchema.parse({ runId, embedded });
703
790
  }
@@ -709,16 +796,70 @@ export class GHCrawlService {
709
796
  async clusterRepository(params) {
710
797
  const repository = this.requireRepository(params.owner, params.repo);
711
798
  const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
712
- const minScore = params.minScore ?? 0.82;
799
+ const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
713
800
  const k = params.k ?? 6;
714
801
  try {
715
- const { items, sourceKinds } = this.loadClusterableThreadMeta(repository.id);
716
- params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
717
- const aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, sourceKinds, {
718
- limit: k,
719
- minScore,
720
- onProgress: params.onProgress,
721
- });
802
+ let items;
803
+ let aggregatedEdges;
804
+ if (this.isRepoVectorStateCurrent(repository.id)) {
805
+ const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName);
806
+ const activeIds = new Set(vectorItems.map((item) => item.id));
807
+ const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k);
808
+ aggregatedEdges = new Map();
809
+ let processed = 0;
810
+ let lastProgressAt = Date.now();
811
+ params.onProgress?.(`[cluster] loaded ${vectorItems.length} active vector(s) for ${repository.fullName} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`);
812
+ for (const item of vectorItems) {
813
+ const neighbors = this.vectorStore.queryNearest({
814
+ storePath: this.repoVectorStorePath(repository.fullName),
815
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
816
+ vector: item.embedding,
817
+ limit: annQuery.limit,
818
+ candidateK: annQuery.candidateK + 1,
819
+ efSearch: annQuery.efSearch,
820
+ excludeThreadId: item.id,
821
+ });
822
+ for (const neighbor of neighbors) {
823
+ if (!activeIds.has(neighbor.threadId))
824
+ continue;
825
+ if (neighbor.score < minScore)
826
+ continue;
827
+ const key = this.edgeKey(item.id, neighbor.threadId);
828
+ const existing = aggregatedEdges.get(key);
829
+ if (existing) {
830
+ existing.score = Math.max(existing.score, neighbor.score);
831
+ }
832
+ else {
833
+ aggregatedEdges.set(key, {
834
+ leftThreadId: Math.min(item.id, neighbor.threadId),
835
+ rightThreadId: Math.max(item.id, neighbor.threadId),
836
+ score: neighbor.score,
837
+ sourceKinds: new Set(['dedupe_summary']),
838
+ });
839
+ }
840
+ }
841
+ processed += 1;
842
+ const now = Date.now();
843
+ if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
844
+ params.onProgress(`[cluster] queried ${processed}/${vectorItems.length} vectors current_edges=${aggregatedEdges.size}`);
845
+ lastProgressAt = now;
846
+ }
847
+ }
848
+ items = vectorItems;
849
+ }
850
+ else if (this.hasLegacyEmbeddings(repository.id)) {
851
+ const legacy = this.loadClusterableThreadMeta(repository.id);
852
+ items = legacy.items;
853
+ params.onProgress?.(`[cluster] loaded ${items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
854
+ aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, {
855
+ limit: k,
856
+ minScore,
857
+ onProgress: params.onProgress,
858
+ });
859
+ }
860
+ else {
861
+ throw new Error(`Vectors for ${repository.fullName} are stale or missing. Run refresh or embed first.`);
862
+ }
722
863
  const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
723
864
  leftThreadId: entry.leftThreadId,
724
865
  rightThreadId: entry.rightThreadId,
@@ -728,6 +869,10 @@ export class GHCrawlService {
728
869
  const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
729
870
  this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
730
871
  this.pruneOldClusterRuns(repository.id, runId);
872
+ if (this.isRepoVectorStateCurrent(repository.id)) {
873
+ this.markRepoClustersCurrent(repository.id);
874
+ this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress);
875
+ }
731
876
  params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
732
877
  this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
733
878
  return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
@@ -737,6 +882,263 @@ export class GHCrawlService {
737
882
  throw error;
738
883
  }
739
884
  }
885
+ clusterExperiment(params) {
886
+ const backend = params.backend ?? 'vectorlite';
887
+ const repository = this.requireRepository(params.owner, params.repo);
888
+ const loaded = this.loadClusterableThreadMeta(repository.id);
889
+ const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : [];
890
+ const activeSourceKind = this.config.embeddingBasis === 'title_summary' ? 'dedupe_summary' : 'body';
891
+ const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0);
892
+ const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds);
893
+ const items = useActiveVectors
894
+ ? activeVectors.map((item) => ({ id: item.id, number: item.number, title: item.title }))
895
+ : loaded.items;
896
+ const aggregation = params.aggregation ?? 'max';
897
+ const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
898
+ const k = params.k ?? 6;
899
+ const candidateK = Math.max(k, params.candidateK ?? Math.max(k * 16, 64));
900
+ const efSearch = params.efSearch;
901
+ const startedAt = Date.now();
902
+ const memoryBefore = process.memoryUsage();
903
+ let peakRssBytes = memoryBefore.rss;
904
+ let peakHeapUsedBytes = memoryBefore.heapUsed;
905
+ const recordMemory = () => {
906
+ const usage = process.memoryUsage();
907
+ peakRssBytes = Math.max(peakRssBytes, usage.rss);
908
+ peakHeapUsedBytes = Math.max(peakHeapUsedBytes, usage.heapUsed);
909
+ };
910
+ recordMemory();
911
+ if (useActiveVectors && params.sourceKinds && loaded.items.length === 0) {
912
+ params.onProgress?.(`[cluster-experiment] legacy source embeddings are unavailable for ${repository.fullName}; falling back to active ${this.config.embeddingBasis} vectors`);
913
+ }
914
+ params.onProgress?.(`[cluster-experiment] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} backend=${backend} k=${k} candidateK=${candidateK} minScore=${minScore} aggregation=${aggregation}`);
915
+ const perSourceScores = new Map();
916
+ let loadMs = 0;
917
+ let setupMs = 0;
918
+ let edgeBuildMs = 0;
919
+ let indexBuildMs = 0;
920
+ let queryMs = 0;
921
+ let clusterBuildMs = 0;
922
+ let tempDbPath = null;
923
+ let tempDb = null;
924
+ let tempDir = null;
925
+ try {
926
+ if (backend === 'exact') {
927
+ if (useActiveVectors) {
928
+ const loadStartedAt = Date.now();
929
+ const normalizedRows = activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding }));
930
+ loadMs += Date.now() - loadStartedAt;
931
+ recordMemory();
932
+ const edgesStartedAt = Date.now();
933
+ const edges = buildSourceKindEdges(normalizedRows, {
934
+ limit: k,
935
+ minScore,
936
+ progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
937
+ onProgress: (progress) => {
938
+ recordMemory();
939
+ if (!params.onProgress)
940
+ return;
941
+ params.onProgress(`[cluster-experiment] exact ${progress.processedItems}/${normalizedRows.length} active vectors processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
942
+ },
943
+ });
944
+ edgeBuildMs += Date.now() - edgesStartedAt;
945
+ this.collectSourceKindScores(perSourceScores, edges, activeSourceKind);
946
+ recordMemory();
947
+ }
948
+ else {
949
+ const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0);
950
+ let processedItems = 0;
951
+ for (const sourceKind of sourceKinds) {
952
+ const loadStartedAt = Date.now();
953
+ const normalizedRows = this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind);
954
+ loadMs += Date.now() - loadStartedAt;
955
+ recordMemory();
956
+ const edgesStartedAt = Date.now();
957
+ const edges = buildSourceKindEdges(normalizedRows, {
958
+ limit: k,
959
+ minScore,
960
+ progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
961
+ onProgress: (progress) => {
962
+ recordMemory();
963
+ if (!params.onProgress)
964
+ return;
965
+ params.onProgress(`[cluster-experiment] exact ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
966
+ },
967
+ });
968
+ edgeBuildMs += Date.now() - edgesStartedAt;
969
+ processedItems += normalizedRows.length;
970
+ this.collectSourceKindScores(perSourceScores, edges, sourceKind);
971
+ recordMemory();
972
+ }
973
+ }
974
+ }
975
+ else {
976
+ const setupStartedAt = Date.now();
977
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-vectorlite-'));
978
+ tempDbPath = path.join(tempDir, 'cluster-experiment.db');
979
+ tempDb = openDb(tempDbPath);
980
+ tempDb.pragma('journal_mode = MEMORY');
981
+ tempDb.pragma('synchronous = OFF');
982
+ tempDb.pragma('temp_store = MEMORY');
983
+ const vectorlite = requireFromHere('vectorlite');
984
+ tempDb.loadExtension(vectorlite.vectorlitePath());
985
+ setupMs += Date.now() - setupStartedAt;
986
+ recordMemory();
987
+ const vectorSources = useActiveVectors
988
+ ? [
989
+ {
990
+ sourceKind: activeSourceKind,
991
+ rows: activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding })),
992
+ },
993
+ ]
994
+ : sourceKinds.map((sourceKind) => ({
995
+ sourceKind,
996
+ rows: this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind).map((row) => ({
997
+ id: row.id,
998
+ normalizedEmbedding: row.normalizedEmbedding,
999
+ })),
1000
+ }));
1001
+ for (const source of vectorSources) {
1002
+ const sourceRowCount = source.rows.length;
1003
+ if (sourceRowCount === 0) {
1004
+ continue;
1005
+ }
1006
+ const dimension = source.rows[0].normalizedEmbedding.length;
1007
+ const safeCandidateK = Math.min(candidateK, Math.max(1, sourceRowCount - 1));
1008
+ const tableName = `vector_${source.sourceKind}`;
1009
+ params.onProgress?.(`[cluster-experiment] building ${source.sourceKind} HNSW index with ${sourceRowCount} vector(s)`);
1010
+ const indexStartedAt = Date.now();
1011
+ tempDb.exec(`create virtual table ${tableName} using vectorlite(vec float32[${dimension}], hnsw(max_elements=${sourceRowCount}));`);
1012
+ const insert = tempDb.prepare(`insert into ${tableName}(rowid, vec) values (?, ?)`);
1013
+ tempDb.transaction(() => {
1014
+ const loadStartedAt = Date.now();
1015
+ for (const row of source.rows) {
1016
+ insert.run(row.id, this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
1017
+ }
1018
+ loadMs += Date.now() - loadStartedAt;
1019
+ })();
1020
+ indexBuildMs += Date.now() - indexStartedAt;
1021
+ recordMemory();
1022
+ const queryStartedAt = Date.now();
1023
+ const querySql = efSearch !== undefined
1024
+ ? `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}, ${efSearch}))`
1025
+ : `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}))`;
1026
+ const query = tempDb.prepare(querySql);
1027
+ let processed = 0;
1028
+ let lastProgressAt = Date.now();
1029
+ const queryLoadStartedAt = Date.now();
1030
+ for (const row of source.rows) {
1031
+ const candidates = query.all(this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
1032
+ const ranked = rankNearestNeighborsByScore(candidates, {
1033
+ limit: k,
1034
+ minScore,
1035
+ score: (candidate) => {
1036
+ if (candidate.rowid === row.id) {
1037
+ return -1;
1038
+ }
1039
+ return this.normalizedDistanceToScore(candidate.distance);
1040
+ },
1041
+ });
1042
+ let addedThisRow = 0;
1043
+ for (const candidate of ranked) {
1044
+ const score = candidate.score;
1045
+ const key = this.edgeKey(row.id, candidate.item.rowid);
1046
+ const existing = perSourceScores.get(key);
1047
+ if (existing) {
1048
+ existing.scores.set(source.sourceKind, Math.max(existing.scores.get(source.sourceKind) ?? -1, score));
1049
+ continue;
1050
+ }
1051
+ const scores = new Map();
1052
+ scores.set(source.sourceKind, score);
1053
+ perSourceScores.set(key, {
1054
+ leftThreadId: Math.min(row.id, candidate.item.rowid),
1055
+ rightThreadId: Math.max(row.id, candidate.item.rowid),
1056
+ scores,
1057
+ });
1058
+ addedThisRow += 1;
1059
+ }
1060
+ processed += 1;
1061
+ const now = Date.now();
1062
+ if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
1063
+ recordMemory();
1064
+ params.onProgress(`[cluster-experiment] querying ${source.sourceKind} index ${processed}/${sourceRowCount} current_edges=${perSourceScores.size} added_this_step=${addedThisRow}`);
1065
+ lastProgressAt = now;
1066
+ }
1067
+ }
1068
+ loadMs += Date.now() - queryLoadStartedAt;
1069
+ queryMs += Date.now() - queryStartedAt;
1070
+ tempDb.exec(`drop table ${tableName}`);
1071
+ recordMemory();
1072
+ }
1073
+ }
1074
+ // Finalize edge scores using the configured aggregation method
1075
+ const defaultWeights = { dedupe_summary: 0.5, title: 0.3, body: 0.2 };
1076
+ const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) };
1077
+ const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore);
1078
+ params.onProgress?.(`[cluster-experiment] finalized ${aggregated.length} edges from ${perSourceScores.size} candidate pairs using ${aggregation} aggregation`);
1079
+ const clusterStartedAt = Date.now();
1080
+ const clusterNodes = items.map((item) => ({ threadId: item.id, number: item.number, title: item.title }));
1081
+ const clusterEdges = aggregated;
1082
+ const clusterMode = params.clusterMode ?? (params.maxClusterSize !== undefined ? 'refine' : 'basic');
1083
+ const clusters = clusterMode === 'bounded'
1084
+ ? buildSizeBoundedClusters(clusterNodes, clusterEdges, {
1085
+ maxClusterSize: params.maxClusterSize ?? 200,
1086
+ })
1087
+ : clusterMode === 'refine'
1088
+ ? buildRefinedClusters(clusterNodes, clusterEdges, {
1089
+ maxClusterSize: params.maxClusterSize ?? 200,
1090
+ refineStep: params.refineStep ?? 0.02,
1091
+ })
1092
+ : buildClusters(clusterNodes, clusterEdges);
1093
+ clusterBuildMs += Date.now() - clusterStartedAt;
1094
+ recordMemory();
1095
+ const memoryAfter = process.memoryUsage();
1096
+ const durationMs = backend === 'vectorlite'
1097
+ ? indexBuildMs + queryMs + clusterBuildMs
1098
+ : edgeBuildMs + clusterBuildMs;
1099
+ const totalDurationMs = Date.now() - startedAt;
1100
+ return {
1101
+ backend,
1102
+ repository,
1103
+ tempDbPath,
1104
+ threads: items.length,
1105
+ sourceKinds: sourceKinds.length,
1106
+ edges: aggregated.length,
1107
+ clusters: clusters.length,
1108
+ timingBasis: 'cluster-only',
1109
+ durationMs,
1110
+ totalDurationMs,
1111
+ loadMs,
1112
+ setupMs,
1113
+ edgeBuildMs,
1114
+ indexBuildMs,
1115
+ queryMs,
1116
+ clusterBuildMs,
1117
+ candidateK,
1118
+ memory: {
1119
+ rssBeforeBytes: memoryBefore.rss,
1120
+ rssAfterBytes: memoryAfter.rss,
1121
+ peakRssBytes,
1122
+ heapUsedBeforeBytes: memoryBefore.heapUsed,
1123
+ heapUsedAfterBytes: memoryAfter.heapUsed,
1124
+ peakHeapUsedBytes,
1125
+ },
1126
+ clusterSizes: this.summarizeClusterSizes(clusters),
1127
+ clustersDetail: params.includeClusters
1128
+ ? clusters.map((cluster) => ({
1129
+ representativeThreadId: cluster.representativeThreadId,
1130
+ memberThreadIds: [...cluster.members],
1131
+ }))
1132
+ : null,
1133
+ };
1134
+ }
1135
+ finally {
1136
+ tempDb?.close();
1137
+ if (tempDir) {
1138
+ fs.rmSync(tempDir, { recursive: true, force: true });
1139
+ }
1140
+ }
1141
+ }
740
1142
  async searchRepository(params) {
741
1143
  const mode = params.mode ?? 'hybrid';
742
1144
  const repository = this.requireRepository(params.owner, params.repo);
@@ -758,13 +1160,33 @@ export class GHCrawlService {
758
1160
  }
759
1161
  }
760
1162
  if (mode !== 'keyword' && this.ai) {
761
- const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
762
- const rows = this.loadParsedStoredEmbeddings(repository.id);
763
- for (const row of rows) {
764
- const score = cosineSimilarity(queryEmbedding, row.embedding);
765
- if (score < 0.2)
766
- continue;
767
- semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
1163
+ if (this.isRepoVectorStateCurrent(repository.id)) {
1164
+ const [queryEmbedding] = await this.ai.embedTexts({
1165
+ model: this.config.embedModel,
1166
+ texts: [params.query],
1167
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1168
+ });
1169
+ const neighbors = this.vectorStore.queryNearest({
1170
+ storePath: this.repoVectorStorePath(repository.fullName),
1171
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1172
+ vector: queryEmbedding,
1173
+ limit: limit * 2,
1174
+ candidateK: Math.max(limit * 8, 64),
1175
+ });
1176
+ for (const neighbor of neighbors) {
1177
+ if (neighbor.score < 0.2)
1178
+ continue;
1179
+ semanticScores.set(neighbor.threadId, Math.max(semanticScores.get(neighbor.threadId) ?? -1, neighbor.score));
1180
+ }
1181
+ }
1182
+ else if (this.hasLegacyEmbeddings(repository.id)) {
1183
+ const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
1184
+ for (const row of this.iterateStoredEmbeddings(repository.id)) {
1185
+ const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json));
1186
+ if (score < 0.2)
1187
+ continue;
1188
+ semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
1189
+ }
768
1190
  }
769
1191
  }
770
1192
  const candidateIds = new Set([...keywordScores.keys(), ...semanticScores.keys()]);
@@ -832,44 +1254,97 @@ export class GHCrawlService {
832
1254
  const repository = this.requireRepository(params.owner, params.repo);
833
1255
  const limit = params.limit ?? 10;
834
1256
  const minScore = params.minScore ?? 0.2;
835
- const rows = this.loadParsedStoredEmbeddings(repository.id);
836
- const targetRows = rows.filter((row) => row.number === params.threadNumber);
837
- if (targetRows.length === 0) {
838
- throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
839
- }
840
- const targetRow = targetRows[0];
841
- const targetBySource = new Map();
842
- for (const row of targetRows) {
843
- targetBySource.set(row.source_kind, row.embedding);
1257
+ const targetRow = this.db
1258
+ .prepare(`select t.*, tv.basis, tv.model, tv.dimensions, tv.content_hash, tv.vector_json, tv.vector_backend
1259
+ from threads t
1260
+ join thread_vectors tv on tv.thread_id = t.id
1261
+ where t.repo_id = ?
1262
+ and t.number = ?
1263
+ and t.state = 'open'
1264
+ and t.closed_at_local is null
1265
+ and tv.model = ?
1266
+ and tv.basis = ?
1267
+ and tv.dimensions = ?
1268
+ limit 1`)
1269
+ .get(repository.id, params.threadNumber, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
1270
+ let responseThread;
1271
+ let neighbors;
1272
+ if (targetRow) {
1273
+ responseThread = targetRow;
1274
+ const candidateRows = this.vectorStore
1275
+ .queryNearest({
1276
+ storePath: this.repoVectorStorePath(repository.fullName),
1277
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1278
+ vector: this.parseStoredVector(targetRow.vector_json),
1279
+ limit: limit * 2,
1280
+ candidateK: Math.max(limit * 8, 64),
1281
+ excludeThreadId: targetRow.id,
1282
+ })
1283
+ .filter((row) => row.score >= minScore);
1284
+ const candidateIds = candidateRows.map((row) => row.threadId);
1285
+ const neighborMeta = candidateIds.length
1286
+ ? this.db
1287
+ .prepare(`select * from threads
1288
+ where repo_id = ? and state = 'open' and closed_at_local is null and id in (${candidateIds.map(() => '?').join(',')})`)
1289
+ .all(repository.id, ...candidateIds)
1290
+ : [];
1291
+ const metaById = new Map(neighborMeta.map((row) => [row.id, row]));
1292
+ neighbors = candidateRows
1293
+ .map((row) => {
1294
+ const meta = metaById.get(row.threadId);
1295
+ if (!meta) {
1296
+ return null;
1297
+ }
1298
+ return {
1299
+ threadId: row.threadId,
1300
+ number: meta.number,
1301
+ kind: meta.kind,
1302
+ title: meta.title,
1303
+ score: row.score,
1304
+ };
1305
+ })
1306
+ .filter((row) => row !== null)
1307
+ .slice(0, limit);
844
1308
  }
845
- const aggregated = new Map();
846
- for (const row of rows) {
847
- if (row.id === targetRow.id)
848
- continue;
849
- const targetEmbedding = targetBySource.get(row.source_kind);
850
- if (!targetEmbedding)
851
- continue;
852
- const score = cosineSimilarity(targetEmbedding, row.embedding);
853
- if (score < minScore)
854
- continue;
855
- const previous = aggregated.get(row.id);
856
- if (!previous || score > previous.score) {
857
- aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
1309
+ else {
1310
+ const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber);
1311
+ if (targetRows.length === 0) {
1312
+ throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
858
1313
  }
1314
+ responseThread = targetRows[0];
1315
+ const targetBySource = new Map();
1316
+ for (const row of targetRows) {
1317
+ targetBySource.set(row.source_kind, JSON.parse(row.embedding_json));
1318
+ }
1319
+ const aggregated = new Map();
1320
+ for (const row of this.iterateStoredEmbeddings(repository.id)) {
1321
+ if (row.id === responseThread.id)
1322
+ continue;
1323
+ const targetEmbedding = targetBySource.get(row.source_kind);
1324
+ if (!targetEmbedding)
1325
+ continue;
1326
+ const score = cosineSimilarity(targetEmbedding, JSON.parse(row.embedding_json));
1327
+ if (score < minScore)
1328
+ continue;
1329
+ const previous = aggregated.get(row.id);
1330
+ if (!previous || score > previous.score) {
1331
+ aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
1332
+ }
1333
+ }
1334
+ neighbors = Array.from(aggregated.entries())
1335
+ .map(([threadId, value]) => ({
1336
+ threadId,
1337
+ number: value.number,
1338
+ kind: value.kind,
1339
+ title: value.title,
1340
+ score: value.score,
1341
+ }))
1342
+ .sort((left, right) => right.score - left.score)
1343
+ .slice(0, limit);
859
1344
  }
860
- const neighbors = Array.from(aggregated.entries())
861
- .map(([threadId, value]) => ({
862
- threadId,
863
- number: value.number,
864
- kind: value.kind,
865
- title: value.title,
866
- score: value.score,
867
- }))
868
- .sort((left, right) => right.score - left.score)
869
- .slice(0, limit);
870
1345
  return neighborsResponseSchema.parse({
871
1346
  repository,
872
- thread: threadToDto(targetRow),
1347
+ thread: threadToDto(responseThread),
873
1348
  neighbors,
874
1349
  });
875
1350
  }
@@ -946,6 +1421,14 @@ export class GHCrawlService {
946
1421
  onProgress: params.onProgress,
947
1422
  });
948
1423
  }
1424
+ if (selected.embed && this.config.embeddingBasis === 'title_summary') {
1425
+ params.onProgress?.(`[refresh] embedding basis ${this.config.embeddingBasis} requires summaries; running summarize before embed`);
1426
+ await this.summarizeRepository({
1427
+ owner: params.owner,
1428
+ repo: params.repo,
1429
+ onProgress: params.onProgress,
1430
+ });
1431
+ }
949
1432
  if (selected.embed) {
950
1433
  embed = await this.embedRepository({
951
1434
  owner: params.owner,
@@ -1150,9 +1633,9 @@ export class GHCrawlService {
1150
1633
  const summaryRows = this.db
1151
1634
  .prepare(`select summary_kind, summary_text
1152
1635
  from document_summaries
1153
- where thread_id = ? and model = ?
1636
+ where thread_id = ? and model = ? and prompt_version = ?
1154
1637
  order by summary_kind asc`)
1155
- .all(row.id, this.config.summaryModel);
1638
+ .all(row.id, this.config.summaryModel, SUMMARY_PROMPT_VERSION);
1156
1639
  const summaries = {};
1157
1640
  for (const summary of summaryRows) {
1158
1641
  if (summary.summary_kind === 'problem_summary' ||
@@ -1314,7 +1797,178 @@ export class GHCrawlService {
1314
1797
  latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
1315
1798
  };
1316
1799
  }
1800
+ getDesiredPipelineState() {
1801
+ return {
1802
+ summary_model: this.config.summaryModel,
1803
+ summary_prompt_version: SUMMARY_PROMPT_VERSION,
1804
+ embedding_basis: this.config.embeddingBasis,
1805
+ embed_model: this.config.embedModel,
1806
+ embed_dimensions: ACTIVE_EMBED_DIMENSIONS,
1807
+ embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION,
1808
+ vector_backend: this.config.vectorBackend,
1809
+ };
1810
+ }
1811
+ getRepoPipelineState(repoId) {
1812
+ return (this.db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) ??
1813
+ null);
1814
+ }
1815
+ isRepoVectorStateCurrent(repoId) {
1816
+ const state = this.getRepoPipelineState(repoId);
1817
+ if (!state || !state.vectors_current_at) {
1818
+ return false;
1819
+ }
1820
+ const desired = this.getDesiredPipelineState();
1821
+ return (state.summary_model === desired.summary_model &&
1822
+ state.summary_prompt_version === desired.summary_prompt_version &&
1823
+ state.embedding_basis === desired.embedding_basis &&
1824
+ state.embed_model === desired.embed_model &&
1825
+ state.embed_dimensions === desired.embed_dimensions &&
1826
+ state.embed_pipeline_version === desired.embed_pipeline_version &&
1827
+ state.vector_backend === desired.vector_backend);
1828
+ }
1829
+ isRepoClusterStateCurrent(repoId) {
1830
+ const state = this.getRepoPipelineState(repoId);
1831
+ return this.isRepoVectorStateCurrent(repoId) && Boolean(state?.clusters_current_at);
1832
+ }
1833
+ hasLegacyEmbeddings(repoId) {
1834
+ const row = this.db
1835
+ .prepare(`select count(*) as count
1836
+ from document_embeddings e
1837
+ join threads t on t.id = e.thread_id
1838
+ where t.repo_id = ?
1839
+ and t.state = 'open'
1840
+ and t.closed_at_local is null
1841
+ and e.model = ?`)
1842
+ .get(repoId, this.config.embedModel);
1843
+ return row.count > 0;
1844
+ }
1845
+ writeRepoPipelineState(repoId, overrides) {
1846
+ const desired = this.getDesiredPipelineState();
1847
+ const current = this.getRepoPipelineState(repoId);
1848
+ this.db
1849
+ .prepare(`insert into repo_pipeline_state (
1850
+ repo_id,
1851
+ summary_model,
1852
+ summary_prompt_version,
1853
+ embedding_basis,
1854
+ embed_model,
1855
+ embed_dimensions,
1856
+ embed_pipeline_version,
1857
+ vector_backend,
1858
+ vectors_current_at,
1859
+ clusters_current_at,
1860
+ updated_at
1861
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1862
+ on conflict(repo_id) do update set
1863
+ summary_model = excluded.summary_model,
1864
+ summary_prompt_version = excluded.summary_prompt_version,
1865
+ embedding_basis = excluded.embedding_basis,
1866
+ embed_model = excluded.embed_model,
1867
+ embed_dimensions = excluded.embed_dimensions,
1868
+ embed_pipeline_version = excluded.embed_pipeline_version,
1869
+ vector_backend = excluded.vector_backend,
1870
+ vectors_current_at = excluded.vectors_current_at,
1871
+ clusters_current_at = excluded.clusters_current_at,
1872
+ updated_at = excluded.updated_at`)
1873
+ .run(repoId, desired.summary_model, desired.summary_prompt_version, desired.embedding_basis, desired.embed_model, desired.embed_dimensions, desired.embed_pipeline_version, desired.vector_backend, overrides.vectors_current_at ?? current?.vectors_current_at ?? null, overrides.clusters_current_at ?? current?.clusters_current_at ?? null, nowIso());
1874
+ }
1875
+ markRepoVectorsCurrent(repoId) {
1876
+ this.writeRepoPipelineState(repoId, {
1877
+ vectors_current_at: nowIso(),
1878
+ clusters_current_at: null,
1879
+ });
1880
+ }
1881
+ markRepoClustersCurrent(repoId) {
1882
+ const state = this.getRepoPipelineState(repoId);
1883
+ this.writeRepoPipelineState(repoId, {
1884
+ vectors_current_at: state?.vectors_current_at ?? nowIso(),
1885
+ clusters_current_at: nowIso(),
1886
+ });
1887
+ }
1888
+ repoVectorStorePath(repoFullName) {
1889
+ const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__');
1890
+ return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`);
1891
+ }
1892
+ resetRepositoryVectors(repoId, repoFullName) {
1893
+ this.db
1894
+ .prepare(`delete from thread_vectors
1895
+ where thread_id in (select id from threads where repo_id = ?)`)
1896
+ .run(repoId);
1897
+ this.vectorStore.resetRepository({
1898
+ storePath: this.repoVectorStorePath(repoFullName),
1899
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1900
+ });
1901
+ this.writeRepoPipelineState(repoId, {
1902
+ vectors_current_at: null,
1903
+ clusters_current_at: null,
1904
+ });
1905
+ }
1906
+ pruneInactiveRepositoryVectors(repoId, repoFullName) {
1907
+ const rows = this.db
1908
+ .prepare(`select tv.thread_id
1909
+ from thread_vectors tv
1910
+ join threads t on t.id = tv.thread_id
1911
+ where t.repo_id = ?
1912
+ and (t.state != 'open' or t.closed_at_local is not null)`)
1913
+ .all(repoId);
1914
+ if (rows.length === 0) {
1915
+ return 0;
1916
+ }
1917
+ const deleteVectorRow = this.db.prepare('delete from thread_vectors where thread_id = ?');
1918
+ this.db.transaction(() => {
1919
+ for (const row of rows) {
1920
+ deleteVectorRow.run(row.thread_id);
1921
+ this.vectorStore.deleteVector({
1922
+ storePath: this.repoVectorStorePath(repoFullName),
1923
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1924
+ threadId: row.thread_id,
1925
+ });
1926
+ }
1927
+ })();
1928
+ return rows.length;
1929
+ }
1930
+ cleanupMigratedRepositoryArtifacts(repoId, repoFullName, onProgress) {
1931
+ const legacyEmbeddingCount = this.countLegacyEmbeddings(repoId);
1932
+ const inlineJsonVectorCount = this.countInlineJsonThreadVectors(repoId);
1933
+ if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) {
1934
+ return;
1935
+ }
1936
+ if (legacyEmbeddingCount > 0) {
1937
+ this.db
1938
+ .prepare(`delete from document_embeddings
1939
+ where thread_id in (select id from threads where repo_id = ?)`)
1940
+ .run(repoId);
1941
+ onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`);
1942
+ }
1943
+ if (inlineJsonVectorCount > 0) {
1944
+ const rows = this.db
1945
+ .prepare(`select tv.thread_id, tv.vector_json
1946
+ from thread_vectors tv
1947
+ join threads t on t.id = tv.thread_id
1948
+ where t.repo_id = ?
1949
+ and typeof(tv.vector_json) = 'text'
1950
+ and tv.vector_json != ''`)
1951
+ .all(repoId);
1952
+ const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?');
1953
+ this.db.transaction(() => {
1954
+ for (const row of rows) {
1955
+ update.run(this.vectorBlob(JSON.parse(row.vector_json)), nowIso(), row.thread_id);
1956
+ }
1957
+ })();
1958
+ onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`);
1959
+ }
1960
+ if (this.config.dbPath !== ':memory:') {
1961
+ onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${repoFullName} migration changes`);
1962
+ this.db.pragma('wal_checkpoint(TRUNCATE)');
1963
+ this.db.exec('VACUUM');
1964
+ this.db.pragma('wal_checkpoint(TRUNCATE)');
1965
+ }
1966
+ }
1317
1967
  getLatestClusterRun(repoId) {
1968
+ const state = this.getRepoPipelineState(repoId);
1969
+ if (state && !this.isRepoClusterStateCurrent(repoId)) {
1970
+ return null;
1971
+ }
1318
1972
  return (this.db
1319
1973
  .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1320
1974
  .get(repoId) ?? null);
@@ -1784,7 +2438,7 @@ export class GHCrawlService {
1784
2438
  }
1785
2439
  }
1786
2440
  const summaryInput = parts.join('\n\n');
1787
- const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
2441
+ const summaryContentHash = stableContentHash(`summary:${SUMMARY_PROMPT_VERSION}:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
1788
2442
  return { summaryInput, summaryContentHash };
1789
2443
  }
1790
2444
  buildEmbeddingTasks(params) {
@@ -1827,6 +2481,35 @@ export class GHCrawlService {
1827
2481
  }
1828
2482
  return tasks;
1829
2483
  }
2484
+ buildActiveVectorTask(params) {
2485
+ const sections = [`title: ${normalizeSummaryText(params.title)}`];
2486
+ if (this.config.embeddingBasis === 'title_summary') {
2487
+ const summary = normalizeSummaryText(params.dedupeSummary ?? '');
2488
+ if (!summary) {
2489
+ return null;
2490
+ }
2491
+ sections.push(`summary: ${summary}`);
2492
+ }
2493
+ else {
2494
+ const body = normalizeSummaryText(params.body ?? '');
2495
+ if (body) {
2496
+ sections.push(`body: ${body}`);
2497
+ }
2498
+ }
2499
+ const prepared = this.prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS);
2500
+ if (!prepared) {
2501
+ return null;
2502
+ }
2503
+ return {
2504
+ threadId: params.threadId,
2505
+ threadNumber: params.threadNumber,
2506
+ basis: this.config.embeddingBasis,
2507
+ text: prepared.text,
2508
+ contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${this.config.embeddingBasis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`),
2509
+ estimatedTokens: prepared.estimatedTokens,
2510
+ wasTruncated: prepared.wasTruncated,
2511
+ };
2512
+ }
1830
2513
  prepareEmbeddingText(text, maxEstimatedTokens) {
1831
2514
  if (!text) {
1832
2515
  return null;
@@ -1868,6 +2551,7 @@ export class GHCrawlService {
1868
2551
  const embeddings = await ai.embedTexts({
1869
2552
  model: this.config.embedModel,
1870
2553
  texts: batch.map((task) => task.text),
2554
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1871
2555
  });
1872
2556
  return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
1873
2557
  }
@@ -1894,6 +2578,7 @@ export class GHCrawlService {
1894
2578
  const [embedding] = await ai.embedTexts({
1895
2579
  model: this.config.embedModel,
1896
2580
  texts: [current.text],
2581
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
1897
2582
  });
1898
2583
  return { task: current, embedding };
1899
2584
  }
@@ -1906,11 +2591,11 @@ export class GHCrawlService {
1906
2591
  if (!next || next.text === current.text) {
1907
2592
  throw error;
1908
2593
  }
1909
- onProgress?.(`[embed] shortened #${current.threadNumber}:${current.sourceKind} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
2594
+ onProgress?.(`[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
1910
2595
  current = next;
1911
2596
  }
1912
2597
  }
1913
- throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.sourceKind} below model limits`);
2598
+ throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`);
1914
2599
  }
1915
2600
  shrinkEmbeddingTask(task, context) {
1916
2601
  const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
@@ -1927,7 +2612,7 @@ export class GHCrawlService {
1927
2612
  return {
1928
2613
  ...task,
1929
2614
  text: nextText,
1930
- contentHash: stableContentHash(`embedding:${task.sourceKind}\n${nextText}`),
2615
+ contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`),
1931
2616
  estimatedTokens: this.estimateEmbeddingTokens(nextText),
1932
2617
  wasTruncated: true,
1933
2618
  };
@@ -1976,23 +2661,71 @@ export class GHCrawlService {
1976
2661
  order by t.number asc, e.source_kind asc`)
1977
2662
  .all(repoId, this.config.embedModel);
1978
2663
  }
1979
- loadParsedStoredEmbeddings(repoId) {
1980
- const cached = this.parsedEmbeddingCache.get(repoId);
1981
- if (cached) {
1982
- return cached;
2664
+ loadStoredEmbeddingsForThreadNumber(repoId, threadNumber) {
2665
+ return this.db
2666
+ .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
2667
+ t.title, t.body, t.author_login, t.html_url, t.labels_json,
2668
+ t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
2669
+ from threads t
2670
+ join document_embeddings e on e.thread_id = t.id
2671
+ where t.repo_id = ?
2672
+ and t.number = ?
2673
+ and t.state = 'open'
2674
+ and t.closed_at_local is null
2675
+ and e.model = ?
2676
+ order by e.source_kind asc`)
2677
+ .all(repoId, threadNumber, this.config.embedModel);
2678
+ }
2679
+ iterateStoredEmbeddings(repoId) {
2680
+ return this.db
2681
+ .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
2682
+ t.title, t.body, t.author_login, t.html_url, t.labels_json,
2683
+ t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
2684
+ from threads t
2685
+ join document_embeddings e on e.thread_id = t.id
2686
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ?
2687
+ order by t.number asc, e.source_kind asc`)
2688
+ .iterate(repoId, this.config.embedModel);
2689
+ }
2690
+ loadNormalizedEmbeddingForSourceKindHead(repoId, sourceKind) {
2691
+ const row = this.db
2692
+ .prepare(`select t.id, e.embedding_json
2693
+ from threads t
2694
+ join document_embeddings e on e.thread_id = t.id
2695
+ where t.repo_id = ?
2696
+ and t.state = 'open'
2697
+ and t.closed_at_local is null
2698
+ and e.model = ?
2699
+ and e.source_kind = ?
2700
+ order by t.number asc
2701
+ limit 1`)
2702
+ .get(repoId, this.config.embedModel, sourceKind);
2703
+ if (!row) {
2704
+ return null;
1983
2705
  }
1984
- const parsed = this.loadStoredEmbeddings(repoId).map((row) => {
1985
- const embedding = JSON.parse(row.embedding_json);
1986
- const normalized = normalizeEmbedding(embedding);
1987
- return {
1988
- ...row,
1989
- embedding,
1990
- normalizedEmbedding: normalized.normalized,
1991
- embeddingNorm: normalized.norm,
2706
+ return {
2707
+ id: row.id,
2708
+ normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
2709
+ };
2710
+ }
2711
+ *iterateNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
2712
+ const rows = this.db
2713
+ .prepare(`select t.id, e.embedding_json
2714
+ from threads t
2715
+ join document_embeddings e on e.thread_id = t.id
2716
+ where t.repo_id = ?
2717
+ and t.state = 'open'
2718
+ and t.closed_at_local is null
2719
+ and e.model = ?
2720
+ and e.source_kind = ?
2721
+ order by t.number asc`)
2722
+ .iterate(repoId, this.config.embedModel, sourceKind);
2723
+ for (const row of rows) {
2724
+ yield {
2725
+ id: row.id,
2726
+ normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
1992
2727
  };
1993
- });
1994
- this.parsedEmbeddingCache.set(repoId, parsed);
1995
- return parsed;
2728
+ }
1996
2729
  }
1997
2730
  loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
1998
2731
  const rows = this.db
@@ -2011,6 +2744,12 @@ export class GHCrawlService {
2011
2744
  normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
2012
2745
  }));
2013
2746
  }
2747
+ normalizedEmbeddingBuffer(values) {
2748
+ return Buffer.from(Float32Array.from(values).buffer);
2749
+ }
2750
+ normalizedDistanceToScore(distance) {
2751
+ return 1 - distance / 2;
2752
+ }
2014
2753
  loadClusterableThreadMeta(repoId) {
2015
2754
  const rows = this.db
2016
2755
  .prepare(`select t.id, t.number, t.title, e.source_kind
@@ -2031,6 +2770,34 @@ export class GHCrawlService {
2031
2770
  sourceKinds: Array.from(sourceKinds.values()),
2032
2771
  };
2033
2772
  }
2773
+ loadClusterableActiveVectorMeta(repoId, _repoFullName) {
2774
+ const rows = this.db
2775
+ .prepare(`select t.id, t.number, t.title, tv.vector_json
2776
+ from threads t
2777
+ join thread_vectors tv on tv.thread_id = t.id
2778
+ where t.repo_id = ?
2779
+ and t.state = 'open'
2780
+ and t.closed_at_local is null
2781
+ and tv.model = ?
2782
+ and tv.basis = ?
2783
+ and tv.dimensions = ?
2784
+ order by t.number asc`)
2785
+ .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
2786
+ return rows.map((row) => ({
2787
+ id: row.id,
2788
+ number: row.number,
2789
+ title: row.title,
2790
+ embedding: this.parseStoredVector(row.vector_json),
2791
+ }));
2792
+ }
2793
+ loadNormalizedActiveVectors(repoId) {
2794
+ return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({
2795
+ id: row.id,
2796
+ number: row.number,
2797
+ title: row.title,
2798
+ embedding: normalizeEmbedding(row.embedding).normalized,
2799
+ }));
2800
+ }
2034
2801
  listStoredClusterNeighbors(repoId, threadId, limit) {
2035
2802
  const latestRun = this.getLatestClusterRun(repoId);
2036
2803
  if (!latestRun) {
@@ -2087,56 +2854,65 @@ export class GHCrawlService {
2087
2854
  }
2088
2855
  sql += ' order by t.number asc';
2089
2856
  const rows = this.db.prepare(sql).all(...args);
2090
- const summaryTexts = this.loadCombinedSummaryTextMap(repoId, threadNumber);
2091
- const tasks = rows.flatMap((row) => this.buildEmbeddingTasks({
2092
- threadId: row.id,
2093
- threadNumber: row.number,
2094
- title: row.title,
2095
- body: row.body,
2096
- dedupeSummary: summaryTexts.get(row.id) ?? null,
2097
- }));
2857
+ const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber);
2858
+ const missingSummaryThreadNumbers = [];
2859
+ const tasks = rows.flatMap((row) => {
2860
+ const task = this.buildActiveVectorTask({
2861
+ threadId: row.id,
2862
+ threadNumber: row.number,
2863
+ title: row.title,
2864
+ body: row.body,
2865
+ dedupeSummary: summaryTexts.get(row.id) ?? null,
2866
+ });
2867
+ if (task) {
2868
+ return [task];
2869
+ }
2870
+ if (this.config.embeddingBasis === 'title_summary') {
2871
+ missingSummaryThreadNumbers.push(row.number);
2872
+ }
2873
+ return [];
2874
+ });
2875
+ const pipelineCurrent = this.isRepoVectorStateCurrent(repoId);
2098
2876
  const existingRows = this.db
2099
- .prepare(`select e.thread_id, e.source_kind, e.content_hash
2100
- from document_embeddings e
2101
- join threads t on t.id = e.thread_id
2102
- where t.repo_id = ? and e.model = ?`)
2103
- .all(repoId, this.config.embedModel);
2877
+ .prepare(`select tv.thread_id, tv.content_hash
2878
+ from thread_vectors tv
2879
+ join threads t on t.id = tv.thread_id
2880
+ where t.repo_id = ?
2881
+ and tv.model = ?
2882
+ and tv.basis = ?
2883
+ and tv.dimensions = ?`)
2884
+ .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
2104
2885
  const existing = new Map();
2105
2886
  for (const row of existingRows) {
2106
- existing.set(`${row.thread_id}:${row.source_kind}`, row.content_hash);
2887
+ existing.set(String(row.thread_id), row.content_hash);
2107
2888
  }
2108
- const pending = tasks.filter((task) => existing.get(`${task.threadId}:${task.sourceKind}`) !== task.contentHash);
2109
- return { rows, tasks, existing, pending };
2889
+ const pending = pipelineCurrent
2890
+ ? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash)
2891
+ : tasks;
2892
+ return { rows, tasks, existing, pending, missingSummaryThreadNumbers };
2110
2893
  }
2111
- loadCombinedSummaryTextMap(repoId, threadNumber) {
2112
- let sql = `select s.thread_id, s.summary_kind, s.summary_text
2894
+ loadDedupeSummaryTextMap(repoId, threadNumber) {
2895
+ let sql = `select s.thread_id, s.summary_text
2113
2896
  from document_summaries s
2114
2897
  join threads t on t.id = s.thread_id
2115
- where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and s.model = ?`;
2116
- const args = [repoId, this.config.summaryModel];
2898
+ where t.repo_id = ?
2899
+ and t.state = 'open'
2900
+ and t.closed_at_local is null
2901
+ and s.model = ?
2902
+ and s.summary_kind = 'dedupe_summary'
2903
+ and s.prompt_version = ?`;
2904
+ const args = [repoId, this.config.summaryModel, SUMMARY_PROMPT_VERSION];
2117
2905
  if (threadNumber) {
2118
2906
  sql += ' and t.number = ?';
2119
2907
  args.push(threadNumber);
2120
2908
  }
2121
- sql += ' order by t.number asc, s.summary_kind asc';
2909
+ sql += ' order by t.number asc';
2122
2910
  const rows = this.db.prepare(sql).all(...args);
2123
- const byThread = new Map();
2124
- for (const row of rows) {
2125
- const entry = byThread.get(row.thread_id) ?? new Map();
2126
- entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
2127
- byThread.set(row.thread_id, entry);
2128
- }
2129
2911
  const combined = new Map();
2130
- const order = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary'];
2131
- for (const [threadId, entry] of byThread.entries()) {
2132
- const parts = order
2133
- .map((summaryKind) => {
2134
- const text = entry.get(summaryKind);
2135
- return text ? `${summaryKind}: ${text}` : '';
2136
- })
2137
- .filter(Boolean);
2138
- if (parts.length > 0) {
2139
- combined.set(threadId, parts.join('\n\n'));
2912
+ for (const row of rows) {
2913
+ const text = normalizeSummaryText(row.summary_text);
2914
+ if (text) {
2915
+ combined.set(row.thread_id, text);
2140
2916
  }
2141
2917
  }
2142
2918
  return combined;
@@ -2233,6 +3009,71 @@ export class GHCrawlService {
2233
3009
  });
2234
3010
  }
2235
3011
  }
3012
+ collectSourceKindScores(perSourceScores, edges, sourceKind) {
3013
+ for (const edge of edges) {
3014
+ const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
3015
+ const existing = perSourceScores.get(key);
3016
+ if (existing) {
3017
+ existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score));
3018
+ continue;
3019
+ }
3020
+ const scores = new Map();
3021
+ scores.set(sourceKind, edge.score);
3022
+ perSourceScores.set(key, {
3023
+ leftThreadId: edge.leftThreadId,
3024
+ rightThreadId: edge.rightThreadId,
3025
+ scores,
3026
+ });
3027
+ }
3028
+ }
3029
+ finalizeEdgeScores(perSourceScores, aggregation, weights, minScore) {
3030
+ const result = [];
3031
+ for (const entry of perSourceScores.values()) {
3032
+ const scoreValues = Array.from(entry.scores.values());
3033
+ let finalScore;
3034
+ switch (aggregation) {
3035
+ case 'max':
3036
+ finalScore = Math.max(...scoreValues);
3037
+ break;
3038
+ case 'mean':
3039
+ finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length;
3040
+ break;
3041
+ case 'weighted': {
3042
+ let weightedSum = 0;
3043
+ let weightSum = 0;
3044
+ for (const [kind, score] of entry.scores) {
3045
+ const w = weights[kind] ?? 0.1;
3046
+ weightedSum += score * w;
3047
+ weightSum += w;
3048
+ }
3049
+ finalScore = weightSum > 0 ? weightedSum / weightSum : 0;
3050
+ break;
3051
+ }
3052
+ case 'min-of-2':
3053
+ // Require at least 2 source kinds to agree (both above minScore)
3054
+ if (scoreValues.length < 2) {
3055
+ continue; // Skip edges with only 1 source kind
3056
+ }
3057
+ finalScore = Math.max(...scoreValues);
3058
+ break;
3059
+ case 'boost': {
3060
+ // Best score + bonus per additional agreeing source
3061
+ const best = Math.max(...scoreValues);
3062
+ const bonusSources = scoreValues.length - 1;
3063
+ finalScore = Math.min(1.0, best + bonusSources * 0.05);
3064
+ break;
3065
+ }
3066
+ }
3067
+ if (finalScore >= minScore) {
3068
+ result.push({
3069
+ leftThreadId: entry.leftThreadId,
3070
+ rightThreadId: entry.rightThreadId,
3071
+ score: finalScore,
3072
+ });
3073
+ }
3074
+ }
3075
+ return result;
3076
+ }
2236
3077
  countEmbeddingsForSourceKind(repoId, sourceKind) {
2237
3078
  const row = this.db
2238
3079
  .prepare(`select count(*) as count
@@ -2280,15 +3121,102 @@ export class GHCrawlService {
2280
3121
  pruneOldClusterRuns(repoId, keepRunId) {
2281
3122
  this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
2282
3123
  }
3124
+ summarizeClusterSizes(clusters) {
3125
+ const histogramCounts = new Map();
3126
+ const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left);
3127
+ let soloClusters = 0;
3128
+ for (const cluster of clusters) {
3129
+ const size = cluster.members.length;
3130
+ histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1);
3131
+ if (size === 1) {
3132
+ soloClusters += 1;
3133
+ }
3134
+ }
3135
+ return {
3136
+ soloClusters,
3137
+ maxClusterSize: topClusterSizes[0] ?? 0,
3138
+ topClusterSizes: topClusterSizes.slice(0, 50),
3139
+ histogram: Array.from(histogramCounts.entries())
3140
+ .map(([size, count]) => ({ size, count }))
3141
+ .sort((left, right) => left.size - right.size),
3142
+ };
3143
+ }
2283
3144
  upsertSummary(threadId, contentHash, summaryKind, summaryText) {
2284
3145
  this.db
2285
- .prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
2286
- values (?, ?, ?, ?, ?, ?, ?)
3146
+ .prepare(`insert into document_summaries (thread_id, summary_kind, model, prompt_version, content_hash, summary_text, created_at, updated_at)
3147
+ values (?, ?, ?, ?, ?, ?, ?, ?)
2287
3148
  on conflict(thread_id, summary_kind, model) do update set
3149
+ prompt_version = excluded.prompt_version,
2288
3150
  content_hash = excluded.content_hash,
2289
3151
  summary_text = excluded.summary_text,
2290
3152
  updated_at = excluded.updated_at`)
2291
- .run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
3153
+ .run(threadId, summaryKind, this.config.summaryModel, SUMMARY_PROMPT_VERSION, contentHash, summaryText, nowIso(), nowIso());
3154
+ }
3155
+ upsertActiveVector(repoId, repoFullName, threadId, basis, contentHash, embedding) {
3156
+ this.db
3157
+ .prepare(`insert into thread_vectors (thread_id, basis, model, dimensions, content_hash, vector_json, vector_backend, created_at, updated_at)
3158
+ values (?, ?, ?, ?, ?, ?, ?, ?, ?)
3159
+ on conflict(thread_id) do update set
3160
+ basis = excluded.basis,
3161
+ model = excluded.model,
3162
+ dimensions = excluded.dimensions,
3163
+ content_hash = excluded.content_hash,
3164
+ vector_json = excluded.vector_json,
3165
+ vector_backend = excluded.vector_backend,
3166
+ updated_at = excluded.updated_at`)
3167
+ .run(threadId, basis, this.config.embedModel, embedding.length, contentHash, this.vectorBlob(embedding), this.config.vectorBackend, nowIso(), nowIso());
3168
+ this.vectorStore.upsertVector({
3169
+ storePath: this.repoVectorStorePath(repoFullName),
3170
+ dimensions: ACTIVE_EMBED_DIMENSIONS,
3171
+ threadId,
3172
+ vector: embedding,
3173
+ });
3174
+ }
3175
+ countLegacyEmbeddings(repoId) {
3176
+ const row = this.db
3177
+ .prepare(`select count(*) as count
3178
+ from document_embeddings
3179
+ where thread_id in (select id from threads where repo_id = ?)`)
3180
+ .get(repoId);
3181
+ return row.count;
3182
+ }
3183
+ countInlineJsonThreadVectors(repoId) {
3184
+ const row = this.db
3185
+ .prepare(`select count(*) as count
3186
+ from thread_vectors
3187
+ where thread_id in (select id from threads where repo_id = ?)
3188
+ and typeof(vector_json) = 'text'
3189
+ and vector_json != ''`)
3190
+ .get(repoId);
3191
+ return row.count;
3192
+ }
3193
+ getVectorliteClusterQuery(totalItems, requestedK) {
3194
+ if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) {
3195
+ return {
3196
+ limit: requestedK,
3197
+ candidateK: Math.max(requestedK * 16, 64),
3198
+ };
3199
+ }
3200
+ const limit = Math.min(Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), Math.max(1, totalItems - 1));
3201
+ const candidateK = Math.min(Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), Math.max(limit, totalItems - 1));
3202
+ return {
3203
+ limit,
3204
+ candidateK,
3205
+ efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH),
3206
+ };
3207
+ }
3208
+ vectorBlob(values) {
3209
+ return Buffer.from(Float32Array.from(values).buffer);
3210
+ }
3211
+ parseStoredVector(value) {
3212
+ if (typeof value === 'string') {
3213
+ if (!value) {
3214
+ throw new Error('Stored vector payload is empty. Run refresh or embed first.');
3215
+ }
3216
+ return JSON.parse(value);
3217
+ }
3218
+ const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT));
3219
+ return Array.from(floats);
2292
3220
  }
2293
3221
  upsertEmbedding(threadId, sourceKind, contentHash, embedding) {
2294
3222
  this.db
@@ -2300,10 +3228,6 @@ export class GHCrawlService {
2300
3228
  embedding_json = excluded.embedding_json,
2301
3229
  updated_at = excluded.updated_at`)
2302
3230
  .run(threadId, sourceKind, this.config.embedModel, embedding.length, contentHash, asJson(embedding), nowIso(), nowIso());
2303
- const row = this.db.prepare('select repo_id from threads where id = ? limit 1').get(threadId);
2304
- if (row) {
2305
- this.parsedEmbeddingCache.delete(row.repo_id);
2306
- }
2307
3231
  }
2308
3232
  startRun(table, repoId, scope) {
2309
3233
  const result = this.db