@ghcrawl/api-core 0.7.1 → 0.8.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cluster/build.d.ts +30 -0
- package/dist/cluster/build.d.ts.map +1 -1
- package/dist/cluster/build.js +178 -7
- package/dist/cluster/build.js.map +1 -1
- package/dist/cluster/perf.integration.js +186 -20
- package/dist/cluster/perf.integration.js.map +1 -1
- package/dist/config.d.ts +9 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +29 -2
- package/dist/config.js.map +1 -1
- package/dist/db/migrate.d.ts.map +1 -1
- package/dist/db/migrate.js +37 -0
- package/dist/db/migrate.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/openai/provider.d.ts +2 -0
- package/dist/openai/provider.d.ts.map +1 -1
- package/dist/openai/provider.js +15 -1
- package/dist/openai/provider.js.map +1 -1
- package/dist/service.d.ts +99 -1
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +1035 -109
- package/dist/service.js.map +1 -1
- package/dist/vector/store.d.ts +38 -0
- package/dist/vector/store.d.ts.map +1 -0
- package/dist/vector/store.js +2 -0
- package/dist/vector/store.js.map +1 -0
- package/dist/vector/vectorlite-store.d.ts +34 -0
- package/dist/vector/vectorlite-store.d.ts.map +1 -0
- package/dist/vector/vectorlite-store.js +124 -0
- package/dist/vector/vectorlite-store.js.map +1 -0
- package/package.json +7 -6
package/dist/service.js
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
2
|
import crypto from 'node:crypto';
|
|
3
|
+
import fs from 'node:fs';
|
|
3
4
|
import { existsSync } from 'node:fs';
|
|
5
|
+
import { createRequire } from 'node:module';
|
|
4
6
|
import os from 'node:os';
|
|
7
|
+
import path from 'node:path';
|
|
5
8
|
import { fileURLToPath } from 'node:url';
|
|
6
9
|
import { Worker } from 'node:worker_threads';
|
|
7
10
|
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
8
11
|
import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
9
|
-
import { buildClusters } from './cluster/build.js';
|
|
12
|
+
import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js';
|
|
10
13
|
import { buildSourceKindEdges } from './cluster/exact-edges.js';
|
|
11
14
|
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
12
15
|
import { migrate } from './db/migrate.js';
|
|
@@ -14,7 +17,8 @@ import { openDb } from './db/sqlite.js';
|
|
|
14
17
|
import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
|
|
15
18
|
import { makeGitHubClient } from './github/client.js';
|
|
16
19
|
import { OpenAiProvider } from './openai/provider.js';
|
|
17
|
-
import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
|
|
20
|
+
import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js';
|
|
21
|
+
import { VectorliteStore } from './vector/vectorlite-store.js';
|
|
18
22
|
const SYNC_BATCH_SIZE = 100;
|
|
19
23
|
const SYNC_BATCH_DELAY_MS = 5000;
|
|
20
24
|
const STALE_CLOSED_SWEEP_LIMIT = 1000;
|
|
@@ -23,10 +27,31 @@ const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
|
|
|
23
27
|
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
24
28
|
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
25
29
|
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
30
|
+
const requireFromHere = createRequire(import.meta.url);
|
|
26
31
|
const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
|
|
27
32
|
const EMBED_CONTEXT_RETRY_ATTEMPTS = 5;
|
|
28
33
|
const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9;
|
|
29
34
|
const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95;
|
|
35
|
+
const SUMMARY_PROMPT_VERSION = 'v1';
|
|
36
|
+
const ACTIVE_EMBED_DIMENSIONS = 1024;
|
|
37
|
+
const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1';
|
|
38
|
+
const DEFAULT_CLUSTER_MIN_SCORE = 0.78;
|
|
39
|
+
const VECTORLITE_CLUSTER_EXPANDED_K = 24;
|
|
40
|
+
const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4;
|
|
41
|
+
const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512;
|
|
42
|
+
const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024;
|
|
43
|
+
const SUMMARY_MODEL_PRICING = {
|
|
44
|
+
'gpt-5-mini': {
|
|
45
|
+
inputCostPerM: 0.25,
|
|
46
|
+
cachedInputCostPerM: 0.025,
|
|
47
|
+
outputCostPerM: 2.0,
|
|
48
|
+
},
|
|
49
|
+
'gpt-5.4-mini': {
|
|
50
|
+
inputCostPerM: 0.75,
|
|
51
|
+
cachedInputCostPerM: 0.075,
|
|
52
|
+
outputCostPerM: 4.5,
|
|
53
|
+
},
|
|
54
|
+
};
|
|
30
55
|
function nowIso() {
|
|
31
56
|
return new Date().toISOString();
|
|
32
57
|
}
|
|
@@ -180,6 +205,7 @@ export class GHCrawlService {
|
|
|
180
205
|
db;
|
|
181
206
|
github;
|
|
182
207
|
ai;
|
|
208
|
+
vectorStore;
|
|
183
209
|
constructor(options = {}) {
|
|
184
210
|
this.config = options.config ?? loadConfig();
|
|
185
211
|
ensureRuntimeDirs(this.config);
|
|
@@ -187,8 +213,10 @@ export class GHCrawlService {
|
|
|
187
213
|
migrate(this.db);
|
|
188
214
|
this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
|
|
189
215
|
this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
|
|
216
|
+
this.vectorStore = options.vectorStore ?? new VectorliteStore();
|
|
190
217
|
}
|
|
191
218
|
close() {
|
|
219
|
+
this.vectorStore.close();
|
|
192
220
|
this.db.close();
|
|
193
221
|
}
|
|
194
222
|
init() {
|
|
@@ -255,7 +283,17 @@ export class GHCrawlService {
|
|
|
255
283
|
}
|
|
256
284
|
}
|
|
257
285
|
}
|
|
258
|
-
|
|
286
|
+
const vectorliteHealth = this.vectorStore.checkRuntime();
|
|
287
|
+
return {
|
|
288
|
+
health,
|
|
289
|
+
github,
|
|
290
|
+
openai,
|
|
291
|
+
vectorlite: {
|
|
292
|
+
configured: this.config.vectorBackend === 'vectorlite',
|
|
293
|
+
runtimeOk: vectorliteHealth.ok,
|
|
294
|
+
error: vectorliteHealth.error,
|
|
295
|
+
},
|
|
296
|
+
};
|
|
259
297
|
}
|
|
260
298
|
listRepositories() {
|
|
261
299
|
const rows = this.db.prepare('select * from repositories order by full_name asc').all();
|
|
@@ -602,31 +640,69 @@ export class GHCrawlService {
|
|
|
602
640
|
});
|
|
603
641
|
const pending = sources.filter((row) => {
|
|
604
642
|
const latest = this.db
|
|
605
|
-
.prepare('select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
|
|
643
|
+
.prepare('select content_hash, prompt_version from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
|
|
606
644
|
.get(row.id, 'dedupe_summary', this.config.summaryModel);
|
|
607
|
-
return latest?.content_hash !== row.summaryContentHash;
|
|
645
|
+
return latest?.content_hash !== row.summaryContentHash || latest?.prompt_version !== SUMMARY_PROMPT_VERSION;
|
|
608
646
|
});
|
|
609
647
|
params.onProgress?.(`[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`);
|
|
610
648
|
let summarized = 0;
|
|
611
649
|
let inputTokens = 0;
|
|
612
650
|
let outputTokens = 0;
|
|
613
651
|
let totalTokens = 0;
|
|
614
|
-
|
|
615
|
-
|
|
652
|
+
let cachedInputTokens = 0;
|
|
653
|
+
const startTime = Date.now();
|
|
654
|
+
const pricing = SUMMARY_MODEL_PRICING[this.config.summaryModel] ?? null;
|
|
655
|
+
// Stage 1: concurrent API calls
|
|
656
|
+
const fetcher = new IterableMapper(pending, async (row) => {
|
|
616
657
|
const result = await ai.summarizeThread({
|
|
617
658
|
model: this.config.summaryModel,
|
|
618
659
|
text: row.summaryInput,
|
|
619
660
|
});
|
|
661
|
+
return { row, result };
|
|
662
|
+
}, { concurrency: 5 });
|
|
663
|
+
// Stage 2: sequential DB writes — consumes from fetcher without blocking API completions
|
|
664
|
+
const writer = new IterableMapper(fetcher, async ({ row, result }) => {
|
|
620
665
|
const summary = result.summary;
|
|
621
666
|
this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
|
|
622
667
|
this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
|
|
623
668
|
this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
|
|
624
669
|
this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
670
|
+
return { row, usage: result.usage };
|
|
671
|
+
}, { concurrency: 1 });
|
|
672
|
+
let index = 0;
|
|
673
|
+
for await (const { row, usage } of writer) {
|
|
674
|
+
index += 1;
|
|
675
|
+
if (usage) {
|
|
676
|
+
inputTokens += usage.inputTokens;
|
|
677
|
+
outputTokens += usage.outputTokens;
|
|
678
|
+
totalTokens += usage.totalTokens;
|
|
679
|
+
cachedInputTokens += usage.cachedInputTokens;
|
|
680
|
+
}
|
|
681
|
+
// Compute cost and ETA every 10 items or on the last item
|
|
682
|
+
if (index % 10 === 0 || index === pending.length) {
|
|
683
|
+
const remaining = pending.length - index;
|
|
684
|
+
const avgIn = inputTokens / index;
|
|
685
|
+
const avgOut = outputTokens / index;
|
|
686
|
+
const avgCachedIn = cachedInputTokens / index;
|
|
687
|
+
const elapsedSec = (Date.now() - startTime) / 1000;
|
|
688
|
+
const secPerItem = elapsedSec / index;
|
|
689
|
+
const etaSec = remaining * secPerItem;
|
|
690
|
+
const etaMin = Math.round(etaSec / 60);
|
|
691
|
+
const etaStr = etaMin >= 60 ? `${Math.floor(etaMin / 60)}h${etaMin % 60}m` : `${etaMin}m`;
|
|
692
|
+
if (pricing) {
|
|
693
|
+
const uncachedInput = inputTokens - cachedInputTokens;
|
|
694
|
+
const costSoFar = (uncachedInput / 1_000_000) * pricing.inputCostPerM +
|
|
695
|
+
(cachedInputTokens / 1_000_000) * pricing.cachedInputCostPerM +
|
|
696
|
+
(outputTokens / 1_000_000) * pricing.outputCostPerM;
|
|
697
|
+
const estTotalCost = costSoFar +
|
|
698
|
+
((remaining * (avgIn - avgCachedIn)) / 1_000_000) * pricing.inputCostPerM +
|
|
699
|
+
((remaining * avgCachedIn) / 1_000_000) * pricing.cachedInputCostPerM +
|
|
700
|
+
((remaining * avgOut) / 1_000_000) * pricing.outputCostPerM;
|
|
701
|
+
params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | cost=$${costSoFar.toFixed(2)} est_total=$${estTotalCost.toFixed(2)} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
|
|
702
|
+
}
|
|
703
|
+
else {
|
|
704
|
+
params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
|
|
705
|
+
}
|
|
630
706
|
}
|
|
631
707
|
summarized += 1;
|
|
632
708
|
}
|
|
@@ -670,11 +746,25 @@ export class GHCrawlService {
|
|
|
670
746
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
671
747
|
const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
|
|
672
748
|
try {
|
|
673
|
-
|
|
749
|
+
if (params.threadNumber === undefined) {
|
|
750
|
+
if (!this.isRepoVectorStateCurrent(repository.id)) {
|
|
751
|
+
this.resetRepositoryVectors(repository.id, repository.fullName);
|
|
752
|
+
}
|
|
753
|
+
else {
|
|
754
|
+
const pruned = this.pruneInactiveRepositoryVectors(repository.id, repository.fullName);
|
|
755
|
+
if (pruned > 0) {
|
|
756
|
+
params.onProgress?.(`[embed] pruned ${pruned} closed or inactive vector(s) before refresh`);
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
const { rows, tasks, pending, missingSummaryThreadNumbers } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
|
|
674
761
|
const skipped = tasks.length - pending.length;
|
|
675
762
|
const truncated = tasks.filter((task) => task.wasTruncated).length;
|
|
676
|
-
|
|
677
|
-
|
|
763
|
+
if (missingSummaryThreadNumbers.length > 0) {
|
|
764
|
+
throw new Error(`Embedding basis ${this.config.embeddingBasis} requires summaries before embedding. Missing summaries for thread(s): ${missingSummaryThreadNumbers.slice(0, 10).join(', ')}${missingSummaryThreadNumbers.length > 10 ? ', …' : ''}.`);
|
|
765
|
+
}
|
|
766
|
+
params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} active vector task(s) for ${repository.fullName}`);
|
|
767
|
+
params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} dimensions=${ACTIVE_EMBED_DIMENSIONS} basis=${this.config.embeddingBasis} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
|
|
678
768
|
let embedded = 0;
|
|
679
769
|
const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
|
|
680
770
|
const mapper = new IterableMapper(batches, async (batch) => {
|
|
@@ -686,14 +776,15 @@ export class GHCrawlService {
|
|
|
686
776
|
let completedBatches = 0;
|
|
687
777
|
for await (const batchResult of mapper) {
|
|
688
778
|
completedBatches += 1;
|
|
689
|
-
const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.
|
|
779
|
+
const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.basis}`);
|
|
690
780
|
const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
|
|
691
781
|
params.onProgress?.(`[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`);
|
|
692
782
|
for (const { task, embedding } of batchResult) {
|
|
693
|
-
this.
|
|
783
|
+
this.upsertActiveVector(repository.id, repository.fullName, task.threadId, task.basis, task.contentHash, embedding);
|
|
694
784
|
embedded += 1;
|
|
695
785
|
}
|
|
696
786
|
}
|
|
787
|
+
this.markRepoVectorsCurrent(repository.id);
|
|
697
788
|
this.finishRun('embedding_runs', runId, 'completed', { embedded });
|
|
698
789
|
return embedResultSchema.parse({ runId, embedded });
|
|
699
790
|
}
|
|
@@ -705,16 +796,70 @@ export class GHCrawlService {
|
|
|
705
796
|
async clusterRepository(params) {
|
|
706
797
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
707
798
|
const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
|
|
708
|
-
const minScore = params.minScore ??
|
|
799
|
+
const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
|
|
709
800
|
const k = params.k ?? 6;
|
|
710
801
|
try {
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
802
|
+
let items;
|
|
803
|
+
let aggregatedEdges;
|
|
804
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
805
|
+
const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName);
|
|
806
|
+
const activeIds = new Set(vectorItems.map((item) => item.id));
|
|
807
|
+
const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k);
|
|
808
|
+
aggregatedEdges = new Map();
|
|
809
|
+
let processed = 0;
|
|
810
|
+
let lastProgressAt = Date.now();
|
|
811
|
+
params.onProgress?.(`[cluster] loaded ${vectorItems.length} active vector(s) for ${repository.fullName} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`);
|
|
812
|
+
for (const item of vectorItems) {
|
|
813
|
+
const neighbors = this.vectorStore.queryNearest({
|
|
814
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
815
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
816
|
+
vector: item.embedding,
|
|
817
|
+
limit: annQuery.limit,
|
|
818
|
+
candidateK: annQuery.candidateK + 1,
|
|
819
|
+
efSearch: annQuery.efSearch,
|
|
820
|
+
excludeThreadId: item.id,
|
|
821
|
+
});
|
|
822
|
+
for (const neighbor of neighbors) {
|
|
823
|
+
if (!activeIds.has(neighbor.threadId))
|
|
824
|
+
continue;
|
|
825
|
+
if (neighbor.score < minScore)
|
|
826
|
+
continue;
|
|
827
|
+
const key = this.edgeKey(item.id, neighbor.threadId);
|
|
828
|
+
const existing = aggregatedEdges.get(key);
|
|
829
|
+
if (existing) {
|
|
830
|
+
existing.score = Math.max(existing.score, neighbor.score);
|
|
831
|
+
}
|
|
832
|
+
else {
|
|
833
|
+
aggregatedEdges.set(key, {
|
|
834
|
+
leftThreadId: Math.min(item.id, neighbor.threadId),
|
|
835
|
+
rightThreadId: Math.max(item.id, neighbor.threadId),
|
|
836
|
+
score: neighbor.score,
|
|
837
|
+
sourceKinds: new Set(['dedupe_summary']),
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
processed += 1;
|
|
842
|
+
const now = Date.now();
|
|
843
|
+
if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
|
|
844
|
+
params.onProgress(`[cluster] queried ${processed}/${vectorItems.length} vectors current_edges=${aggregatedEdges.size}`);
|
|
845
|
+
lastProgressAt = now;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
items = vectorItems;
|
|
849
|
+
}
|
|
850
|
+
else if (this.hasLegacyEmbeddings(repository.id)) {
|
|
851
|
+
const legacy = this.loadClusterableThreadMeta(repository.id);
|
|
852
|
+
items = legacy.items;
|
|
853
|
+
params.onProgress?.(`[cluster] loaded ${items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
854
|
+
aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, {
|
|
855
|
+
limit: k,
|
|
856
|
+
minScore,
|
|
857
|
+
onProgress: params.onProgress,
|
|
858
|
+
});
|
|
859
|
+
}
|
|
860
|
+
else {
|
|
861
|
+
throw new Error(`Vectors for ${repository.fullName} are stale or missing. Run refresh or embed first.`);
|
|
862
|
+
}
|
|
718
863
|
const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
|
|
719
864
|
leftThreadId: entry.leftThreadId,
|
|
720
865
|
rightThreadId: entry.rightThreadId,
|
|
@@ -724,6 +869,10 @@ export class GHCrawlService {
|
|
|
724
869
|
const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
|
|
725
870
|
this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
|
|
726
871
|
this.pruneOldClusterRuns(repository.id, runId);
|
|
872
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
873
|
+
this.markRepoClustersCurrent(repository.id);
|
|
874
|
+
this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress);
|
|
875
|
+
}
|
|
727
876
|
params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
|
|
728
877
|
this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
|
|
729
878
|
return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
|
|
@@ -733,6 +882,263 @@ export class GHCrawlService {
|
|
|
733
882
|
throw error;
|
|
734
883
|
}
|
|
735
884
|
}
|
|
885
|
+
clusterExperiment(params) {
|
|
886
|
+
const backend = params.backend ?? 'vectorlite';
|
|
887
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
888
|
+
const loaded = this.loadClusterableThreadMeta(repository.id);
|
|
889
|
+
const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : [];
|
|
890
|
+
const activeSourceKind = this.config.embeddingBasis === 'title_summary' ? 'dedupe_summary' : 'body';
|
|
891
|
+
const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0);
|
|
892
|
+
const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds);
|
|
893
|
+
const items = useActiveVectors
|
|
894
|
+
? activeVectors.map((item) => ({ id: item.id, number: item.number, title: item.title }))
|
|
895
|
+
: loaded.items;
|
|
896
|
+
const aggregation = params.aggregation ?? 'max';
|
|
897
|
+
const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
|
|
898
|
+
const k = params.k ?? 6;
|
|
899
|
+
const candidateK = Math.max(k, params.candidateK ?? Math.max(k * 16, 64));
|
|
900
|
+
const efSearch = params.efSearch;
|
|
901
|
+
const startedAt = Date.now();
|
|
902
|
+
const memoryBefore = process.memoryUsage();
|
|
903
|
+
let peakRssBytes = memoryBefore.rss;
|
|
904
|
+
let peakHeapUsedBytes = memoryBefore.heapUsed;
|
|
905
|
+
const recordMemory = () => {
|
|
906
|
+
const usage = process.memoryUsage();
|
|
907
|
+
peakRssBytes = Math.max(peakRssBytes, usage.rss);
|
|
908
|
+
peakHeapUsedBytes = Math.max(peakHeapUsedBytes, usage.heapUsed);
|
|
909
|
+
};
|
|
910
|
+
recordMemory();
|
|
911
|
+
if (useActiveVectors && params.sourceKinds && loaded.items.length === 0) {
|
|
912
|
+
params.onProgress?.(`[cluster-experiment] legacy source embeddings are unavailable for ${repository.fullName}; falling back to active ${this.config.embeddingBasis} vectors`);
|
|
913
|
+
}
|
|
914
|
+
params.onProgress?.(`[cluster-experiment] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} backend=${backend} k=${k} candidateK=${candidateK} minScore=${minScore} aggregation=${aggregation}`);
|
|
915
|
+
const perSourceScores = new Map();
|
|
916
|
+
let loadMs = 0;
|
|
917
|
+
let setupMs = 0;
|
|
918
|
+
let edgeBuildMs = 0;
|
|
919
|
+
let indexBuildMs = 0;
|
|
920
|
+
let queryMs = 0;
|
|
921
|
+
let clusterBuildMs = 0;
|
|
922
|
+
let tempDbPath = null;
|
|
923
|
+
let tempDb = null;
|
|
924
|
+
let tempDir = null;
|
|
925
|
+
try {
|
|
926
|
+
if (backend === 'exact') {
|
|
927
|
+
if (useActiveVectors) {
|
|
928
|
+
const loadStartedAt = Date.now();
|
|
929
|
+
const normalizedRows = activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding }));
|
|
930
|
+
loadMs += Date.now() - loadStartedAt;
|
|
931
|
+
recordMemory();
|
|
932
|
+
const edgesStartedAt = Date.now();
|
|
933
|
+
const edges = buildSourceKindEdges(normalizedRows, {
|
|
934
|
+
limit: k,
|
|
935
|
+
minScore,
|
|
936
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
937
|
+
onProgress: (progress) => {
|
|
938
|
+
recordMemory();
|
|
939
|
+
if (!params.onProgress)
|
|
940
|
+
return;
|
|
941
|
+
params.onProgress(`[cluster-experiment] exact ${progress.processedItems}/${normalizedRows.length} active vectors processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
|
|
942
|
+
},
|
|
943
|
+
});
|
|
944
|
+
edgeBuildMs += Date.now() - edgesStartedAt;
|
|
945
|
+
this.collectSourceKindScores(perSourceScores, edges, activeSourceKind);
|
|
946
|
+
recordMemory();
|
|
947
|
+
}
|
|
948
|
+
else {
|
|
949
|
+
const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0);
|
|
950
|
+
let processedItems = 0;
|
|
951
|
+
for (const sourceKind of sourceKinds) {
|
|
952
|
+
const loadStartedAt = Date.now();
|
|
953
|
+
const normalizedRows = this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind);
|
|
954
|
+
loadMs += Date.now() - loadStartedAt;
|
|
955
|
+
recordMemory();
|
|
956
|
+
const edgesStartedAt = Date.now();
|
|
957
|
+
const edges = buildSourceKindEdges(normalizedRows, {
|
|
958
|
+
limit: k,
|
|
959
|
+
minScore,
|
|
960
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
961
|
+
onProgress: (progress) => {
|
|
962
|
+
recordMemory();
|
|
963
|
+
if (!params.onProgress)
|
|
964
|
+
return;
|
|
965
|
+
params.onProgress(`[cluster-experiment] exact ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
|
|
966
|
+
},
|
|
967
|
+
});
|
|
968
|
+
edgeBuildMs += Date.now() - edgesStartedAt;
|
|
969
|
+
processedItems += normalizedRows.length;
|
|
970
|
+
this.collectSourceKindScores(perSourceScores, edges, sourceKind);
|
|
971
|
+
recordMemory();
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
else {
|
|
976
|
+
const setupStartedAt = Date.now();
|
|
977
|
+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-vectorlite-'));
|
|
978
|
+
tempDbPath = path.join(tempDir, 'cluster-experiment.db');
|
|
979
|
+
tempDb = openDb(tempDbPath);
|
|
980
|
+
tempDb.pragma('journal_mode = MEMORY');
|
|
981
|
+
tempDb.pragma('synchronous = OFF');
|
|
982
|
+
tempDb.pragma('temp_store = MEMORY');
|
|
983
|
+
const vectorlite = requireFromHere('vectorlite');
|
|
984
|
+
tempDb.loadExtension(vectorlite.vectorlitePath());
|
|
985
|
+
setupMs += Date.now() - setupStartedAt;
|
|
986
|
+
recordMemory();
|
|
987
|
+
const vectorSources = useActiveVectors
|
|
988
|
+
? [
|
|
989
|
+
{
|
|
990
|
+
sourceKind: activeSourceKind,
|
|
991
|
+
rows: activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding })),
|
|
992
|
+
},
|
|
993
|
+
]
|
|
994
|
+
: sourceKinds.map((sourceKind) => ({
|
|
995
|
+
sourceKind,
|
|
996
|
+
rows: this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind).map((row) => ({
|
|
997
|
+
id: row.id,
|
|
998
|
+
normalizedEmbedding: row.normalizedEmbedding,
|
|
999
|
+
})),
|
|
1000
|
+
}));
|
|
1001
|
+
for (const source of vectorSources) {
|
|
1002
|
+
const sourceRowCount = source.rows.length;
|
|
1003
|
+
if (sourceRowCount === 0) {
|
|
1004
|
+
continue;
|
|
1005
|
+
}
|
|
1006
|
+
const dimension = source.rows[0].normalizedEmbedding.length;
|
|
1007
|
+
const safeCandidateK = Math.min(candidateK, Math.max(1, sourceRowCount - 1));
|
|
1008
|
+
const tableName = `vector_${source.sourceKind}`;
|
|
1009
|
+
params.onProgress?.(`[cluster-experiment] building ${source.sourceKind} HNSW index with ${sourceRowCount} vector(s)`);
|
|
1010
|
+
const indexStartedAt = Date.now();
|
|
1011
|
+
tempDb.exec(`create virtual table ${tableName} using vectorlite(vec float32[${dimension}], hnsw(max_elements=${sourceRowCount}));`);
|
|
1012
|
+
const insert = tempDb.prepare(`insert into ${tableName}(rowid, vec) values (?, ?)`);
|
|
1013
|
+
tempDb.transaction(() => {
|
|
1014
|
+
const loadStartedAt = Date.now();
|
|
1015
|
+
for (const row of source.rows) {
|
|
1016
|
+
insert.run(row.id, this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
|
|
1017
|
+
}
|
|
1018
|
+
loadMs += Date.now() - loadStartedAt;
|
|
1019
|
+
})();
|
|
1020
|
+
indexBuildMs += Date.now() - indexStartedAt;
|
|
1021
|
+
recordMemory();
|
|
1022
|
+
const queryStartedAt = Date.now();
|
|
1023
|
+
const querySql = efSearch !== undefined
|
|
1024
|
+
? `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}, ${efSearch}))`
|
|
1025
|
+
: `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}))`;
|
|
1026
|
+
const query = tempDb.prepare(querySql);
|
|
1027
|
+
let processed = 0;
|
|
1028
|
+
let lastProgressAt = Date.now();
|
|
1029
|
+
const queryLoadStartedAt = Date.now();
|
|
1030
|
+
for (const row of source.rows) {
|
|
1031
|
+
const candidates = query.all(this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
|
|
1032
|
+
const ranked = rankNearestNeighborsByScore(candidates, {
|
|
1033
|
+
limit: k,
|
|
1034
|
+
minScore,
|
|
1035
|
+
score: (candidate) => {
|
|
1036
|
+
if (candidate.rowid === row.id) {
|
|
1037
|
+
return -1;
|
|
1038
|
+
}
|
|
1039
|
+
return this.normalizedDistanceToScore(candidate.distance);
|
|
1040
|
+
},
|
|
1041
|
+
});
|
|
1042
|
+
let addedThisRow = 0;
|
|
1043
|
+
for (const candidate of ranked) {
|
|
1044
|
+
const score = candidate.score;
|
|
1045
|
+
const key = this.edgeKey(row.id, candidate.item.rowid);
|
|
1046
|
+
const existing = perSourceScores.get(key);
|
|
1047
|
+
if (existing) {
|
|
1048
|
+
existing.scores.set(source.sourceKind, Math.max(existing.scores.get(source.sourceKind) ?? -1, score));
|
|
1049
|
+
continue;
|
|
1050
|
+
}
|
|
1051
|
+
const scores = new Map();
|
|
1052
|
+
scores.set(source.sourceKind, score);
|
|
1053
|
+
perSourceScores.set(key, {
|
|
1054
|
+
leftThreadId: Math.min(row.id, candidate.item.rowid),
|
|
1055
|
+
rightThreadId: Math.max(row.id, candidate.item.rowid),
|
|
1056
|
+
scores,
|
|
1057
|
+
});
|
|
1058
|
+
addedThisRow += 1;
|
|
1059
|
+
}
|
|
1060
|
+
processed += 1;
|
|
1061
|
+
const now = Date.now();
|
|
1062
|
+
if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
|
|
1063
|
+
recordMemory();
|
|
1064
|
+
params.onProgress(`[cluster-experiment] querying ${source.sourceKind} index ${processed}/${sourceRowCount} current_edges=${perSourceScores.size} added_this_step=${addedThisRow}`);
|
|
1065
|
+
lastProgressAt = now;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
loadMs += Date.now() - queryLoadStartedAt;
|
|
1069
|
+
queryMs += Date.now() - queryStartedAt;
|
|
1070
|
+
tempDb.exec(`drop table ${tableName}`);
|
|
1071
|
+
recordMemory();
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
// Finalize edge scores using the configured aggregation method
|
|
1075
|
+
const defaultWeights = { dedupe_summary: 0.5, title: 0.3, body: 0.2 };
|
|
1076
|
+
const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) };
|
|
1077
|
+
const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore);
|
|
1078
|
+
params.onProgress?.(`[cluster-experiment] finalized ${aggregated.length} edges from ${perSourceScores.size} candidate pairs using ${aggregation} aggregation`);
|
|
1079
|
+
const clusterStartedAt = Date.now();
|
|
1080
|
+
const clusterNodes = items.map((item) => ({ threadId: item.id, number: item.number, title: item.title }));
|
|
1081
|
+
const clusterEdges = aggregated;
|
|
1082
|
+
const clusterMode = params.clusterMode ?? (params.maxClusterSize !== undefined ? 'refine' : 'basic');
|
|
1083
|
+
const clusters = clusterMode === 'bounded'
|
|
1084
|
+
? buildSizeBoundedClusters(clusterNodes, clusterEdges, {
|
|
1085
|
+
maxClusterSize: params.maxClusterSize ?? 200,
|
|
1086
|
+
})
|
|
1087
|
+
: clusterMode === 'refine'
|
|
1088
|
+
? buildRefinedClusters(clusterNodes, clusterEdges, {
|
|
1089
|
+
maxClusterSize: params.maxClusterSize ?? 200,
|
|
1090
|
+
refineStep: params.refineStep ?? 0.02,
|
|
1091
|
+
})
|
|
1092
|
+
: buildClusters(clusterNodes, clusterEdges);
|
|
1093
|
+
clusterBuildMs += Date.now() - clusterStartedAt;
|
|
1094
|
+
recordMemory();
|
|
1095
|
+
const memoryAfter = process.memoryUsage();
|
|
1096
|
+
const durationMs = backend === 'vectorlite'
|
|
1097
|
+
? indexBuildMs + queryMs + clusterBuildMs
|
|
1098
|
+
: edgeBuildMs + clusterBuildMs;
|
|
1099
|
+
const totalDurationMs = Date.now() - startedAt;
|
|
1100
|
+
return {
|
|
1101
|
+
backend,
|
|
1102
|
+
repository,
|
|
1103
|
+
tempDbPath,
|
|
1104
|
+
threads: items.length,
|
|
1105
|
+
sourceKinds: sourceKinds.length,
|
|
1106
|
+
edges: aggregated.length,
|
|
1107
|
+
clusters: clusters.length,
|
|
1108
|
+
timingBasis: 'cluster-only',
|
|
1109
|
+
durationMs,
|
|
1110
|
+
totalDurationMs,
|
|
1111
|
+
loadMs,
|
|
1112
|
+
setupMs,
|
|
1113
|
+
edgeBuildMs,
|
|
1114
|
+
indexBuildMs,
|
|
1115
|
+
queryMs,
|
|
1116
|
+
clusterBuildMs,
|
|
1117
|
+
candidateK,
|
|
1118
|
+
memory: {
|
|
1119
|
+
rssBeforeBytes: memoryBefore.rss,
|
|
1120
|
+
rssAfterBytes: memoryAfter.rss,
|
|
1121
|
+
peakRssBytes,
|
|
1122
|
+
heapUsedBeforeBytes: memoryBefore.heapUsed,
|
|
1123
|
+
heapUsedAfterBytes: memoryAfter.heapUsed,
|
|
1124
|
+
peakHeapUsedBytes,
|
|
1125
|
+
},
|
|
1126
|
+
clusterSizes: this.summarizeClusterSizes(clusters),
|
|
1127
|
+
clustersDetail: params.includeClusters
|
|
1128
|
+
? clusters.map((cluster) => ({
|
|
1129
|
+
representativeThreadId: cluster.representativeThreadId,
|
|
1130
|
+
memberThreadIds: [...cluster.members],
|
|
1131
|
+
}))
|
|
1132
|
+
: null,
|
|
1133
|
+
};
|
|
1134
|
+
}
|
|
1135
|
+
finally {
|
|
1136
|
+
tempDb?.close();
|
|
1137
|
+
if (tempDir) {
|
|
1138
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
736
1142
|
async searchRepository(params) {
|
|
737
1143
|
const mode = params.mode ?? 'hybrid';
|
|
738
1144
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
@@ -754,12 +1160,33 @@ export class GHCrawlService {
|
|
|
754
1160
|
}
|
|
755
1161
|
}
|
|
756
1162
|
if (mode !== 'keyword' && this.ai) {
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
1163
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
1164
|
+
const [queryEmbedding] = await this.ai.embedTexts({
|
|
1165
|
+
model: this.config.embedModel,
|
|
1166
|
+
texts: [params.query],
|
|
1167
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1168
|
+
});
|
|
1169
|
+
const neighbors = this.vectorStore.queryNearest({
|
|
1170
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
1171
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1172
|
+
vector: queryEmbedding,
|
|
1173
|
+
limit: limit * 2,
|
|
1174
|
+
candidateK: Math.max(limit * 8, 64),
|
|
1175
|
+
});
|
|
1176
|
+
for (const neighbor of neighbors) {
|
|
1177
|
+
if (neighbor.score < 0.2)
|
|
1178
|
+
continue;
|
|
1179
|
+
semanticScores.set(neighbor.threadId, Math.max(semanticScores.get(neighbor.threadId) ?? -1, neighbor.score));
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
else if (this.hasLegacyEmbeddings(repository.id)) {
|
|
1183
|
+
const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
|
|
1184
|
+
for (const row of this.iterateStoredEmbeddings(repository.id)) {
|
|
1185
|
+
const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json));
|
|
1186
|
+
if (score < 0.2)
|
|
1187
|
+
continue;
|
|
1188
|
+
semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
|
|
1189
|
+
}
|
|
763
1190
|
}
|
|
764
1191
|
}
|
|
765
1192
|
const candidateIds = new Set([...keywordScores.keys(), ...semanticScores.keys()]);
|
|
@@ -827,43 +1254,97 @@ export class GHCrawlService {
|
|
|
827
1254
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
828
1255
|
const limit = params.limit ?? 10;
|
|
829
1256
|
const minScore = params.minScore ?? 0.2;
|
|
830
|
-
const
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
1257
|
+
const targetRow = this.db
|
|
1258
|
+
.prepare(`select t.*, tv.basis, tv.model, tv.dimensions, tv.content_hash, tv.vector_json, tv.vector_backend
|
|
1259
|
+
from threads t
|
|
1260
|
+
join thread_vectors tv on tv.thread_id = t.id
|
|
1261
|
+
where t.repo_id = ?
|
|
1262
|
+
and t.number = ?
|
|
1263
|
+
and t.state = 'open'
|
|
1264
|
+
and t.closed_at_local is null
|
|
1265
|
+
and tv.model = ?
|
|
1266
|
+
and tv.basis = ?
|
|
1267
|
+
and tv.dimensions = ?
|
|
1268
|
+
limit 1`)
|
|
1269
|
+
.get(repository.id, params.threadNumber, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
1270
|
+
let responseThread;
|
|
1271
|
+
let neighbors;
|
|
1272
|
+
if (targetRow) {
|
|
1273
|
+
responseThread = targetRow;
|
|
1274
|
+
const candidateRows = this.vectorStore
|
|
1275
|
+
.queryNearest({
|
|
1276
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
1277
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1278
|
+
vector: this.parseStoredVector(targetRow.vector_json),
|
|
1279
|
+
limit: limit * 2,
|
|
1280
|
+
candidateK: Math.max(limit * 8, 64),
|
|
1281
|
+
excludeThreadId: targetRow.id,
|
|
1282
|
+
})
|
|
1283
|
+
.filter((row) => row.score >= minScore);
|
|
1284
|
+
const candidateIds = candidateRows.map((row) => row.threadId);
|
|
1285
|
+
const neighborMeta = candidateIds.length
|
|
1286
|
+
? this.db
|
|
1287
|
+
.prepare(`select * from threads
|
|
1288
|
+
where repo_id = ? and state = 'open' and closed_at_local is null and id in (${candidateIds.map(() => '?').join(',')})`)
|
|
1289
|
+
.all(repository.id, ...candidateIds)
|
|
1290
|
+
: [];
|
|
1291
|
+
const metaById = new Map(neighborMeta.map((row) => [row.id, row]));
|
|
1292
|
+
neighbors = candidateRows
|
|
1293
|
+
.map((row) => {
|
|
1294
|
+
const meta = metaById.get(row.threadId);
|
|
1295
|
+
if (!meta) {
|
|
1296
|
+
return null;
|
|
1297
|
+
}
|
|
1298
|
+
return {
|
|
1299
|
+
threadId: row.threadId,
|
|
1300
|
+
number: meta.number,
|
|
1301
|
+
kind: meta.kind,
|
|
1302
|
+
title: meta.title,
|
|
1303
|
+
score: row.score,
|
|
1304
|
+
};
|
|
1305
|
+
})
|
|
1306
|
+
.filter((row) => row !== null)
|
|
1307
|
+
.slice(0, limit);
|
|
838
1308
|
}
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
if (
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
const
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
const
|
|
850
|
-
|
|
851
|
-
|
|
1309
|
+
else {
|
|
1310
|
+
const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber);
|
|
1311
|
+
if (targetRows.length === 0) {
|
|
1312
|
+
throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
|
|
1313
|
+
}
|
|
1314
|
+
responseThread = targetRows[0];
|
|
1315
|
+
const targetBySource = new Map();
|
|
1316
|
+
for (const row of targetRows) {
|
|
1317
|
+
targetBySource.set(row.source_kind, JSON.parse(row.embedding_json));
|
|
1318
|
+
}
|
|
1319
|
+
const aggregated = new Map();
|
|
1320
|
+
for (const row of this.iterateStoredEmbeddings(repository.id)) {
|
|
1321
|
+
if (row.id === responseThread.id)
|
|
1322
|
+
continue;
|
|
1323
|
+
const targetEmbedding = targetBySource.get(row.source_kind);
|
|
1324
|
+
if (!targetEmbedding)
|
|
1325
|
+
continue;
|
|
1326
|
+
const score = cosineSimilarity(targetEmbedding, JSON.parse(row.embedding_json));
|
|
1327
|
+
if (score < minScore)
|
|
1328
|
+
continue;
|
|
1329
|
+
const previous = aggregated.get(row.id);
|
|
1330
|
+
if (!previous || score > previous.score) {
|
|
1331
|
+
aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
|
|
1332
|
+
}
|
|
852
1333
|
}
|
|
1334
|
+
neighbors = Array.from(aggregated.entries())
|
|
1335
|
+
.map(([threadId, value]) => ({
|
|
1336
|
+
threadId,
|
|
1337
|
+
number: value.number,
|
|
1338
|
+
kind: value.kind,
|
|
1339
|
+
title: value.title,
|
|
1340
|
+
score: value.score,
|
|
1341
|
+
}))
|
|
1342
|
+
.sort((left, right) => right.score - left.score)
|
|
1343
|
+
.slice(0, limit);
|
|
853
1344
|
}
|
|
854
|
-
const neighbors = Array.from(aggregated.entries())
|
|
855
|
-
.map(([threadId, value]) => ({
|
|
856
|
-
threadId,
|
|
857
|
-
number: value.number,
|
|
858
|
-
kind: value.kind,
|
|
859
|
-
title: value.title,
|
|
860
|
-
score: value.score,
|
|
861
|
-
}))
|
|
862
|
-
.sort((left, right) => right.score - left.score)
|
|
863
|
-
.slice(0, limit);
|
|
864
1345
|
return neighborsResponseSchema.parse({
|
|
865
1346
|
repository,
|
|
866
|
-
thread: threadToDto(
|
|
1347
|
+
thread: threadToDto(responseThread),
|
|
867
1348
|
neighbors,
|
|
868
1349
|
});
|
|
869
1350
|
}
|
|
@@ -940,6 +1421,14 @@ export class GHCrawlService {
|
|
|
940
1421
|
onProgress: params.onProgress,
|
|
941
1422
|
});
|
|
942
1423
|
}
|
|
1424
|
+
if (selected.embed && this.config.embeddingBasis === 'title_summary') {
|
|
1425
|
+
params.onProgress?.(`[refresh] embedding basis ${this.config.embeddingBasis} requires summaries; running summarize before embed`);
|
|
1426
|
+
await this.summarizeRepository({
|
|
1427
|
+
owner: params.owner,
|
|
1428
|
+
repo: params.repo,
|
|
1429
|
+
onProgress: params.onProgress,
|
|
1430
|
+
});
|
|
1431
|
+
}
|
|
943
1432
|
if (selected.embed) {
|
|
944
1433
|
embed = await this.embedRepository({
|
|
945
1434
|
owner: params.owner,
|
|
@@ -1144,9 +1633,9 @@ export class GHCrawlService {
|
|
|
1144
1633
|
const summaryRows = this.db
|
|
1145
1634
|
.prepare(`select summary_kind, summary_text
|
|
1146
1635
|
from document_summaries
|
|
1147
|
-
where thread_id = ? and model = ?
|
|
1636
|
+
where thread_id = ? and model = ? and prompt_version = ?
|
|
1148
1637
|
order by summary_kind asc`)
|
|
1149
|
-
.all(row.id, this.config.summaryModel);
|
|
1638
|
+
.all(row.id, this.config.summaryModel, SUMMARY_PROMPT_VERSION);
|
|
1150
1639
|
const summaries = {};
|
|
1151
1640
|
for (const summary of summaryRows) {
|
|
1152
1641
|
if (summary.summary_kind === 'problem_summary' ||
|
|
@@ -1308,7 +1797,178 @@ export class GHCrawlService {
|
|
|
1308
1797
|
latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
|
|
1309
1798
|
};
|
|
1310
1799
|
}
|
|
1800
|
+
getDesiredPipelineState() {
|
|
1801
|
+
return {
|
|
1802
|
+
summary_model: this.config.summaryModel,
|
|
1803
|
+
summary_prompt_version: SUMMARY_PROMPT_VERSION,
|
|
1804
|
+
embedding_basis: this.config.embeddingBasis,
|
|
1805
|
+
embed_model: this.config.embedModel,
|
|
1806
|
+
embed_dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1807
|
+
embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION,
|
|
1808
|
+
vector_backend: this.config.vectorBackend,
|
|
1809
|
+
};
|
|
1810
|
+
}
|
|
1811
|
+
getRepoPipelineState(repoId) {
|
|
1812
|
+
return (this.db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) ??
|
|
1813
|
+
null);
|
|
1814
|
+
}
|
|
1815
|
+
isRepoVectorStateCurrent(repoId) {
|
|
1816
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1817
|
+
if (!state || !state.vectors_current_at) {
|
|
1818
|
+
return false;
|
|
1819
|
+
}
|
|
1820
|
+
const desired = this.getDesiredPipelineState();
|
|
1821
|
+
return (state.summary_model === desired.summary_model &&
|
|
1822
|
+
state.summary_prompt_version === desired.summary_prompt_version &&
|
|
1823
|
+
state.embedding_basis === desired.embedding_basis &&
|
|
1824
|
+
state.embed_model === desired.embed_model &&
|
|
1825
|
+
state.embed_dimensions === desired.embed_dimensions &&
|
|
1826
|
+
state.embed_pipeline_version === desired.embed_pipeline_version &&
|
|
1827
|
+
state.vector_backend === desired.vector_backend);
|
|
1828
|
+
}
|
|
1829
|
+
isRepoClusterStateCurrent(repoId) {
|
|
1830
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1831
|
+
return this.isRepoVectorStateCurrent(repoId) && Boolean(state?.clusters_current_at);
|
|
1832
|
+
}
|
|
1833
|
+
hasLegacyEmbeddings(repoId) {
|
|
1834
|
+
const row = this.db
|
|
1835
|
+
.prepare(`select count(*) as count
|
|
1836
|
+
from document_embeddings e
|
|
1837
|
+
join threads t on t.id = e.thread_id
|
|
1838
|
+
where t.repo_id = ?
|
|
1839
|
+
and t.state = 'open'
|
|
1840
|
+
and t.closed_at_local is null
|
|
1841
|
+
and e.model = ?`)
|
|
1842
|
+
.get(repoId, this.config.embedModel);
|
|
1843
|
+
return row.count > 0;
|
|
1844
|
+
}
|
|
1845
|
+
writeRepoPipelineState(repoId, overrides) {
|
|
1846
|
+
const desired = this.getDesiredPipelineState();
|
|
1847
|
+
const current = this.getRepoPipelineState(repoId);
|
|
1848
|
+
this.db
|
|
1849
|
+
.prepare(`insert into repo_pipeline_state (
|
|
1850
|
+
repo_id,
|
|
1851
|
+
summary_model,
|
|
1852
|
+
summary_prompt_version,
|
|
1853
|
+
embedding_basis,
|
|
1854
|
+
embed_model,
|
|
1855
|
+
embed_dimensions,
|
|
1856
|
+
embed_pipeline_version,
|
|
1857
|
+
vector_backend,
|
|
1858
|
+
vectors_current_at,
|
|
1859
|
+
clusters_current_at,
|
|
1860
|
+
updated_at
|
|
1861
|
+
) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1862
|
+
on conflict(repo_id) do update set
|
|
1863
|
+
summary_model = excluded.summary_model,
|
|
1864
|
+
summary_prompt_version = excluded.summary_prompt_version,
|
|
1865
|
+
embedding_basis = excluded.embedding_basis,
|
|
1866
|
+
embed_model = excluded.embed_model,
|
|
1867
|
+
embed_dimensions = excluded.embed_dimensions,
|
|
1868
|
+
embed_pipeline_version = excluded.embed_pipeline_version,
|
|
1869
|
+
vector_backend = excluded.vector_backend,
|
|
1870
|
+
vectors_current_at = excluded.vectors_current_at,
|
|
1871
|
+
clusters_current_at = excluded.clusters_current_at,
|
|
1872
|
+
updated_at = excluded.updated_at`)
|
|
1873
|
+
.run(repoId, desired.summary_model, desired.summary_prompt_version, desired.embedding_basis, desired.embed_model, desired.embed_dimensions, desired.embed_pipeline_version, desired.vector_backend, overrides.vectors_current_at ?? current?.vectors_current_at ?? null, overrides.clusters_current_at ?? current?.clusters_current_at ?? null, nowIso());
|
|
1874
|
+
}
|
|
1875
|
+
markRepoVectorsCurrent(repoId) {
|
|
1876
|
+
this.writeRepoPipelineState(repoId, {
|
|
1877
|
+
vectors_current_at: nowIso(),
|
|
1878
|
+
clusters_current_at: null,
|
|
1879
|
+
});
|
|
1880
|
+
}
|
|
1881
|
+
markRepoClustersCurrent(repoId) {
|
|
1882
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1883
|
+
this.writeRepoPipelineState(repoId, {
|
|
1884
|
+
vectors_current_at: state?.vectors_current_at ?? nowIso(),
|
|
1885
|
+
clusters_current_at: nowIso(),
|
|
1886
|
+
});
|
|
1887
|
+
}
|
|
1888
|
+
repoVectorStorePath(repoFullName) {
|
|
1889
|
+
const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__');
|
|
1890
|
+
return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`);
|
|
1891
|
+
}
|
|
1892
|
+
resetRepositoryVectors(repoId, repoFullName) {
|
|
1893
|
+
this.db
|
|
1894
|
+
.prepare(`delete from thread_vectors
|
|
1895
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
1896
|
+
.run(repoId);
|
|
1897
|
+
this.vectorStore.resetRepository({
|
|
1898
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
1899
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1900
|
+
});
|
|
1901
|
+
this.writeRepoPipelineState(repoId, {
|
|
1902
|
+
vectors_current_at: null,
|
|
1903
|
+
clusters_current_at: null,
|
|
1904
|
+
});
|
|
1905
|
+
}
|
|
1906
|
+
pruneInactiveRepositoryVectors(repoId, repoFullName) {
|
|
1907
|
+
const rows = this.db
|
|
1908
|
+
.prepare(`select tv.thread_id
|
|
1909
|
+
from thread_vectors tv
|
|
1910
|
+
join threads t on t.id = tv.thread_id
|
|
1911
|
+
where t.repo_id = ?
|
|
1912
|
+
and (t.state != 'open' or t.closed_at_local is not null)`)
|
|
1913
|
+
.all(repoId);
|
|
1914
|
+
if (rows.length === 0) {
|
|
1915
|
+
return 0;
|
|
1916
|
+
}
|
|
1917
|
+
const deleteVectorRow = this.db.prepare('delete from thread_vectors where thread_id = ?');
|
|
1918
|
+
this.db.transaction(() => {
|
|
1919
|
+
for (const row of rows) {
|
|
1920
|
+
deleteVectorRow.run(row.thread_id);
|
|
1921
|
+
this.vectorStore.deleteVector({
|
|
1922
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
1923
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1924
|
+
threadId: row.thread_id,
|
|
1925
|
+
});
|
|
1926
|
+
}
|
|
1927
|
+
})();
|
|
1928
|
+
return rows.length;
|
|
1929
|
+
}
|
|
1930
|
+
cleanupMigratedRepositoryArtifacts(repoId, repoFullName, onProgress) {
|
|
1931
|
+
const legacyEmbeddingCount = this.countLegacyEmbeddings(repoId);
|
|
1932
|
+
const inlineJsonVectorCount = this.countInlineJsonThreadVectors(repoId);
|
|
1933
|
+
if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) {
|
|
1934
|
+
return;
|
|
1935
|
+
}
|
|
1936
|
+
if (legacyEmbeddingCount > 0) {
|
|
1937
|
+
this.db
|
|
1938
|
+
.prepare(`delete from document_embeddings
|
|
1939
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
1940
|
+
.run(repoId);
|
|
1941
|
+
onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`);
|
|
1942
|
+
}
|
|
1943
|
+
if (inlineJsonVectorCount > 0) {
|
|
1944
|
+
const rows = this.db
|
|
1945
|
+
.prepare(`select tv.thread_id, tv.vector_json
|
|
1946
|
+
from thread_vectors tv
|
|
1947
|
+
join threads t on t.id = tv.thread_id
|
|
1948
|
+
where t.repo_id = ?
|
|
1949
|
+
and typeof(tv.vector_json) = 'text'
|
|
1950
|
+
and tv.vector_json != ''`)
|
|
1951
|
+
.all(repoId);
|
|
1952
|
+
const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?');
|
|
1953
|
+
this.db.transaction(() => {
|
|
1954
|
+
for (const row of rows) {
|
|
1955
|
+
update.run(this.vectorBlob(JSON.parse(row.vector_json)), nowIso(), row.thread_id);
|
|
1956
|
+
}
|
|
1957
|
+
})();
|
|
1958
|
+
onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`);
|
|
1959
|
+
}
|
|
1960
|
+
if (this.config.dbPath !== ':memory:') {
|
|
1961
|
+
onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${repoFullName} migration changes`);
|
|
1962
|
+
this.db.pragma('wal_checkpoint(TRUNCATE)');
|
|
1963
|
+
this.db.exec('VACUUM');
|
|
1964
|
+
this.db.pragma('wal_checkpoint(TRUNCATE)');
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1311
1967
|
getLatestClusterRun(repoId) {
|
|
1968
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1969
|
+
if (state && !this.isRepoClusterStateCurrent(repoId)) {
|
|
1970
|
+
return null;
|
|
1971
|
+
}
|
|
1312
1972
|
return (this.db
|
|
1313
1973
|
.prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1314
1974
|
.get(repoId) ?? null);
|
|
@@ -1778,7 +2438,7 @@ export class GHCrawlService {
|
|
|
1778
2438
|
}
|
|
1779
2439
|
}
|
|
1780
2440
|
const summaryInput = parts.join('\n\n');
|
|
1781
|
-
const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
|
|
2441
|
+
const summaryContentHash = stableContentHash(`summary:${SUMMARY_PROMPT_VERSION}:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
|
|
1782
2442
|
return { summaryInput, summaryContentHash };
|
|
1783
2443
|
}
|
|
1784
2444
|
buildEmbeddingTasks(params) {
|
|
@@ -1821,6 +2481,35 @@ export class GHCrawlService {
|
|
|
1821
2481
|
}
|
|
1822
2482
|
return tasks;
|
|
1823
2483
|
}
|
|
2484
|
+
buildActiveVectorTask(params) {
|
|
2485
|
+
const sections = [`title: ${normalizeSummaryText(params.title)}`];
|
|
2486
|
+
if (this.config.embeddingBasis === 'title_summary') {
|
|
2487
|
+
const summary = normalizeSummaryText(params.dedupeSummary ?? '');
|
|
2488
|
+
if (!summary) {
|
|
2489
|
+
return null;
|
|
2490
|
+
}
|
|
2491
|
+
sections.push(`summary: ${summary}`);
|
|
2492
|
+
}
|
|
2493
|
+
else {
|
|
2494
|
+
const body = normalizeSummaryText(params.body ?? '');
|
|
2495
|
+
if (body) {
|
|
2496
|
+
sections.push(`body: ${body}`);
|
|
2497
|
+
}
|
|
2498
|
+
}
|
|
2499
|
+
const prepared = this.prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS);
|
|
2500
|
+
if (!prepared) {
|
|
2501
|
+
return null;
|
|
2502
|
+
}
|
|
2503
|
+
return {
|
|
2504
|
+
threadId: params.threadId,
|
|
2505
|
+
threadNumber: params.threadNumber,
|
|
2506
|
+
basis: this.config.embeddingBasis,
|
|
2507
|
+
text: prepared.text,
|
|
2508
|
+
contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${this.config.embeddingBasis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`),
|
|
2509
|
+
estimatedTokens: prepared.estimatedTokens,
|
|
2510
|
+
wasTruncated: prepared.wasTruncated,
|
|
2511
|
+
};
|
|
2512
|
+
}
|
|
1824
2513
|
prepareEmbeddingText(text, maxEstimatedTokens) {
|
|
1825
2514
|
if (!text) {
|
|
1826
2515
|
return null;
|
|
@@ -1862,6 +2551,7 @@ export class GHCrawlService {
|
|
|
1862
2551
|
const embeddings = await ai.embedTexts({
|
|
1863
2552
|
model: this.config.embedModel,
|
|
1864
2553
|
texts: batch.map((task) => task.text),
|
|
2554
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1865
2555
|
});
|
|
1866
2556
|
return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
|
|
1867
2557
|
}
|
|
@@ -1888,6 +2578,7 @@ export class GHCrawlService {
|
|
|
1888
2578
|
const [embedding] = await ai.embedTexts({
|
|
1889
2579
|
model: this.config.embedModel,
|
|
1890
2580
|
texts: [current.text],
|
|
2581
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1891
2582
|
});
|
|
1892
2583
|
return { task: current, embedding };
|
|
1893
2584
|
}
|
|
@@ -1900,11 +2591,11 @@ export class GHCrawlService {
|
|
|
1900
2591
|
if (!next || next.text === current.text) {
|
|
1901
2592
|
throw error;
|
|
1902
2593
|
}
|
|
1903
|
-
onProgress?.(`[embed] shortened #${current.threadNumber}:${current.
|
|
2594
|
+
onProgress?.(`[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
|
|
1904
2595
|
current = next;
|
|
1905
2596
|
}
|
|
1906
2597
|
}
|
|
1907
|
-
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.
|
|
2598
|
+
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`);
|
|
1908
2599
|
}
|
|
1909
2600
|
shrinkEmbeddingTask(task, context) {
|
|
1910
2601
|
const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
|
|
@@ -1921,7 +2612,7 @@ export class GHCrawlService {
|
|
|
1921
2612
|
return {
|
|
1922
2613
|
...task,
|
|
1923
2614
|
text: nextText,
|
|
1924
|
-
contentHash: stableContentHash(`embedding:${task.
|
|
2615
|
+
contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`),
|
|
1925
2616
|
estimatedTokens: this.estimateEmbeddingTokens(nextText),
|
|
1926
2617
|
wasTruncated: true,
|
|
1927
2618
|
};
|
|
@@ -1996,6 +2687,46 @@ export class GHCrawlService {
|
|
|
1996
2687
|
order by t.number asc, e.source_kind asc`)
|
|
1997
2688
|
.iterate(repoId, this.config.embedModel);
|
|
1998
2689
|
}
|
|
2690
|
+
loadNormalizedEmbeddingForSourceKindHead(repoId, sourceKind) {
|
|
2691
|
+
const row = this.db
|
|
2692
|
+
.prepare(`select t.id, e.embedding_json
|
|
2693
|
+
from threads t
|
|
2694
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2695
|
+
where t.repo_id = ?
|
|
2696
|
+
and t.state = 'open'
|
|
2697
|
+
and t.closed_at_local is null
|
|
2698
|
+
and e.model = ?
|
|
2699
|
+
and e.source_kind = ?
|
|
2700
|
+
order by t.number asc
|
|
2701
|
+
limit 1`)
|
|
2702
|
+
.get(repoId, this.config.embedModel, sourceKind);
|
|
2703
|
+
if (!row) {
|
|
2704
|
+
return null;
|
|
2705
|
+
}
|
|
2706
|
+
return {
|
|
2707
|
+
id: row.id,
|
|
2708
|
+
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2709
|
+
};
|
|
2710
|
+
}
|
|
2711
|
+
*iterateNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2712
|
+
const rows = this.db
|
|
2713
|
+
.prepare(`select t.id, e.embedding_json
|
|
2714
|
+
from threads t
|
|
2715
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2716
|
+
where t.repo_id = ?
|
|
2717
|
+
and t.state = 'open'
|
|
2718
|
+
and t.closed_at_local is null
|
|
2719
|
+
and e.model = ?
|
|
2720
|
+
and e.source_kind = ?
|
|
2721
|
+
order by t.number asc`)
|
|
2722
|
+
.iterate(repoId, this.config.embedModel, sourceKind);
|
|
2723
|
+
for (const row of rows) {
|
|
2724
|
+
yield {
|
|
2725
|
+
id: row.id,
|
|
2726
|
+
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2727
|
+
};
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
1999
2730
|
loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2000
2731
|
const rows = this.db
|
|
2001
2732
|
.prepare(`select t.id, e.embedding_json
|
|
@@ -2013,6 +2744,12 @@ export class GHCrawlService {
|
|
|
2013
2744
|
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2014
2745
|
}));
|
|
2015
2746
|
}
|
|
2747
|
+
normalizedEmbeddingBuffer(values) {
|
|
2748
|
+
return Buffer.from(Float32Array.from(values).buffer);
|
|
2749
|
+
}
|
|
2750
|
+
normalizedDistanceToScore(distance) {
|
|
2751
|
+
return 1 - distance / 2;
|
|
2752
|
+
}
|
|
2016
2753
|
loadClusterableThreadMeta(repoId) {
|
|
2017
2754
|
const rows = this.db
|
|
2018
2755
|
.prepare(`select t.id, t.number, t.title, e.source_kind
|
|
@@ -2033,6 +2770,34 @@ export class GHCrawlService {
|
|
|
2033
2770
|
sourceKinds: Array.from(sourceKinds.values()),
|
|
2034
2771
|
};
|
|
2035
2772
|
}
|
|
2773
|
+
loadClusterableActiveVectorMeta(repoId, _repoFullName) {
|
|
2774
|
+
const rows = this.db
|
|
2775
|
+
.prepare(`select t.id, t.number, t.title, tv.vector_json
|
|
2776
|
+
from threads t
|
|
2777
|
+
join thread_vectors tv on tv.thread_id = t.id
|
|
2778
|
+
where t.repo_id = ?
|
|
2779
|
+
and t.state = 'open'
|
|
2780
|
+
and t.closed_at_local is null
|
|
2781
|
+
and tv.model = ?
|
|
2782
|
+
and tv.basis = ?
|
|
2783
|
+
and tv.dimensions = ?
|
|
2784
|
+
order by t.number asc`)
|
|
2785
|
+
.all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
2786
|
+
return rows.map((row) => ({
|
|
2787
|
+
id: row.id,
|
|
2788
|
+
number: row.number,
|
|
2789
|
+
title: row.title,
|
|
2790
|
+
embedding: this.parseStoredVector(row.vector_json),
|
|
2791
|
+
}));
|
|
2792
|
+
}
|
|
2793
|
+
loadNormalizedActiveVectors(repoId) {
|
|
2794
|
+
return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({
|
|
2795
|
+
id: row.id,
|
|
2796
|
+
number: row.number,
|
|
2797
|
+
title: row.title,
|
|
2798
|
+
embedding: normalizeEmbedding(row.embedding).normalized,
|
|
2799
|
+
}));
|
|
2800
|
+
}
|
|
2036
2801
|
listStoredClusterNeighbors(repoId, threadId, limit) {
|
|
2037
2802
|
const latestRun = this.getLatestClusterRun(repoId);
|
|
2038
2803
|
if (!latestRun) {
|
|
@@ -2089,56 +2854,65 @@ export class GHCrawlService {
|
|
|
2089
2854
|
}
|
|
2090
2855
|
sql += ' order by t.number asc';
|
|
2091
2856
|
const rows = this.db.prepare(sql).all(...args);
|
|
2092
|
-
const summaryTexts = this.
|
|
2093
|
-
const
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2857
|
+
const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber);
|
|
2858
|
+
const missingSummaryThreadNumbers = [];
|
|
2859
|
+
const tasks = rows.flatMap((row) => {
|
|
2860
|
+
const task = this.buildActiveVectorTask({
|
|
2861
|
+
threadId: row.id,
|
|
2862
|
+
threadNumber: row.number,
|
|
2863
|
+
title: row.title,
|
|
2864
|
+
body: row.body,
|
|
2865
|
+
dedupeSummary: summaryTexts.get(row.id) ?? null,
|
|
2866
|
+
});
|
|
2867
|
+
if (task) {
|
|
2868
|
+
return [task];
|
|
2869
|
+
}
|
|
2870
|
+
if (this.config.embeddingBasis === 'title_summary') {
|
|
2871
|
+
missingSummaryThreadNumbers.push(row.number);
|
|
2872
|
+
}
|
|
2873
|
+
return [];
|
|
2874
|
+
});
|
|
2875
|
+
const pipelineCurrent = this.isRepoVectorStateCurrent(repoId);
|
|
2100
2876
|
const existingRows = this.db
|
|
2101
|
-
.prepare(`select
|
|
2102
|
-
from
|
|
2103
|
-
join threads t on t.id =
|
|
2104
|
-
where t.repo_id = ?
|
|
2105
|
-
|
|
2877
|
+
.prepare(`select tv.thread_id, tv.content_hash
|
|
2878
|
+
from thread_vectors tv
|
|
2879
|
+
join threads t on t.id = tv.thread_id
|
|
2880
|
+
where t.repo_id = ?
|
|
2881
|
+
and tv.model = ?
|
|
2882
|
+
and tv.basis = ?
|
|
2883
|
+
and tv.dimensions = ?`)
|
|
2884
|
+
.all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
2106
2885
|
const existing = new Map();
|
|
2107
2886
|
for (const row of existingRows) {
|
|
2108
|
-
existing.set(
|
|
2887
|
+
existing.set(String(row.thread_id), row.content_hash);
|
|
2109
2888
|
}
|
|
2110
|
-
const pending =
|
|
2111
|
-
|
|
2889
|
+
const pending = pipelineCurrent
|
|
2890
|
+
? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash)
|
|
2891
|
+
: tasks;
|
|
2892
|
+
return { rows, tasks, existing, pending, missingSummaryThreadNumbers };
|
|
2112
2893
|
}
|
|
2113
|
-
|
|
2114
|
-
let sql = `select s.thread_id, s.
|
|
2894
|
+
loadDedupeSummaryTextMap(repoId, threadNumber) {
|
|
2895
|
+
let sql = `select s.thread_id, s.summary_text
|
|
2115
2896
|
from document_summaries s
|
|
2116
2897
|
join threads t on t.id = s.thread_id
|
|
2117
|
-
where t.repo_id = ?
|
|
2118
|
-
|
|
2898
|
+
where t.repo_id = ?
|
|
2899
|
+
and t.state = 'open'
|
|
2900
|
+
and t.closed_at_local is null
|
|
2901
|
+
and s.model = ?
|
|
2902
|
+
and s.summary_kind = 'dedupe_summary'
|
|
2903
|
+
and s.prompt_version = ?`;
|
|
2904
|
+
const args = [repoId, this.config.summaryModel, SUMMARY_PROMPT_VERSION];
|
|
2119
2905
|
if (threadNumber) {
|
|
2120
2906
|
sql += ' and t.number = ?';
|
|
2121
2907
|
args.push(threadNumber);
|
|
2122
2908
|
}
|
|
2123
|
-
sql += ' order by t.number asc
|
|
2909
|
+
sql += ' order by t.number asc';
|
|
2124
2910
|
const rows = this.db.prepare(sql).all(...args);
|
|
2125
|
-
const byThread = new Map();
|
|
2126
|
-
for (const row of rows) {
|
|
2127
|
-
const entry = byThread.get(row.thread_id) ?? new Map();
|
|
2128
|
-
entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
|
|
2129
|
-
byThread.set(row.thread_id, entry);
|
|
2130
|
-
}
|
|
2131
2911
|
const combined = new Map();
|
|
2132
|
-
const
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
.
|
|
2136
|
-
const text = entry.get(summaryKind);
|
|
2137
|
-
return text ? `${summaryKind}: ${text}` : '';
|
|
2138
|
-
})
|
|
2139
|
-
.filter(Boolean);
|
|
2140
|
-
if (parts.length > 0) {
|
|
2141
|
-
combined.set(threadId, parts.join('\n\n'));
|
|
2912
|
+
for (const row of rows) {
|
|
2913
|
+
const text = normalizeSummaryText(row.summary_text);
|
|
2914
|
+
if (text) {
|
|
2915
|
+
combined.set(row.thread_id, text);
|
|
2142
2916
|
}
|
|
2143
2917
|
}
|
|
2144
2918
|
return combined;
|
|
@@ -2235,6 +3009,71 @@ export class GHCrawlService {
|
|
|
2235
3009
|
});
|
|
2236
3010
|
}
|
|
2237
3011
|
}
|
|
3012
|
+
collectSourceKindScores(perSourceScores, edges, sourceKind) {
|
|
3013
|
+
for (const edge of edges) {
|
|
3014
|
+
const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
|
|
3015
|
+
const existing = perSourceScores.get(key);
|
|
3016
|
+
if (existing) {
|
|
3017
|
+
existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score));
|
|
3018
|
+
continue;
|
|
3019
|
+
}
|
|
3020
|
+
const scores = new Map();
|
|
3021
|
+
scores.set(sourceKind, edge.score);
|
|
3022
|
+
perSourceScores.set(key, {
|
|
3023
|
+
leftThreadId: edge.leftThreadId,
|
|
3024
|
+
rightThreadId: edge.rightThreadId,
|
|
3025
|
+
scores,
|
|
3026
|
+
});
|
|
3027
|
+
}
|
|
3028
|
+
}
|
|
3029
|
+
finalizeEdgeScores(perSourceScores, aggregation, weights, minScore) {
|
|
3030
|
+
const result = [];
|
|
3031
|
+
for (const entry of perSourceScores.values()) {
|
|
3032
|
+
const scoreValues = Array.from(entry.scores.values());
|
|
3033
|
+
let finalScore;
|
|
3034
|
+
switch (aggregation) {
|
|
3035
|
+
case 'max':
|
|
3036
|
+
finalScore = Math.max(...scoreValues);
|
|
3037
|
+
break;
|
|
3038
|
+
case 'mean':
|
|
3039
|
+
finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length;
|
|
3040
|
+
break;
|
|
3041
|
+
case 'weighted': {
|
|
3042
|
+
let weightedSum = 0;
|
|
3043
|
+
let weightSum = 0;
|
|
3044
|
+
for (const [kind, score] of entry.scores) {
|
|
3045
|
+
const w = weights[kind] ?? 0.1;
|
|
3046
|
+
weightedSum += score * w;
|
|
3047
|
+
weightSum += w;
|
|
3048
|
+
}
|
|
3049
|
+
finalScore = weightSum > 0 ? weightedSum / weightSum : 0;
|
|
3050
|
+
break;
|
|
3051
|
+
}
|
|
3052
|
+
case 'min-of-2':
|
|
3053
|
+
// Require at least 2 source kinds to agree (both above minScore)
|
|
3054
|
+
if (scoreValues.length < 2) {
|
|
3055
|
+
continue; // Skip edges with only 1 source kind
|
|
3056
|
+
}
|
|
3057
|
+
finalScore = Math.max(...scoreValues);
|
|
3058
|
+
break;
|
|
3059
|
+
case 'boost': {
|
|
3060
|
+
// Best score + bonus per additional agreeing source
|
|
3061
|
+
const best = Math.max(...scoreValues);
|
|
3062
|
+
const bonusSources = scoreValues.length - 1;
|
|
3063
|
+
finalScore = Math.min(1.0, best + bonusSources * 0.05);
|
|
3064
|
+
break;
|
|
3065
|
+
}
|
|
3066
|
+
}
|
|
3067
|
+
if (finalScore >= minScore) {
|
|
3068
|
+
result.push({
|
|
3069
|
+
leftThreadId: entry.leftThreadId,
|
|
3070
|
+
rightThreadId: entry.rightThreadId,
|
|
3071
|
+
score: finalScore,
|
|
3072
|
+
});
|
|
3073
|
+
}
|
|
3074
|
+
}
|
|
3075
|
+
return result;
|
|
3076
|
+
}
|
|
2238
3077
|
countEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2239
3078
|
const row = this.db
|
|
2240
3079
|
.prepare(`select count(*) as count
|
|
@@ -2282,15 +3121,102 @@ export class GHCrawlService {
|
|
|
2282
3121
|
pruneOldClusterRuns(repoId, keepRunId) {
|
|
2283
3122
|
this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
|
|
2284
3123
|
}
|
|
3124
|
+
summarizeClusterSizes(clusters) {
|
|
3125
|
+
const histogramCounts = new Map();
|
|
3126
|
+
const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left);
|
|
3127
|
+
let soloClusters = 0;
|
|
3128
|
+
for (const cluster of clusters) {
|
|
3129
|
+
const size = cluster.members.length;
|
|
3130
|
+
histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1);
|
|
3131
|
+
if (size === 1) {
|
|
3132
|
+
soloClusters += 1;
|
|
3133
|
+
}
|
|
3134
|
+
}
|
|
3135
|
+
return {
|
|
3136
|
+
soloClusters,
|
|
3137
|
+
maxClusterSize: topClusterSizes[0] ?? 0,
|
|
3138
|
+
topClusterSizes: topClusterSizes.slice(0, 50),
|
|
3139
|
+
histogram: Array.from(histogramCounts.entries())
|
|
3140
|
+
.map(([size, count]) => ({ size, count }))
|
|
3141
|
+
.sort((left, right) => left.size - right.size),
|
|
3142
|
+
};
|
|
3143
|
+
}
|
|
2285
3144
|
upsertSummary(threadId, contentHash, summaryKind, summaryText) {
|
|
2286
3145
|
this.db
|
|
2287
|
-
.prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
|
|
2288
|
-
values (?, ?, ?, ?, ?, ?, ?)
|
|
3146
|
+
.prepare(`insert into document_summaries (thread_id, summary_kind, model, prompt_version, content_hash, summary_text, created_at, updated_at)
|
|
3147
|
+
values (?, ?, ?, ?, ?, ?, ?, ?)
|
|
2289
3148
|
on conflict(thread_id, summary_kind, model) do update set
|
|
3149
|
+
prompt_version = excluded.prompt_version,
|
|
2290
3150
|
content_hash = excluded.content_hash,
|
|
2291
3151
|
summary_text = excluded.summary_text,
|
|
2292
3152
|
updated_at = excluded.updated_at`)
|
|
2293
|
-
.run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
|
|
3153
|
+
.run(threadId, summaryKind, this.config.summaryModel, SUMMARY_PROMPT_VERSION, contentHash, summaryText, nowIso(), nowIso());
|
|
3154
|
+
}
|
|
3155
|
+
upsertActiveVector(repoId, repoFullName, threadId, basis, contentHash, embedding) {
|
|
3156
|
+
this.db
|
|
3157
|
+
.prepare(`insert into thread_vectors (thread_id, basis, model, dimensions, content_hash, vector_json, vector_backend, created_at, updated_at)
|
|
3158
|
+
values (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3159
|
+
on conflict(thread_id) do update set
|
|
3160
|
+
basis = excluded.basis,
|
|
3161
|
+
model = excluded.model,
|
|
3162
|
+
dimensions = excluded.dimensions,
|
|
3163
|
+
content_hash = excluded.content_hash,
|
|
3164
|
+
vector_json = excluded.vector_json,
|
|
3165
|
+
vector_backend = excluded.vector_backend,
|
|
3166
|
+
updated_at = excluded.updated_at`)
|
|
3167
|
+
.run(threadId, basis, this.config.embedModel, embedding.length, contentHash, this.vectorBlob(embedding), this.config.vectorBackend, nowIso(), nowIso());
|
|
3168
|
+
this.vectorStore.upsertVector({
|
|
3169
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
3170
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
3171
|
+
threadId,
|
|
3172
|
+
vector: embedding,
|
|
3173
|
+
});
|
|
3174
|
+
}
|
|
3175
|
+
countLegacyEmbeddings(repoId) {
|
|
3176
|
+
const row = this.db
|
|
3177
|
+
.prepare(`select count(*) as count
|
|
3178
|
+
from document_embeddings
|
|
3179
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
3180
|
+
.get(repoId);
|
|
3181
|
+
return row.count;
|
|
3182
|
+
}
|
|
3183
|
+
countInlineJsonThreadVectors(repoId) {
|
|
3184
|
+
const row = this.db
|
|
3185
|
+
.prepare(`select count(*) as count
|
|
3186
|
+
from thread_vectors
|
|
3187
|
+
where thread_id in (select id from threads where repo_id = ?)
|
|
3188
|
+
and typeof(vector_json) = 'text'
|
|
3189
|
+
and vector_json != ''`)
|
|
3190
|
+
.get(repoId);
|
|
3191
|
+
return row.count;
|
|
3192
|
+
}
|
|
3193
|
+
getVectorliteClusterQuery(totalItems, requestedK) {
|
|
3194
|
+
if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) {
|
|
3195
|
+
return {
|
|
3196
|
+
limit: requestedK,
|
|
3197
|
+
candidateK: Math.max(requestedK * 16, 64),
|
|
3198
|
+
};
|
|
3199
|
+
}
|
|
3200
|
+
const limit = Math.min(Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), Math.max(1, totalItems - 1));
|
|
3201
|
+
const candidateK = Math.min(Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), Math.max(limit, totalItems - 1));
|
|
3202
|
+
return {
|
|
3203
|
+
limit,
|
|
3204
|
+
candidateK,
|
|
3205
|
+
efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH),
|
|
3206
|
+
};
|
|
3207
|
+
}
|
|
3208
|
+
vectorBlob(values) {
|
|
3209
|
+
return Buffer.from(Float32Array.from(values).buffer);
|
|
3210
|
+
}
|
|
3211
|
+
parseStoredVector(value) {
|
|
3212
|
+
if (typeof value === 'string') {
|
|
3213
|
+
if (!value) {
|
|
3214
|
+
throw new Error('Stored vector payload is empty. Run refresh or embed first.');
|
|
3215
|
+
}
|
|
3216
|
+
return JSON.parse(value);
|
|
3217
|
+
}
|
|
3218
|
+
const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT));
|
|
3219
|
+
return Array.from(floats);
|
|
2294
3220
|
}
|
|
2295
3221
|
upsertEmbedding(threadId, sourceKind, contentHash, embedding) {
|
|
2296
3222
|
this.db
|