@ghcrawl/api-core 0.7.0 → 0.8.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cluster/build.d.ts +30 -0
- package/dist/cluster/build.d.ts.map +1 -1
- package/dist/cluster/build.js +178 -7
- package/dist/cluster/build.js.map +1 -1
- package/dist/cluster/perf.integration.js +186 -20
- package/dist/cluster/perf.integration.js.map +1 -1
- package/dist/config.d.ts +9 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +29 -2
- package/dist/config.js.map +1 -1
- package/dist/db/migrate.d.ts.map +1 -1
- package/dist/db/migrate.js +37 -0
- package/dist/db/migrate.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/openai/provider.d.ts +2 -0
- package/dist/openai/provider.d.ts.map +1 -1
- package/dist/openai/provider.js +15 -1
- package/dist/openai/provider.js.map +1 -1
- package/dist/service.d.ts +101 -3
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +1058 -134
- package/dist/service.js.map +1 -1
- package/dist/vector/store.d.ts +38 -0
- package/dist/vector/store.d.ts.map +1 -0
- package/dist/vector/store.js +2 -0
- package/dist/vector/store.js.map +1 -0
- package/dist/vector/vectorlite-store.d.ts +34 -0
- package/dist/vector/vectorlite-store.d.ts.map +1 -0
- package/dist/vector/vectorlite-store.js +124 -0
- package/dist/vector/vectorlite-store.js.map +1 -0
- package/package.json +7 -6
package/dist/service.js
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
2
|
import crypto from 'node:crypto';
|
|
3
|
+
import fs from 'node:fs';
|
|
3
4
|
import { existsSync } from 'node:fs';
|
|
5
|
+
import { createRequire } from 'node:module';
|
|
4
6
|
import os from 'node:os';
|
|
7
|
+
import path from 'node:path';
|
|
5
8
|
import { fileURLToPath } from 'node:url';
|
|
6
9
|
import { Worker } from 'node:worker_threads';
|
|
7
10
|
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
8
11
|
import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
9
|
-
import { buildClusters } from './cluster/build.js';
|
|
12
|
+
import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js';
|
|
10
13
|
import { buildSourceKindEdges } from './cluster/exact-edges.js';
|
|
11
14
|
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
12
15
|
import { migrate } from './db/migrate.js';
|
|
@@ -14,7 +17,8 @@ import { openDb } from './db/sqlite.js';
|
|
|
14
17
|
import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
|
|
15
18
|
import { makeGitHubClient } from './github/client.js';
|
|
16
19
|
import { OpenAiProvider } from './openai/provider.js';
|
|
17
|
-
import { cosineSimilarity, normalizeEmbedding, rankNearestNeighbors } from './search/exact.js';
|
|
20
|
+
import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js';
|
|
21
|
+
import { VectorliteStore } from './vector/vectorlite-store.js';
|
|
18
22
|
const SYNC_BATCH_SIZE = 100;
|
|
19
23
|
const SYNC_BATCH_DELAY_MS = 5000;
|
|
20
24
|
const STALE_CLOSED_SWEEP_LIMIT = 1000;
|
|
@@ -23,10 +27,31 @@ const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000;
|
|
|
23
27
|
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
24
28
|
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
25
29
|
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
30
|
+
const requireFromHere = createRequire(import.meta.url);
|
|
26
31
|
const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
|
|
27
32
|
const EMBED_CONTEXT_RETRY_ATTEMPTS = 5;
|
|
28
33
|
const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9;
|
|
29
34
|
const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95;
|
|
35
|
+
const SUMMARY_PROMPT_VERSION = 'v1';
|
|
36
|
+
const ACTIVE_EMBED_DIMENSIONS = 1024;
|
|
37
|
+
const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1';
|
|
38
|
+
const DEFAULT_CLUSTER_MIN_SCORE = 0.78;
|
|
39
|
+
const VECTORLITE_CLUSTER_EXPANDED_K = 24;
|
|
40
|
+
const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4;
|
|
41
|
+
const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512;
|
|
42
|
+
const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024;
|
|
43
|
+
const SUMMARY_MODEL_PRICING = {
|
|
44
|
+
'gpt-5-mini': {
|
|
45
|
+
inputCostPerM: 0.25,
|
|
46
|
+
cachedInputCostPerM: 0.025,
|
|
47
|
+
outputCostPerM: 2.0,
|
|
48
|
+
},
|
|
49
|
+
'gpt-5.4-mini': {
|
|
50
|
+
inputCostPerM: 0.75,
|
|
51
|
+
cachedInputCostPerM: 0.075,
|
|
52
|
+
outputCostPerM: 4.5,
|
|
53
|
+
},
|
|
54
|
+
};
|
|
30
55
|
function nowIso() {
|
|
31
56
|
return new Date().toISOString();
|
|
32
57
|
}
|
|
@@ -180,7 +205,7 @@ export class GHCrawlService {
|
|
|
180
205
|
db;
|
|
181
206
|
github;
|
|
182
207
|
ai;
|
|
183
|
-
|
|
208
|
+
vectorStore;
|
|
184
209
|
constructor(options = {}) {
|
|
185
210
|
this.config = options.config ?? loadConfig();
|
|
186
211
|
ensureRuntimeDirs(this.config);
|
|
@@ -188,9 +213,10 @@ export class GHCrawlService {
|
|
|
188
213
|
migrate(this.db);
|
|
189
214
|
this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
|
|
190
215
|
this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
|
|
216
|
+
this.vectorStore = options.vectorStore ?? new VectorliteStore();
|
|
191
217
|
}
|
|
192
218
|
close() {
|
|
193
|
-
this.
|
|
219
|
+
this.vectorStore.close();
|
|
194
220
|
this.db.close();
|
|
195
221
|
}
|
|
196
222
|
init() {
|
|
@@ -257,7 +283,17 @@ export class GHCrawlService {
|
|
|
257
283
|
}
|
|
258
284
|
}
|
|
259
285
|
}
|
|
260
|
-
|
|
286
|
+
const vectorliteHealth = this.vectorStore.checkRuntime();
|
|
287
|
+
return {
|
|
288
|
+
health,
|
|
289
|
+
github,
|
|
290
|
+
openai,
|
|
291
|
+
vectorlite: {
|
|
292
|
+
configured: this.config.vectorBackend === 'vectorlite',
|
|
293
|
+
runtimeOk: vectorliteHealth.ok,
|
|
294
|
+
error: vectorliteHealth.error,
|
|
295
|
+
},
|
|
296
|
+
};
|
|
261
297
|
}
|
|
262
298
|
listRepositories() {
|
|
263
299
|
const rows = this.db.prepare('select * from repositories order by full_name asc').all();
|
|
@@ -409,7 +445,6 @@ export class GHCrawlService {
|
|
|
409
445
|
updated_at = ?
|
|
410
446
|
where id = ?`)
|
|
411
447
|
.run(closedAt, closedAt, row.id);
|
|
412
|
-
this.parsedEmbeddingCache.delete(repository.id);
|
|
413
448
|
const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id);
|
|
414
449
|
const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0;
|
|
415
450
|
const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id);
|
|
@@ -543,7 +578,6 @@ export class GHCrawlService {
|
|
|
543
578
|
})
|
|
544
579
|
: 0;
|
|
545
580
|
const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile;
|
|
546
|
-
this.parsedEmbeddingCache.delete(repoId);
|
|
547
581
|
if (threadsClosed > 0) {
|
|
548
582
|
this.reconcileClusterCloseState(repoId);
|
|
549
583
|
}
|
|
@@ -606,31 +640,69 @@ export class GHCrawlService {
|
|
|
606
640
|
});
|
|
607
641
|
const pending = sources.filter((row) => {
|
|
608
642
|
const latest = this.db
|
|
609
|
-
.prepare('select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
|
|
643
|
+
.prepare('select content_hash, prompt_version from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
|
|
610
644
|
.get(row.id, 'dedupe_summary', this.config.summaryModel);
|
|
611
|
-
return latest?.content_hash !== row.summaryContentHash;
|
|
645
|
+
return latest?.content_hash !== row.summaryContentHash || latest?.prompt_version !== SUMMARY_PROMPT_VERSION;
|
|
612
646
|
});
|
|
613
647
|
params.onProgress?.(`[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`);
|
|
614
648
|
let summarized = 0;
|
|
615
649
|
let inputTokens = 0;
|
|
616
650
|
let outputTokens = 0;
|
|
617
651
|
let totalTokens = 0;
|
|
618
|
-
|
|
619
|
-
|
|
652
|
+
let cachedInputTokens = 0;
|
|
653
|
+
const startTime = Date.now();
|
|
654
|
+
const pricing = SUMMARY_MODEL_PRICING[this.config.summaryModel] ?? null;
|
|
655
|
+
// Stage 1: concurrent API calls
|
|
656
|
+
const fetcher = new IterableMapper(pending, async (row) => {
|
|
620
657
|
const result = await ai.summarizeThread({
|
|
621
658
|
model: this.config.summaryModel,
|
|
622
659
|
text: row.summaryInput,
|
|
623
660
|
});
|
|
661
|
+
return { row, result };
|
|
662
|
+
}, { concurrency: 5 });
|
|
663
|
+
// Stage 2: sequential DB writes — consumes from fetcher without blocking API completions
|
|
664
|
+
const writer = new IterableMapper(fetcher, async ({ row, result }) => {
|
|
624
665
|
const summary = result.summary;
|
|
625
666
|
this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
|
|
626
667
|
this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
|
|
627
668
|
this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
|
|
628
669
|
this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
670
|
+
return { row, usage: result.usage };
|
|
671
|
+
}, { concurrency: 1 });
|
|
672
|
+
let index = 0;
|
|
673
|
+
for await (const { row, usage } of writer) {
|
|
674
|
+
index += 1;
|
|
675
|
+
if (usage) {
|
|
676
|
+
inputTokens += usage.inputTokens;
|
|
677
|
+
outputTokens += usage.outputTokens;
|
|
678
|
+
totalTokens += usage.totalTokens;
|
|
679
|
+
cachedInputTokens += usage.cachedInputTokens;
|
|
680
|
+
}
|
|
681
|
+
// Compute cost and ETA every 10 items or on the last item
|
|
682
|
+
if (index % 10 === 0 || index === pending.length) {
|
|
683
|
+
const remaining = pending.length - index;
|
|
684
|
+
const avgIn = inputTokens / index;
|
|
685
|
+
const avgOut = outputTokens / index;
|
|
686
|
+
const avgCachedIn = cachedInputTokens / index;
|
|
687
|
+
const elapsedSec = (Date.now() - startTime) / 1000;
|
|
688
|
+
const secPerItem = elapsedSec / index;
|
|
689
|
+
const etaSec = remaining * secPerItem;
|
|
690
|
+
const etaMin = Math.round(etaSec / 60);
|
|
691
|
+
const etaStr = etaMin >= 60 ? `${Math.floor(etaMin / 60)}h${etaMin % 60}m` : `${etaMin}m`;
|
|
692
|
+
if (pricing) {
|
|
693
|
+
const uncachedInput = inputTokens - cachedInputTokens;
|
|
694
|
+
const costSoFar = (uncachedInput / 1_000_000) * pricing.inputCostPerM +
|
|
695
|
+
(cachedInputTokens / 1_000_000) * pricing.cachedInputCostPerM +
|
|
696
|
+
(outputTokens / 1_000_000) * pricing.outputCostPerM;
|
|
697
|
+
const estTotalCost = costSoFar +
|
|
698
|
+
((remaining * (avgIn - avgCachedIn)) / 1_000_000) * pricing.inputCostPerM +
|
|
699
|
+
((remaining * avgCachedIn) / 1_000_000) * pricing.cachedInputCostPerM +
|
|
700
|
+
((remaining * avgOut) / 1_000_000) * pricing.outputCostPerM;
|
|
701
|
+
params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | cost=$${costSoFar.toFixed(2)} est_total=$${estTotalCost.toFixed(2)} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
|
|
702
|
+
}
|
|
703
|
+
else {
|
|
704
|
+
params.onProgress?.(`[summarize] ${index}/${pending.length} thread #${row.number} | avg_in=${Math.round(avgIn)} avg_out=${Math.round(avgOut)} | ETA ${etaStr}`);
|
|
705
|
+
}
|
|
634
706
|
}
|
|
635
707
|
summarized += 1;
|
|
636
708
|
}
|
|
@@ -674,11 +746,25 @@ export class GHCrawlService {
|
|
|
674
746
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
675
747
|
const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
|
|
676
748
|
try {
|
|
677
|
-
|
|
749
|
+
if (params.threadNumber === undefined) {
|
|
750
|
+
if (!this.isRepoVectorStateCurrent(repository.id)) {
|
|
751
|
+
this.resetRepositoryVectors(repository.id, repository.fullName);
|
|
752
|
+
}
|
|
753
|
+
else {
|
|
754
|
+
const pruned = this.pruneInactiveRepositoryVectors(repository.id, repository.fullName);
|
|
755
|
+
if (pruned > 0) {
|
|
756
|
+
params.onProgress?.(`[embed] pruned ${pruned} closed or inactive vector(s) before refresh`);
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
const { rows, tasks, pending, missingSummaryThreadNumbers } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
|
|
678
761
|
const skipped = tasks.length - pending.length;
|
|
679
762
|
const truncated = tasks.filter((task) => task.wasTruncated).length;
|
|
680
|
-
|
|
681
|
-
|
|
763
|
+
if (missingSummaryThreadNumbers.length > 0) {
|
|
764
|
+
throw new Error(`Embedding basis ${this.config.embeddingBasis} requires summaries before embedding. Missing summaries for thread(s): ${missingSummaryThreadNumbers.slice(0, 10).join(', ')}${missingSummaryThreadNumbers.length > 10 ? ', …' : ''}.`);
|
|
765
|
+
}
|
|
766
|
+
params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} active vector task(s) for ${repository.fullName}`);
|
|
767
|
+
params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} dimensions=${ACTIVE_EMBED_DIMENSIONS} basis=${this.config.embeddingBasis} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
|
|
682
768
|
let embedded = 0;
|
|
683
769
|
const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
|
|
684
770
|
const mapper = new IterableMapper(batches, async (batch) => {
|
|
@@ -690,14 +776,15 @@ export class GHCrawlService {
|
|
|
690
776
|
let completedBatches = 0;
|
|
691
777
|
for await (const batchResult of mapper) {
|
|
692
778
|
completedBatches += 1;
|
|
693
|
-
const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.
|
|
779
|
+
const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.basis}`);
|
|
694
780
|
const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
|
|
695
781
|
params.onProgress?.(`[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`);
|
|
696
782
|
for (const { task, embedding } of batchResult) {
|
|
697
|
-
this.
|
|
783
|
+
this.upsertActiveVector(repository.id, repository.fullName, task.threadId, task.basis, task.contentHash, embedding);
|
|
698
784
|
embedded += 1;
|
|
699
785
|
}
|
|
700
786
|
}
|
|
787
|
+
this.markRepoVectorsCurrent(repository.id);
|
|
701
788
|
this.finishRun('embedding_runs', runId, 'completed', { embedded });
|
|
702
789
|
return embedResultSchema.parse({ runId, embedded });
|
|
703
790
|
}
|
|
@@ -709,16 +796,70 @@ export class GHCrawlService {
|
|
|
709
796
|
async clusterRepository(params) {
|
|
710
797
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
711
798
|
const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
|
|
712
|
-
const minScore = params.minScore ??
|
|
799
|
+
const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
|
|
713
800
|
const k = params.k ?? 6;
|
|
714
801
|
try {
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
802
|
+
let items;
|
|
803
|
+
let aggregatedEdges;
|
|
804
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
805
|
+
const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName);
|
|
806
|
+
const activeIds = new Set(vectorItems.map((item) => item.id));
|
|
807
|
+
const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k);
|
|
808
|
+
aggregatedEdges = new Map();
|
|
809
|
+
let processed = 0;
|
|
810
|
+
let lastProgressAt = Date.now();
|
|
811
|
+
params.onProgress?.(`[cluster] loaded ${vectorItems.length} active vector(s) for ${repository.fullName} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`);
|
|
812
|
+
for (const item of vectorItems) {
|
|
813
|
+
const neighbors = this.vectorStore.queryNearest({
|
|
814
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
815
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
816
|
+
vector: item.embedding,
|
|
817
|
+
limit: annQuery.limit,
|
|
818
|
+
candidateK: annQuery.candidateK + 1,
|
|
819
|
+
efSearch: annQuery.efSearch,
|
|
820
|
+
excludeThreadId: item.id,
|
|
821
|
+
});
|
|
822
|
+
for (const neighbor of neighbors) {
|
|
823
|
+
if (!activeIds.has(neighbor.threadId))
|
|
824
|
+
continue;
|
|
825
|
+
if (neighbor.score < minScore)
|
|
826
|
+
continue;
|
|
827
|
+
const key = this.edgeKey(item.id, neighbor.threadId);
|
|
828
|
+
const existing = aggregatedEdges.get(key);
|
|
829
|
+
if (existing) {
|
|
830
|
+
existing.score = Math.max(existing.score, neighbor.score);
|
|
831
|
+
}
|
|
832
|
+
else {
|
|
833
|
+
aggregatedEdges.set(key, {
|
|
834
|
+
leftThreadId: Math.min(item.id, neighbor.threadId),
|
|
835
|
+
rightThreadId: Math.max(item.id, neighbor.threadId),
|
|
836
|
+
score: neighbor.score,
|
|
837
|
+
sourceKinds: new Set(['dedupe_summary']),
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
processed += 1;
|
|
842
|
+
const now = Date.now();
|
|
843
|
+
if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
|
|
844
|
+
params.onProgress(`[cluster] queried ${processed}/${vectorItems.length} vectors current_edges=${aggregatedEdges.size}`);
|
|
845
|
+
lastProgressAt = now;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
items = vectorItems;
|
|
849
|
+
}
|
|
850
|
+
else if (this.hasLegacyEmbeddings(repository.id)) {
|
|
851
|
+
const legacy = this.loadClusterableThreadMeta(repository.id);
|
|
852
|
+
items = legacy.items;
|
|
853
|
+
params.onProgress?.(`[cluster] loaded ${items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
854
|
+
aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, {
|
|
855
|
+
limit: k,
|
|
856
|
+
minScore,
|
|
857
|
+
onProgress: params.onProgress,
|
|
858
|
+
});
|
|
859
|
+
}
|
|
860
|
+
else {
|
|
861
|
+
throw new Error(`Vectors for ${repository.fullName} are stale or missing. Run refresh or embed first.`);
|
|
862
|
+
}
|
|
722
863
|
const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
|
|
723
864
|
leftThreadId: entry.leftThreadId,
|
|
724
865
|
rightThreadId: entry.rightThreadId,
|
|
@@ -728,6 +869,10 @@ export class GHCrawlService {
|
|
|
728
869
|
const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
|
|
729
870
|
this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
|
|
730
871
|
this.pruneOldClusterRuns(repository.id, runId);
|
|
872
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
873
|
+
this.markRepoClustersCurrent(repository.id);
|
|
874
|
+
this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress);
|
|
875
|
+
}
|
|
731
876
|
params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
|
|
732
877
|
this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
|
|
733
878
|
return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
|
|
@@ -737,6 +882,263 @@ export class GHCrawlService {
|
|
|
737
882
|
throw error;
|
|
738
883
|
}
|
|
739
884
|
}
|
|
885
|
+
clusterExperiment(params) {
|
|
886
|
+
const backend = params.backend ?? 'vectorlite';
|
|
887
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
888
|
+
const loaded = this.loadClusterableThreadMeta(repository.id);
|
|
889
|
+
const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : [];
|
|
890
|
+
const activeSourceKind = this.config.embeddingBasis === 'title_summary' ? 'dedupe_summary' : 'body';
|
|
891
|
+
const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0);
|
|
892
|
+
const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds);
|
|
893
|
+
const items = useActiveVectors
|
|
894
|
+
? activeVectors.map((item) => ({ id: item.id, number: item.number, title: item.title }))
|
|
895
|
+
: loaded.items;
|
|
896
|
+
const aggregation = params.aggregation ?? 'max';
|
|
897
|
+
const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE;
|
|
898
|
+
const k = params.k ?? 6;
|
|
899
|
+
const candidateK = Math.max(k, params.candidateK ?? Math.max(k * 16, 64));
|
|
900
|
+
const efSearch = params.efSearch;
|
|
901
|
+
const startedAt = Date.now();
|
|
902
|
+
const memoryBefore = process.memoryUsage();
|
|
903
|
+
let peakRssBytes = memoryBefore.rss;
|
|
904
|
+
let peakHeapUsedBytes = memoryBefore.heapUsed;
|
|
905
|
+
const recordMemory = () => {
|
|
906
|
+
const usage = process.memoryUsage();
|
|
907
|
+
peakRssBytes = Math.max(peakRssBytes, usage.rss);
|
|
908
|
+
peakHeapUsedBytes = Math.max(peakHeapUsedBytes, usage.heapUsed);
|
|
909
|
+
};
|
|
910
|
+
recordMemory();
|
|
911
|
+
if (useActiveVectors && params.sourceKinds && loaded.items.length === 0) {
|
|
912
|
+
params.onProgress?.(`[cluster-experiment] legacy source embeddings are unavailable for ${repository.fullName}; falling back to active ${this.config.embeddingBasis} vectors`);
|
|
913
|
+
}
|
|
914
|
+
params.onProgress?.(`[cluster-experiment] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} backend=${backend} k=${k} candidateK=${candidateK} minScore=${minScore} aggregation=${aggregation}`);
|
|
915
|
+
const perSourceScores = new Map();
|
|
916
|
+
let loadMs = 0;
|
|
917
|
+
let setupMs = 0;
|
|
918
|
+
let edgeBuildMs = 0;
|
|
919
|
+
let indexBuildMs = 0;
|
|
920
|
+
let queryMs = 0;
|
|
921
|
+
let clusterBuildMs = 0;
|
|
922
|
+
let tempDbPath = null;
|
|
923
|
+
let tempDb = null;
|
|
924
|
+
let tempDir = null;
|
|
925
|
+
try {
|
|
926
|
+
if (backend === 'exact') {
|
|
927
|
+
if (useActiveVectors) {
|
|
928
|
+
const loadStartedAt = Date.now();
|
|
929
|
+
const normalizedRows = activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding }));
|
|
930
|
+
loadMs += Date.now() - loadStartedAt;
|
|
931
|
+
recordMemory();
|
|
932
|
+
const edgesStartedAt = Date.now();
|
|
933
|
+
const edges = buildSourceKindEdges(normalizedRows, {
|
|
934
|
+
limit: k,
|
|
935
|
+
minScore,
|
|
936
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
937
|
+
onProgress: (progress) => {
|
|
938
|
+
recordMemory();
|
|
939
|
+
if (!params.onProgress)
|
|
940
|
+
return;
|
|
941
|
+
params.onProgress(`[cluster-experiment] exact ${progress.processedItems}/${normalizedRows.length} active vectors processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
|
|
942
|
+
},
|
|
943
|
+
});
|
|
944
|
+
edgeBuildMs += Date.now() - edgesStartedAt;
|
|
945
|
+
this.collectSourceKindScores(perSourceScores, edges, activeSourceKind);
|
|
946
|
+
recordMemory();
|
|
947
|
+
}
|
|
948
|
+
else {
|
|
949
|
+
const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0);
|
|
950
|
+
let processedItems = 0;
|
|
951
|
+
for (const sourceKind of sourceKinds) {
|
|
952
|
+
const loadStartedAt = Date.now();
|
|
953
|
+
const normalizedRows = this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind);
|
|
954
|
+
loadMs += Date.now() - loadStartedAt;
|
|
955
|
+
recordMemory();
|
|
956
|
+
const edgesStartedAt = Date.now();
|
|
957
|
+
const edges = buildSourceKindEdges(normalizedRows, {
|
|
958
|
+
limit: k,
|
|
959
|
+
minScore,
|
|
960
|
+
progressIntervalMs: CLUSTER_PROGRESS_INTERVAL_MS,
|
|
961
|
+
onProgress: (progress) => {
|
|
962
|
+
recordMemory();
|
|
963
|
+
if (!params.onProgress)
|
|
964
|
+
return;
|
|
965
|
+
params.onProgress(`[cluster-experiment] exact ${processedItems + progress.processedItems}/${totalItems} source embeddings processed current_edges~=${perSourceScores.size + progress.currentEdgeEstimate}`);
|
|
966
|
+
},
|
|
967
|
+
});
|
|
968
|
+
edgeBuildMs += Date.now() - edgesStartedAt;
|
|
969
|
+
processedItems += normalizedRows.length;
|
|
970
|
+
this.collectSourceKindScores(perSourceScores, edges, sourceKind);
|
|
971
|
+
recordMemory();
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
else {
|
|
976
|
+
const setupStartedAt = Date.now();
|
|
977
|
+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-vectorlite-'));
|
|
978
|
+
tempDbPath = path.join(tempDir, 'cluster-experiment.db');
|
|
979
|
+
tempDb = openDb(tempDbPath);
|
|
980
|
+
tempDb.pragma('journal_mode = MEMORY');
|
|
981
|
+
tempDb.pragma('synchronous = OFF');
|
|
982
|
+
tempDb.pragma('temp_store = MEMORY');
|
|
983
|
+
const vectorlite = requireFromHere('vectorlite');
|
|
984
|
+
tempDb.loadExtension(vectorlite.vectorlitePath());
|
|
985
|
+
setupMs += Date.now() - setupStartedAt;
|
|
986
|
+
recordMemory();
|
|
987
|
+
const vectorSources = useActiveVectors
|
|
988
|
+
? [
|
|
989
|
+
{
|
|
990
|
+
sourceKind: activeSourceKind,
|
|
991
|
+
rows: activeVectors.map(({ id, embedding }) => ({ id, normalizedEmbedding: embedding })),
|
|
992
|
+
},
|
|
993
|
+
]
|
|
994
|
+
: sourceKinds.map((sourceKind) => ({
|
|
995
|
+
sourceKind,
|
|
996
|
+
rows: this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind).map((row) => ({
|
|
997
|
+
id: row.id,
|
|
998
|
+
normalizedEmbedding: row.normalizedEmbedding,
|
|
999
|
+
})),
|
|
1000
|
+
}));
|
|
1001
|
+
for (const source of vectorSources) {
|
|
1002
|
+
const sourceRowCount = source.rows.length;
|
|
1003
|
+
if (sourceRowCount === 0) {
|
|
1004
|
+
continue;
|
|
1005
|
+
}
|
|
1006
|
+
const dimension = source.rows[0].normalizedEmbedding.length;
|
|
1007
|
+
const safeCandidateK = Math.min(candidateK, Math.max(1, sourceRowCount - 1));
|
|
1008
|
+
const tableName = `vector_${source.sourceKind}`;
|
|
1009
|
+
params.onProgress?.(`[cluster-experiment] building ${source.sourceKind} HNSW index with ${sourceRowCount} vector(s)`);
|
|
1010
|
+
const indexStartedAt = Date.now();
|
|
1011
|
+
tempDb.exec(`create virtual table ${tableName} using vectorlite(vec float32[${dimension}], hnsw(max_elements=${sourceRowCount}));`);
|
|
1012
|
+
const insert = tempDb.prepare(`insert into ${tableName}(rowid, vec) values (?, ?)`);
|
|
1013
|
+
tempDb.transaction(() => {
|
|
1014
|
+
const loadStartedAt = Date.now();
|
|
1015
|
+
for (const row of source.rows) {
|
|
1016
|
+
insert.run(row.id, this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
|
|
1017
|
+
}
|
|
1018
|
+
loadMs += Date.now() - loadStartedAt;
|
|
1019
|
+
})();
|
|
1020
|
+
indexBuildMs += Date.now() - indexStartedAt;
|
|
1021
|
+
recordMemory();
|
|
1022
|
+
const queryStartedAt = Date.now();
|
|
1023
|
+
const querySql = efSearch !== undefined
|
|
1024
|
+
? `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}, ${efSearch}))`
|
|
1025
|
+
: `select rowid, distance from ${tableName} where knn_search(vec, knn_param(?, ${safeCandidateK + 1}))`;
|
|
1026
|
+
const query = tempDb.prepare(querySql);
|
|
1027
|
+
let processed = 0;
|
|
1028
|
+
let lastProgressAt = Date.now();
|
|
1029
|
+
const queryLoadStartedAt = Date.now();
|
|
1030
|
+
for (const row of source.rows) {
|
|
1031
|
+
const candidates = query.all(this.normalizedEmbeddingBuffer(row.normalizedEmbedding));
|
|
1032
|
+
const ranked = rankNearestNeighborsByScore(candidates, {
|
|
1033
|
+
limit: k,
|
|
1034
|
+
minScore,
|
|
1035
|
+
score: (candidate) => {
|
|
1036
|
+
if (candidate.rowid === row.id) {
|
|
1037
|
+
return -1;
|
|
1038
|
+
}
|
|
1039
|
+
return this.normalizedDistanceToScore(candidate.distance);
|
|
1040
|
+
},
|
|
1041
|
+
});
|
|
1042
|
+
let addedThisRow = 0;
|
|
1043
|
+
for (const candidate of ranked) {
|
|
1044
|
+
const score = candidate.score;
|
|
1045
|
+
const key = this.edgeKey(row.id, candidate.item.rowid);
|
|
1046
|
+
const existing = perSourceScores.get(key);
|
|
1047
|
+
if (existing) {
|
|
1048
|
+
existing.scores.set(source.sourceKind, Math.max(existing.scores.get(source.sourceKind) ?? -1, score));
|
|
1049
|
+
continue;
|
|
1050
|
+
}
|
|
1051
|
+
const scores = new Map();
|
|
1052
|
+
scores.set(source.sourceKind, score);
|
|
1053
|
+
perSourceScores.set(key, {
|
|
1054
|
+
leftThreadId: Math.min(row.id, candidate.item.rowid),
|
|
1055
|
+
rightThreadId: Math.max(row.id, candidate.item.rowid),
|
|
1056
|
+
scores,
|
|
1057
|
+
});
|
|
1058
|
+
addedThisRow += 1;
|
|
1059
|
+
}
|
|
1060
|
+
processed += 1;
|
|
1061
|
+
const now = Date.now();
|
|
1062
|
+
if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
|
|
1063
|
+
recordMemory();
|
|
1064
|
+
params.onProgress(`[cluster-experiment] querying ${source.sourceKind} index ${processed}/${sourceRowCount} current_edges=${perSourceScores.size} added_this_step=${addedThisRow}`);
|
|
1065
|
+
lastProgressAt = now;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
loadMs += Date.now() - queryLoadStartedAt;
|
|
1069
|
+
queryMs += Date.now() - queryStartedAt;
|
|
1070
|
+
tempDb.exec(`drop table ${tableName}`);
|
|
1071
|
+
recordMemory();
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
// Finalize edge scores using the configured aggregation method
|
|
1075
|
+
const defaultWeights = { dedupe_summary: 0.5, title: 0.3, body: 0.2 };
|
|
1076
|
+
const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) };
|
|
1077
|
+
const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore);
|
|
1078
|
+
params.onProgress?.(`[cluster-experiment] finalized ${aggregated.length} edges from ${perSourceScores.size} candidate pairs using ${aggregation} aggregation`);
|
|
1079
|
+
const clusterStartedAt = Date.now();
|
|
1080
|
+
const clusterNodes = items.map((item) => ({ threadId: item.id, number: item.number, title: item.title }));
|
|
1081
|
+
const clusterEdges = aggregated;
|
|
1082
|
+
const clusterMode = params.clusterMode ?? (params.maxClusterSize !== undefined ? 'refine' : 'basic');
|
|
1083
|
+
const clusters = clusterMode === 'bounded'
|
|
1084
|
+
? buildSizeBoundedClusters(clusterNodes, clusterEdges, {
|
|
1085
|
+
maxClusterSize: params.maxClusterSize ?? 200,
|
|
1086
|
+
})
|
|
1087
|
+
: clusterMode === 'refine'
|
|
1088
|
+
? buildRefinedClusters(clusterNodes, clusterEdges, {
|
|
1089
|
+
maxClusterSize: params.maxClusterSize ?? 200,
|
|
1090
|
+
refineStep: params.refineStep ?? 0.02,
|
|
1091
|
+
})
|
|
1092
|
+
: buildClusters(clusterNodes, clusterEdges);
|
|
1093
|
+
clusterBuildMs += Date.now() - clusterStartedAt;
|
|
1094
|
+
recordMemory();
|
|
1095
|
+
const memoryAfter = process.memoryUsage();
|
|
1096
|
+
const durationMs = backend === 'vectorlite'
|
|
1097
|
+
? indexBuildMs + queryMs + clusterBuildMs
|
|
1098
|
+
: edgeBuildMs + clusterBuildMs;
|
|
1099
|
+
const totalDurationMs = Date.now() - startedAt;
|
|
1100
|
+
return {
|
|
1101
|
+
backend,
|
|
1102
|
+
repository,
|
|
1103
|
+
tempDbPath,
|
|
1104
|
+
threads: items.length,
|
|
1105
|
+
sourceKinds: sourceKinds.length,
|
|
1106
|
+
edges: aggregated.length,
|
|
1107
|
+
clusters: clusters.length,
|
|
1108
|
+
timingBasis: 'cluster-only',
|
|
1109
|
+
durationMs,
|
|
1110
|
+
totalDurationMs,
|
|
1111
|
+
loadMs,
|
|
1112
|
+
setupMs,
|
|
1113
|
+
edgeBuildMs,
|
|
1114
|
+
indexBuildMs,
|
|
1115
|
+
queryMs,
|
|
1116
|
+
clusterBuildMs,
|
|
1117
|
+
candidateK,
|
|
1118
|
+
memory: {
|
|
1119
|
+
rssBeforeBytes: memoryBefore.rss,
|
|
1120
|
+
rssAfterBytes: memoryAfter.rss,
|
|
1121
|
+
peakRssBytes,
|
|
1122
|
+
heapUsedBeforeBytes: memoryBefore.heapUsed,
|
|
1123
|
+
heapUsedAfterBytes: memoryAfter.heapUsed,
|
|
1124
|
+
peakHeapUsedBytes,
|
|
1125
|
+
},
|
|
1126
|
+
clusterSizes: this.summarizeClusterSizes(clusters),
|
|
1127
|
+
clustersDetail: params.includeClusters
|
|
1128
|
+
? clusters.map((cluster) => ({
|
|
1129
|
+
representativeThreadId: cluster.representativeThreadId,
|
|
1130
|
+
memberThreadIds: [...cluster.members],
|
|
1131
|
+
}))
|
|
1132
|
+
: null,
|
|
1133
|
+
};
|
|
1134
|
+
}
|
|
1135
|
+
finally {
|
|
1136
|
+
tempDb?.close();
|
|
1137
|
+
if (tempDir) {
|
|
1138
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
740
1142
|
async searchRepository(params) {
|
|
741
1143
|
const mode = params.mode ?? 'hybrid';
|
|
742
1144
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
@@ -758,13 +1160,33 @@ export class GHCrawlService {
|
|
|
758
1160
|
}
|
|
759
1161
|
}
|
|
760
1162
|
if (mode !== 'keyword' && this.ai) {
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
1163
|
+
if (this.isRepoVectorStateCurrent(repository.id)) {
|
|
1164
|
+
const [queryEmbedding] = await this.ai.embedTexts({
|
|
1165
|
+
model: this.config.embedModel,
|
|
1166
|
+
texts: [params.query],
|
|
1167
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1168
|
+
});
|
|
1169
|
+
const neighbors = this.vectorStore.queryNearest({
|
|
1170
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
1171
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1172
|
+
vector: queryEmbedding,
|
|
1173
|
+
limit: limit * 2,
|
|
1174
|
+
candidateK: Math.max(limit * 8, 64),
|
|
1175
|
+
});
|
|
1176
|
+
for (const neighbor of neighbors) {
|
|
1177
|
+
if (neighbor.score < 0.2)
|
|
1178
|
+
continue;
|
|
1179
|
+
semanticScores.set(neighbor.threadId, Math.max(semanticScores.get(neighbor.threadId) ?? -1, neighbor.score));
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
else if (this.hasLegacyEmbeddings(repository.id)) {
|
|
1183
|
+
const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
|
|
1184
|
+
for (const row of this.iterateStoredEmbeddings(repository.id)) {
|
|
1185
|
+
const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json));
|
|
1186
|
+
if (score < 0.2)
|
|
1187
|
+
continue;
|
|
1188
|
+
semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
|
|
1189
|
+
}
|
|
768
1190
|
}
|
|
769
1191
|
}
|
|
770
1192
|
const candidateIds = new Set([...keywordScores.keys(), ...semanticScores.keys()]);
|
|
@@ -832,44 +1254,97 @@ export class GHCrawlService {
|
|
|
832
1254
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
833
1255
|
const limit = params.limit ?? 10;
|
|
834
1256
|
const minScore = params.minScore ?? 0.2;
|
|
835
|
-
const
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
1257
|
+
const targetRow = this.db
|
|
1258
|
+
.prepare(`select t.*, tv.basis, tv.model, tv.dimensions, tv.content_hash, tv.vector_json, tv.vector_backend
|
|
1259
|
+
from threads t
|
|
1260
|
+
join thread_vectors tv on tv.thread_id = t.id
|
|
1261
|
+
where t.repo_id = ?
|
|
1262
|
+
and t.number = ?
|
|
1263
|
+
and t.state = 'open'
|
|
1264
|
+
and t.closed_at_local is null
|
|
1265
|
+
and tv.model = ?
|
|
1266
|
+
and tv.basis = ?
|
|
1267
|
+
and tv.dimensions = ?
|
|
1268
|
+
limit 1`)
|
|
1269
|
+
.get(repository.id, params.threadNumber, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
1270
|
+
let responseThread;
|
|
1271
|
+
let neighbors;
|
|
1272
|
+
if (targetRow) {
|
|
1273
|
+
responseThread = targetRow;
|
|
1274
|
+
const candidateRows = this.vectorStore
|
|
1275
|
+
.queryNearest({
|
|
1276
|
+
storePath: this.repoVectorStorePath(repository.fullName),
|
|
1277
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1278
|
+
vector: this.parseStoredVector(targetRow.vector_json),
|
|
1279
|
+
limit: limit * 2,
|
|
1280
|
+
candidateK: Math.max(limit * 8, 64),
|
|
1281
|
+
excludeThreadId: targetRow.id,
|
|
1282
|
+
})
|
|
1283
|
+
.filter((row) => row.score >= minScore);
|
|
1284
|
+
const candidateIds = candidateRows.map((row) => row.threadId);
|
|
1285
|
+
const neighborMeta = candidateIds.length
|
|
1286
|
+
? this.db
|
|
1287
|
+
.prepare(`select * from threads
|
|
1288
|
+
where repo_id = ? and state = 'open' and closed_at_local is null and id in (${candidateIds.map(() => '?').join(',')})`)
|
|
1289
|
+
.all(repository.id, ...candidateIds)
|
|
1290
|
+
: [];
|
|
1291
|
+
const metaById = new Map(neighborMeta.map((row) => [row.id, row]));
|
|
1292
|
+
neighbors = candidateRows
|
|
1293
|
+
.map((row) => {
|
|
1294
|
+
const meta = metaById.get(row.threadId);
|
|
1295
|
+
if (!meta) {
|
|
1296
|
+
return null;
|
|
1297
|
+
}
|
|
1298
|
+
return {
|
|
1299
|
+
threadId: row.threadId,
|
|
1300
|
+
number: meta.number,
|
|
1301
|
+
kind: meta.kind,
|
|
1302
|
+
title: meta.title,
|
|
1303
|
+
score: row.score,
|
|
1304
|
+
};
|
|
1305
|
+
})
|
|
1306
|
+
.filter((row) => row !== null)
|
|
1307
|
+
.slice(0, limit);
|
|
844
1308
|
}
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
if (
|
|
848
|
-
|
|
849
|
-
const targetEmbedding = targetBySource.get(row.source_kind);
|
|
850
|
-
if (!targetEmbedding)
|
|
851
|
-
continue;
|
|
852
|
-
const score = cosineSimilarity(targetEmbedding, row.embedding);
|
|
853
|
-
if (score < minScore)
|
|
854
|
-
continue;
|
|
855
|
-
const previous = aggregated.get(row.id);
|
|
856
|
-
if (!previous || score > previous.score) {
|
|
857
|
-
aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
|
|
1309
|
+
else {
|
|
1310
|
+
const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber);
|
|
1311
|
+
if (targetRows.length === 0) {
|
|
1312
|
+
throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
|
|
858
1313
|
}
|
|
1314
|
+
responseThread = targetRows[0];
|
|
1315
|
+
const targetBySource = new Map();
|
|
1316
|
+
for (const row of targetRows) {
|
|
1317
|
+
targetBySource.set(row.source_kind, JSON.parse(row.embedding_json));
|
|
1318
|
+
}
|
|
1319
|
+
const aggregated = new Map();
|
|
1320
|
+
for (const row of this.iterateStoredEmbeddings(repository.id)) {
|
|
1321
|
+
if (row.id === responseThread.id)
|
|
1322
|
+
continue;
|
|
1323
|
+
const targetEmbedding = targetBySource.get(row.source_kind);
|
|
1324
|
+
if (!targetEmbedding)
|
|
1325
|
+
continue;
|
|
1326
|
+
const score = cosineSimilarity(targetEmbedding, JSON.parse(row.embedding_json));
|
|
1327
|
+
if (score < minScore)
|
|
1328
|
+
continue;
|
|
1329
|
+
const previous = aggregated.get(row.id);
|
|
1330
|
+
if (!previous || score > previous.score) {
|
|
1331
|
+
aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
neighbors = Array.from(aggregated.entries())
|
|
1335
|
+
.map(([threadId, value]) => ({
|
|
1336
|
+
threadId,
|
|
1337
|
+
number: value.number,
|
|
1338
|
+
kind: value.kind,
|
|
1339
|
+
title: value.title,
|
|
1340
|
+
score: value.score,
|
|
1341
|
+
}))
|
|
1342
|
+
.sort((left, right) => right.score - left.score)
|
|
1343
|
+
.slice(0, limit);
|
|
859
1344
|
}
|
|
860
|
-
const neighbors = Array.from(aggregated.entries())
|
|
861
|
-
.map(([threadId, value]) => ({
|
|
862
|
-
threadId,
|
|
863
|
-
number: value.number,
|
|
864
|
-
kind: value.kind,
|
|
865
|
-
title: value.title,
|
|
866
|
-
score: value.score,
|
|
867
|
-
}))
|
|
868
|
-
.sort((left, right) => right.score - left.score)
|
|
869
|
-
.slice(0, limit);
|
|
870
1345
|
return neighborsResponseSchema.parse({
|
|
871
1346
|
repository,
|
|
872
|
-
thread: threadToDto(
|
|
1347
|
+
thread: threadToDto(responseThread),
|
|
873
1348
|
neighbors,
|
|
874
1349
|
});
|
|
875
1350
|
}
|
|
@@ -946,6 +1421,14 @@ export class GHCrawlService {
|
|
|
946
1421
|
onProgress: params.onProgress,
|
|
947
1422
|
});
|
|
948
1423
|
}
|
|
1424
|
+
if (selected.embed && this.config.embeddingBasis === 'title_summary') {
|
|
1425
|
+
params.onProgress?.(`[refresh] embedding basis ${this.config.embeddingBasis} requires summaries; running summarize before embed`);
|
|
1426
|
+
await this.summarizeRepository({
|
|
1427
|
+
owner: params.owner,
|
|
1428
|
+
repo: params.repo,
|
|
1429
|
+
onProgress: params.onProgress,
|
|
1430
|
+
});
|
|
1431
|
+
}
|
|
949
1432
|
if (selected.embed) {
|
|
950
1433
|
embed = await this.embedRepository({
|
|
951
1434
|
owner: params.owner,
|
|
@@ -1150,9 +1633,9 @@ export class GHCrawlService {
|
|
|
1150
1633
|
const summaryRows = this.db
|
|
1151
1634
|
.prepare(`select summary_kind, summary_text
|
|
1152
1635
|
from document_summaries
|
|
1153
|
-
where thread_id = ? and model = ?
|
|
1636
|
+
where thread_id = ? and model = ? and prompt_version = ?
|
|
1154
1637
|
order by summary_kind asc`)
|
|
1155
|
-
.all(row.id, this.config.summaryModel);
|
|
1638
|
+
.all(row.id, this.config.summaryModel, SUMMARY_PROMPT_VERSION);
|
|
1156
1639
|
const summaries = {};
|
|
1157
1640
|
for (const summary of summaryRows) {
|
|
1158
1641
|
if (summary.summary_kind === 'problem_summary' ||
|
|
@@ -1314,7 +1797,178 @@ export class GHCrawlService {
|
|
|
1314
1797
|
latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
|
|
1315
1798
|
};
|
|
1316
1799
|
}
|
|
1800
|
+
getDesiredPipelineState() {
|
|
1801
|
+
return {
|
|
1802
|
+
summary_model: this.config.summaryModel,
|
|
1803
|
+
summary_prompt_version: SUMMARY_PROMPT_VERSION,
|
|
1804
|
+
embedding_basis: this.config.embeddingBasis,
|
|
1805
|
+
embed_model: this.config.embedModel,
|
|
1806
|
+
embed_dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1807
|
+
embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION,
|
|
1808
|
+
vector_backend: this.config.vectorBackend,
|
|
1809
|
+
};
|
|
1810
|
+
}
|
|
1811
|
+
getRepoPipelineState(repoId) {
|
|
1812
|
+
return (this.db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) ??
|
|
1813
|
+
null);
|
|
1814
|
+
}
|
|
1815
|
+
isRepoVectorStateCurrent(repoId) {
|
|
1816
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1817
|
+
if (!state || !state.vectors_current_at) {
|
|
1818
|
+
return false;
|
|
1819
|
+
}
|
|
1820
|
+
const desired = this.getDesiredPipelineState();
|
|
1821
|
+
return (state.summary_model === desired.summary_model &&
|
|
1822
|
+
state.summary_prompt_version === desired.summary_prompt_version &&
|
|
1823
|
+
state.embedding_basis === desired.embedding_basis &&
|
|
1824
|
+
state.embed_model === desired.embed_model &&
|
|
1825
|
+
state.embed_dimensions === desired.embed_dimensions &&
|
|
1826
|
+
state.embed_pipeline_version === desired.embed_pipeline_version &&
|
|
1827
|
+
state.vector_backend === desired.vector_backend);
|
|
1828
|
+
}
|
|
1829
|
+
isRepoClusterStateCurrent(repoId) {
|
|
1830
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1831
|
+
return this.isRepoVectorStateCurrent(repoId) && Boolean(state?.clusters_current_at);
|
|
1832
|
+
}
|
|
1833
|
+
hasLegacyEmbeddings(repoId) {
|
|
1834
|
+
const row = this.db
|
|
1835
|
+
.prepare(`select count(*) as count
|
|
1836
|
+
from document_embeddings e
|
|
1837
|
+
join threads t on t.id = e.thread_id
|
|
1838
|
+
where t.repo_id = ?
|
|
1839
|
+
and t.state = 'open'
|
|
1840
|
+
and t.closed_at_local is null
|
|
1841
|
+
and e.model = ?`)
|
|
1842
|
+
.get(repoId, this.config.embedModel);
|
|
1843
|
+
return row.count > 0;
|
|
1844
|
+
}
|
|
1845
|
+
writeRepoPipelineState(repoId, overrides) {
|
|
1846
|
+
const desired = this.getDesiredPipelineState();
|
|
1847
|
+
const current = this.getRepoPipelineState(repoId);
|
|
1848
|
+
this.db
|
|
1849
|
+
.prepare(`insert into repo_pipeline_state (
|
|
1850
|
+
repo_id,
|
|
1851
|
+
summary_model,
|
|
1852
|
+
summary_prompt_version,
|
|
1853
|
+
embedding_basis,
|
|
1854
|
+
embed_model,
|
|
1855
|
+
embed_dimensions,
|
|
1856
|
+
embed_pipeline_version,
|
|
1857
|
+
vector_backend,
|
|
1858
|
+
vectors_current_at,
|
|
1859
|
+
clusters_current_at,
|
|
1860
|
+
updated_at
|
|
1861
|
+
) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1862
|
+
on conflict(repo_id) do update set
|
|
1863
|
+
summary_model = excluded.summary_model,
|
|
1864
|
+
summary_prompt_version = excluded.summary_prompt_version,
|
|
1865
|
+
embedding_basis = excluded.embedding_basis,
|
|
1866
|
+
embed_model = excluded.embed_model,
|
|
1867
|
+
embed_dimensions = excluded.embed_dimensions,
|
|
1868
|
+
embed_pipeline_version = excluded.embed_pipeline_version,
|
|
1869
|
+
vector_backend = excluded.vector_backend,
|
|
1870
|
+
vectors_current_at = excluded.vectors_current_at,
|
|
1871
|
+
clusters_current_at = excluded.clusters_current_at,
|
|
1872
|
+
updated_at = excluded.updated_at`)
|
|
1873
|
+
.run(repoId, desired.summary_model, desired.summary_prompt_version, desired.embedding_basis, desired.embed_model, desired.embed_dimensions, desired.embed_pipeline_version, desired.vector_backend, overrides.vectors_current_at ?? current?.vectors_current_at ?? null, overrides.clusters_current_at ?? current?.clusters_current_at ?? null, nowIso());
|
|
1874
|
+
}
|
|
1875
|
+
markRepoVectorsCurrent(repoId) {
|
|
1876
|
+
this.writeRepoPipelineState(repoId, {
|
|
1877
|
+
vectors_current_at: nowIso(),
|
|
1878
|
+
clusters_current_at: null,
|
|
1879
|
+
});
|
|
1880
|
+
}
|
|
1881
|
+
markRepoClustersCurrent(repoId) {
|
|
1882
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1883
|
+
this.writeRepoPipelineState(repoId, {
|
|
1884
|
+
vectors_current_at: state?.vectors_current_at ?? nowIso(),
|
|
1885
|
+
clusters_current_at: nowIso(),
|
|
1886
|
+
});
|
|
1887
|
+
}
|
|
1888
|
+
repoVectorStorePath(repoFullName) {
|
|
1889
|
+
const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__');
|
|
1890
|
+
return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`);
|
|
1891
|
+
}
|
|
1892
|
+
resetRepositoryVectors(repoId, repoFullName) {
|
|
1893
|
+
this.db
|
|
1894
|
+
.prepare(`delete from thread_vectors
|
|
1895
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
1896
|
+
.run(repoId);
|
|
1897
|
+
this.vectorStore.resetRepository({
|
|
1898
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
1899
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1900
|
+
});
|
|
1901
|
+
this.writeRepoPipelineState(repoId, {
|
|
1902
|
+
vectors_current_at: null,
|
|
1903
|
+
clusters_current_at: null,
|
|
1904
|
+
});
|
|
1905
|
+
}
|
|
1906
|
+
pruneInactiveRepositoryVectors(repoId, repoFullName) {
|
|
1907
|
+
const rows = this.db
|
|
1908
|
+
.prepare(`select tv.thread_id
|
|
1909
|
+
from thread_vectors tv
|
|
1910
|
+
join threads t on t.id = tv.thread_id
|
|
1911
|
+
where t.repo_id = ?
|
|
1912
|
+
and (t.state != 'open' or t.closed_at_local is not null)`)
|
|
1913
|
+
.all(repoId);
|
|
1914
|
+
if (rows.length === 0) {
|
|
1915
|
+
return 0;
|
|
1916
|
+
}
|
|
1917
|
+
const deleteVectorRow = this.db.prepare('delete from thread_vectors where thread_id = ?');
|
|
1918
|
+
this.db.transaction(() => {
|
|
1919
|
+
for (const row of rows) {
|
|
1920
|
+
deleteVectorRow.run(row.thread_id);
|
|
1921
|
+
this.vectorStore.deleteVector({
|
|
1922
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
1923
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1924
|
+
threadId: row.thread_id,
|
|
1925
|
+
});
|
|
1926
|
+
}
|
|
1927
|
+
})();
|
|
1928
|
+
return rows.length;
|
|
1929
|
+
}
|
|
1930
|
+
cleanupMigratedRepositoryArtifacts(repoId, repoFullName, onProgress) {
|
|
1931
|
+
const legacyEmbeddingCount = this.countLegacyEmbeddings(repoId);
|
|
1932
|
+
const inlineJsonVectorCount = this.countInlineJsonThreadVectors(repoId);
|
|
1933
|
+
if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) {
|
|
1934
|
+
return;
|
|
1935
|
+
}
|
|
1936
|
+
if (legacyEmbeddingCount > 0) {
|
|
1937
|
+
this.db
|
|
1938
|
+
.prepare(`delete from document_embeddings
|
|
1939
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
1940
|
+
.run(repoId);
|
|
1941
|
+
onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`);
|
|
1942
|
+
}
|
|
1943
|
+
if (inlineJsonVectorCount > 0) {
|
|
1944
|
+
const rows = this.db
|
|
1945
|
+
.prepare(`select tv.thread_id, tv.vector_json
|
|
1946
|
+
from thread_vectors tv
|
|
1947
|
+
join threads t on t.id = tv.thread_id
|
|
1948
|
+
where t.repo_id = ?
|
|
1949
|
+
and typeof(tv.vector_json) = 'text'
|
|
1950
|
+
and tv.vector_json != ''`)
|
|
1951
|
+
.all(repoId);
|
|
1952
|
+
const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?');
|
|
1953
|
+
this.db.transaction(() => {
|
|
1954
|
+
for (const row of rows) {
|
|
1955
|
+
update.run(this.vectorBlob(JSON.parse(row.vector_json)), nowIso(), row.thread_id);
|
|
1956
|
+
}
|
|
1957
|
+
})();
|
|
1958
|
+
onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`);
|
|
1959
|
+
}
|
|
1960
|
+
if (this.config.dbPath !== ':memory:') {
|
|
1961
|
+
onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${repoFullName} migration changes`);
|
|
1962
|
+
this.db.pragma('wal_checkpoint(TRUNCATE)');
|
|
1963
|
+
this.db.exec('VACUUM');
|
|
1964
|
+
this.db.pragma('wal_checkpoint(TRUNCATE)');
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1317
1967
|
getLatestClusterRun(repoId) {
|
|
1968
|
+
const state = this.getRepoPipelineState(repoId);
|
|
1969
|
+
if (state && !this.isRepoClusterStateCurrent(repoId)) {
|
|
1970
|
+
return null;
|
|
1971
|
+
}
|
|
1318
1972
|
return (this.db
|
|
1319
1973
|
.prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1320
1974
|
.get(repoId) ?? null);
|
|
@@ -1784,7 +2438,7 @@ export class GHCrawlService {
|
|
|
1784
2438
|
}
|
|
1785
2439
|
}
|
|
1786
2440
|
const summaryInput = parts.join('\n\n');
|
|
1787
|
-
const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
|
|
2441
|
+
const summaryContentHash = stableContentHash(`summary:${SUMMARY_PROMPT_VERSION}:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
|
|
1788
2442
|
return { summaryInput, summaryContentHash };
|
|
1789
2443
|
}
|
|
1790
2444
|
buildEmbeddingTasks(params) {
|
|
@@ -1827,6 +2481,35 @@ export class GHCrawlService {
|
|
|
1827
2481
|
}
|
|
1828
2482
|
return tasks;
|
|
1829
2483
|
}
|
|
2484
|
+
buildActiveVectorTask(params) {
|
|
2485
|
+
const sections = [`title: ${normalizeSummaryText(params.title)}`];
|
|
2486
|
+
if (this.config.embeddingBasis === 'title_summary') {
|
|
2487
|
+
const summary = normalizeSummaryText(params.dedupeSummary ?? '');
|
|
2488
|
+
if (!summary) {
|
|
2489
|
+
return null;
|
|
2490
|
+
}
|
|
2491
|
+
sections.push(`summary: ${summary}`);
|
|
2492
|
+
}
|
|
2493
|
+
else {
|
|
2494
|
+
const body = normalizeSummaryText(params.body ?? '');
|
|
2495
|
+
if (body) {
|
|
2496
|
+
sections.push(`body: ${body}`);
|
|
2497
|
+
}
|
|
2498
|
+
}
|
|
2499
|
+
const prepared = this.prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS);
|
|
2500
|
+
if (!prepared) {
|
|
2501
|
+
return null;
|
|
2502
|
+
}
|
|
2503
|
+
return {
|
|
2504
|
+
threadId: params.threadId,
|
|
2505
|
+
threadNumber: params.threadNumber,
|
|
2506
|
+
basis: this.config.embeddingBasis,
|
|
2507
|
+
text: prepared.text,
|
|
2508
|
+
contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${this.config.embeddingBasis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`),
|
|
2509
|
+
estimatedTokens: prepared.estimatedTokens,
|
|
2510
|
+
wasTruncated: prepared.wasTruncated,
|
|
2511
|
+
};
|
|
2512
|
+
}
|
|
1830
2513
|
prepareEmbeddingText(text, maxEstimatedTokens) {
|
|
1831
2514
|
if (!text) {
|
|
1832
2515
|
return null;
|
|
@@ -1868,6 +2551,7 @@ export class GHCrawlService {
|
|
|
1868
2551
|
const embeddings = await ai.embedTexts({
|
|
1869
2552
|
model: this.config.embedModel,
|
|
1870
2553
|
texts: batch.map((task) => task.text),
|
|
2554
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1871
2555
|
});
|
|
1872
2556
|
return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
|
|
1873
2557
|
}
|
|
@@ -1894,6 +2578,7 @@ export class GHCrawlService {
|
|
|
1894
2578
|
const [embedding] = await ai.embedTexts({
|
|
1895
2579
|
model: this.config.embedModel,
|
|
1896
2580
|
texts: [current.text],
|
|
2581
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
1897
2582
|
});
|
|
1898
2583
|
return { task: current, embedding };
|
|
1899
2584
|
}
|
|
@@ -1906,11 +2591,11 @@ export class GHCrawlService {
|
|
|
1906
2591
|
if (!next || next.text === current.text) {
|
|
1907
2592
|
throw error;
|
|
1908
2593
|
}
|
|
1909
|
-
onProgress?.(`[embed] shortened #${current.threadNumber}:${current.
|
|
2594
|
+
onProgress?.(`[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
|
|
1910
2595
|
current = next;
|
|
1911
2596
|
}
|
|
1912
2597
|
}
|
|
1913
|
-
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.
|
|
2598
|
+
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`);
|
|
1914
2599
|
}
|
|
1915
2600
|
shrinkEmbeddingTask(task, context) {
|
|
1916
2601
|
const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
|
|
@@ -1927,7 +2612,7 @@ export class GHCrawlService {
|
|
|
1927
2612
|
return {
|
|
1928
2613
|
...task,
|
|
1929
2614
|
text: nextText,
|
|
1930
|
-
contentHash: stableContentHash(`embedding:${task.
|
|
2615
|
+
contentHash: stableContentHash(`embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`),
|
|
1931
2616
|
estimatedTokens: this.estimateEmbeddingTokens(nextText),
|
|
1932
2617
|
wasTruncated: true,
|
|
1933
2618
|
};
|
|
@@ -1976,23 +2661,71 @@ export class GHCrawlService {
|
|
|
1976
2661
|
order by t.number asc, e.source_kind asc`)
|
|
1977
2662
|
.all(repoId, this.config.embedModel);
|
|
1978
2663
|
}
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
2664
|
+
loadStoredEmbeddingsForThreadNumber(repoId, threadNumber) {
|
|
2665
|
+
return this.db
|
|
2666
|
+
.prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
|
|
2667
|
+
t.title, t.body, t.author_login, t.html_url, t.labels_json,
|
|
2668
|
+
t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
|
|
2669
|
+
from threads t
|
|
2670
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2671
|
+
where t.repo_id = ?
|
|
2672
|
+
and t.number = ?
|
|
2673
|
+
and t.state = 'open'
|
|
2674
|
+
and t.closed_at_local is null
|
|
2675
|
+
and e.model = ?
|
|
2676
|
+
order by e.source_kind asc`)
|
|
2677
|
+
.all(repoId, threadNumber, this.config.embedModel);
|
|
2678
|
+
}
|
|
2679
|
+
iterateStoredEmbeddings(repoId) {
|
|
2680
|
+
return this.db
|
|
2681
|
+
.prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
|
|
2682
|
+
t.title, t.body, t.author_login, t.html_url, t.labels_json,
|
|
2683
|
+
t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
|
|
2684
|
+
from threads t
|
|
2685
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2686
|
+
where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ?
|
|
2687
|
+
order by t.number asc, e.source_kind asc`)
|
|
2688
|
+
.iterate(repoId, this.config.embedModel);
|
|
2689
|
+
}
|
|
2690
|
+
loadNormalizedEmbeddingForSourceKindHead(repoId, sourceKind) {
|
|
2691
|
+
const row = this.db
|
|
2692
|
+
.prepare(`select t.id, e.embedding_json
|
|
2693
|
+
from threads t
|
|
2694
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2695
|
+
where t.repo_id = ?
|
|
2696
|
+
and t.state = 'open'
|
|
2697
|
+
and t.closed_at_local is null
|
|
2698
|
+
and e.model = ?
|
|
2699
|
+
and e.source_kind = ?
|
|
2700
|
+
order by t.number asc
|
|
2701
|
+
limit 1`)
|
|
2702
|
+
.get(repoId, this.config.embedModel, sourceKind);
|
|
2703
|
+
if (!row) {
|
|
2704
|
+
return null;
|
|
1983
2705
|
}
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
2706
|
+
return {
|
|
2707
|
+
id: row.id,
|
|
2708
|
+
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2709
|
+
};
|
|
2710
|
+
}
|
|
2711
|
+
*iterateNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2712
|
+
const rows = this.db
|
|
2713
|
+
.prepare(`select t.id, e.embedding_json
|
|
2714
|
+
from threads t
|
|
2715
|
+
join document_embeddings e on e.thread_id = t.id
|
|
2716
|
+
where t.repo_id = ?
|
|
2717
|
+
and t.state = 'open'
|
|
2718
|
+
and t.closed_at_local is null
|
|
2719
|
+
and e.model = ?
|
|
2720
|
+
and e.source_kind = ?
|
|
2721
|
+
order by t.number asc`)
|
|
2722
|
+
.iterate(repoId, this.config.embedModel, sourceKind);
|
|
2723
|
+
for (const row of rows) {
|
|
2724
|
+
yield {
|
|
2725
|
+
id: row.id,
|
|
2726
|
+
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
1992
2727
|
};
|
|
1993
|
-
}
|
|
1994
|
-
this.parsedEmbeddingCache.set(repoId, parsed);
|
|
1995
|
-
return parsed;
|
|
2728
|
+
}
|
|
1996
2729
|
}
|
|
1997
2730
|
loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
1998
2731
|
const rows = this.db
|
|
@@ -2011,6 +2744,12 @@ export class GHCrawlService {
|
|
|
2011
2744
|
normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json)).normalized,
|
|
2012
2745
|
}));
|
|
2013
2746
|
}
|
|
2747
|
+
normalizedEmbeddingBuffer(values) {
|
|
2748
|
+
return Buffer.from(Float32Array.from(values).buffer);
|
|
2749
|
+
}
|
|
2750
|
+
normalizedDistanceToScore(distance) {
|
|
2751
|
+
return 1 - distance / 2;
|
|
2752
|
+
}
|
|
2014
2753
|
loadClusterableThreadMeta(repoId) {
|
|
2015
2754
|
const rows = this.db
|
|
2016
2755
|
.prepare(`select t.id, t.number, t.title, e.source_kind
|
|
@@ -2031,6 +2770,34 @@ export class GHCrawlService {
|
|
|
2031
2770
|
sourceKinds: Array.from(sourceKinds.values()),
|
|
2032
2771
|
};
|
|
2033
2772
|
}
|
|
2773
|
+
loadClusterableActiveVectorMeta(repoId, _repoFullName) {
|
|
2774
|
+
const rows = this.db
|
|
2775
|
+
.prepare(`select t.id, t.number, t.title, tv.vector_json
|
|
2776
|
+
from threads t
|
|
2777
|
+
join thread_vectors tv on tv.thread_id = t.id
|
|
2778
|
+
where t.repo_id = ?
|
|
2779
|
+
and t.state = 'open'
|
|
2780
|
+
and t.closed_at_local is null
|
|
2781
|
+
and tv.model = ?
|
|
2782
|
+
and tv.basis = ?
|
|
2783
|
+
and tv.dimensions = ?
|
|
2784
|
+
order by t.number asc`)
|
|
2785
|
+
.all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
2786
|
+
return rows.map((row) => ({
|
|
2787
|
+
id: row.id,
|
|
2788
|
+
number: row.number,
|
|
2789
|
+
title: row.title,
|
|
2790
|
+
embedding: this.parseStoredVector(row.vector_json),
|
|
2791
|
+
}));
|
|
2792
|
+
}
|
|
2793
|
+
loadNormalizedActiveVectors(repoId) {
|
|
2794
|
+
return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({
|
|
2795
|
+
id: row.id,
|
|
2796
|
+
number: row.number,
|
|
2797
|
+
title: row.title,
|
|
2798
|
+
embedding: normalizeEmbedding(row.embedding).normalized,
|
|
2799
|
+
}));
|
|
2800
|
+
}
|
|
2034
2801
|
listStoredClusterNeighbors(repoId, threadId, limit) {
|
|
2035
2802
|
const latestRun = this.getLatestClusterRun(repoId);
|
|
2036
2803
|
if (!latestRun) {
|
|
@@ -2087,56 +2854,65 @@ export class GHCrawlService {
|
|
|
2087
2854
|
}
|
|
2088
2855
|
sql += ' order by t.number asc';
|
|
2089
2856
|
const rows = this.db.prepare(sql).all(...args);
|
|
2090
|
-
const summaryTexts = this.
|
|
2091
|
-
const
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2857
|
+
const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber);
|
|
2858
|
+
const missingSummaryThreadNumbers = [];
|
|
2859
|
+
const tasks = rows.flatMap((row) => {
|
|
2860
|
+
const task = this.buildActiveVectorTask({
|
|
2861
|
+
threadId: row.id,
|
|
2862
|
+
threadNumber: row.number,
|
|
2863
|
+
title: row.title,
|
|
2864
|
+
body: row.body,
|
|
2865
|
+
dedupeSummary: summaryTexts.get(row.id) ?? null,
|
|
2866
|
+
});
|
|
2867
|
+
if (task) {
|
|
2868
|
+
return [task];
|
|
2869
|
+
}
|
|
2870
|
+
if (this.config.embeddingBasis === 'title_summary') {
|
|
2871
|
+
missingSummaryThreadNumbers.push(row.number);
|
|
2872
|
+
}
|
|
2873
|
+
return [];
|
|
2874
|
+
});
|
|
2875
|
+
const pipelineCurrent = this.isRepoVectorStateCurrent(repoId);
|
|
2098
2876
|
const existingRows = this.db
|
|
2099
|
-
.prepare(`select
|
|
2100
|
-
from
|
|
2101
|
-
join threads t on t.id =
|
|
2102
|
-
where t.repo_id = ?
|
|
2103
|
-
|
|
2877
|
+
.prepare(`select tv.thread_id, tv.content_hash
|
|
2878
|
+
from thread_vectors tv
|
|
2879
|
+
join threads t on t.id = tv.thread_id
|
|
2880
|
+
where t.repo_id = ?
|
|
2881
|
+
and tv.model = ?
|
|
2882
|
+
and tv.basis = ?
|
|
2883
|
+
and tv.dimensions = ?`)
|
|
2884
|
+
.all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS);
|
|
2104
2885
|
const existing = new Map();
|
|
2105
2886
|
for (const row of existingRows) {
|
|
2106
|
-
existing.set(
|
|
2887
|
+
existing.set(String(row.thread_id), row.content_hash);
|
|
2107
2888
|
}
|
|
2108
|
-
const pending =
|
|
2109
|
-
|
|
2889
|
+
const pending = pipelineCurrent
|
|
2890
|
+
? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash)
|
|
2891
|
+
: tasks;
|
|
2892
|
+
return { rows, tasks, existing, pending, missingSummaryThreadNumbers };
|
|
2110
2893
|
}
|
|
2111
|
-
|
|
2112
|
-
let sql = `select s.thread_id, s.
|
|
2894
|
+
loadDedupeSummaryTextMap(repoId, threadNumber) {
|
|
2895
|
+
let sql = `select s.thread_id, s.summary_text
|
|
2113
2896
|
from document_summaries s
|
|
2114
2897
|
join threads t on t.id = s.thread_id
|
|
2115
|
-
where t.repo_id = ?
|
|
2116
|
-
|
|
2898
|
+
where t.repo_id = ?
|
|
2899
|
+
and t.state = 'open'
|
|
2900
|
+
and t.closed_at_local is null
|
|
2901
|
+
and s.model = ?
|
|
2902
|
+
and s.summary_kind = 'dedupe_summary'
|
|
2903
|
+
and s.prompt_version = ?`;
|
|
2904
|
+
const args = [repoId, this.config.summaryModel, SUMMARY_PROMPT_VERSION];
|
|
2117
2905
|
if (threadNumber) {
|
|
2118
2906
|
sql += ' and t.number = ?';
|
|
2119
2907
|
args.push(threadNumber);
|
|
2120
2908
|
}
|
|
2121
|
-
sql += ' order by t.number asc
|
|
2909
|
+
sql += ' order by t.number asc';
|
|
2122
2910
|
const rows = this.db.prepare(sql).all(...args);
|
|
2123
|
-
const byThread = new Map();
|
|
2124
|
-
for (const row of rows) {
|
|
2125
|
-
const entry = byThread.get(row.thread_id) ?? new Map();
|
|
2126
|
-
entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
|
|
2127
|
-
byThread.set(row.thread_id, entry);
|
|
2128
|
-
}
|
|
2129
2911
|
const combined = new Map();
|
|
2130
|
-
const
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
.
|
|
2134
|
-
const text = entry.get(summaryKind);
|
|
2135
|
-
return text ? `${summaryKind}: ${text}` : '';
|
|
2136
|
-
})
|
|
2137
|
-
.filter(Boolean);
|
|
2138
|
-
if (parts.length > 0) {
|
|
2139
|
-
combined.set(threadId, parts.join('\n\n'));
|
|
2912
|
+
for (const row of rows) {
|
|
2913
|
+
const text = normalizeSummaryText(row.summary_text);
|
|
2914
|
+
if (text) {
|
|
2915
|
+
combined.set(row.thread_id, text);
|
|
2140
2916
|
}
|
|
2141
2917
|
}
|
|
2142
2918
|
return combined;
|
|
@@ -2233,6 +3009,71 @@ export class GHCrawlService {
|
|
|
2233
3009
|
});
|
|
2234
3010
|
}
|
|
2235
3011
|
}
|
|
3012
|
+
collectSourceKindScores(perSourceScores, edges, sourceKind) {
|
|
3013
|
+
for (const edge of edges) {
|
|
3014
|
+
const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId);
|
|
3015
|
+
const existing = perSourceScores.get(key);
|
|
3016
|
+
if (existing) {
|
|
3017
|
+
existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score));
|
|
3018
|
+
continue;
|
|
3019
|
+
}
|
|
3020
|
+
const scores = new Map();
|
|
3021
|
+
scores.set(sourceKind, edge.score);
|
|
3022
|
+
perSourceScores.set(key, {
|
|
3023
|
+
leftThreadId: edge.leftThreadId,
|
|
3024
|
+
rightThreadId: edge.rightThreadId,
|
|
3025
|
+
scores,
|
|
3026
|
+
});
|
|
3027
|
+
}
|
|
3028
|
+
}
|
|
3029
|
+
finalizeEdgeScores(perSourceScores, aggregation, weights, minScore) {
|
|
3030
|
+
const result = [];
|
|
3031
|
+
for (const entry of perSourceScores.values()) {
|
|
3032
|
+
const scoreValues = Array.from(entry.scores.values());
|
|
3033
|
+
let finalScore;
|
|
3034
|
+
switch (aggregation) {
|
|
3035
|
+
case 'max':
|
|
3036
|
+
finalScore = Math.max(...scoreValues);
|
|
3037
|
+
break;
|
|
3038
|
+
case 'mean':
|
|
3039
|
+
finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length;
|
|
3040
|
+
break;
|
|
3041
|
+
case 'weighted': {
|
|
3042
|
+
let weightedSum = 0;
|
|
3043
|
+
let weightSum = 0;
|
|
3044
|
+
for (const [kind, score] of entry.scores) {
|
|
3045
|
+
const w = weights[kind] ?? 0.1;
|
|
3046
|
+
weightedSum += score * w;
|
|
3047
|
+
weightSum += w;
|
|
3048
|
+
}
|
|
3049
|
+
finalScore = weightSum > 0 ? weightedSum / weightSum : 0;
|
|
3050
|
+
break;
|
|
3051
|
+
}
|
|
3052
|
+
case 'min-of-2':
|
|
3053
|
+
// Require at least 2 source kinds to agree (both above minScore)
|
|
3054
|
+
if (scoreValues.length < 2) {
|
|
3055
|
+
continue; // Skip edges with only 1 source kind
|
|
3056
|
+
}
|
|
3057
|
+
finalScore = Math.max(...scoreValues);
|
|
3058
|
+
break;
|
|
3059
|
+
case 'boost': {
|
|
3060
|
+
// Best score + bonus per additional agreeing source
|
|
3061
|
+
const best = Math.max(...scoreValues);
|
|
3062
|
+
const bonusSources = scoreValues.length - 1;
|
|
3063
|
+
finalScore = Math.min(1.0, best + bonusSources * 0.05);
|
|
3064
|
+
break;
|
|
3065
|
+
}
|
|
3066
|
+
}
|
|
3067
|
+
if (finalScore >= minScore) {
|
|
3068
|
+
result.push({
|
|
3069
|
+
leftThreadId: entry.leftThreadId,
|
|
3070
|
+
rightThreadId: entry.rightThreadId,
|
|
3071
|
+
score: finalScore,
|
|
3072
|
+
});
|
|
3073
|
+
}
|
|
3074
|
+
}
|
|
3075
|
+
return result;
|
|
3076
|
+
}
|
|
2236
3077
|
countEmbeddingsForSourceKind(repoId, sourceKind) {
|
|
2237
3078
|
const row = this.db
|
|
2238
3079
|
.prepare(`select count(*) as count
|
|
@@ -2280,15 +3121,102 @@ export class GHCrawlService {
|
|
|
2280
3121
|
pruneOldClusterRuns(repoId, keepRunId) {
|
|
2281
3122
|
this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
|
|
2282
3123
|
}
|
|
3124
|
+
summarizeClusterSizes(clusters) {
|
|
3125
|
+
const histogramCounts = new Map();
|
|
3126
|
+
const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left);
|
|
3127
|
+
let soloClusters = 0;
|
|
3128
|
+
for (const cluster of clusters) {
|
|
3129
|
+
const size = cluster.members.length;
|
|
3130
|
+
histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1);
|
|
3131
|
+
if (size === 1) {
|
|
3132
|
+
soloClusters += 1;
|
|
3133
|
+
}
|
|
3134
|
+
}
|
|
3135
|
+
return {
|
|
3136
|
+
soloClusters,
|
|
3137
|
+
maxClusterSize: topClusterSizes[0] ?? 0,
|
|
3138
|
+
topClusterSizes: topClusterSizes.slice(0, 50),
|
|
3139
|
+
histogram: Array.from(histogramCounts.entries())
|
|
3140
|
+
.map(([size, count]) => ({ size, count }))
|
|
3141
|
+
.sort((left, right) => left.size - right.size),
|
|
3142
|
+
};
|
|
3143
|
+
}
|
|
2283
3144
|
upsertSummary(threadId, contentHash, summaryKind, summaryText) {
|
|
2284
3145
|
this.db
|
|
2285
|
-
.prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
|
|
2286
|
-
values (?, ?, ?, ?, ?, ?, ?)
|
|
3146
|
+
.prepare(`insert into document_summaries (thread_id, summary_kind, model, prompt_version, content_hash, summary_text, created_at, updated_at)
|
|
3147
|
+
values (?, ?, ?, ?, ?, ?, ?, ?)
|
|
2287
3148
|
on conflict(thread_id, summary_kind, model) do update set
|
|
3149
|
+
prompt_version = excluded.prompt_version,
|
|
2288
3150
|
content_hash = excluded.content_hash,
|
|
2289
3151
|
summary_text = excluded.summary_text,
|
|
2290
3152
|
updated_at = excluded.updated_at`)
|
|
2291
|
-
.run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
|
|
3153
|
+
.run(threadId, summaryKind, this.config.summaryModel, SUMMARY_PROMPT_VERSION, contentHash, summaryText, nowIso(), nowIso());
|
|
3154
|
+
}
|
|
3155
|
+
upsertActiveVector(repoId, repoFullName, threadId, basis, contentHash, embedding) {
|
|
3156
|
+
this.db
|
|
3157
|
+
.prepare(`insert into thread_vectors (thread_id, basis, model, dimensions, content_hash, vector_json, vector_backend, created_at, updated_at)
|
|
3158
|
+
values (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3159
|
+
on conflict(thread_id) do update set
|
|
3160
|
+
basis = excluded.basis,
|
|
3161
|
+
model = excluded.model,
|
|
3162
|
+
dimensions = excluded.dimensions,
|
|
3163
|
+
content_hash = excluded.content_hash,
|
|
3164
|
+
vector_json = excluded.vector_json,
|
|
3165
|
+
vector_backend = excluded.vector_backend,
|
|
3166
|
+
updated_at = excluded.updated_at`)
|
|
3167
|
+
.run(threadId, basis, this.config.embedModel, embedding.length, contentHash, this.vectorBlob(embedding), this.config.vectorBackend, nowIso(), nowIso());
|
|
3168
|
+
this.vectorStore.upsertVector({
|
|
3169
|
+
storePath: this.repoVectorStorePath(repoFullName),
|
|
3170
|
+
dimensions: ACTIVE_EMBED_DIMENSIONS,
|
|
3171
|
+
threadId,
|
|
3172
|
+
vector: embedding,
|
|
3173
|
+
});
|
|
3174
|
+
}
|
|
3175
|
+
countLegacyEmbeddings(repoId) {
|
|
3176
|
+
const row = this.db
|
|
3177
|
+
.prepare(`select count(*) as count
|
|
3178
|
+
from document_embeddings
|
|
3179
|
+
where thread_id in (select id from threads where repo_id = ?)`)
|
|
3180
|
+
.get(repoId);
|
|
3181
|
+
return row.count;
|
|
3182
|
+
}
|
|
3183
|
+
countInlineJsonThreadVectors(repoId) {
|
|
3184
|
+
const row = this.db
|
|
3185
|
+
.prepare(`select count(*) as count
|
|
3186
|
+
from thread_vectors
|
|
3187
|
+
where thread_id in (select id from threads where repo_id = ?)
|
|
3188
|
+
and typeof(vector_json) = 'text'
|
|
3189
|
+
and vector_json != ''`)
|
|
3190
|
+
.get(repoId);
|
|
3191
|
+
return row.count;
|
|
3192
|
+
}
|
|
3193
|
+
getVectorliteClusterQuery(totalItems, requestedK) {
|
|
3194
|
+
if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) {
|
|
3195
|
+
return {
|
|
3196
|
+
limit: requestedK,
|
|
3197
|
+
candidateK: Math.max(requestedK * 16, 64),
|
|
3198
|
+
};
|
|
3199
|
+
}
|
|
3200
|
+
const limit = Math.min(Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), Math.max(1, totalItems - 1));
|
|
3201
|
+
const candidateK = Math.min(Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), Math.max(limit, totalItems - 1));
|
|
3202
|
+
return {
|
|
3203
|
+
limit,
|
|
3204
|
+
candidateK,
|
|
3205
|
+
efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH),
|
|
3206
|
+
};
|
|
3207
|
+
}
|
|
3208
|
+
vectorBlob(values) {
|
|
3209
|
+
return Buffer.from(Float32Array.from(values).buffer);
|
|
3210
|
+
}
|
|
3211
|
+
parseStoredVector(value) {
|
|
3212
|
+
if (typeof value === 'string') {
|
|
3213
|
+
if (!value) {
|
|
3214
|
+
throw new Error('Stored vector payload is empty. Run refresh or embed first.');
|
|
3215
|
+
}
|
|
3216
|
+
return JSON.parse(value);
|
|
3217
|
+
}
|
|
3218
|
+
const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT));
|
|
3219
|
+
return Array.from(floats);
|
|
2292
3220
|
}
|
|
2293
3221
|
upsertEmbedding(threadId, sourceKind, contentHash, embedding) {
|
|
2294
3222
|
this.db
|
|
@@ -2300,10 +3228,6 @@ export class GHCrawlService {
|
|
|
2300
3228
|
embedding_json = excluded.embedding_json,
|
|
2301
3229
|
updated_at = excluded.updated_at`)
|
|
2302
3230
|
.run(threadId, sourceKind, this.config.embedModel, embedding.length, contentHash, asJson(embedding), nowIso(), nowIso());
|
|
2303
|
-
const row = this.db.prepare('select repo_id from threads where id = ? limit 1').get(threadId);
|
|
2304
|
-
if (row) {
|
|
2305
|
-
this.parsedEmbeddingCache.delete(row.repo_id);
|
|
2306
|
-
}
|
|
2307
3231
|
}
|
|
2308
3232
|
startRun(table, repoId, scope) {
|
|
2309
3233
|
const result = this.db
|