@ghcrawl/api-core 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/server.d.ts.map +1 -1
- package/dist/api/server.js +37 -3
- package/dist/api/server.js.map +1 -1
- package/dist/config.d.ts +3 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +7 -2
- package/dist/config.js.map +1 -1
- package/dist/db/migrate.d.ts.map +1 -1
- package/dist/db/migrate.js +13 -0
- package/dist/db/migrate.js.map +1 -1
- package/dist/service.d.ts +35 -1
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +382 -47
- package/dist/service.js.map +1 -1
- package/package.json +2 -2
package/dist/service.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
2
|
import crypto from 'node:crypto';
|
|
3
3
|
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
4
|
-
import { actionResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
4
|
+
import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
5
5
|
import { buildClusters } from './cluster/build.js';
|
|
6
6
|
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
7
7
|
import { migrate } from './db/migrate.js';
|
|
@@ -13,6 +13,7 @@ import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
|
|
|
13
13
|
const SYNC_BATCH_SIZE = 100;
|
|
14
14
|
const SYNC_BATCH_DELAY_MS = 5000;
|
|
15
15
|
const STALE_CLOSED_SWEEP_LIMIT = 1000;
|
|
16
|
+
const CLUSTER_PROGRESS_INTERVAL_MS = 5000;
|
|
16
17
|
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
17
18
|
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
18
19
|
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
@@ -26,6 +27,9 @@ function parseIso(value) {
|
|
|
26
27
|
const parsed = Date.parse(value);
|
|
27
28
|
return Number.isNaN(parsed) ? null : parsed;
|
|
28
29
|
}
|
|
30
|
+
function isEffectivelyClosed(row) {
|
|
31
|
+
return row.state !== 'open' || row.closed_at_local !== null;
|
|
32
|
+
}
|
|
29
33
|
function isMissingGitHubResourceError(error) {
|
|
30
34
|
const status = typeof error?.status === 'number' ? Number(error.status) : null;
|
|
31
35
|
if (status === 404 || status === 410) {
|
|
@@ -149,6 +153,10 @@ function threadToDto(row, clusterId) {
|
|
|
149
153
|
number: row.number,
|
|
150
154
|
kind: row.kind,
|
|
151
155
|
state: row.state,
|
|
156
|
+
isClosed: isEffectivelyClosed(row),
|
|
157
|
+
closedAtGh: row.closed_at_gh ?? null,
|
|
158
|
+
closedAtLocal: row.closed_at_local ?? null,
|
|
159
|
+
closeReasonLocal: row.close_reason_local ?? null,
|
|
152
160
|
title: row.title,
|
|
153
161
|
body: row.body,
|
|
154
162
|
authorLogin: row.author_login,
|
|
@@ -259,17 +267,177 @@ export class GHCrawlService {
|
|
|
259
267
|
.all(repository.id, repository.id);
|
|
260
268
|
for (const row of clusterRows)
|
|
261
269
|
clusterIds.set(row.thread_id, row.cluster_id);
|
|
262
|
-
let sql =
|
|
270
|
+
let sql = 'select * from threads where repo_id = ?';
|
|
263
271
|
const args = [repository.id];
|
|
272
|
+
if (!params.includeClosed) {
|
|
273
|
+
sql += " and state = 'open' and closed_at_local is null";
|
|
274
|
+
}
|
|
264
275
|
if (params.kind) {
|
|
265
276
|
sql += ' and kind = ?';
|
|
266
277
|
args.push(params.kind);
|
|
267
278
|
}
|
|
279
|
+
if (params.numbers && params.numbers.length > 0) {
|
|
280
|
+
const uniqueNumbers = Array.from(new Set(params.numbers.filter((value) => Number.isSafeInteger(value) && value > 0)));
|
|
281
|
+
if (uniqueNumbers.length === 0) {
|
|
282
|
+
return threadsResponseSchema.parse({
|
|
283
|
+
repository,
|
|
284
|
+
threads: [],
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
sql += ` and number in (${uniqueNumbers.map(() => '?').join(', ')})`;
|
|
288
|
+
args.push(...uniqueNumbers);
|
|
289
|
+
}
|
|
268
290
|
sql += ' order by updated_at_gh desc, number desc';
|
|
269
291
|
const rows = this.db.prepare(sql).all(...args);
|
|
292
|
+
const orderedRows = params.numbers && params.numbers.length > 0
|
|
293
|
+
? (() => {
|
|
294
|
+
const byNumber = new Map(rows.map((row) => [row.number, row]));
|
|
295
|
+
const uniqueRequested = Array.from(new Set(params.numbers));
|
|
296
|
+
return uniqueRequested.map((number) => byNumber.get(number)).filter((row) => row !== undefined);
|
|
297
|
+
})()
|
|
298
|
+
: rows;
|
|
270
299
|
return threadsResponseSchema.parse({
|
|
271
300
|
repository,
|
|
272
|
-
threads:
|
|
301
|
+
threads: orderedRows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
listAuthorThreads(params) {
|
|
305
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
306
|
+
const normalizedLogin = params.login.trim();
|
|
307
|
+
if (!normalizedLogin) {
|
|
308
|
+
return authorThreadsResponseSchema.parse({
|
|
309
|
+
repository,
|
|
310
|
+
authorLogin: '',
|
|
311
|
+
threads: [],
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
const clusterIds = new Map();
|
|
315
|
+
const clusterRows = this.db
|
|
316
|
+
.prepare(`select cm.thread_id, cm.cluster_id
|
|
317
|
+
from cluster_members cm
|
|
318
|
+
join clusters c on c.id = cm.cluster_id
|
|
319
|
+
where c.repo_id = ? and c.cluster_run_id = (
|
|
320
|
+
select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
|
|
321
|
+
)`)
|
|
322
|
+
.all(repository.id, repository.id);
|
|
323
|
+
for (const row of clusterRows)
|
|
324
|
+
clusterIds.set(row.thread_id, row.cluster_id);
|
|
325
|
+
const rows = this.db
|
|
326
|
+
.prepare(`select *
|
|
327
|
+
from threads
|
|
328
|
+
where repo_id = ? and lower(author_login) = lower(?)
|
|
329
|
+
${params.includeClosed ? '' : "and state = 'open' and closed_at_local is null"}
|
|
330
|
+
order by updated_at_gh desc, number desc`)
|
|
331
|
+
.all(repository.id, normalizedLogin);
|
|
332
|
+
const latestRun = this.getLatestClusterRun(repository.id);
|
|
333
|
+
const strongestByThread = new Map();
|
|
334
|
+
if (latestRun && rows.length > 1) {
|
|
335
|
+
const edges = this.db
|
|
336
|
+
.prepare(`select
|
|
337
|
+
se.left_thread_id,
|
|
338
|
+
se.right_thread_id,
|
|
339
|
+
se.score,
|
|
340
|
+
t1.number as left_number,
|
|
341
|
+
t1.kind as left_kind,
|
|
342
|
+
t1.title as left_title,
|
|
343
|
+
t2.number as right_number,
|
|
344
|
+
t2.kind as right_kind,
|
|
345
|
+
t2.title as right_title
|
|
346
|
+
from similarity_edges se
|
|
347
|
+
join threads t1 on t1.id = se.left_thread_id
|
|
348
|
+
join threads t2 on t2.id = se.right_thread_id
|
|
349
|
+
where se.repo_id = ?
|
|
350
|
+
and se.cluster_run_id = ?
|
|
351
|
+
and lower(t1.author_login) = lower(?)
|
|
352
|
+
and lower(t2.author_login) = lower(?)
|
|
353
|
+
${params.includeClosed ? '' : "and t1.state = 'open' and t1.closed_at_local is null and t2.state = 'open' and t2.closed_at_local is null"}`)
|
|
354
|
+
.all(repository.id, latestRun.id, normalizedLogin, normalizedLogin);
|
|
355
|
+
const updateStrongest = (sourceThreadId, match) => {
|
|
356
|
+
const previous = strongestByThread.get(sourceThreadId);
|
|
357
|
+
if (!previous || match.score > previous.score) {
|
|
358
|
+
strongestByThread.set(sourceThreadId, match);
|
|
359
|
+
}
|
|
360
|
+
};
|
|
361
|
+
for (const edge of edges) {
|
|
362
|
+
updateStrongest(edge.left_thread_id, {
|
|
363
|
+
threadId: edge.right_thread_id,
|
|
364
|
+
number: edge.right_number,
|
|
365
|
+
kind: edge.right_kind,
|
|
366
|
+
title: edge.right_title,
|
|
367
|
+
score: edge.score,
|
|
368
|
+
});
|
|
369
|
+
updateStrongest(edge.right_thread_id, {
|
|
370
|
+
threadId: edge.left_thread_id,
|
|
371
|
+
number: edge.left_number,
|
|
372
|
+
kind: edge.left_kind,
|
|
373
|
+
title: edge.left_title,
|
|
374
|
+
score: edge.score,
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
return authorThreadsResponseSchema.parse({
|
|
379
|
+
repository,
|
|
380
|
+
authorLogin: normalizedLogin,
|
|
381
|
+
threads: rows.map((row) => ({
|
|
382
|
+
thread: threadToDto(row, clusterIds.get(row.id) ?? null),
|
|
383
|
+
strongestSameAuthorMatch: strongestByThread.get(row.id) ?? null,
|
|
384
|
+
})),
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
closeThreadLocally(params) {
|
|
388
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
389
|
+
const row = this.db
|
|
390
|
+
.prepare('select * from threads where repo_id = ? and number = ? limit 1')
|
|
391
|
+
.get(repository.id, params.threadNumber);
|
|
392
|
+
if (!row) {
|
|
393
|
+
throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`);
|
|
394
|
+
}
|
|
395
|
+
const closedAt = nowIso();
|
|
396
|
+
this.db
|
|
397
|
+
.prepare(`update threads
|
|
398
|
+
set closed_at_local = ?,
|
|
399
|
+
close_reason_local = 'manual',
|
|
400
|
+
updated_at = ?
|
|
401
|
+
where id = ?`)
|
|
402
|
+
.run(closedAt, closedAt, row.id);
|
|
403
|
+
this.parsedEmbeddingCache.delete(repository.id);
|
|
404
|
+
const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id);
|
|
405
|
+
const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0;
|
|
406
|
+
const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id);
|
|
407
|
+
return closeResponseSchema.parse({
|
|
408
|
+
ok: true,
|
|
409
|
+
repository,
|
|
410
|
+
thread: threadToDto(updated),
|
|
411
|
+
clusterId: clusterIds[0] ?? null,
|
|
412
|
+
clusterClosed,
|
|
413
|
+
message: `Marked ${updated.kind} #${updated.number} closed locally.`,
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
closeClusterLocally(params) {
|
|
417
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
418
|
+
const latestRun = this.getLatestClusterRun(repository.id);
|
|
419
|
+
if (!latestRun) {
|
|
420
|
+
throw new Error(`No completed cluster run found for ${repository.fullName}.`);
|
|
421
|
+
}
|
|
422
|
+
const row = this.db
|
|
423
|
+
.prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1')
|
|
424
|
+
.get(repository.id, latestRun.id, params.clusterId);
|
|
425
|
+
if (!row) {
|
|
426
|
+
throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
|
|
427
|
+
}
|
|
428
|
+
const closedAt = nowIso();
|
|
429
|
+
this.db
|
|
430
|
+
.prepare(`update clusters
|
|
431
|
+
set closed_at_local = ?,
|
|
432
|
+
close_reason_local = 'manual'
|
|
433
|
+
where id = ?`)
|
|
434
|
+
.run(closedAt, row.id);
|
|
435
|
+
return closeResponseSchema.parse({
|
|
436
|
+
ok: true,
|
|
437
|
+
repository,
|
|
438
|
+
clusterId: row.id,
|
|
439
|
+
clusterClosed: true,
|
|
440
|
+
message: `Marked cluster ${row.id} closed locally.`,
|
|
273
441
|
});
|
|
274
442
|
}
|
|
275
443
|
async syncRepository(params) {
|
|
@@ -366,6 +534,10 @@ export class GHCrawlService {
|
|
|
366
534
|
})
|
|
367
535
|
: 0;
|
|
368
536
|
const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile;
|
|
537
|
+
this.parsedEmbeddingCache.delete(repoId);
|
|
538
|
+
if (threadsClosed > 0) {
|
|
539
|
+
this.reconcileClusterCloseState(repoId);
|
|
540
|
+
}
|
|
369
541
|
const finishedAt = nowIso();
|
|
370
542
|
const reconciledOpenCloseAt = shouldSweepClosedOverlap || shouldReconcileMissingOpenThreads ? finishedAt : null;
|
|
371
543
|
const nextSyncCursor = {
|
|
@@ -542,34 +714,21 @@ export class GHCrawlService {
|
|
|
542
714
|
title: meta.title,
|
|
543
715
|
}));
|
|
544
716
|
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
717
|
+
const aggregatedEdges = this.aggregateRepositoryEdges(rows, {
|
|
718
|
+
limit: k,
|
|
719
|
+
minScore,
|
|
720
|
+
onProgress: params.onProgress,
|
|
721
|
+
});
|
|
549
722
|
const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
|
|
550
723
|
leftThreadId: entry.leftThreadId,
|
|
551
724
|
rightThreadId: entry.rightThreadId,
|
|
552
725
|
score: entry.score,
|
|
553
726
|
}));
|
|
554
|
-
const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
|
|
555
|
-
values (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
556
|
-
for (const edge of aggregatedEdges.values()) {
|
|
557
|
-
insertEdge.run(repository.id, runId, edge.leftThreadId, edge.rightThreadId, 'exact_cosine', edge.score, asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }), nowIso());
|
|
558
|
-
}
|
|
559
727
|
params.onProgress?.(`[cluster] built ${edges.length} similarity edge(s)`);
|
|
560
728
|
const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
const clusterResult = insertCluster.run(repository.id, runId, cluster.representativeThreadId, cluster.members.length, nowIso());
|
|
565
|
-
const clusterId = Number(clusterResult.lastInsertRowid);
|
|
566
|
-
for (const memberId of cluster.members) {
|
|
567
|
-
const key = this.edgeKey(cluster.representativeThreadId, memberId);
|
|
568
|
-
const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
|
|
569
|
-
insertMember.run(clusterId, memberId, score, nowIso());
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s)`);
|
|
729
|
+
this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
|
|
730
|
+
this.pruneOldClusterRuns(repository.id, runId);
|
|
731
|
+
params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
|
|
573
732
|
this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
|
|
574
733
|
return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
|
|
575
734
|
}
|
|
@@ -590,7 +749,7 @@ export class GHCrawlService {
|
|
|
590
749
|
from documents_fts
|
|
591
750
|
join documents d on d.id = documents_fts.rowid
|
|
592
751
|
join threads t on t.id = d.thread_id
|
|
593
|
-
|
|
752
|
+
where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and documents_fts match ?
|
|
594
753
|
order by rank
|
|
595
754
|
limit ?`)
|
|
596
755
|
.all(repository.id, params.query, limit * 2);
|
|
@@ -612,7 +771,7 @@ export class GHCrawlService {
|
|
|
612
771
|
const threadRows = candidateIds.size
|
|
613
772
|
? this.db
|
|
614
773
|
.prepare(`select * from threads
|
|
615
|
-
where repo_id = ? and state = 'open' and id in (${[...candidateIds].map(() => '?').join(',')})
|
|
774
|
+
where repo_id = ? and state = 'open' and closed_at_local is null and id in (${[...candidateIds].map(() => '?').join(',')})
|
|
616
775
|
order by updated_at_gh desc, number desc`)
|
|
617
776
|
.all(repository.id, ...candidateIds)
|
|
618
777
|
: [];
|
|
@@ -724,7 +883,8 @@ export class GHCrawlService {
|
|
|
724
883
|
}
|
|
725
884
|
const rows = this.db
|
|
726
885
|
.prepare(`select c.id, c.repo_id, c.representative_thread_id, c.member_count,
|
|
727
|
-
|
|
886
|
+
c.closed_at_local, c.close_reason_local,
|
|
887
|
+
cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title, t.state, t.closed_at_local as thread_closed_at_local
|
|
728
888
|
from clusters c
|
|
729
889
|
left join cluster_members cm on cm.cluster_id = c.id
|
|
730
890
|
left join threads t on t.id = cm.thread_id
|
|
@@ -736,6 +896,9 @@ export class GHCrawlService {
|
|
|
736
896
|
const cluster = clusters.get(row.id) ?? {
|
|
737
897
|
id: row.id,
|
|
738
898
|
repoId: row.repo_id,
|
|
899
|
+
isClosed: row.close_reason_local !== null,
|
|
900
|
+
closedAtLocal: row.closed_at_local,
|
|
901
|
+
closeReasonLocal: row.close_reason_local,
|
|
739
902
|
representativeThreadId: row.representative_thread_id,
|
|
740
903
|
memberCount: row.member_count,
|
|
741
904
|
members: [],
|
|
@@ -745,15 +908,20 @@ export class GHCrawlService {
|
|
|
745
908
|
threadId: row.thread_id,
|
|
746
909
|
number: row.number,
|
|
747
910
|
kind: row.kind,
|
|
911
|
+
isClosed: row.state !== null && isEffectivelyClosed({ state: row.state, closed_at_local: row.thread_closed_at_local }),
|
|
748
912
|
title: row.title,
|
|
749
913
|
scoreToRepresentative: row.score_to_representative,
|
|
750
914
|
});
|
|
751
915
|
}
|
|
752
916
|
clusters.set(row.id, cluster);
|
|
753
917
|
}
|
|
918
|
+
const clusterValues = Array.from(clusters.values()).map((cluster) => ({
|
|
919
|
+
...cluster,
|
|
920
|
+
isClosed: cluster.isClosed || (cluster.memberCount > 0 && cluster.members.every((member) => member.isClosed)),
|
|
921
|
+
}));
|
|
754
922
|
return clustersResponseSchema.parse({
|
|
755
923
|
repository,
|
|
756
|
-
clusters:
|
|
924
|
+
clusters: clusterValues.filter((cluster) => (params.includeClosed ? true : !cluster.isClosed)),
|
|
757
925
|
});
|
|
758
926
|
}
|
|
759
927
|
async refreshRepository(params) {
|
|
@@ -808,6 +976,7 @@ export class GHCrawlService {
|
|
|
808
976
|
minSize: params.minSize,
|
|
809
977
|
sort: params.sort,
|
|
810
978
|
search: params.search,
|
|
979
|
+
includeClosedClusters: params.includeClosed === true,
|
|
811
980
|
});
|
|
812
981
|
const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters;
|
|
813
982
|
return clusterSummariesResponseSchema.parse({
|
|
@@ -816,6 +985,9 @@ export class GHCrawlService {
|
|
|
816
985
|
clusters: clusters.map((cluster) => ({
|
|
817
986
|
clusterId: cluster.clusterId,
|
|
818
987
|
displayTitle: cluster.displayTitle,
|
|
988
|
+
isClosed: cluster.isClosed,
|
|
989
|
+
closedAtLocal: cluster.closedAtLocal,
|
|
990
|
+
closeReasonLocal: cluster.closeReasonLocal,
|
|
819
991
|
totalCount: cluster.totalCount,
|
|
820
992
|
issueCount: cluster.issueCount,
|
|
821
993
|
pullRequestCount: cluster.pullRequestCount,
|
|
@@ -831,6 +1003,7 @@ export class GHCrawlService {
|
|
|
831
1003
|
owner: params.owner,
|
|
832
1004
|
repo: params.repo,
|
|
833
1005
|
minSize: 0,
|
|
1006
|
+
includeClosedClusters: params.includeClosed === true,
|
|
834
1007
|
});
|
|
835
1008
|
const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId);
|
|
836
1009
|
if (!cluster) {
|
|
@@ -863,6 +1036,9 @@ export class GHCrawlService {
|
|
|
863
1036
|
cluster: {
|
|
864
1037
|
clusterId: cluster.clusterId,
|
|
865
1038
|
displayTitle: cluster.displayTitle,
|
|
1039
|
+
isClosed: cluster.isClosed,
|
|
1040
|
+
closedAtLocal: cluster.closedAtLocal,
|
|
1041
|
+
closeReasonLocal: cluster.closeReasonLocal,
|
|
866
1042
|
totalCount: cluster.totalCount,
|
|
867
1043
|
issueCount: cluster.issueCount,
|
|
868
1044
|
pullRequestCount: cluster.pullRequestCount,
|
|
@@ -881,7 +1057,9 @@ export class GHCrawlService {
|
|
|
881
1057
|
if (!latestRun) {
|
|
882
1058
|
return { repository, stats, clusters: [] };
|
|
883
1059
|
}
|
|
1060
|
+
const includeClosedClusters = params.includeClosedClusters ?? true;
|
|
884
1061
|
const clusters = this.listRawTuiClusters(repository.id, latestRun.id)
|
|
1062
|
+
.filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed))
|
|
885
1063
|
.filter((cluster) => cluster.totalCount >= (params.minSize ?? 10))
|
|
886
1064
|
.filter((cluster) => {
|
|
887
1065
|
const search = params.search?.trim().toLowerCase();
|
|
@@ -907,7 +1085,7 @@ export class GHCrawlService {
|
|
|
907
1085
|
throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
|
|
908
1086
|
}
|
|
909
1087
|
const rows = this.db
|
|
910
|
-
.prepare(`select t.id, t.number, t.kind, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
|
|
1088
|
+
.prepare(`select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
|
|
911
1089
|
from cluster_members cm
|
|
912
1090
|
join threads t on t.id = cm.thread_id
|
|
913
1091
|
where cm.cluster_id = ?
|
|
@@ -919,6 +1097,9 @@ export class GHCrawlService {
|
|
|
919
1097
|
return {
|
|
920
1098
|
clusterId: summary.clusterId,
|
|
921
1099
|
displayTitle: summary.displayTitle,
|
|
1100
|
+
isClosed: summary.isClosed,
|
|
1101
|
+
closedAtLocal: summary.closedAtLocal,
|
|
1102
|
+
closeReasonLocal: summary.closeReasonLocal,
|
|
922
1103
|
totalCount: summary.totalCount,
|
|
923
1104
|
issueCount: summary.issueCount,
|
|
924
1105
|
pullRequestCount: summary.pullRequestCount,
|
|
@@ -930,6 +1111,7 @@ export class GHCrawlService {
|
|
|
930
1111
|
id: row.id,
|
|
931
1112
|
number: row.number,
|
|
932
1113
|
kind: row.kind,
|
|
1114
|
+
isClosed: isEffectivelyClosed(row),
|
|
933
1115
|
title: row.title,
|
|
934
1116
|
updatedAtGh: row.updated_at_gh,
|
|
935
1117
|
htmlUrl: row.html_url,
|
|
@@ -942,11 +1124,11 @@ export class GHCrawlService {
|
|
|
942
1124
|
const repository = this.requireRepository(params.owner, params.repo);
|
|
943
1125
|
const row = params.threadId
|
|
944
1126
|
? (this.db
|
|
945
|
-
.prepare('select * from threads where repo_id = ? and id = ?
|
|
1127
|
+
.prepare('select * from threads where repo_id = ? and id = ? limit 1')
|
|
946
1128
|
.get(repository.id, params.threadId) ?? null)
|
|
947
1129
|
: params.threadNumber
|
|
948
1130
|
? (this.db
|
|
949
|
-
.prepare('select * from threads where repo_id = ? and number = ?
|
|
1131
|
+
.prepare('select * from threads where repo_id = ? and number = ? limit 1')
|
|
950
1132
|
.get(repository.id, params.threadNumber) ?? null)
|
|
951
1133
|
: null;
|
|
952
1134
|
if (!row) {
|
|
@@ -979,17 +1161,20 @@ export class GHCrawlService {
|
|
|
979
1161
|
}
|
|
980
1162
|
let neighbors = [];
|
|
981
1163
|
if (params.includeNeighbors !== false) {
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1164
|
+
neighbors = this.listStoredClusterNeighbors(repository.id, row.id, 8);
|
|
1165
|
+
if (neighbors.length === 0) {
|
|
1166
|
+
try {
|
|
1167
|
+
neighbors = this.listNeighbors({
|
|
1168
|
+
owner: params.owner,
|
|
1169
|
+
repo: params.repo,
|
|
1170
|
+
threadNumber: row.number,
|
|
1171
|
+
limit: 8,
|
|
1172
|
+
minScore: 0.2,
|
|
1173
|
+
}).neighbors;
|
|
1174
|
+
}
|
|
1175
|
+
catch {
|
|
1176
|
+
neighbors = [];
|
|
1177
|
+
}
|
|
993
1178
|
}
|
|
994
1179
|
}
|
|
995
1180
|
return {
|
|
@@ -1103,7 +1288,7 @@ export class GHCrawlService {
|
|
|
1103
1288
|
const counts = this.db
|
|
1104
1289
|
.prepare(`select kind, count(*) as count
|
|
1105
1290
|
from threads
|
|
1106
|
-
where repo_id = ? and state = 'open'
|
|
1291
|
+
where repo_id = ? and state = 'open' and closed_at_local is null
|
|
1107
1292
|
group by kind`)
|
|
1108
1293
|
.all(repoId);
|
|
1109
1294
|
const latestRun = this.getLatestClusterRun(repoId);
|
|
@@ -1131,11 +1316,73 @@ export class GHCrawlService {
|
|
|
1131
1316
|
.prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1132
1317
|
.get(repoId) ?? null);
|
|
1133
1318
|
}
|
|
1319
|
+
getLatestRunClusterIdsForThread(repoId, threadId) {
|
|
1320
|
+
const latestRun = this.getLatestClusterRun(repoId);
|
|
1321
|
+
if (!latestRun) {
|
|
1322
|
+
return [];
|
|
1323
|
+
}
|
|
1324
|
+
return this.db
|
|
1325
|
+
.prepare(`select cm.cluster_id
|
|
1326
|
+
from cluster_members cm
|
|
1327
|
+
join clusters c on c.id = cm.cluster_id
|
|
1328
|
+
where c.repo_id = ? and c.cluster_run_id = ? and cm.thread_id = ?
|
|
1329
|
+
order by cm.cluster_id asc`)
|
|
1330
|
+
.all(repoId, latestRun.id, threadId).map((row) => row.cluster_id);
|
|
1331
|
+
}
|
|
1332
|
+
reconcileClusterCloseState(repoId, clusterIds) {
|
|
1333
|
+
const latestRun = this.getLatestClusterRun(repoId);
|
|
1334
|
+
if (!latestRun) {
|
|
1335
|
+
return 0;
|
|
1336
|
+
}
|
|
1337
|
+
const resolvedClusterIds = clusterIds && clusterIds.length > 0
|
|
1338
|
+
? Array.from(new Set(clusterIds))
|
|
1339
|
+
: this.db
|
|
1340
|
+
.prepare('select id from clusters where repo_id = ? and cluster_run_id = ? order by id asc')
|
|
1341
|
+
.all(repoId, latestRun.id).map((row) => row.id);
|
|
1342
|
+
if (resolvedClusterIds.length === 0) {
|
|
1343
|
+
return 0;
|
|
1344
|
+
}
|
|
1345
|
+
const summarize = this.db.prepare(`select
|
|
1346
|
+
c.id,
|
|
1347
|
+
c.close_reason_local,
|
|
1348
|
+
count(*) as member_count,
|
|
1349
|
+
sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count
|
|
1350
|
+
from clusters c
|
|
1351
|
+
join cluster_members cm on cm.cluster_id = c.id
|
|
1352
|
+
join threads t on t.id = cm.thread_id
|
|
1353
|
+
where c.id = ?
|
|
1354
|
+
group by c.id, c.close_reason_local`);
|
|
1355
|
+
const markClosed = this.db.prepare(`update clusters
|
|
1356
|
+
set closed_at_local = coalesce(closed_at_local, ?),
|
|
1357
|
+
close_reason_local = 'all_members_closed'
|
|
1358
|
+
where id = ?`);
|
|
1359
|
+
const clearClosed = this.db.prepare(`update clusters
|
|
1360
|
+
set closed_at_local = null,
|
|
1361
|
+
close_reason_local = null
|
|
1362
|
+
where id = ? and close_reason_local = 'all_members_closed'`);
|
|
1363
|
+
let changed = 0;
|
|
1364
|
+
for (const clusterId of resolvedClusterIds) {
|
|
1365
|
+
const row = summarize.get(clusterId);
|
|
1366
|
+
if (!row || row.close_reason_local === 'manual') {
|
|
1367
|
+
continue;
|
|
1368
|
+
}
|
|
1369
|
+
if (row.member_count > 0 && row.closed_member_count >= row.member_count) {
|
|
1370
|
+
const result = markClosed.run(nowIso(), clusterId);
|
|
1371
|
+
changed += result.changes;
|
|
1372
|
+
continue;
|
|
1373
|
+
}
|
|
1374
|
+
const cleared = clearClosed.run(clusterId);
|
|
1375
|
+
changed += cleared.changes;
|
|
1376
|
+
}
|
|
1377
|
+
return changed;
|
|
1378
|
+
}
|
|
1134
1379
|
listRawTuiClusters(repoId, clusterRunId) {
|
|
1135
1380
|
const rows = this.db
|
|
1136
1381
|
.prepare(`select
|
|
1137
1382
|
c.id as cluster_id,
|
|
1138
1383
|
c.member_count,
|
|
1384
|
+
c.closed_at_local,
|
|
1385
|
+
c.close_reason_local,
|
|
1139
1386
|
c.representative_thread_id,
|
|
1140
1387
|
rt.number as representative_number,
|
|
1141
1388
|
rt.kind as representative_kind,
|
|
@@ -1143,6 +1390,7 @@ export class GHCrawlService {
|
|
|
1143
1390
|
max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at,
|
|
1144
1391
|
sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count,
|
|
1145
1392
|
sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count,
|
|
1393
|
+
sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count,
|
|
1146
1394
|
group_concat(lower(coalesce(t.title, '')), ' ') as search_text
|
|
1147
1395
|
from clusters c
|
|
1148
1396
|
left join threads rt on rt.id = c.representative_thread_id
|
|
@@ -1160,6 +1408,9 @@ export class GHCrawlService {
|
|
|
1160
1408
|
return rows.map((row) => ({
|
|
1161
1409
|
clusterId: row.cluster_id,
|
|
1162
1410
|
displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`,
|
|
1411
|
+
isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count,
|
|
1412
|
+
closedAtLocal: row.closed_at_local,
|
|
1413
|
+
closeReasonLocal: row.close_reason_local,
|
|
1163
1414
|
totalCount: row.member_count,
|
|
1164
1415
|
issueCount: row.issue_count,
|
|
1165
1416
|
pullRequestCount: row.pull_request_count,
|
|
@@ -1297,6 +1548,7 @@ export class GHCrawlService {
|
|
|
1297
1548
|
from threads
|
|
1298
1549
|
where repo_id = ?
|
|
1299
1550
|
and state = 'open'
|
|
1551
|
+
and closed_at_local is null
|
|
1300
1552
|
and (last_pulled_at is null or last_pulled_at < ?)
|
|
1301
1553
|
order by number asc`)
|
|
1302
1554
|
.all(params.repoId, params.crawlStartedAt);
|
|
@@ -1341,6 +1593,7 @@ export class GHCrawlService {
|
|
|
1341
1593
|
from threads
|
|
1342
1594
|
where repo_id = ?
|
|
1343
1595
|
and state = 'open'
|
|
1596
|
+
and closed_at_local is null
|
|
1344
1597
|
and (last_pulled_at is null or last_pulled_at < ?)
|
|
1345
1598
|
order by number asc`)
|
|
1346
1599
|
.all(params.repoId, params.crawlStartedAt);
|
|
@@ -1628,11 +1881,12 @@ export class GHCrawlService {
|
|
|
1628
1881
|
}
|
|
1629
1882
|
loadStoredEmbeddings(repoId) {
|
|
1630
1883
|
return this.db
|
|
1631
|
-
.prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.
|
|
1884
|
+
.prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
|
|
1885
|
+
t.title, t.body, t.author_login, t.html_url, t.labels_json,
|
|
1632
1886
|
t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
|
|
1633
1887
|
from threads t
|
|
1634
1888
|
join document_embeddings e on e.thread_id = t.id
|
|
1635
|
-
where t.repo_id = ? and t.state = 'open' and e.model = ?
|
|
1889
|
+
where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ?
|
|
1636
1890
|
order by t.number asc, e.source_kind asc`)
|
|
1637
1891
|
.all(repoId, this.config.embedModel);
|
|
1638
1892
|
}
|
|
@@ -1648,10 +1902,55 @@ export class GHCrawlService {
|
|
|
1648
1902
|
this.parsedEmbeddingCache.set(repoId, parsed);
|
|
1649
1903
|
return parsed;
|
|
1650
1904
|
}
|
|
1905
|
+
listStoredClusterNeighbors(repoId, threadId, limit) {
|
|
1906
|
+
const latestRun = this.getLatestClusterRun(repoId);
|
|
1907
|
+
if (!latestRun) {
|
|
1908
|
+
return [];
|
|
1909
|
+
}
|
|
1910
|
+
const rows = this.db
|
|
1911
|
+
.prepare(`select
|
|
1912
|
+
case
|
|
1913
|
+
when se.left_thread_id = ? then se.right_thread_id
|
|
1914
|
+
else se.left_thread_id
|
|
1915
|
+
end as neighbor_thread_id,
|
|
1916
|
+
case
|
|
1917
|
+
when se.left_thread_id = ? then t2.number
|
|
1918
|
+
else t1.number
|
|
1919
|
+
end as neighbor_number,
|
|
1920
|
+
case
|
|
1921
|
+
when se.left_thread_id = ? then t2.kind
|
|
1922
|
+
else t1.kind
|
|
1923
|
+
end as neighbor_kind,
|
|
1924
|
+
case
|
|
1925
|
+
when se.left_thread_id = ? then t2.title
|
|
1926
|
+
else t1.title
|
|
1927
|
+
end as neighbor_title,
|
|
1928
|
+
se.score
|
|
1929
|
+
from similarity_edges se
|
|
1930
|
+
join threads t1 on t1.id = se.left_thread_id
|
|
1931
|
+
join threads t2 on t2.id = se.right_thread_id
|
|
1932
|
+
where se.repo_id = ?
|
|
1933
|
+
and se.cluster_run_id = ?
|
|
1934
|
+
and (se.left_thread_id = ? or se.right_thread_id = ?)
|
|
1935
|
+
and t1.state = 'open'
|
|
1936
|
+
and t1.closed_at_local is null
|
|
1937
|
+
and t2.state = 'open'
|
|
1938
|
+
and t2.closed_at_local is null
|
|
1939
|
+
order by se.score desc
|
|
1940
|
+
limit ?`)
|
|
1941
|
+
.all(threadId, threadId, threadId, threadId, repoId, latestRun.id, threadId, threadId, limit);
|
|
1942
|
+
return rows.map((row) => ({
|
|
1943
|
+
threadId: row.neighbor_thread_id,
|
|
1944
|
+
number: row.neighbor_number,
|
|
1945
|
+
kind: row.neighbor_kind,
|
|
1946
|
+
title: row.neighbor_title,
|
|
1947
|
+
score: row.score,
|
|
1948
|
+
}));
|
|
1949
|
+
}
|
|
1651
1950
|
getEmbeddingWorkset(repoId, threadNumber) {
|
|
1652
1951
|
let sql = `select t.id, t.number, t.title, t.body
|
|
1653
1952
|
from threads t
|
|
1654
|
-
where t.repo_id = ? and t.state = 'open'`;
|
|
1953
|
+
where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null`;
|
|
1655
1954
|
const args = [repoId];
|
|
1656
1955
|
if (threadNumber) {
|
|
1657
1956
|
sql += ' and t.number = ?';
|
|
@@ -1684,7 +1983,7 @@ export class GHCrawlService {
|
|
|
1684
1983
|
let sql = `select s.thread_id, s.summary_kind, s.summary_text
|
|
1685
1984
|
from document_summaries s
|
|
1686
1985
|
join threads t on t.id = s.thread_id
|
|
1687
|
-
where t.repo_id = ? and t.state = 'open' and s.model = ?`;
|
|
1986
|
+
where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and s.model = ?`;
|
|
1688
1987
|
const args = [repoId, this.config.summaryModel];
|
|
1689
1988
|
if (threadNumber) {
|
|
1690
1989
|
sql += ' and t.number = ?';
|
|
@@ -1726,6 +2025,9 @@ export class GHCrawlService {
|
|
|
1726
2025
|
bySource.set(row.source_kind, list);
|
|
1727
2026
|
}
|
|
1728
2027
|
const aggregated = new Map();
|
|
2028
|
+
const totalItems = Array.from(bySource.values()).reduce((sum, items) => sum + items.length, 0);
|
|
2029
|
+
let processedItems = 0;
|
|
2030
|
+
let lastProgressAt = Date.now();
|
|
1729
2031
|
for (const [sourceKind, items] of bySource.entries()) {
|
|
1730
2032
|
for (const item of items) {
|
|
1731
2033
|
const neighbors = rankNearestNeighbors(items, {
|
|
@@ -1749,10 +2051,43 @@ export class GHCrawlService {
|
|
|
1749
2051
|
sourceKinds: new Set([sourceKind]),
|
|
1750
2052
|
});
|
|
1751
2053
|
}
|
|
2054
|
+
processedItems += 1;
|
|
2055
|
+
const now = Date.now();
|
|
2056
|
+
if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
|
|
2057
|
+
params.onProgress(`[cluster] identifying similarity edges ${processedItems}/${totalItems} source embeddings processed current_edges=${aggregated.size}`);
|
|
2058
|
+
lastProgressAt = now;
|
|
2059
|
+
}
|
|
1752
2060
|
}
|
|
1753
2061
|
}
|
|
1754
2062
|
return aggregated;
|
|
1755
2063
|
}
|
|
2064
|
+
persistClusterRun(repoId, runId, aggregatedEdges, clusters) {
|
|
2065
|
+
const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
|
|
2066
|
+
values (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
2067
|
+
const insertCluster = this.db.prepare('insert into clusters (repo_id, cluster_run_id, representative_thread_id, member_count, created_at) values (?, ?, ?, ?, ?)');
|
|
2068
|
+
const insertMember = this.db.prepare('insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)');
|
|
2069
|
+
this.db.transaction(() => {
|
|
2070
|
+
this.db.prepare('delete from cluster_members where cluster_id in (select id from clusters where cluster_run_id = ?)').run(runId);
|
|
2071
|
+
this.db.prepare('delete from clusters where cluster_run_id = ?').run(runId);
|
|
2072
|
+
this.db.prepare('delete from similarity_edges where cluster_run_id = ?').run(runId);
|
|
2073
|
+
const createdAt = nowIso();
|
|
2074
|
+
for (const edge of aggregatedEdges.values()) {
|
|
2075
|
+
insertEdge.run(repoId, runId, edge.leftThreadId, edge.rightThreadId, 'exact_cosine', edge.score, asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }), createdAt);
|
|
2076
|
+
}
|
|
2077
|
+
for (const cluster of clusters) {
|
|
2078
|
+
const clusterResult = insertCluster.run(repoId, runId, cluster.representativeThreadId, cluster.members.length, createdAt);
|
|
2079
|
+
const clusterId = Number(clusterResult.lastInsertRowid);
|
|
2080
|
+
for (const memberId of cluster.members) {
|
|
2081
|
+
const key = this.edgeKey(cluster.representativeThreadId, memberId);
|
|
2082
|
+
const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
|
|
2083
|
+
insertMember.run(clusterId, memberId, score, createdAt);
|
|
2084
|
+
}
|
|
2085
|
+
}
|
|
2086
|
+
})();
|
|
2087
|
+
}
|
|
2088
|
+
pruneOldClusterRuns(repoId, keepRunId) {
|
|
2089
|
+
this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
|
|
2090
|
+
}
|
|
1756
2091
|
upsertSummary(threadId, contentHash, summaryKind, summaryText) {
|
|
1757
2092
|
this.db
|
|
1758
2093
|
.prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
|