@ghcrawl/api-core 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/service.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import http from 'node:http';
2
2
  import crypto from 'node:crypto';
3
3
  import { IterableMapper } from '@shutterstock/p-map-iterable';
4
- import { actionResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
4
+ import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
5
5
  import { buildClusters } from './cluster/build.js';
6
6
  import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
7
7
  import { migrate } from './db/migrate.js';
@@ -13,6 +13,7 @@ import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
13
13
  const SYNC_BATCH_SIZE = 100;
14
14
  const SYNC_BATCH_DELAY_MS = 5000;
15
15
  const STALE_CLOSED_SWEEP_LIMIT = 1000;
16
+ const CLUSTER_PROGRESS_INTERVAL_MS = 5000;
16
17
  const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
17
18
  const EMBED_MAX_ITEM_TOKENS = 7000;
18
19
  const EMBED_MAX_BATCH_TOKENS = 250000;
@@ -26,6 +27,9 @@ function parseIso(value) {
26
27
  const parsed = Date.parse(value);
27
28
  return Number.isNaN(parsed) ? null : parsed;
28
29
  }
30
+ function isEffectivelyClosed(row) {
31
+ return row.state !== 'open' || row.closed_at_local !== null;
32
+ }
29
33
  function isMissingGitHubResourceError(error) {
30
34
  const status = typeof error?.status === 'number' ? Number(error.status) : null;
31
35
  if (status === 404 || status === 410) {
@@ -149,6 +153,10 @@ function threadToDto(row, clusterId) {
149
153
  number: row.number,
150
154
  kind: row.kind,
151
155
  state: row.state,
156
+ isClosed: isEffectivelyClosed(row),
157
+ closedAtGh: row.closed_at_gh ?? null,
158
+ closedAtLocal: row.closed_at_local ?? null,
159
+ closeReasonLocal: row.close_reason_local ?? null,
152
160
  title: row.title,
153
161
  body: row.body,
154
162
  authorLogin: row.author_login,
@@ -259,17 +267,177 @@ export class GHCrawlService {
259
267
  .all(repository.id, repository.id);
260
268
  for (const row of clusterRows)
261
269
  clusterIds.set(row.thread_id, row.cluster_id);
262
- let sql = "select * from threads where repo_id = ? and state = 'open'";
270
+ let sql = 'select * from threads where repo_id = ?';
263
271
  const args = [repository.id];
272
+ if (!params.includeClosed) {
273
+ sql += " and state = 'open' and closed_at_local is null";
274
+ }
264
275
  if (params.kind) {
265
276
  sql += ' and kind = ?';
266
277
  args.push(params.kind);
267
278
  }
279
+ if (params.numbers && params.numbers.length > 0) {
280
+ const uniqueNumbers = Array.from(new Set(params.numbers.filter((value) => Number.isSafeInteger(value) && value > 0)));
281
+ if (uniqueNumbers.length === 0) {
282
+ return threadsResponseSchema.parse({
283
+ repository,
284
+ threads: [],
285
+ });
286
+ }
287
+ sql += ` and number in (${uniqueNumbers.map(() => '?').join(', ')})`;
288
+ args.push(...uniqueNumbers);
289
+ }
268
290
  sql += ' order by updated_at_gh desc, number desc';
269
291
  const rows = this.db.prepare(sql).all(...args);
292
+ const orderedRows = params.numbers && params.numbers.length > 0
293
+ ? (() => {
294
+ const byNumber = new Map(rows.map((row) => [row.number, row]));
295
+ const uniqueRequested = Array.from(new Set(params.numbers));
296
+ return uniqueRequested.map((number) => byNumber.get(number)).filter((row) => row !== undefined);
297
+ })()
298
+ : rows;
270
299
  return threadsResponseSchema.parse({
271
300
  repository,
272
- threads: rows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
301
+ threads: orderedRows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
302
+ });
303
+ }
304
+ listAuthorThreads(params) {
305
+ const repository = this.requireRepository(params.owner, params.repo);
306
+ const normalizedLogin = params.login.trim();
307
+ if (!normalizedLogin) {
308
+ return authorThreadsResponseSchema.parse({
309
+ repository,
310
+ authorLogin: '',
311
+ threads: [],
312
+ });
313
+ }
314
+ const clusterIds = new Map();
315
+ const clusterRows = this.db
316
+ .prepare(`select cm.thread_id, cm.cluster_id
317
+ from cluster_members cm
318
+ join clusters c on c.id = cm.cluster_id
319
+ where c.repo_id = ? and c.cluster_run_id = (
320
+ select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
321
+ )`)
322
+ .all(repository.id, repository.id);
323
+ for (const row of clusterRows)
324
+ clusterIds.set(row.thread_id, row.cluster_id);
325
+ const rows = this.db
326
+ .prepare(`select *
327
+ from threads
328
+ where repo_id = ? and lower(author_login) = lower(?)
329
+ ${params.includeClosed ? '' : "and state = 'open' and closed_at_local is null"}
330
+ order by updated_at_gh desc, number desc`)
331
+ .all(repository.id, normalizedLogin);
332
+ const latestRun = this.getLatestClusterRun(repository.id);
333
+ const strongestByThread = new Map();
334
+ if (latestRun && rows.length > 1) {
335
+ const edges = this.db
336
+ .prepare(`select
337
+ se.left_thread_id,
338
+ se.right_thread_id,
339
+ se.score,
340
+ t1.number as left_number,
341
+ t1.kind as left_kind,
342
+ t1.title as left_title,
343
+ t2.number as right_number,
344
+ t2.kind as right_kind,
345
+ t2.title as right_title
346
+ from similarity_edges se
347
+ join threads t1 on t1.id = se.left_thread_id
348
+ join threads t2 on t2.id = se.right_thread_id
349
+ where se.repo_id = ?
350
+ and se.cluster_run_id = ?
351
+ and lower(t1.author_login) = lower(?)
352
+ and lower(t2.author_login) = lower(?)
353
+ ${params.includeClosed ? '' : "and t1.state = 'open' and t1.closed_at_local is null and t2.state = 'open' and t2.closed_at_local is null"}`)
354
+ .all(repository.id, latestRun.id, normalizedLogin, normalizedLogin);
355
+ const updateStrongest = (sourceThreadId, match) => {
356
+ const previous = strongestByThread.get(sourceThreadId);
357
+ if (!previous || match.score > previous.score) {
358
+ strongestByThread.set(sourceThreadId, match);
359
+ }
360
+ };
361
+ for (const edge of edges) {
362
+ updateStrongest(edge.left_thread_id, {
363
+ threadId: edge.right_thread_id,
364
+ number: edge.right_number,
365
+ kind: edge.right_kind,
366
+ title: edge.right_title,
367
+ score: edge.score,
368
+ });
369
+ updateStrongest(edge.right_thread_id, {
370
+ threadId: edge.left_thread_id,
371
+ number: edge.left_number,
372
+ kind: edge.left_kind,
373
+ title: edge.left_title,
374
+ score: edge.score,
375
+ });
376
+ }
377
+ }
378
+ return authorThreadsResponseSchema.parse({
379
+ repository,
380
+ authorLogin: normalizedLogin,
381
+ threads: rows.map((row) => ({
382
+ thread: threadToDto(row, clusterIds.get(row.id) ?? null),
383
+ strongestSameAuthorMatch: strongestByThread.get(row.id) ?? null,
384
+ })),
385
+ });
386
+ }
387
+ closeThreadLocally(params) {
388
+ const repository = this.requireRepository(params.owner, params.repo);
389
+ const row = this.db
390
+ .prepare('select * from threads where repo_id = ? and number = ? limit 1')
391
+ .get(repository.id, params.threadNumber);
392
+ if (!row) {
393
+ throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`);
394
+ }
395
+ const closedAt = nowIso();
396
+ this.db
397
+ .prepare(`update threads
398
+ set closed_at_local = ?,
399
+ close_reason_local = 'manual',
400
+ updated_at = ?
401
+ where id = ?`)
402
+ .run(closedAt, closedAt, row.id);
403
+ this.parsedEmbeddingCache.delete(repository.id);
404
+ const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id);
405
+ const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0;
406
+ const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id);
407
+ return closeResponseSchema.parse({
408
+ ok: true,
409
+ repository,
410
+ thread: threadToDto(updated),
411
+ clusterId: clusterIds[0] ?? null,
412
+ clusterClosed,
413
+ message: `Marked ${updated.kind} #${updated.number} closed locally.`,
414
+ });
415
+ }
416
+ closeClusterLocally(params) {
417
+ const repository = this.requireRepository(params.owner, params.repo);
418
+ const latestRun = this.getLatestClusterRun(repository.id);
419
+ if (!latestRun) {
420
+ throw new Error(`No completed cluster run found for ${repository.fullName}.`);
421
+ }
422
+ const row = this.db
423
+ .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1')
424
+ .get(repository.id, latestRun.id, params.clusterId);
425
+ if (!row) {
426
+ throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
427
+ }
428
+ const closedAt = nowIso();
429
+ this.db
430
+ .prepare(`update clusters
431
+ set closed_at_local = ?,
432
+ close_reason_local = 'manual'
433
+ where id = ?`)
434
+ .run(closedAt, row.id);
435
+ return closeResponseSchema.parse({
436
+ ok: true,
437
+ repository,
438
+ clusterId: row.id,
439
+ clusterClosed: true,
440
+ message: `Marked cluster ${row.id} closed locally.`,
273
441
  });
274
442
  }
275
443
  async syncRepository(params) {
@@ -366,6 +534,10 @@ export class GHCrawlService {
366
534
  })
367
535
  : 0;
368
536
  const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile;
537
+ this.parsedEmbeddingCache.delete(repoId);
538
+ if (threadsClosed > 0) {
539
+ this.reconcileClusterCloseState(repoId);
540
+ }
369
541
  const finishedAt = nowIso();
370
542
  const reconciledOpenCloseAt = shouldSweepClosedOverlap || shouldReconcileMissingOpenThreads ? finishedAt : null;
371
543
  const nextSyncCursor = {
@@ -542,34 +714,21 @@ export class GHCrawlService {
542
714
  title: meta.title,
543
715
  }));
544
716
  params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
545
- this.db.prepare('delete from cluster_members where cluster_id in (select id from clusters where cluster_run_id = ?)').run(runId);
546
- this.db.prepare('delete from clusters where cluster_run_id = ?').run(runId);
547
- this.db.prepare('delete from similarity_edges where cluster_run_id = ?').run(runId);
548
- const aggregatedEdges = this.aggregateRepositoryEdges(rows, { limit: k, minScore });
717
+ const aggregatedEdges = this.aggregateRepositoryEdges(rows, {
718
+ limit: k,
719
+ minScore,
720
+ onProgress: params.onProgress,
721
+ });
549
722
  const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
550
723
  leftThreadId: entry.leftThreadId,
551
724
  rightThreadId: entry.rightThreadId,
552
725
  score: entry.score,
553
726
  }));
554
- const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
555
- values (?, ?, ?, ?, ?, ?, ?, ?)`);
556
- for (const edge of aggregatedEdges.values()) {
557
- insertEdge.run(repository.id, runId, edge.leftThreadId, edge.rightThreadId, 'exact_cosine', edge.score, asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }), nowIso());
558
- }
559
727
  params.onProgress?.(`[cluster] built ${edges.length} similarity edge(s)`);
560
728
  const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
561
- const insertCluster = this.db.prepare('insert into clusters (repo_id, cluster_run_id, representative_thread_id, member_count, created_at) values (?, ?, ?, ?, ?)');
562
- const insertMember = this.db.prepare('insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)');
563
- for (const cluster of clusters) {
564
- const clusterResult = insertCluster.run(repository.id, runId, cluster.representativeThreadId, cluster.members.length, nowIso());
565
- const clusterId = Number(clusterResult.lastInsertRowid);
566
- for (const memberId of cluster.members) {
567
- const key = this.edgeKey(cluster.representativeThreadId, memberId);
568
- const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
569
- insertMember.run(clusterId, memberId, score, nowIso());
570
- }
571
- }
572
- params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s)`);
729
+ this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters);
730
+ this.pruneOldClusterRuns(repository.id, runId);
731
+ params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`);
573
732
  this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
574
733
  return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
575
734
  }
@@ -590,7 +749,7 @@ export class GHCrawlService {
590
749
  from documents_fts
591
750
  join documents d on d.id = documents_fts.rowid
592
751
  join threads t on t.id = d.thread_id
593
- where t.repo_id = ? and t.state = 'open' and documents_fts match ?
752
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and documents_fts match ?
594
753
  order by rank
595
754
  limit ?`)
596
755
  .all(repository.id, params.query, limit * 2);
@@ -612,7 +771,7 @@ export class GHCrawlService {
612
771
  const threadRows = candidateIds.size
613
772
  ? this.db
614
773
  .prepare(`select * from threads
615
- where repo_id = ? and state = 'open' and id in (${[...candidateIds].map(() => '?').join(',')})
774
+ where repo_id = ? and state = 'open' and closed_at_local is null and id in (${[...candidateIds].map(() => '?').join(',')})
616
775
  order by updated_at_gh desc, number desc`)
617
776
  .all(repository.id, ...candidateIds)
618
777
  : [];
@@ -724,7 +883,8 @@ export class GHCrawlService {
724
883
  }
725
884
  const rows = this.db
726
885
  .prepare(`select c.id, c.repo_id, c.representative_thread_id, c.member_count,
727
- cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title
886
+ c.closed_at_local, c.close_reason_local,
887
+ cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title, t.state, t.closed_at_local as thread_closed_at_local
728
888
  from clusters c
729
889
  left join cluster_members cm on cm.cluster_id = c.id
730
890
  left join threads t on t.id = cm.thread_id
@@ -736,6 +896,9 @@ export class GHCrawlService {
736
896
  const cluster = clusters.get(row.id) ?? {
737
897
  id: row.id,
738
898
  repoId: row.repo_id,
899
+ isClosed: row.close_reason_local !== null,
900
+ closedAtLocal: row.closed_at_local,
901
+ closeReasonLocal: row.close_reason_local,
739
902
  representativeThreadId: row.representative_thread_id,
740
903
  memberCount: row.member_count,
741
904
  members: [],
@@ -745,15 +908,20 @@ export class GHCrawlService {
745
908
  threadId: row.thread_id,
746
909
  number: row.number,
747
910
  kind: row.kind,
911
+ isClosed: row.state !== null && isEffectivelyClosed({ state: row.state, closed_at_local: row.thread_closed_at_local }),
748
912
  title: row.title,
749
913
  scoreToRepresentative: row.score_to_representative,
750
914
  });
751
915
  }
752
916
  clusters.set(row.id, cluster);
753
917
  }
918
+ const clusterValues = Array.from(clusters.values()).map((cluster) => ({
919
+ ...cluster,
920
+ isClosed: cluster.isClosed || (cluster.memberCount > 0 && cluster.members.every((member) => member.isClosed)),
921
+ }));
754
922
  return clustersResponseSchema.parse({
755
923
  repository,
756
- clusters: Array.from(clusters.values()),
924
+ clusters: clusterValues.filter((cluster) => (params.includeClosed ? true : !cluster.isClosed)),
757
925
  });
758
926
  }
759
927
  async refreshRepository(params) {
@@ -808,6 +976,7 @@ export class GHCrawlService {
808
976
  minSize: params.minSize,
809
977
  sort: params.sort,
810
978
  search: params.search,
979
+ includeClosedClusters: params.includeClosed === true,
811
980
  });
812
981
  const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters;
813
982
  return clusterSummariesResponseSchema.parse({
@@ -816,6 +985,9 @@ export class GHCrawlService {
816
985
  clusters: clusters.map((cluster) => ({
817
986
  clusterId: cluster.clusterId,
818
987
  displayTitle: cluster.displayTitle,
988
+ isClosed: cluster.isClosed,
989
+ closedAtLocal: cluster.closedAtLocal,
990
+ closeReasonLocal: cluster.closeReasonLocal,
819
991
  totalCount: cluster.totalCount,
820
992
  issueCount: cluster.issueCount,
821
993
  pullRequestCount: cluster.pullRequestCount,
@@ -831,6 +1003,7 @@ export class GHCrawlService {
831
1003
  owner: params.owner,
832
1004
  repo: params.repo,
833
1005
  minSize: 0,
1006
+ includeClosedClusters: params.includeClosed === true,
834
1007
  });
835
1008
  const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId);
836
1009
  if (!cluster) {
@@ -863,6 +1036,9 @@ export class GHCrawlService {
863
1036
  cluster: {
864
1037
  clusterId: cluster.clusterId,
865
1038
  displayTitle: cluster.displayTitle,
1039
+ isClosed: cluster.isClosed,
1040
+ closedAtLocal: cluster.closedAtLocal,
1041
+ closeReasonLocal: cluster.closeReasonLocal,
866
1042
  totalCount: cluster.totalCount,
867
1043
  issueCount: cluster.issueCount,
868
1044
  pullRequestCount: cluster.pullRequestCount,
@@ -881,7 +1057,9 @@ export class GHCrawlService {
881
1057
  if (!latestRun) {
882
1058
  return { repository, stats, clusters: [] };
883
1059
  }
1060
+ const includeClosedClusters = params.includeClosedClusters ?? true;
884
1061
  const clusters = this.listRawTuiClusters(repository.id, latestRun.id)
1062
+ .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed))
885
1063
  .filter((cluster) => cluster.totalCount >= (params.minSize ?? 10))
886
1064
  .filter((cluster) => {
887
1065
  const search = params.search?.trim().toLowerCase();
@@ -907,7 +1085,7 @@ export class GHCrawlService {
907
1085
  throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
908
1086
  }
909
1087
  const rows = this.db
910
- .prepare(`select t.id, t.number, t.kind, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
1088
+ .prepare(`select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
911
1089
  from cluster_members cm
912
1090
  join threads t on t.id = cm.thread_id
913
1091
  where cm.cluster_id = ?
@@ -919,6 +1097,9 @@ export class GHCrawlService {
919
1097
  return {
920
1098
  clusterId: summary.clusterId,
921
1099
  displayTitle: summary.displayTitle,
1100
+ isClosed: summary.isClosed,
1101
+ closedAtLocal: summary.closedAtLocal,
1102
+ closeReasonLocal: summary.closeReasonLocal,
922
1103
  totalCount: summary.totalCount,
923
1104
  issueCount: summary.issueCount,
924
1105
  pullRequestCount: summary.pullRequestCount,
@@ -930,6 +1111,7 @@ export class GHCrawlService {
930
1111
  id: row.id,
931
1112
  number: row.number,
932
1113
  kind: row.kind,
1114
+ isClosed: isEffectivelyClosed(row),
933
1115
  title: row.title,
934
1116
  updatedAtGh: row.updated_at_gh,
935
1117
  htmlUrl: row.html_url,
@@ -942,11 +1124,11 @@ export class GHCrawlService {
942
1124
  const repository = this.requireRepository(params.owner, params.repo);
943
1125
  const row = params.threadId
944
1126
  ? (this.db
945
- .prepare('select * from threads where repo_id = ? and id = ? and state = \'open\' limit 1')
1127
+ .prepare('select * from threads where repo_id = ? and id = ? limit 1')
946
1128
  .get(repository.id, params.threadId) ?? null)
947
1129
  : params.threadNumber
948
1130
  ? (this.db
949
- .prepare('select * from threads where repo_id = ? and number = ? and state = \'open\' limit 1')
1131
+ .prepare('select * from threads where repo_id = ? and number = ? limit 1')
950
1132
  .get(repository.id, params.threadNumber) ?? null)
951
1133
  : null;
952
1134
  if (!row) {
@@ -979,17 +1161,20 @@ export class GHCrawlService {
979
1161
  }
980
1162
  let neighbors = [];
981
1163
  if (params.includeNeighbors !== false) {
982
- try {
983
- neighbors = this.listNeighbors({
984
- owner: params.owner,
985
- repo: params.repo,
986
- threadNumber: row.number,
987
- limit: 8,
988
- minScore: 0.2,
989
- }).neighbors;
990
- }
991
- catch {
992
- neighbors = [];
1164
+ neighbors = this.listStoredClusterNeighbors(repository.id, row.id, 8);
1165
+ if (neighbors.length === 0) {
1166
+ try {
1167
+ neighbors = this.listNeighbors({
1168
+ owner: params.owner,
1169
+ repo: params.repo,
1170
+ threadNumber: row.number,
1171
+ limit: 8,
1172
+ minScore: 0.2,
1173
+ }).neighbors;
1174
+ }
1175
+ catch {
1176
+ neighbors = [];
1177
+ }
993
1178
  }
994
1179
  }
995
1180
  return {
@@ -1103,7 +1288,7 @@ export class GHCrawlService {
1103
1288
  const counts = this.db
1104
1289
  .prepare(`select kind, count(*) as count
1105
1290
  from threads
1106
- where repo_id = ? and state = 'open'
1291
+ where repo_id = ? and state = 'open' and closed_at_local is null
1107
1292
  group by kind`)
1108
1293
  .all(repoId);
1109
1294
  const latestRun = this.getLatestClusterRun(repoId);
@@ -1131,11 +1316,73 @@ export class GHCrawlService {
1131
1316
  .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1132
1317
  .get(repoId) ?? null);
1133
1318
  }
1319
+ getLatestRunClusterIdsForThread(repoId, threadId) {
1320
+ const latestRun = this.getLatestClusterRun(repoId);
1321
+ if (!latestRun) {
1322
+ return [];
1323
+ }
1324
+ return this.db
1325
+ .prepare(`select cm.cluster_id
1326
+ from cluster_members cm
1327
+ join clusters c on c.id = cm.cluster_id
1328
+ where c.repo_id = ? and c.cluster_run_id = ? and cm.thread_id = ?
1329
+ order by cm.cluster_id asc`)
1330
+ .all(repoId, latestRun.id, threadId).map((row) => row.cluster_id);
1331
+ }
1332
+ reconcileClusterCloseState(repoId, clusterIds) {
1333
+ const latestRun = this.getLatestClusterRun(repoId);
1334
+ if (!latestRun) {
1335
+ return 0;
1336
+ }
1337
+ const resolvedClusterIds = clusterIds && clusterIds.length > 0
1338
+ ? Array.from(new Set(clusterIds))
1339
+ : this.db
1340
+ .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? order by id asc')
1341
+ .all(repoId, latestRun.id).map((row) => row.id);
1342
+ if (resolvedClusterIds.length === 0) {
1343
+ return 0;
1344
+ }
1345
+ const summarize = this.db.prepare(`select
1346
+ c.id,
1347
+ c.close_reason_local,
1348
+ count(*) as member_count,
1349
+ sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count
1350
+ from clusters c
1351
+ join cluster_members cm on cm.cluster_id = c.id
1352
+ join threads t on t.id = cm.thread_id
1353
+ where c.id = ?
1354
+ group by c.id, c.close_reason_local`);
1355
+ const markClosed = this.db.prepare(`update clusters
1356
+ set closed_at_local = coalesce(closed_at_local, ?),
1357
+ close_reason_local = 'all_members_closed'
1358
+ where id = ?`);
1359
+ const clearClosed = this.db.prepare(`update clusters
1360
+ set closed_at_local = null,
1361
+ close_reason_local = null
1362
+ where id = ? and close_reason_local = 'all_members_closed'`);
1363
+ let changed = 0;
1364
+ for (const clusterId of resolvedClusterIds) {
1365
+ const row = summarize.get(clusterId);
1366
+ if (!row || row.close_reason_local === 'manual') {
1367
+ continue;
1368
+ }
1369
+ if (row.member_count > 0 && row.closed_member_count >= row.member_count) {
1370
+ const result = markClosed.run(nowIso(), clusterId);
1371
+ changed += result.changes;
1372
+ continue;
1373
+ }
1374
+ const cleared = clearClosed.run(clusterId);
1375
+ changed += cleared.changes;
1376
+ }
1377
+ return changed;
1378
+ }
1134
1379
  listRawTuiClusters(repoId, clusterRunId) {
1135
1380
  const rows = this.db
1136
1381
  .prepare(`select
1137
1382
  c.id as cluster_id,
1138
1383
  c.member_count,
1384
+ c.closed_at_local,
1385
+ c.close_reason_local,
1139
1386
  c.representative_thread_id,
1140
1387
  rt.number as representative_number,
1141
1388
  rt.kind as representative_kind,
@@ -1143,6 +1390,7 @@ export class GHCrawlService {
1143
1390
  max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at,
1144
1391
  sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count,
1145
1392
  sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count,
1393
+ sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count,
1146
1394
  group_concat(lower(coalesce(t.title, '')), ' ') as search_text
1147
1395
  from clusters c
1148
1396
  left join threads rt on rt.id = c.representative_thread_id
@@ -1160,6 +1408,9 @@ export class GHCrawlService {
1160
1408
  return rows.map((row) => ({
1161
1409
  clusterId: row.cluster_id,
1162
1410
  displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`,
1411
+ isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count,
1412
+ closedAtLocal: row.closed_at_local,
1413
+ closeReasonLocal: row.close_reason_local,
1163
1414
  totalCount: row.member_count,
1164
1415
  issueCount: row.issue_count,
1165
1416
  pullRequestCount: row.pull_request_count,
@@ -1297,6 +1548,7 @@ export class GHCrawlService {
1297
1548
  from threads
1298
1549
  where repo_id = ?
1299
1550
  and state = 'open'
1551
+ and closed_at_local is null
1300
1552
  and (last_pulled_at is null or last_pulled_at < ?)
1301
1553
  order by number asc`)
1302
1554
  .all(params.repoId, params.crawlStartedAt);
@@ -1341,6 +1593,7 @@ export class GHCrawlService {
1341
1593
  from threads
1342
1594
  where repo_id = ?
1343
1595
  and state = 'open'
1596
+ and closed_at_local is null
1344
1597
  and (last_pulled_at is null or last_pulled_at < ?)
1345
1598
  order by number asc`)
1346
1599
  .all(params.repoId, params.crawlStartedAt);
@@ -1628,11 +1881,12 @@ export class GHCrawlService {
1628
1881
  }
1629
1882
  loadStoredEmbeddings(repoId) {
1630
1883
  return this.db
1631
- .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.title, t.body, t.author_login, t.html_url, t.labels_json,
1884
+ .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
1885
+ t.title, t.body, t.author_login, t.html_url, t.labels_json,
1632
1886
  t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
1633
1887
  from threads t
1634
1888
  join document_embeddings e on e.thread_id = t.id
1635
- where t.repo_id = ? and t.state = 'open' and e.model = ?
1889
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ?
1636
1890
  order by t.number asc, e.source_kind asc`)
1637
1891
  .all(repoId, this.config.embedModel);
1638
1892
  }
@@ -1648,10 +1902,55 @@ export class GHCrawlService {
1648
1902
  this.parsedEmbeddingCache.set(repoId, parsed);
1649
1903
  return parsed;
1650
1904
  }
1905
+ listStoredClusterNeighbors(repoId, threadId, limit) {
1906
+ const latestRun = this.getLatestClusterRun(repoId);
1907
+ if (!latestRun) {
1908
+ return [];
1909
+ }
1910
+ const rows = this.db
1911
+ .prepare(`select
1912
+ case
1913
+ when se.left_thread_id = ? then se.right_thread_id
1914
+ else se.left_thread_id
1915
+ end as neighbor_thread_id,
1916
+ case
1917
+ when se.left_thread_id = ? then t2.number
1918
+ else t1.number
1919
+ end as neighbor_number,
1920
+ case
1921
+ when se.left_thread_id = ? then t2.kind
1922
+ else t1.kind
1923
+ end as neighbor_kind,
1924
+ case
1925
+ when se.left_thread_id = ? then t2.title
1926
+ else t1.title
1927
+ end as neighbor_title,
1928
+ se.score
1929
+ from similarity_edges se
1930
+ join threads t1 on t1.id = se.left_thread_id
1931
+ join threads t2 on t2.id = se.right_thread_id
1932
+ where se.repo_id = ?
1933
+ and se.cluster_run_id = ?
1934
+ and (se.left_thread_id = ? or se.right_thread_id = ?)
1935
+ and t1.state = 'open'
1936
+ and t1.closed_at_local is null
1937
+ and t2.state = 'open'
1938
+ and t2.closed_at_local is null
1939
+ order by se.score desc
1940
+ limit ?`)
1941
+ .all(threadId, threadId, threadId, threadId, repoId, latestRun.id, threadId, threadId, limit);
1942
+ return rows.map((row) => ({
1943
+ threadId: row.neighbor_thread_id,
1944
+ number: row.neighbor_number,
1945
+ kind: row.neighbor_kind,
1946
+ title: row.neighbor_title,
1947
+ score: row.score,
1948
+ }));
1949
+ }
1651
1950
  getEmbeddingWorkset(repoId, threadNumber) {
1652
1951
  let sql = `select t.id, t.number, t.title, t.body
1653
1952
  from threads t
1654
- where t.repo_id = ? and t.state = 'open'`;
1953
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null`;
1655
1954
  const args = [repoId];
1656
1955
  if (threadNumber) {
1657
1956
  sql += ' and t.number = ?';
@@ -1684,7 +1983,7 @@ export class GHCrawlService {
1684
1983
  let sql = `select s.thread_id, s.summary_kind, s.summary_text
1685
1984
  from document_summaries s
1686
1985
  join threads t on t.id = s.thread_id
1687
- where t.repo_id = ? and t.state = 'open' and s.model = ?`;
1986
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and s.model = ?`;
1688
1987
  const args = [repoId, this.config.summaryModel];
1689
1988
  if (threadNumber) {
1690
1989
  sql += ' and t.number = ?';
@@ -1726,6 +2025,9 @@ export class GHCrawlService {
1726
2025
  bySource.set(row.source_kind, list);
1727
2026
  }
1728
2027
  const aggregated = new Map();
2028
+ const totalItems = Array.from(bySource.values()).reduce((sum, items) => sum + items.length, 0);
2029
+ let processedItems = 0;
2030
+ let lastProgressAt = Date.now();
1729
2031
  for (const [sourceKind, items] of bySource.entries()) {
1730
2032
  for (const item of items) {
1731
2033
  const neighbors = rankNearestNeighbors(items, {
@@ -1749,10 +2051,43 @@ export class GHCrawlService {
1749
2051
  sourceKinds: new Set([sourceKind]),
1750
2052
  });
1751
2053
  }
2054
+ processedItems += 1;
2055
+ const now = Date.now();
2056
+ if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) {
2057
+ params.onProgress(`[cluster] identifying similarity edges ${processedItems}/${totalItems} source embeddings processed current_edges=${aggregated.size}`);
2058
+ lastProgressAt = now;
2059
+ }
1752
2060
  }
1753
2061
  }
1754
2062
  return aggregated;
1755
2063
  }
2064
+ persistClusterRun(repoId, runId, aggregatedEdges, clusters) {
2065
+ const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
2066
+ values (?, ?, ?, ?, ?, ?, ?, ?)`);
2067
+ const insertCluster = this.db.prepare('insert into clusters (repo_id, cluster_run_id, representative_thread_id, member_count, created_at) values (?, ?, ?, ?, ?)');
2068
+ const insertMember = this.db.prepare('insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)');
2069
+ this.db.transaction(() => {
2070
+ this.db.prepare('delete from cluster_members where cluster_id in (select id from clusters where cluster_run_id = ?)').run(runId);
2071
+ this.db.prepare('delete from clusters where cluster_run_id = ?').run(runId);
2072
+ this.db.prepare('delete from similarity_edges where cluster_run_id = ?').run(runId);
2073
+ const createdAt = nowIso();
2074
+ for (const edge of aggregatedEdges.values()) {
2075
+ insertEdge.run(repoId, runId, edge.leftThreadId, edge.rightThreadId, 'exact_cosine', edge.score, asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }), createdAt);
2076
+ }
2077
+ for (const cluster of clusters) {
2078
+ const clusterResult = insertCluster.run(repoId, runId, cluster.representativeThreadId, cluster.members.length, createdAt);
2079
+ const clusterId = Number(clusterResult.lastInsertRowid);
2080
+ for (const memberId of cluster.members) {
2081
+ const key = this.edgeKey(cluster.representativeThreadId, memberId);
2082
+ const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
2083
+ insertMember.run(clusterId, memberId, score, createdAt);
2084
+ }
2085
+ }
2086
+ })();
2087
+ }
2088
+ pruneOldClusterRuns(repoId, keepRunId) {
2089
+ this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId);
2090
+ }
1756
2091
  upsertSummary(threadId, contentHash, summaryKind, summaryText) {
1757
2092
  this.db
1758
2093
  .prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)