@ghcrawl/api-core 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/service.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import http from 'node:http';
2
2
  import crypto from 'node:crypto';
3
3
  import { IterableMapper } from '@shutterstock/p-map-iterable';
4
- import { actionResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
4
+ import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
5
5
  import { buildClusters } from './cluster/build.js';
6
6
  import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
7
7
  import { migrate } from './db/migrate.js';
@@ -27,6 +27,9 @@ function parseIso(value) {
27
27
  const parsed = Date.parse(value);
28
28
  return Number.isNaN(parsed) ? null : parsed;
29
29
  }
30
+ function isEffectivelyClosed(row) {
31
+ return row.state !== 'open' || row.closed_at_local !== null;
32
+ }
30
33
  function isMissingGitHubResourceError(error) {
31
34
  const status = typeof error?.status === 'number' ? Number(error.status) : null;
32
35
  if (status === 404 || status === 410) {
@@ -150,6 +153,10 @@ function threadToDto(row, clusterId) {
150
153
  number: row.number,
151
154
  kind: row.kind,
152
155
  state: row.state,
156
+ isClosed: isEffectivelyClosed(row),
157
+ closedAtGh: row.closed_at_gh ?? null,
158
+ closedAtLocal: row.closed_at_local ?? null,
159
+ closeReasonLocal: row.close_reason_local ?? null,
153
160
  title: row.title,
154
161
  body: row.body,
155
162
  authorLogin: row.author_login,
@@ -260,8 +267,11 @@ export class GHCrawlService {
260
267
  .all(repository.id, repository.id);
261
268
  for (const row of clusterRows)
262
269
  clusterIds.set(row.thread_id, row.cluster_id);
263
- let sql = "select * from threads where repo_id = ? and state = 'open'";
270
+ let sql = 'select * from threads where repo_id = ?';
264
271
  const args = [repository.id];
272
+ if (!params.includeClosed) {
273
+ sql += " and state = 'open' and closed_at_local is null";
274
+ }
265
275
  if (params.kind) {
266
276
  sql += ' and kind = ?';
267
277
  args.push(params.kind);
@@ -291,6 +301,145 @@ export class GHCrawlService {
291
301
  threads: orderedRows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
292
302
  });
293
303
  }
304
+ listAuthorThreads(params) {
305
+ const repository = this.requireRepository(params.owner, params.repo);
306
+ const normalizedLogin = params.login.trim();
307
+ if (!normalizedLogin) {
308
+ return authorThreadsResponseSchema.parse({
309
+ repository,
310
+ authorLogin: '',
311
+ threads: [],
312
+ });
313
+ }
314
+ const clusterIds = new Map();
315
+ const clusterRows = this.db
316
+ .prepare(`select cm.thread_id, cm.cluster_id
317
+ from cluster_members cm
318
+ join clusters c on c.id = cm.cluster_id
319
+ where c.repo_id = ? and c.cluster_run_id = (
320
+ select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
321
+ )`)
322
+ .all(repository.id, repository.id);
323
+ for (const row of clusterRows)
324
+ clusterIds.set(row.thread_id, row.cluster_id);
325
+ const rows = this.db
326
+ .prepare(`select *
327
+ from threads
328
+ where repo_id = ? and lower(author_login) = lower(?)
329
+ ${params.includeClosed ? '' : "and state = 'open' and closed_at_local is null"}
330
+ order by updated_at_gh desc, number desc`)
331
+ .all(repository.id, normalizedLogin);
332
+ const latestRun = this.getLatestClusterRun(repository.id);
333
+ const strongestByThread = new Map();
334
+ if (latestRun && rows.length > 1) {
335
+ const edges = this.db
336
+ .prepare(`select
337
+ se.left_thread_id,
338
+ se.right_thread_id,
339
+ se.score,
340
+ t1.number as left_number,
341
+ t1.kind as left_kind,
342
+ t1.title as left_title,
343
+ t2.number as right_number,
344
+ t2.kind as right_kind,
345
+ t2.title as right_title
346
+ from similarity_edges se
347
+ join threads t1 on t1.id = se.left_thread_id
348
+ join threads t2 on t2.id = se.right_thread_id
349
+ where se.repo_id = ?
350
+ and se.cluster_run_id = ?
351
+ and lower(t1.author_login) = lower(?)
352
+ and lower(t2.author_login) = lower(?)
353
+ ${params.includeClosed ? '' : "and t1.state = 'open' and t1.closed_at_local is null and t2.state = 'open' and t2.closed_at_local is null"}`)
354
+ .all(repository.id, latestRun.id, normalizedLogin, normalizedLogin);
355
+ const updateStrongest = (sourceThreadId, match) => {
356
+ const previous = strongestByThread.get(sourceThreadId);
357
+ if (!previous || match.score > previous.score) {
358
+ strongestByThread.set(sourceThreadId, match);
359
+ }
360
+ };
361
+ for (const edge of edges) {
362
+ updateStrongest(edge.left_thread_id, {
363
+ threadId: edge.right_thread_id,
364
+ number: edge.right_number,
365
+ kind: edge.right_kind,
366
+ title: edge.right_title,
367
+ score: edge.score,
368
+ });
369
+ updateStrongest(edge.right_thread_id, {
370
+ threadId: edge.left_thread_id,
371
+ number: edge.left_number,
372
+ kind: edge.left_kind,
373
+ title: edge.left_title,
374
+ score: edge.score,
375
+ });
376
+ }
377
+ }
378
+ return authorThreadsResponseSchema.parse({
379
+ repository,
380
+ authorLogin: normalizedLogin,
381
+ threads: rows.map((row) => ({
382
+ thread: threadToDto(row, clusterIds.get(row.id) ?? null),
383
+ strongestSameAuthorMatch: strongestByThread.get(row.id) ?? null,
384
+ })),
385
+ });
386
+ }
387
+ closeThreadLocally(params) {
388
+ const repository = this.requireRepository(params.owner, params.repo);
389
+ const row = this.db
390
+ .prepare('select * from threads where repo_id = ? and number = ? limit 1')
391
+ .get(repository.id, params.threadNumber);
392
+ if (!row) {
393
+ throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`);
394
+ }
395
+ const closedAt = nowIso();
396
+ this.db
397
+ .prepare(`update threads
398
+ set closed_at_local = ?,
399
+ close_reason_local = 'manual',
400
+ updated_at = ?
401
+ where id = ?`)
402
+ .run(closedAt, closedAt, row.id);
403
+ this.parsedEmbeddingCache.delete(repository.id);
404
+ const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id);
405
+ const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0;
406
+ const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id);
407
+ return closeResponseSchema.parse({
408
+ ok: true,
409
+ repository,
410
+ thread: threadToDto(updated),
411
+ clusterId: clusterIds[0] ?? null,
412
+ clusterClosed,
413
+ message: `Marked ${updated.kind} #${updated.number} closed locally.`,
414
+ });
415
+ }
416
+ closeClusterLocally(params) {
417
+ const repository = this.requireRepository(params.owner, params.repo);
418
+ const latestRun = this.getLatestClusterRun(repository.id);
419
+ if (!latestRun) {
420
+ throw new Error(`No completed cluster run found for ${repository.fullName}.`);
421
+ }
422
+ const row = this.db
423
+ .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1')
424
+ .get(repository.id, latestRun.id, params.clusterId);
425
+ if (!row) {
426
+ throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
427
+ }
428
+ const closedAt = nowIso();
429
+ this.db
430
+ .prepare(`update clusters
431
+ set closed_at_local = ?,
432
+ close_reason_local = 'manual'
433
+ where id = ?`)
434
+ .run(closedAt, row.id);
435
+ return closeResponseSchema.parse({
436
+ ok: true,
437
+ repository,
438
+ clusterId: row.id,
439
+ clusterClosed: true,
440
+ message: `Marked cluster ${row.id} closed locally.`,
441
+ });
442
+ }
294
443
  async syncRepository(params) {
295
444
  const crawlStartedAt = params.startedAt ?? nowIso();
296
445
  const includeComments = params.includeComments ?? false;
@@ -385,6 +534,10 @@ export class GHCrawlService {
385
534
  })
386
535
  : 0;
387
536
  const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile;
537
+ this.parsedEmbeddingCache.delete(repoId);
538
+ if (threadsClosed > 0) {
539
+ this.reconcileClusterCloseState(repoId);
540
+ }
388
541
  const finishedAt = nowIso();
389
542
  const reconciledOpenCloseAt = shouldSweepClosedOverlap || shouldReconcileMissingOpenThreads ? finishedAt : null;
390
543
  const nextSyncCursor = {
@@ -596,7 +749,7 @@ export class GHCrawlService {
596
749
  from documents_fts
597
750
  join documents d on d.id = documents_fts.rowid
598
751
  join threads t on t.id = d.thread_id
599
- where t.repo_id = ? and t.state = 'open' and documents_fts match ?
752
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and documents_fts match ?
600
753
  order by rank
601
754
  limit ?`)
602
755
  .all(repository.id, params.query, limit * 2);
@@ -618,7 +771,7 @@ export class GHCrawlService {
618
771
  const threadRows = candidateIds.size
619
772
  ? this.db
620
773
  .prepare(`select * from threads
621
- where repo_id = ? and state = 'open' and id in (${[...candidateIds].map(() => '?').join(',')})
774
+ where repo_id = ? and state = 'open' and closed_at_local is null and id in (${[...candidateIds].map(() => '?').join(',')})
622
775
  order by updated_at_gh desc, number desc`)
623
776
  .all(repository.id, ...candidateIds)
624
777
  : [];
@@ -730,7 +883,8 @@ export class GHCrawlService {
730
883
  }
731
884
  const rows = this.db
732
885
  .prepare(`select c.id, c.repo_id, c.representative_thread_id, c.member_count,
733
- cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title
886
+ c.closed_at_local, c.close_reason_local,
887
+ cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title, t.state, t.closed_at_local as thread_closed_at_local
734
888
  from clusters c
735
889
  left join cluster_members cm on cm.cluster_id = c.id
736
890
  left join threads t on t.id = cm.thread_id
@@ -742,6 +896,9 @@ export class GHCrawlService {
742
896
  const cluster = clusters.get(row.id) ?? {
743
897
  id: row.id,
744
898
  repoId: row.repo_id,
899
+ isClosed: row.close_reason_local !== null,
900
+ closedAtLocal: row.closed_at_local,
901
+ closeReasonLocal: row.close_reason_local,
745
902
  representativeThreadId: row.representative_thread_id,
746
903
  memberCount: row.member_count,
747
904
  members: [],
@@ -751,15 +908,20 @@ export class GHCrawlService {
751
908
  threadId: row.thread_id,
752
909
  number: row.number,
753
910
  kind: row.kind,
911
+ isClosed: row.state !== null && isEffectivelyClosed({ state: row.state, closed_at_local: row.thread_closed_at_local }),
754
912
  title: row.title,
755
913
  scoreToRepresentative: row.score_to_representative,
756
914
  });
757
915
  }
758
916
  clusters.set(row.id, cluster);
759
917
  }
918
+ const clusterValues = Array.from(clusters.values()).map((cluster) => ({
919
+ ...cluster,
920
+ isClosed: cluster.isClosed || (cluster.memberCount > 0 && cluster.members.every((member) => member.isClosed)),
921
+ }));
760
922
  return clustersResponseSchema.parse({
761
923
  repository,
762
- clusters: Array.from(clusters.values()),
924
+ clusters: clusterValues.filter((cluster) => (params.includeClosed ? true : !cluster.isClosed)),
763
925
  });
764
926
  }
765
927
  async refreshRepository(params) {
@@ -814,6 +976,7 @@ export class GHCrawlService {
814
976
  minSize: params.minSize,
815
977
  sort: params.sort,
816
978
  search: params.search,
979
+ includeClosedClusters: params.includeClosed === true,
817
980
  });
818
981
  const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters;
819
982
  return clusterSummariesResponseSchema.parse({
@@ -822,6 +985,9 @@ export class GHCrawlService {
822
985
  clusters: clusters.map((cluster) => ({
823
986
  clusterId: cluster.clusterId,
824
987
  displayTitle: cluster.displayTitle,
988
+ isClosed: cluster.isClosed,
989
+ closedAtLocal: cluster.closedAtLocal,
990
+ closeReasonLocal: cluster.closeReasonLocal,
825
991
  totalCount: cluster.totalCount,
826
992
  issueCount: cluster.issueCount,
827
993
  pullRequestCount: cluster.pullRequestCount,
@@ -837,6 +1003,7 @@ export class GHCrawlService {
837
1003
  owner: params.owner,
838
1004
  repo: params.repo,
839
1005
  minSize: 0,
1006
+ includeClosedClusters: params.includeClosed === true,
840
1007
  });
841
1008
  const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId);
842
1009
  if (!cluster) {
@@ -869,6 +1036,9 @@ export class GHCrawlService {
869
1036
  cluster: {
870
1037
  clusterId: cluster.clusterId,
871
1038
  displayTitle: cluster.displayTitle,
1039
+ isClosed: cluster.isClosed,
1040
+ closedAtLocal: cluster.closedAtLocal,
1041
+ closeReasonLocal: cluster.closeReasonLocal,
872
1042
  totalCount: cluster.totalCount,
873
1043
  issueCount: cluster.issueCount,
874
1044
  pullRequestCount: cluster.pullRequestCount,
@@ -887,7 +1057,9 @@ export class GHCrawlService {
887
1057
  if (!latestRun) {
888
1058
  return { repository, stats, clusters: [] };
889
1059
  }
1060
+ const includeClosedClusters = params.includeClosedClusters ?? true;
890
1061
  const clusters = this.listRawTuiClusters(repository.id, latestRun.id)
1062
+ .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed))
891
1063
  .filter((cluster) => cluster.totalCount >= (params.minSize ?? 10))
892
1064
  .filter((cluster) => {
893
1065
  const search = params.search?.trim().toLowerCase();
@@ -913,7 +1085,7 @@ export class GHCrawlService {
913
1085
  throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
914
1086
  }
915
1087
  const rows = this.db
916
- .prepare(`select t.id, t.number, t.kind, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
1088
+ .prepare(`select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
917
1089
  from cluster_members cm
918
1090
  join threads t on t.id = cm.thread_id
919
1091
  where cm.cluster_id = ?
@@ -925,6 +1097,9 @@ export class GHCrawlService {
925
1097
  return {
926
1098
  clusterId: summary.clusterId,
927
1099
  displayTitle: summary.displayTitle,
1100
+ isClosed: summary.isClosed,
1101
+ closedAtLocal: summary.closedAtLocal,
1102
+ closeReasonLocal: summary.closeReasonLocal,
928
1103
  totalCount: summary.totalCount,
929
1104
  issueCount: summary.issueCount,
930
1105
  pullRequestCount: summary.pullRequestCount,
@@ -936,6 +1111,7 @@ export class GHCrawlService {
936
1111
  id: row.id,
937
1112
  number: row.number,
938
1113
  kind: row.kind,
1114
+ isClosed: isEffectivelyClosed(row),
939
1115
  title: row.title,
940
1116
  updatedAtGh: row.updated_at_gh,
941
1117
  htmlUrl: row.html_url,
@@ -948,11 +1124,11 @@ export class GHCrawlService {
948
1124
  const repository = this.requireRepository(params.owner, params.repo);
949
1125
  const row = params.threadId
950
1126
  ? (this.db
951
- .prepare('select * from threads where repo_id = ? and id = ? and state = \'open\' limit 1')
1127
+ .prepare('select * from threads where repo_id = ? and id = ? limit 1')
952
1128
  .get(repository.id, params.threadId) ?? null)
953
1129
  : params.threadNumber
954
1130
  ? (this.db
955
- .prepare('select * from threads where repo_id = ? and number = ? and state = \'open\' limit 1')
1131
+ .prepare('select * from threads where repo_id = ? and number = ? limit 1')
956
1132
  .get(repository.id, params.threadNumber) ?? null)
957
1133
  : null;
958
1134
  if (!row) {
@@ -1112,7 +1288,7 @@ export class GHCrawlService {
1112
1288
  const counts = this.db
1113
1289
  .prepare(`select kind, count(*) as count
1114
1290
  from threads
1115
- where repo_id = ? and state = 'open'
1291
+ where repo_id = ? and state = 'open' and closed_at_local is null
1116
1292
  group by kind`)
1117
1293
  .all(repoId);
1118
1294
  const latestRun = this.getLatestClusterRun(repoId);
@@ -1140,11 +1316,73 @@ export class GHCrawlService {
1140
1316
  .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1141
1317
  .get(repoId) ?? null);
1142
1318
  }
1319
+ getLatestRunClusterIdsForThread(repoId, threadId) {
1320
+ const latestRun = this.getLatestClusterRun(repoId);
1321
+ if (!latestRun) {
1322
+ return [];
1323
+ }
1324
+ return this.db
1325
+ .prepare(`select cm.cluster_id
1326
+ from cluster_members cm
1327
+ join clusters c on c.id = cm.cluster_id
1328
+ where c.repo_id = ? and c.cluster_run_id = ? and cm.thread_id = ?
1329
+ order by cm.cluster_id asc`)
1330
+ .all(repoId, latestRun.id, threadId).map((row) => row.cluster_id);
1331
+ }
1332
+ reconcileClusterCloseState(repoId, clusterIds) {
1333
+ const latestRun = this.getLatestClusterRun(repoId);
1334
+ if (!latestRun) {
1335
+ return 0;
1336
+ }
1337
+ const resolvedClusterIds = clusterIds && clusterIds.length > 0
1338
+ ? Array.from(new Set(clusterIds))
1339
+ : this.db
1340
+ .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? order by id asc')
1341
+ .all(repoId, latestRun.id).map((row) => row.id);
1342
+ if (resolvedClusterIds.length === 0) {
1343
+ return 0;
1344
+ }
1345
+ const summarize = this.db.prepare(`select
1346
+ c.id,
1347
+ c.close_reason_local,
1348
+ count(*) as member_count,
1349
+ sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count
1350
+ from clusters c
1351
+ join cluster_members cm on cm.cluster_id = c.id
1352
+ join threads t on t.id = cm.thread_id
1353
+ where c.id = ?
1354
+ group by c.id, c.close_reason_local`);
1355
+ const markClosed = this.db.prepare(`update clusters
1356
+ set closed_at_local = coalesce(closed_at_local, ?),
1357
+ close_reason_local = 'all_members_closed'
1358
+ where id = ?`);
1359
+ const clearClosed = this.db.prepare(`update clusters
1360
+ set closed_at_local = null,
1361
+ close_reason_local = null
1362
+ where id = ? and close_reason_local = 'all_members_closed'`);
1363
+ let changed = 0;
1364
+ for (const clusterId of resolvedClusterIds) {
1365
+ const row = summarize.get(clusterId);
1366
+ if (!row || row.close_reason_local === 'manual') {
1367
+ continue;
1368
+ }
1369
+ if (row.member_count > 0 && row.closed_member_count >= row.member_count) {
1370
+ const result = markClosed.run(nowIso(), clusterId);
1371
+ changed += result.changes;
1372
+ continue;
1373
+ }
1374
+ const cleared = clearClosed.run(clusterId);
1375
+ changed += cleared.changes;
1376
+ }
1377
+ return changed;
1378
+ }
1143
1379
  listRawTuiClusters(repoId, clusterRunId) {
1144
1380
  const rows = this.db
1145
1381
  .prepare(`select
1146
1382
  c.id as cluster_id,
1147
1383
  c.member_count,
1384
+ c.closed_at_local,
1385
+ c.close_reason_local,
1148
1386
  c.representative_thread_id,
1149
1387
  rt.number as representative_number,
1150
1388
  rt.kind as representative_kind,
@@ -1152,6 +1390,7 @@ export class GHCrawlService {
1152
1390
  max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at,
1153
1391
  sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count,
1154
1392
  sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count,
1393
+ sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count,
1155
1394
  group_concat(lower(coalesce(t.title, '')), ' ') as search_text
1156
1395
  from clusters c
1157
1396
  left join threads rt on rt.id = c.representative_thread_id
@@ -1169,6 +1408,9 @@ export class GHCrawlService {
1169
1408
  return rows.map((row) => ({
1170
1409
  clusterId: row.cluster_id,
1171
1410
  displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`,
1411
+ isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count,
1412
+ closedAtLocal: row.closed_at_local,
1413
+ closeReasonLocal: row.close_reason_local,
1172
1414
  totalCount: row.member_count,
1173
1415
  issueCount: row.issue_count,
1174
1416
  pullRequestCount: row.pull_request_count,
@@ -1306,6 +1548,7 @@ export class GHCrawlService {
1306
1548
  from threads
1307
1549
  where repo_id = ?
1308
1550
  and state = 'open'
1551
+ and closed_at_local is null
1309
1552
  and (last_pulled_at is null or last_pulled_at < ?)
1310
1553
  order by number asc`)
1311
1554
  .all(params.repoId, params.crawlStartedAt);
@@ -1350,6 +1593,7 @@ export class GHCrawlService {
1350
1593
  from threads
1351
1594
  where repo_id = ?
1352
1595
  and state = 'open'
1596
+ and closed_at_local is null
1353
1597
  and (last_pulled_at is null or last_pulled_at < ?)
1354
1598
  order by number asc`)
1355
1599
  .all(params.repoId, params.crawlStartedAt);
@@ -1637,11 +1881,12 @@ export class GHCrawlService {
1637
1881
  }
1638
1882
  loadStoredEmbeddings(repoId) {
1639
1883
  return this.db
1640
- .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.title, t.body, t.author_login, t.html_url, t.labels_json,
1884
+ .prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local,
1885
+ t.title, t.body, t.author_login, t.html_url, t.labels_json,
1641
1886
  t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
1642
1887
  from threads t
1643
1888
  join document_embeddings e on e.thread_id = t.id
1644
- where t.repo_id = ? and t.state = 'open' and e.model = ?
1889
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ?
1645
1890
  order by t.number asc, e.source_kind asc`)
1646
1891
  .all(repoId, this.config.embedModel);
1647
1892
  }
@@ -1688,7 +1933,9 @@ export class GHCrawlService {
1688
1933
  and se.cluster_run_id = ?
1689
1934
  and (se.left_thread_id = ? or se.right_thread_id = ?)
1690
1935
  and t1.state = 'open'
1936
+ and t1.closed_at_local is null
1691
1937
  and t2.state = 'open'
1938
+ and t2.closed_at_local is null
1692
1939
  order by se.score desc
1693
1940
  limit ?`)
1694
1941
  .all(threadId, threadId, threadId, threadId, repoId, latestRun.id, threadId, threadId, limit);
@@ -1703,7 +1950,7 @@ export class GHCrawlService {
1703
1950
  getEmbeddingWorkset(repoId, threadNumber) {
1704
1951
  let sql = `select t.id, t.number, t.title, t.body
1705
1952
  from threads t
1706
- where t.repo_id = ? and t.state = 'open'`;
1953
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null`;
1707
1954
  const args = [repoId];
1708
1955
  if (threadNumber) {
1709
1956
  sql += ' and t.number = ?';
@@ -1736,7 +1983,7 @@ export class GHCrawlService {
1736
1983
  let sql = `select s.thread_id, s.summary_kind, s.summary_text
1737
1984
  from document_summaries s
1738
1985
  join threads t on t.id = s.thread_id
1739
- where t.repo_id = ? and t.state = 'open' and s.model = ?`;
1986
+ where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and s.model = ?`;
1740
1987
  const args = [repoId, this.config.summaryModel];
1741
1988
  if (threadNumber) {
1742
1989
  sql += ' and t.number = ?';