@ghcrawl/api-core 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +25 -0
  2. package/dist/api/server.d.ts +4 -0
  3. package/dist/api/server.d.ts.map +1 -0
  4. package/dist/api/server.js +142 -0
  5. package/dist/api/server.js.map +1 -0
  6. package/dist/cluster/build.d.ts +16 -0
  7. package/dist/cluster/build.d.ts.map +1 -0
  8. package/dist/cluster/build.js +62 -0
  9. package/dist/cluster/build.js.map +1 -0
  10. package/dist/config.d.ts +83 -0
  11. package/dist/config.d.ts.map +1 -0
  12. package/dist/config.js +257 -0
  13. package/dist/config.js.map +1 -0
  14. package/dist/db/migrate.d.ts +3 -0
  15. package/dist/db/migrate.d.ts.map +1 -0
  16. package/{src/db/migrate.ts → dist/db/migrate.js} +30 -36
  17. package/dist/db/migrate.js.map +1 -0
  18. package/dist/db/sqlite.d.ts +4 -0
  19. package/dist/db/sqlite.d.ts.map +1 -0
  20. package/dist/db/sqlite.js +11 -0
  21. package/dist/db/sqlite.js.map +1 -0
  22. package/dist/documents/normalize.d.ts +23 -0
  23. package/dist/documents/normalize.d.ts.map +1 -0
  24. package/dist/documents/normalize.js +36 -0
  25. package/dist/documents/normalize.js.map +1 -0
  26. package/dist/github/client.d.ts +24 -0
  27. package/dist/github/client.d.ts.map +1 -0
  28. package/dist/github/client.js +170 -0
  29. package/dist/github/client.js.map +1 -0
  30. package/dist/index.d.ts +7 -0
  31. package/dist/index.d.ts.map +1 -0
  32. package/{src/index.ts → dist/index.js} +1 -0
  33. package/dist/index.js.map +1 -0
  34. package/dist/openai/provider.d.ts +44 -0
  35. package/dist/openai/provider.d.ts.map +1 -0
  36. package/dist/openai/provider.js +107 -0
  37. package/dist/openai/provider.js.map +1 -0
  38. package/dist/search/exact.d.ts +14 -0
  39. package/dist/search/exact.d.ts.map +1 -0
  40. package/dist/search/exact.js +26 -0
  41. package/dist/search/exact.js.map +1 -0
  42. package/dist/service.d.ts +249 -0
  43. package/dist/service.d.ts.map +1 -0
  44. package/dist/service.js +1801 -0
  45. package/dist/service.js.map +1 -0
  46. package/package.json +8 -6
  47. package/src/api/server.test.ts +0 -296
  48. package/src/api/server.ts +0 -171
  49. package/src/cluster/build.test.ts +0 -18
  50. package/src/cluster/build.ts +0 -74
  51. package/src/config.test.ts +0 -247
  52. package/src/config.ts +0 -421
  53. package/src/db/migrate.test.ts +0 -30
  54. package/src/db/sqlite.ts +0 -14
  55. package/src/documents/normalize.test.ts +0 -25
  56. package/src/documents/normalize.ts +0 -52
  57. package/src/github/client.ts +0 -241
  58. package/src/openai/provider.ts +0 -141
  59. package/src/search/exact.test.ts +0 -22
  60. package/src/search/exact.ts +0 -28
  61. package/src/service.test.ts +0 -2036
  62. package/src/service.ts +0 -2497
  63. package/src/types/better-sqlite3.d.ts +0 -1
package/src/service.ts DELETED
@@ -1,2497 +0,0 @@
1
- import http from 'node:http';
2
- import crypto from 'node:crypto';
3
-
4
- import { IterableMapper } from '@shutterstock/p-map-iterable';
5
- import {
6
- actionResponseSchema,
7
- clusterDetailResponseSchema,
8
- clusterResultSchema,
9
- clusterSummariesResponseSchema,
10
- clustersResponseSchema,
11
- embedResultSchema,
12
- healthResponseSchema,
13
- neighborsResponseSchema,
14
- refreshResponseSchema,
15
- repositoriesResponseSchema,
16
- searchResponseSchema,
17
- syncResultSchema,
18
- threadsResponseSchema,
19
- type ActionRequest,
20
- type ActionResponse,
21
- type ClusterDetailResponse,
22
- type ClusterDto,
23
- type ClusterResultDto,
24
- type ClusterSummariesResponse,
25
- type ClustersResponse,
26
- type EmbedResultDto,
27
- type HealthResponse,
28
- type NeighborsResponse,
29
- type RefreshResponse,
30
- type RepositoriesResponse,
31
- type RepositoryDto,
32
- type SearchHitDto,
33
- type SearchMode,
34
- type SearchResponse,
35
- type SyncResultDto,
36
- type ThreadDto,
37
- type ThreadsResponse,
38
- } from '@ghcrawl/api-contract';
39
-
40
- import { buildClusters } from './cluster/build.js';
41
- import {
42
- ensureRuntimeDirs,
43
- isLikelyGitHubToken,
44
- isLikelyOpenAiApiKey,
45
- loadConfig,
46
- requireGithubToken,
47
- requireOpenAiKey,
48
- type ConfigValueSource,
49
- type GitcrawlConfig,
50
- } from './config.js';
51
- import { migrate } from './db/migrate.js';
52
- import { openDb, type SqliteDatabase } from './db/sqlite.js';
53
- import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
54
- import { makeGitHubClient, type GitHubClient } from './github/client.js';
55
- import { OpenAiProvider, type AiProvider } from './openai/provider.js';
56
- import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
57
-
58
- type RunTable = 'sync_runs' | 'summary_runs' | 'embedding_runs' | 'cluster_runs';
59
-
60
- type ThreadRow = {
61
- id: number;
62
- repo_id: number;
63
- number: number;
64
- kind: 'issue' | 'pull_request';
65
- state: string;
66
- title: string;
67
- body: string | null;
68
- author_login: string | null;
69
- html_url: string;
70
- labels_json: string;
71
- updated_at_gh: string | null;
72
- first_pulled_at: string | null;
73
- last_pulled_at: string | null;
74
- };
75
-
76
- type CommentSeed = {
77
- githubId: string;
78
- commentType: string;
79
- authorLogin: string | null;
80
- authorType: string | null;
81
- body: string;
82
- isBot: boolean;
83
- rawJson: string;
84
- createdAtGh: string | null;
85
- updatedAtGh: string | null;
86
- };
87
-
88
- type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary';
89
-
90
- type EmbeddingTask = {
91
- threadId: number;
92
- threadNumber: number;
93
- sourceKind: EmbeddingSourceKind;
94
- text: string;
95
- contentHash: string;
96
- estimatedTokens: number;
97
- wasTruncated: boolean;
98
- };
99
-
100
- type StoredEmbeddingRow = ThreadRow & {
101
- source_kind: EmbeddingSourceKind;
102
- embedding_json: string;
103
- };
104
-
105
- type ParsedStoredEmbeddingRow = Omit<StoredEmbeddingRow, 'embedding_json'> & {
106
- embedding: number[];
107
- };
108
-
109
- type EmbeddingWorkset = {
110
- rows: Array<{
111
- id: number;
112
- number: number;
113
- title: string;
114
- body: string | null;
115
- }>;
116
- tasks: EmbeddingTask[];
117
- existing: Map<string, string>;
118
- pending: EmbeddingTask[];
119
- };
120
-
121
- type SyncCursorState = {
122
- lastFullOpenScanStartedAt: string | null;
123
- lastOverlappingOpenScanCompletedAt: string | null;
124
- lastNonOverlappingScanCompletedAt: string | null;
125
- lastReconciledOpenCloseAt: string | null;
126
- };
127
-
128
- type SyncRunStats = {
129
- threadsSynced: number;
130
- commentsSynced: number;
131
- threadsClosed: number;
132
- crawlStartedAt: string;
133
- requestedSince: string | null;
134
- effectiveSince: string | null;
135
- limit: number | null;
136
- includeComments: boolean;
137
- isFullOpenScan: boolean;
138
- isOverlappingOpenScan: boolean;
139
- overlapReferenceAt: string | null;
140
- reconciledOpenCloseAt: string | null;
141
- };
142
-
143
- export type TuiClusterSortMode = 'recent' | 'size';
144
-
145
- export type TuiRepoStats = {
146
- openIssueCount: number;
147
- openPullRequestCount: number;
148
- lastGithubReconciliationAt: string | null;
149
- lastEmbedRefreshAt: string | null;
150
- staleEmbedThreadCount: number;
151
- staleEmbedSourceCount: number;
152
- latestClusterRunId: number | null;
153
- latestClusterRunFinishedAt: string | null;
154
- };
155
-
156
- export type TuiClusterSummary = {
157
- clusterId: number;
158
- displayTitle: string;
159
- totalCount: number;
160
- issueCount: number;
161
- pullRequestCount: number;
162
- latestUpdatedAt: string | null;
163
- representativeThreadId: number | null;
164
- representativeNumber: number | null;
165
- representativeKind: 'issue' | 'pull_request' | null;
166
- searchText: string;
167
- };
168
-
169
- export type TuiClusterMember = {
170
- id: number;
171
- number: number;
172
- kind: 'issue' | 'pull_request';
173
- title: string;
174
- updatedAtGh: string | null;
175
- htmlUrl: string;
176
- labels: string[];
177
- clusterScore: number | null;
178
- };
179
-
180
- export type TuiClusterDetail = {
181
- clusterId: number;
182
- displayTitle: string;
183
- totalCount: number;
184
- issueCount: number;
185
- pullRequestCount: number;
186
- latestUpdatedAt: string | null;
187
- representativeThreadId: number | null;
188
- representativeNumber: number | null;
189
- representativeKind: 'issue' | 'pull_request' | null;
190
- members: TuiClusterMember[];
191
- };
192
-
193
- export type TuiThreadDetail = {
194
- thread: ThreadDto;
195
- summaries: Partial<Record<'problem_summary' | 'solution_summary' | 'maintainer_signal_summary' | 'dedupe_summary', string>>;
196
- neighbors: SearchHitDto['neighbors'];
197
- };
198
-
199
- export type TuiSnapshot = {
200
- repository: RepositoryDto;
201
- stats: TuiRepoStats;
202
- clusters: TuiClusterSummary[];
203
- };
204
-
205
- export type DoctorResult = {
206
- health: HealthResponse;
207
- github: {
208
- configured: boolean;
209
- source: ConfigValueSource;
210
- formatOk: boolean;
211
- authOk: boolean;
212
- error: string | null;
213
- };
214
- openai: {
215
- configured: boolean;
216
- source: ConfigValueSource;
217
- formatOk: boolean;
218
- authOk: boolean;
219
- error: string | null;
220
- };
221
- };
222
-
223
- type SyncOptions = {
224
- owner: string;
225
- repo: string;
226
- since?: string;
227
- limit?: number;
228
- includeComments?: boolean;
229
- onProgress?: (message: string) => void;
230
- startedAt?: string;
231
- };
232
-
233
- type SearchResultInternal = SearchResponse;
234
- type NeighborsResultInternal = NeighborsResponse;
235
-
236
- const SYNC_BATCH_SIZE = 100;
237
- const SYNC_BATCH_DELAY_MS = 5000;
238
- const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
239
- const EMBED_MAX_ITEM_TOKENS = 7000;
240
- const EMBED_MAX_BATCH_TOKENS = 250000;
241
- const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
242
-
243
- function nowIso(): string {
244
- return new Date().toISOString();
245
- }
246
-
247
- function parseIso(value: string | null | undefined): number | null {
248
- if (!value) return null;
249
- const parsed = Date.parse(value);
250
- return Number.isNaN(parsed) ? null : parsed;
251
- }
252
-
253
- function isMissingGitHubResourceError(error: unknown): boolean {
254
- const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null;
255
- if (status === 404 || status === 410) {
256
- return true;
257
- }
258
- const message = error instanceof Error ? error.message : String(error);
259
- return /\b(404|410)\b/.test(message) || /Not Found|Gone/i.test(message);
260
- }
261
-
262
- function deriveIncrementalSince(referenceAt: string, crawlStartedAt: string): string {
263
- const referenceMs = parseIso(referenceAt) ?? Date.now();
264
- const crawlMs = parseIso(crawlStartedAt) ?? Date.now();
265
- const gapMs = Math.max(0, crawlMs - referenceMs);
266
- const hourMs = 60 * 60 * 1000;
267
- const roundedHours = Math.max(2, Math.ceil(gapMs / hourMs));
268
- return new Date(crawlMs - roundedHours * hourMs).toISOString();
269
- }
270
-
271
- function parseSyncRunStats(statsJson: string | null): SyncRunStats | null {
272
- if (!statsJson) return null;
273
- try {
274
- const parsed = JSON.parse(statsJson) as Partial<SyncRunStats>;
275
- if (typeof parsed.crawlStartedAt !== 'string') {
276
- return null;
277
- }
278
- return {
279
- threadsSynced: typeof parsed.threadsSynced === 'number' ? parsed.threadsSynced : 0,
280
- commentsSynced: typeof parsed.commentsSynced === 'number' ? parsed.commentsSynced : 0,
281
- threadsClosed: typeof parsed.threadsClosed === 'number' ? parsed.threadsClosed : 0,
282
- crawlStartedAt: parsed.crawlStartedAt,
283
- requestedSince: typeof parsed.requestedSince === 'string' ? parsed.requestedSince : null,
284
- effectiveSince: typeof parsed.effectiveSince === 'string' ? parsed.effectiveSince : null,
285
- limit: typeof parsed.limit === 'number' ? parsed.limit : null,
286
- includeComments: parsed.includeComments === true,
287
- isFullOpenScan: parsed.isFullOpenScan === true,
288
- isOverlappingOpenScan: parsed.isOverlappingOpenScan === true,
289
- overlapReferenceAt: typeof parsed.overlapReferenceAt === 'string' ? parsed.overlapReferenceAt : null,
290
- reconciledOpenCloseAt: typeof parsed.reconciledOpenCloseAt === 'string' ? parsed.reconciledOpenCloseAt : null,
291
- };
292
- } catch {
293
- return null;
294
- }
295
- }
296
-
297
- function asJson(value: unknown): string {
298
- return JSON.stringify(value ?? null);
299
- }
300
-
301
- function parseArray(value: string): string[] {
302
- return JSON.parse(value) as string[];
303
- }
304
-
305
- function userLogin(payload: Record<string, unknown>): string | null {
306
- const user = payload.user as Record<string, unknown> | undefined;
307
- const login = user?.login;
308
- return typeof login === 'string' ? login : null;
309
- }
310
-
311
- function userType(payload: Record<string, unknown>): string | null {
312
- const user = payload.user as Record<string, unknown> | undefined;
313
- const type = user?.type;
314
- return typeof type === 'string' ? type : null;
315
- }
316
-
317
- function isPullRequestPayload(payload: Record<string, unknown>): boolean {
318
- return Boolean(payload.pull_request);
319
- }
320
-
321
- function parseLabels(payload: Record<string, unknown>): string[] {
322
- const labels = payload.labels;
323
- if (!Array.isArray(labels)) return [];
324
- return labels
325
- .map((label) => {
326
- if (typeof label === 'string') return label;
327
- if (label && typeof label === 'object' && typeof (label as Record<string, unknown>).name === 'string') {
328
- return String((label as Record<string, unknown>).name);
329
- }
330
- return null;
331
- })
332
- .filter((value): value is string => Boolean(value));
333
- }
334
-
335
- function parseAssignees(payload: Record<string, unknown>): string[] {
336
- const assignees = payload.assignees;
337
- if (!Array.isArray(assignees)) return [];
338
- return assignees
339
- .map((assignee) => {
340
- if (assignee && typeof assignee === 'object' && typeof (assignee as Record<string, unknown>).login === 'string') {
341
- return String((assignee as Record<string, unknown>).login);
342
- }
343
- return null;
344
- })
345
- .filter((value): value is string => Boolean(value));
346
- }
347
-
348
- function stableContentHash(input: string): string {
349
- return crypto.createHash('sha256').update(input).digest('hex');
350
- }
351
-
352
- function normalizeSummaryText(value: string): string {
353
- return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim();
354
- }
355
-
356
- function snippetText(value: string | null | undefined, maxChars: number): string | null {
357
- if (!value) return null;
358
- const normalized = value.replace(/\s+/g, ' ').trim();
359
- if (!normalized) return null;
360
- if (normalized.length <= maxChars) return normalized;
361
- return `${normalized.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`;
362
- }
363
-
364
- function repositoryToDto(row: Record<string, unknown>): RepositoryDto {
365
- return {
366
- id: Number(row.id),
367
- owner: String(row.owner),
368
- name: String(row.name),
369
- fullName: String(row.full_name),
370
- githubRepoId: row.github_repo_id === null ? null : String(row.github_repo_id),
371
- updatedAt: String(row.updated_at),
372
- };
373
- }
374
-
375
- function threadToDto(row: ThreadRow, clusterId?: number | null): ThreadDto {
376
- return {
377
- id: row.id,
378
- repoId: row.repo_id,
379
- number: row.number,
380
- kind: row.kind,
381
- state: row.state,
382
- title: row.title,
383
- body: row.body,
384
- authorLogin: row.author_login,
385
- htmlUrl: row.html_url,
386
- labels: parseArray(row.labels_json),
387
- updatedAtGh: row.updated_at_gh,
388
- clusterId: clusterId ?? null,
389
- };
390
- }
391
-
392
- export class GHCrawlService {
393
- readonly config: GitcrawlConfig;
394
- readonly db: SqliteDatabase;
395
- readonly github?: GitHubClient;
396
- readonly ai?: AiProvider;
397
- private readonly parsedEmbeddingCache = new Map<number, ParsedStoredEmbeddingRow[]>();
398
-
399
- constructor(options: {
400
- config?: GitcrawlConfig;
401
- db?: SqliteDatabase;
402
- github?: GitHubClient;
403
- ai?: AiProvider;
404
- } = {}) {
405
- this.config = options.config ?? loadConfig();
406
- ensureRuntimeDirs(this.config);
407
- this.db = options.db ?? openDb(this.config.dbPath);
408
- migrate(this.db);
409
- this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
410
- this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
411
- }
412
-
413
- close(): void {
414
- this.parsedEmbeddingCache.clear();
415
- this.db.close();
416
- }
417
-
418
- init(): HealthResponse {
419
- ensureRuntimeDirs(this.config);
420
- migrate(this.db);
421
- const response = {
422
- ok: true,
423
- configPath: this.config.configPath,
424
- configFileExists: this.config.configFileExists,
425
- dbPath: this.config.dbPath,
426
- apiPort: this.config.apiPort,
427
- githubConfigured: Boolean(this.config.githubToken),
428
- openaiConfigured: Boolean(this.config.openaiApiKey),
429
- };
430
- return healthResponseSchema.parse(response);
431
- }
432
-
433
- async doctor(): Promise<DoctorResult> {
434
- const health = this.init();
435
- const github = {
436
- configured: Boolean(this.config.githubToken),
437
- source: this.config.githubTokenSource,
438
- formatOk: this.config.githubToken ? isLikelyGitHubToken(this.config.githubToken) : false,
439
- authOk: false,
440
- error: null as string | null,
441
- };
442
- const openai = {
443
- configured: Boolean(this.config.openaiApiKey),
444
- source: this.config.openaiApiKeySource,
445
- formatOk: this.config.openaiApiKey ? isLikelyOpenAiApiKey(this.config.openaiApiKey) : false,
446
- authOk: false,
447
- error: null as string | null,
448
- };
449
- if (!github.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) {
450
- github.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so GITHUB_TOKEN is present in the environment.`;
451
- }
452
- if (!openai.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) {
453
- openai.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so OPENAI_API_KEY is present in the environment.`;
454
- }
455
- if (github.configured) {
456
- if (!github.formatOk) {
457
- github.error = 'Token format does not look like a GitHub personal access token.';
458
- } else {
459
- try {
460
- await this.requireGithub().checkAuth();
461
- github.authOk = true;
462
- } catch (error) {
463
- github.error = error instanceof Error ? error.message : String(error);
464
- }
465
- }
466
- }
467
-
468
- if (openai.configured) {
469
- if (!openai.formatOk) {
470
- openai.error = 'Key format does not look like an OpenAI API key.';
471
- } else {
472
- try {
473
- await this.requireAi().checkAuth();
474
- openai.authOk = true;
475
- } catch (error) {
476
- openai.error = error instanceof Error ? error.message : String(error);
477
- }
478
- }
479
- }
480
-
481
- return { health, github, openai };
482
- }
483
-
484
- listRepositories(): RepositoriesResponse {
485
- const rows = this.db.prepare('select * from repositories order by full_name asc').all() as Array<Record<string, unknown>>;
486
- return repositoriesResponseSchema.parse({ repositories: rows.map(repositoryToDto) });
487
- }
488
-
489
- listThreads(params: { owner: string; repo: string; kind?: 'issue' | 'pull_request' }): ThreadsResponse {
490
- const repository = this.requireRepository(params.owner, params.repo);
491
- const clusterIds = new Map<number, number>();
492
- const clusterRows = this.db
493
- .prepare(
494
- `select cm.thread_id, cm.cluster_id
495
- from cluster_members cm
496
- join clusters c on c.id = cm.cluster_id
497
- where c.repo_id = ? and c.cluster_run_id = (
498
- select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
499
- )`,
500
- )
501
- .all(repository.id, repository.id) as Array<{ thread_id: number; cluster_id: number }>;
502
- for (const row of clusterRows) clusterIds.set(row.thread_id, row.cluster_id);
503
-
504
- let sql = "select * from threads where repo_id = ? and state = 'open'";
505
- const args: Array<string | number> = [repository.id];
506
- if (params.kind) {
507
- sql += ' and kind = ?';
508
- args.push(params.kind);
509
- }
510
- sql += ' order by updated_at_gh desc, number desc';
511
- const rows = this.db.prepare(sql).all(...args) as ThreadRow[];
512
- return threadsResponseSchema.parse({
513
- repository,
514
- threads: rows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
515
- });
516
- }
517
-
518
- async syncRepository(
519
- params: SyncOptions,
520
- ): Promise<SyncResultDto> {
521
- const crawlStartedAt = params.startedAt ?? nowIso();
522
- const includeComments = params.includeComments ?? false;
523
- const github = this.requireGithub();
524
- params.onProgress?.(`[sync] fetching repository metadata for ${params.owner}/${params.repo}`);
525
- const reporter = params.onProgress ? (message: string) => params.onProgress?.(message.replace(/^\[github\]/, '[sync/github]')) : undefined;
526
- const repoData = await github.getRepo(params.owner, params.repo, reporter);
527
- const repoId = this.upsertRepository(params.owner, params.repo, repoData);
528
- const runId = this.startRun('sync_runs', repoId, `${params.owner}/${params.repo}`);
529
- const syncCursor = this.getSyncCursorState(repoId);
530
- const overlapReferenceAt = syncCursor.lastOverlappingOpenScanCompletedAt ?? syncCursor.lastFullOpenScanStartedAt;
531
- const effectiveSince =
532
- params.since ??
533
- (params.limit === undefined && overlapReferenceAt ? deriveIncrementalSince(overlapReferenceAt, crawlStartedAt) : undefined);
534
- const isFullOpenScan = params.limit === undefined && params.since === undefined && overlapReferenceAt === null;
535
- const isOverlappingOpenScan =
536
- params.limit === undefined &&
537
- overlapReferenceAt !== null &&
538
- effectiveSince !== undefined &&
539
- (parseIso(effectiveSince) ?? Number.POSITIVE_INFINITY) <= (parseIso(overlapReferenceAt) ?? Number.NEGATIVE_INFINITY);
540
-
541
- try {
542
- params.onProgress?.(`[sync] listing issues and pull requests for ${params.owner}/${params.repo}`);
543
- params.onProgress?.(
544
- includeComments
545
- ? '[sync] comment hydration enabled; fetching issue comments, reviews, and review comments'
546
- : '[sync] metadata-only mode; skipping comment, review, and review-comment fetches',
547
- );
548
- if (isFullOpenScan) {
549
- params.onProgress?.('[sync] full open scan; no prior completed overlap/full cursor was found for this repository');
550
- } else if (params.since === undefined && effectiveSince && overlapReferenceAt) {
551
- params.onProgress?.(
552
- `[sync] derived incremental window since=${effectiveSince} from overlap reference ${overlapReferenceAt}`,
553
- );
554
- } else if (params.since !== undefined) {
555
- params.onProgress?.(`[sync] using requested since=${params.since}`);
556
- }
557
- const items = await github.listRepositoryIssues(params.owner, params.repo, effectiveSince, params.limit, reporter);
558
- params.onProgress?.(`[sync] discovered ${items.length} threads to process`);
559
- let threadsSynced = 0;
560
- let commentsSynced = 0;
561
-
562
- for (const [index, item] of items.entries()) {
563
- if (index > 0 && index % SYNC_BATCH_SIZE === 0) {
564
- params.onProgress?.(`[sync] batch boundary reached at ${index} threads; sleeping 5s before continuing`);
565
- await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS));
566
- }
567
- const number = Number(item.number);
568
- const isPr = isPullRequestPayload(item);
569
- const kind = isPr ? 'pull_request' : 'issue';
570
- params.onProgress?.(`[sync] ${index + 1}/${items.length} ${kind} #${number}`);
571
- try {
572
- const threadPayload = isPr ? await github.getPull(params.owner, params.repo, number, reporter) : item;
573
- const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt);
574
- if (includeComments) {
575
- const comments = await this.fetchThreadComments(params.owner, params.repo, number, isPr, reporter);
576
- this.replaceComments(threadId, comments);
577
- commentsSynced += comments.length;
578
- }
579
- this.refreshDocument(threadId);
580
- threadsSynced += 1;
581
- } catch (error) {
582
- const message = error instanceof Error ? error.message : String(error);
583
- throw new Error(`sync failed while processing ${kind} #${number}: ${message}`);
584
- }
585
- }
586
-
587
- const shouldReconcileMissingOpenThreads = params.limit === undefined && (isFullOpenScan || isOverlappingOpenScan);
588
- if (!shouldReconcileMissingOpenThreads) {
589
- params.onProgress?.('[sync] skipping stale-open reconciliation because this scan did not overlap a confirmed full/overlap cursor');
590
- }
591
- const threadsClosed = shouldReconcileMissingOpenThreads
592
- ? await this.reconcileMissingOpenThreads({
593
- repoId,
594
- owner: params.owner,
595
- repo: params.repo,
596
- crawlStartedAt,
597
- reporter,
598
- onProgress: params.onProgress,
599
- })
600
- : 0;
601
- const finishedAt = nowIso();
602
- const reconciledOpenCloseAt = shouldReconcileMissingOpenThreads ? finishedAt : null;
603
- const nextSyncCursor: SyncCursorState = {
604
- lastFullOpenScanStartedAt: isFullOpenScan ? crawlStartedAt : syncCursor.lastFullOpenScanStartedAt,
605
- lastOverlappingOpenScanCompletedAt: isOverlappingOpenScan ? finishedAt : syncCursor.lastOverlappingOpenScanCompletedAt,
606
- lastNonOverlappingScanCompletedAt:
607
- !isFullOpenScan && !isOverlappingOpenScan ? finishedAt : syncCursor.lastNonOverlappingScanCompletedAt,
608
- lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt,
609
- };
610
- this.writeSyncCursorState(repoId, nextSyncCursor);
611
-
612
- this.finishRun('sync_runs', runId, 'completed', {
613
- threadsSynced,
614
- commentsSynced,
615
- threadsClosed,
616
- crawlStartedAt,
617
- requestedSince: params.since ?? null,
618
- effectiveSince: effectiveSince ?? null,
619
- limit: params.limit ?? null,
620
- includeComments,
621
- isFullOpenScan,
622
- isOverlappingOpenScan,
623
- overlapReferenceAt,
624
- reconciledOpenCloseAt,
625
- } satisfies SyncRunStats, undefined, finishedAt);
626
- return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, threadsClosed });
627
- } catch (error) {
628
- this.finishRun('sync_runs', runId, 'failed', null, error);
629
- throw error;
630
- }
631
- }
632
-
633
- async summarizeRepository(params: {
634
- owner: string;
635
- repo: string;
636
- threadNumber?: number;
637
- includeComments?: boolean;
638
- onProgress?: (message: string) => void;
639
- }): Promise<{ runId: number; summarized: number; inputTokens: number; outputTokens: number; totalTokens: number }> {
640
- const ai = this.requireAi();
641
- const repository = this.requireRepository(params.owner, params.repo);
642
- const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
643
- const includeComments = params.includeComments ?? false;
644
-
645
- try {
646
- let sql =
647
- `select t.id, t.number, t.title, t.body, t.labels_json
648
- from threads t
649
- where t.repo_id = ? and t.state = 'open'`;
650
- const args: Array<number> = [repository.id];
651
- if (params.threadNumber) {
652
- sql += ' and t.number = ?';
653
- args.push(params.threadNumber);
654
- }
655
- sql += ' order by t.number asc';
656
-
657
- const rows = this.db.prepare(sql).all(...args) as Array<{
658
- id: number;
659
- number: number;
660
- title: string;
661
- body: string | null;
662
- labels_json: string;
663
- }>;
664
-
665
- params.onProgress?.(`[summarize] loaded ${rows.length} candidate thread(s) for ${repository.fullName}`);
666
- params.onProgress?.(
667
- includeComments
668
- ? '[summarize] include-comments enabled; hydrated human comments may be included in the summary input'
669
- : '[summarize] metadata-only mode; comments are excluded from the summary input',
670
- );
671
-
672
- const sources = rows.map((row) => {
673
- const source = this.buildSummarySource(row.id, row.title, row.body, parseArray(row.labels_json), includeComments);
674
- return { ...row, ...source };
675
- });
676
-
677
- const pending = sources.filter((row) => {
678
- const latest = this.db
679
- .prepare(
680
- 'select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1',
681
- )
682
- .get(row.id, 'dedupe_summary', this.config.summaryModel) as { content_hash: string } | undefined;
683
- return latest?.content_hash !== row.summaryContentHash;
684
- });
685
-
686
- params.onProgress?.(
687
- `[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`,
688
- );
689
-
690
- let summarized = 0;
691
- let inputTokens = 0;
692
- let outputTokens = 0;
693
- let totalTokens = 0;
694
- for (const [index, row] of pending.entries()) {
695
- params.onProgress?.(`[summarize] ${index + 1}/${pending.length} thread #${row.number}`);
696
- const result = await ai.summarizeThread({
697
- model: this.config.summaryModel,
698
- text: row.summaryInput,
699
- });
700
- const summary = result.summary;
701
-
702
- this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
703
- this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
704
- this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
705
- this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
706
- if (result.usage) {
707
- inputTokens += result.usage.inputTokens;
708
- outputTokens += result.usage.outputTokens;
709
- totalTokens += result.usage.totalTokens;
710
- params.onProgress?.(
711
- `[summarize] tokens thread #${row.number} in=${result.usage.inputTokens} out=${result.usage.outputTokens} total=${result.usage.totalTokens} cached_in=${result.usage.cachedInputTokens} reasoning=${result.usage.reasoningTokens}`,
712
- );
713
- }
714
- summarized += 1;
715
- }
716
-
717
- this.finishRun('summary_runs', runId, 'completed', { summarized, inputTokens, outputTokens, totalTokens });
718
- return { runId, summarized, inputTokens, outputTokens, totalTokens };
719
- } catch (error) {
720
- this.finishRun('summary_runs', runId, 'failed', null, error);
721
- throw error;
722
- }
723
- }
724
-
725
- purgeComments(params: {
726
- owner: string;
727
- repo: string;
728
- threadNumber?: number;
729
- onProgress?: (message: string) => void;
730
- }): { purgedComments: number; refreshedThreads: number } {
731
- const repository = this.requireRepository(params.owner, params.repo);
732
-
733
- let sql = 'select id, number from threads where repo_id = ?';
734
- const args: Array<number> = [repository.id];
735
- if (params.threadNumber) {
736
- sql += ' and number = ?';
737
- args.push(params.threadNumber);
738
- }
739
- sql += ' order by number asc';
740
-
741
- const threads = this.db.prepare(sql).all(...args) as Array<{ id: number; number: number }>;
742
- if (threads.length === 0) {
743
- return { purgedComments: 0, refreshedThreads: 0 };
744
- }
745
-
746
- params.onProgress?.(`[purge-comments] removing hydrated comments from ${threads.length} thread(s) in ${repository.fullName}`);
747
-
748
- const deleteComments = this.db.prepare('delete from comments where thread_id = ?');
749
- let purgedComments = 0;
750
- for (const thread of threads) {
751
- const row = this.db.prepare('select count(*) as count from comments where thread_id = ?').get(thread.id) as { count: number };
752
- if (row.count > 0) {
753
- deleteComments.run(thread.id);
754
- purgedComments += row.count;
755
- }
756
- this.refreshDocument(thread.id);
757
- }
758
-
759
- params.onProgress?.(
760
- `[purge-comments] removed ${purgedComments} comment(s) and refreshed ${threads.length} document(s) for ${repository.fullName}`,
761
- );
762
-
763
- return { purgedComments, refreshedThreads: threads.length };
764
- }
765
-
766
- async embedRepository(params: {
767
- owner: string;
768
- repo: string;
769
- threadNumber?: number;
770
- onProgress?: (message: string) => void;
771
- }): Promise<EmbedResultDto> {
772
- const ai = this.requireAi();
773
- const repository = this.requireRepository(params.owner, params.repo);
774
- const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
775
-
776
- try {
777
- const { rows, tasks, pending } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
778
- const skipped = tasks.length - pending.length;
779
- const truncated = tasks.filter((task) => task.wasTruncated).length;
780
-
781
- params.onProgress?.(
782
- `[embed] loaded ${rows.length} open thread(s) and ${tasks.length} embedding source(s) for ${repository.fullName}`,
783
- );
784
- params.onProgress?.(
785
- `[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`,
786
- );
787
-
788
- let embedded = 0;
789
- const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
790
- const mapper = new IterableMapper(
791
- batches,
792
- async (batch: EmbeddingTask[]) => {
793
- return this.embedBatchWithRecovery(ai, batch, params.onProgress);
794
- },
795
- {
796
- concurrency: this.config.embedConcurrency,
797
- maxUnread: this.config.embedMaxUnread,
798
- },
799
- );
800
-
801
- let completedBatches = 0;
802
- for await (const batchResult of mapper) {
803
- completedBatches += 1;
804
- const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.sourceKind}`);
805
- const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
806
- params.onProgress?.(
807
- `[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`,
808
- );
809
- for (const { task, embedding } of batchResult) {
810
- this.upsertEmbedding(task.threadId, task.sourceKind, task.contentHash, embedding);
811
- embedded += 1;
812
- }
813
- }
814
-
815
- this.finishRun('embedding_runs', runId, 'completed', { embedded });
816
- return embedResultSchema.parse({ runId, embedded });
817
- } catch (error) {
818
- this.finishRun('embedding_runs', runId, 'failed', null, error);
819
- throw error;
820
- }
821
- }
822
-
823
- clusterRepository(params: {
824
- owner: string;
825
- repo: string;
826
- minScore?: number;
827
- k?: number;
828
- onProgress?: (message: string) => void;
829
- }): ClusterResultDto {
830
- const repository = this.requireRepository(params.owner, params.repo);
831
- const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
832
- const minScore = params.minScore ?? 0.82;
833
- const k = params.k ?? 6;
834
-
835
- try {
836
- const rows = this.loadParsedStoredEmbeddings(repository.id);
837
- const threadMeta = new Map<number, { number: number; title: string }>();
838
- for (const row of rows) {
839
- threadMeta.set(row.id, { number: row.number, title: row.title });
840
- }
841
- const items = Array.from(threadMeta.entries()).map(([id, meta]) => ({
842
- id,
843
- number: meta.number,
844
- title: meta.title,
845
- }));
846
-
847
- params.onProgress?.(
848
- `[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`,
849
- );
850
-
851
- this.db.prepare('delete from cluster_members where cluster_id in (select id from clusters where cluster_run_id = ?)').run(runId);
852
- this.db.prepare('delete from clusters where cluster_run_id = ?').run(runId);
853
- this.db.prepare('delete from similarity_edges where cluster_run_id = ?').run(runId);
854
-
855
- const aggregatedEdges = this.aggregateRepositoryEdges(rows, { limit: k, minScore });
856
- const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
857
- leftThreadId: entry.leftThreadId,
858
- rightThreadId: entry.rightThreadId,
859
- score: entry.score,
860
- }));
861
- const insertEdge = this.db.prepare(
862
- `insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
863
- values (?, ?, ?, ?, ?, ?, ?, ?)`,
864
- );
865
- for (const edge of aggregatedEdges.values()) {
866
- insertEdge.run(
867
- repository.id,
868
- runId,
869
- edge.leftThreadId,
870
- edge.rightThreadId,
871
- 'exact_cosine',
872
- edge.score,
873
- asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }),
874
- nowIso(),
875
- );
876
- }
877
-
878
- params.onProgress?.(`[cluster] built ${edges.length} similarity edge(s)`);
879
-
880
- const clusters = buildClusters(
881
- items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })),
882
- edges,
883
- );
884
-
885
- const insertCluster = this.db.prepare(
886
- 'insert into clusters (repo_id, cluster_run_id, representative_thread_id, member_count, created_at) values (?, ?, ?, ?, ?)',
887
- );
888
- const insertMember = this.db.prepare(
889
- 'insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)',
890
- );
891
-
892
- for (const cluster of clusters) {
893
- const clusterResult = insertCluster.run(
894
- repository.id,
895
- runId,
896
- cluster.representativeThreadId,
897
- cluster.members.length,
898
- nowIso(),
899
- );
900
- const clusterId = Number(clusterResult.lastInsertRowid);
901
- for (const memberId of cluster.members) {
902
- const key = this.edgeKey(cluster.representativeThreadId, memberId);
903
- const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
904
- insertMember.run(clusterId, memberId, score, nowIso());
905
- }
906
- }
907
-
908
- params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s)`);
909
-
910
- this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
911
- return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
912
- } catch (error) {
913
- this.finishRun('cluster_runs', runId, 'failed', null, error);
914
- throw error;
915
- }
916
- }
917
-
918
- async searchRepository(params: {
919
- owner: string;
920
- repo: string;
921
- query: string;
922
- mode?: SearchMode;
923
- limit?: number;
924
- }): Promise<SearchResultInternal> {
925
- const mode = params.mode ?? 'hybrid';
926
- const repository = this.requireRepository(params.owner, params.repo);
927
- const limit = params.limit ?? 20;
928
- const keywordScores = new Map<number, number>();
929
- const semanticScores = new Map<number, number>();
930
-
931
- if (mode !== 'semantic') {
932
- const rows = this.db
933
- .prepare(
934
- `select d.thread_id, bm25(documents_fts) as rank
935
- from documents_fts
936
- join documents d on d.id = documents_fts.rowid
937
- join threads t on t.id = d.thread_id
938
- where t.repo_id = ? and t.state = 'open' and documents_fts match ?
939
- order by rank
940
- limit ?`,
941
- )
942
- .all(repository.id, params.query, limit * 2) as Array<{ thread_id: number; rank: number }>;
943
- for (const row of rows) {
944
- keywordScores.set(row.thread_id, 1 / (1 + Math.abs(row.rank)));
945
- }
946
- }
947
-
948
- if (mode !== 'keyword' && this.ai) {
949
- const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
950
- const rows = this.loadParsedStoredEmbeddings(repository.id);
951
- for (const row of rows) {
952
- const score = cosineSimilarity(queryEmbedding, row.embedding);
953
- if (score < 0.2) continue;
954
- semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
955
- }
956
- }
957
-
958
- const candidateIds = new Set<number>([...keywordScores.keys(), ...semanticScores.keys()]);
959
- const threadRows = candidateIds.size
960
- ? (this.db
961
- .prepare(
962
- `select * from threads
963
- where repo_id = ? and state = 'open' and id in (${[...candidateIds].map(() => '?').join(',')})
964
- order by updated_at_gh desc, number desc`,
965
- )
966
- .all(repository.id, ...candidateIds) as ThreadRow[])
967
- : [];
968
-
969
- const neighborRows = this.db
970
- .prepare(
971
- `select se.left_thread_id, se.right_thread_id, se.score, t1.number as left_number, t2.number as right_number,
972
- t1.kind as left_kind, t2.kind as right_kind, t1.title as left_title, t2.title as right_title
973
- from similarity_edges se
974
- join threads t1 on t1.id = se.left_thread_id
975
- join threads t2 on t2.id = se.right_thread_id
976
- where se.repo_id = ? and se.cluster_run_id = (
977
- select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
978
- )`,
979
- )
980
- .all(repository.id, repository.id) as Array<{
981
- left_thread_id: number;
982
- right_thread_id: number;
983
- score: number;
984
- left_number: number;
985
- right_number: number;
986
- left_kind: 'issue' | 'pull_request';
987
- right_kind: 'issue' | 'pull_request';
988
- left_title: string;
989
- right_title: string;
990
- }>;
991
-
992
- const neighborsByThread = new Map<number, SearchHitDto['neighbors']>();
993
- for (const edge of neighborRows) {
994
- const leftList = neighborsByThread.get(edge.left_thread_id) ?? [];
995
- leftList.push({
996
- threadId: edge.right_thread_id,
997
- number: edge.right_number,
998
- kind: edge.right_kind,
999
- title: edge.right_title,
1000
- score: edge.score,
1001
- });
1002
- neighborsByThread.set(edge.left_thread_id, leftList);
1003
-
1004
- const rightList = neighborsByThread.get(edge.right_thread_id) ?? [];
1005
- rightList.push({
1006
- threadId: edge.left_thread_id,
1007
- number: edge.left_number,
1008
- kind: edge.left_kind,
1009
- title: edge.left_title,
1010
- score: edge.score,
1011
- });
1012
- neighborsByThread.set(edge.right_thread_id, rightList);
1013
- }
1014
-
1015
- const hits = threadRows
1016
- .map((row) => {
1017
- const keywordScore = keywordScores.get(row.id) ?? null;
1018
- const semanticScore = semanticScores.get(row.id) ?? null;
1019
- const hybridScore = (keywordScore ?? 0) + (semanticScore ?? 0);
1020
- return {
1021
- thread: threadToDto(row),
1022
- keywordScore,
1023
- semanticScore,
1024
- hybridScore,
1025
- neighbors: (neighborsByThread.get(row.id) ?? []).sort((left, right) => right.score - left.score).slice(0, 3),
1026
- };
1027
- })
1028
- .sort((left, right) => right.hybridScore - left.hybridScore)
1029
- .slice(0, limit);
1030
-
1031
- return searchResponseSchema.parse({
1032
- repository,
1033
- query: params.query,
1034
- mode,
1035
- hits,
1036
- });
1037
- }
1038
-
1039
- listNeighbors(params: {
1040
- owner: string;
1041
- repo: string;
1042
- threadNumber: number;
1043
- limit?: number;
1044
- minScore?: number;
1045
- }): NeighborsResultInternal {
1046
- const repository = this.requireRepository(params.owner, params.repo);
1047
- const limit = params.limit ?? 10;
1048
- const minScore = params.minScore ?? 0.2;
1049
-
1050
- const rows = this.loadParsedStoredEmbeddings(repository.id);
1051
- const targetRows = rows.filter((row) => row.number === params.threadNumber);
1052
- if (targetRows.length === 0) {
1053
- throw new Error(
1054
- `Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`,
1055
- );
1056
- }
1057
- const targetRow = targetRows[0];
1058
- const targetBySource = new Map<EmbeddingSourceKind, number[]>();
1059
- for (const row of targetRows) {
1060
- targetBySource.set(row.source_kind, row.embedding);
1061
- }
1062
-
1063
- const aggregated = new Map<number, { number: number; kind: 'issue' | 'pull_request'; title: string; score: number }>();
1064
- for (const row of rows) {
1065
- if (row.id === targetRow.id) continue;
1066
- const targetEmbedding = targetBySource.get(row.source_kind);
1067
- if (!targetEmbedding) continue;
1068
- const score = cosineSimilarity(targetEmbedding, row.embedding);
1069
- if (score < minScore) continue;
1070
- const previous = aggregated.get(row.id);
1071
- if (!previous || score > previous.score) {
1072
- aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
1073
- }
1074
- }
1075
-
1076
- const neighbors = Array.from(aggregated.entries())
1077
- .map(([threadId, value]) => ({
1078
- threadId,
1079
- number: value.number,
1080
- kind: value.kind,
1081
- title: value.title,
1082
- score: value.score,
1083
- }))
1084
- .sort((left, right) => right.score - left.score)
1085
- .slice(0, limit);
1086
-
1087
- return neighborsResponseSchema.parse({
1088
- repository,
1089
- thread: threadToDto(targetRow),
1090
- neighbors,
1091
- });
1092
- }
1093
-
1094
- listClusters(params: { owner: string; repo: string }): ClustersResponse {
1095
- const repository = this.requireRepository(params.owner, params.repo);
1096
- const latestRun = this.db
1097
- .prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1098
- .get(repository.id) as { id: number } | undefined;
1099
-
1100
- if (!latestRun) {
1101
- return clustersResponseSchema.parse({ repository, clusters: [] });
1102
- }
1103
-
1104
- const rows = this.db
1105
- .prepare(
1106
- `select c.id, c.repo_id, c.representative_thread_id, c.member_count,
1107
- cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title
1108
- from clusters c
1109
- left join cluster_members cm on cm.cluster_id = c.id
1110
- left join threads t on t.id = cm.thread_id
1111
- where c.cluster_run_id = ?
1112
- order by c.member_count desc, c.id asc, t.number asc`,
1113
- )
1114
- .all(latestRun.id) as Array<{
1115
- id: number;
1116
- repo_id: number;
1117
- representative_thread_id: number | null;
1118
- member_count: number;
1119
- thread_id: number | null;
1120
- score_to_representative: number | null;
1121
- number: number | null;
1122
- kind: 'issue' | 'pull_request' | null;
1123
- title: string | null;
1124
- }>;
1125
-
1126
- const clusters = new Map<number, ClusterDto>();
1127
- for (const row of rows) {
1128
- const cluster = clusters.get(row.id) ?? {
1129
- id: row.id,
1130
- repoId: row.repo_id,
1131
- representativeThreadId: row.representative_thread_id,
1132
- memberCount: row.member_count,
1133
- members: [],
1134
- };
1135
- if (row.thread_id !== null && row.number !== null && row.kind !== null && row.title !== null) {
1136
- cluster.members.push({
1137
- threadId: row.thread_id,
1138
- number: row.number,
1139
- kind: row.kind,
1140
- title: row.title,
1141
- scoreToRepresentative: row.score_to_representative,
1142
- });
1143
- }
1144
- clusters.set(row.id, cluster);
1145
- }
1146
-
1147
- return clustersResponseSchema.parse({
1148
- repository,
1149
- clusters: Array.from(clusters.values()),
1150
- });
1151
- }
1152
-
1153
- async refreshRepository(params: {
1154
- owner: string;
1155
- repo: string;
1156
- sync?: boolean;
1157
- embed?: boolean;
1158
- cluster?: boolean;
1159
- onProgress?: (message: string) => void;
1160
- }): Promise<RefreshResponse> {
1161
- const selected = {
1162
- sync: params.sync ?? true,
1163
- embed: params.embed ?? true,
1164
- cluster: params.cluster ?? true,
1165
- };
1166
- if (!selected.sync && !selected.embed && !selected.cluster) {
1167
- throw new Error('Refresh requires at least one selected step');
1168
- }
1169
- if (!selected.sync) {
1170
- this.requireRepository(params.owner, params.repo);
1171
- }
1172
-
1173
- let sync: SyncResultDto | null = null;
1174
- let embed: EmbedResultDto | null = null;
1175
- let cluster: ClusterResultDto | null = null;
1176
-
1177
- if (selected.sync) {
1178
- sync = await this.syncRepository({
1179
- owner: params.owner,
1180
- repo: params.repo,
1181
- onProgress: params.onProgress,
1182
- });
1183
- }
1184
- if (selected.embed) {
1185
- embed = await this.embedRepository({
1186
- owner: params.owner,
1187
- repo: params.repo,
1188
- onProgress: params.onProgress,
1189
- });
1190
- }
1191
- if (selected.cluster) {
1192
- cluster = this.clusterRepository({
1193
- owner: params.owner,
1194
- repo: params.repo,
1195
- onProgress: params.onProgress,
1196
- });
1197
- }
1198
-
1199
- const repository = this.requireRepository(params.owner, params.repo);
1200
-
1201
- return refreshResponseSchema.parse({
1202
- repository,
1203
- selected,
1204
- sync,
1205
- embed,
1206
- cluster,
1207
- });
1208
- }
1209
-
1210
- listClusterSummaries(params: {
1211
- owner: string;
1212
- repo: string;
1213
- minSize?: number;
1214
- limit?: number;
1215
- sort?: TuiClusterSortMode;
1216
- search?: string;
1217
- }): ClusterSummariesResponse {
1218
- const snapshot = this.getTuiSnapshot({
1219
- owner: params.owner,
1220
- repo: params.repo,
1221
- minSize: params.minSize,
1222
- sort: params.sort,
1223
- search: params.search,
1224
- });
1225
- const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters;
1226
- return clusterSummariesResponseSchema.parse({
1227
- repository: snapshot.repository,
1228
- stats: snapshot.stats,
1229
- clusters: clusters.map((cluster) => ({
1230
- clusterId: cluster.clusterId,
1231
- displayTitle: cluster.displayTitle,
1232
- totalCount: cluster.totalCount,
1233
- issueCount: cluster.issueCount,
1234
- pullRequestCount: cluster.pullRequestCount,
1235
- latestUpdatedAt: cluster.latestUpdatedAt,
1236
- representativeThreadId: cluster.representativeThreadId,
1237
- representativeNumber: cluster.representativeNumber,
1238
- representativeKind: cluster.representativeKind,
1239
- })),
1240
- });
1241
- }
1242
-
1243
- getClusterDetailDump(params: {
1244
- owner: string;
1245
- repo: string;
1246
- clusterId: number;
1247
- memberLimit?: number;
1248
- bodyChars?: number;
1249
- }): ClusterDetailResponse {
1250
- const snapshot = this.getTuiSnapshot({
1251
- owner: params.owner,
1252
- repo: params.repo,
1253
- minSize: 0,
1254
- });
1255
- const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId);
1256
- if (!cluster) {
1257
- throw new Error(`Cluster ${params.clusterId} was not found for ${snapshot.repository.fullName}.`);
1258
- }
1259
-
1260
- const detail = this.getTuiClusterDetail({
1261
- owner: params.owner,
1262
- repo: params.repo,
1263
- clusterId: params.clusterId,
1264
- });
1265
- const members = detail.members.slice(0, params.memberLimit ?? detail.members.length).map((member) => {
1266
- const threadDetail = this.getTuiThreadDetail({
1267
- owner: params.owner,
1268
- repo: params.repo,
1269
- threadId: member.id,
1270
- includeNeighbors: false,
1271
- });
1272
- return {
1273
- thread: {
1274
- ...threadDetail.thread,
1275
- body: null,
1276
- },
1277
- bodySnippet: snippetText(threadDetail.thread.body, params.bodyChars ?? 280),
1278
- summaries: threadDetail.summaries,
1279
- };
1280
- });
1281
-
1282
- return clusterDetailResponseSchema.parse({
1283
- repository: snapshot.repository,
1284
- stats: snapshot.stats,
1285
- cluster: {
1286
- clusterId: cluster.clusterId,
1287
- displayTitle: cluster.displayTitle,
1288
- totalCount: cluster.totalCount,
1289
- issueCount: cluster.issueCount,
1290
- pullRequestCount: cluster.pullRequestCount,
1291
- latestUpdatedAt: cluster.latestUpdatedAt,
1292
- representativeThreadId: cluster.representativeThreadId,
1293
- representativeNumber: cluster.representativeNumber,
1294
- representativeKind: cluster.representativeKind,
1295
- },
1296
- members,
1297
- });
1298
- }
1299
-
1300
- getTuiSnapshot(params: {
1301
- owner: string;
1302
- repo: string;
1303
- minSize?: number;
1304
- sort?: TuiClusterSortMode;
1305
- search?: string;
1306
- }): TuiSnapshot {
1307
- const repository = this.requireRepository(params.owner, params.repo);
1308
- const stats = this.getTuiRepoStats(repository.id);
1309
- const latestRun = this.getLatestClusterRun(repository.id);
1310
- if (!latestRun) {
1311
- return { repository, stats, clusters: [] };
1312
- }
1313
-
1314
- const clusters = this.listRawTuiClusters(repository.id, latestRun.id)
1315
- .filter((cluster) => cluster.totalCount >= (params.minSize ?? 10))
1316
- .filter((cluster) => {
1317
- const search = params.search?.trim().toLowerCase();
1318
- if (!search) return true;
1319
- return cluster.searchText.includes(search);
1320
- })
1321
- .sort((left, right) => this.compareTuiClusterSummary(left, right, params.sort ?? 'recent'));
1322
-
1323
- return {
1324
- repository,
1325
- stats,
1326
- clusters,
1327
- };
1328
- }
1329
-
1330
- getTuiClusterDetail(params: { owner: string; repo: string; clusterId: number }): TuiClusterDetail {
1331
- const repository = this.requireRepository(params.owner, params.repo);
1332
- const latestRun = this.getLatestClusterRun(repository.id);
1333
- if (!latestRun) {
1334
- throw new Error(`No completed cluster run found for ${repository.fullName}. Run cluster first.`);
1335
- }
1336
-
1337
- const summary = this.listRawTuiClusters(repository.id, latestRun.id).find((cluster) => cluster.clusterId === params.clusterId);
1338
- if (!summary) {
1339
- throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
1340
- }
1341
-
1342
- const rows = this.db
1343
- .prepare(
1344
- `select t.id, t.number, t.kind, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
1345
- from cluster_members cm
1346
- join threads t on t.id = cm.thread_id
1347
- where cm.cluster_id = ?
1348
- order by
1349
- case t.kind when 'issue' then 0 else 1 end asc,
1350
- coalesce(t.updated_at_gh, t.updated_at) desc,
1351
- t.number desc`,
1352
- )
1353
- .all(params.clusterId) as Array<{
1354
- id: number;
1355
- number: number;
1356
- kind: 'issue' | 'pull_request';
1357
- title: string;
1358
- updated_at_gh: string | null;
1359
- html_url: string;
1360
- labels_json: string;
1361
- score_to_representative: number | null;
1362
- }>;
1363
-
1364
- return {
1365
- clusterId: summary.clusterId,
1366
- displayTitle: summary.displayTitle,
1367
- totalCount: summary.totalCount,
1368
- issueCount: summary.issueCount,
1369
- pullRequestCount: summary.pullRequestCount,
1370
- latestUpdatedAt: summary.latestUpdatedAt,
1371
- representativeThreadId: summary.representativeThreadId,
1372
- representativeNumber: summary.representativeNumber,
1373
- representativeKind: summary.representativeKind,
1374
- members: rows.map((row) => ({
1375
- id: row.id,
1376
- number: row.number,
1377
- kind: row.kind,
1378
- title: row.title,
1379
- updatedAtGh: row.updated_at_gh,
1380
- htmlUrl: row.html_url,
1381
- labels: parseArray(row.labels_json),
1382
- clusterScore: row.score_to_representative,
1383
- })),
1384
- };
1385
- }
1386
-
1387
- getTuiThreadDetail(params: {
1388
- owner: string;
1389
- repo: string;
1390
- threadId?: number;
1391
- threadNumber?: number;
1392
- includeNeighbors?: boolean;
1393
- }): TuiThreadDetail {
1394
- const repository = this.requireRepository(params.owner, params.repo);
1395
- const row = params.threadId
1396
- ? ((this.db
1397
- .prepare('select * from threads where repo_id = ? and id = ? and state = \'open\' limit 1')
1398
- .get(repository.id, params.threadId) as ThreadRow | undefined) ?? null)
1399
- : params.threadNumber
1400
- ? ((this.db
1401
- .prepare('select * from threads where repo_id = ? and number = ? and state = \'open\' limit 1')
1402
- .get(repository.id, params.threadNumber) as ThreadRow | undefined) ?? null)
1403
- : null;
1404
-
1405
- if (!row) {
1406
- throw new Error(`Thread was not found for ${repository.fullName}.`);
1407
- }
1408
-
1409
- const latestRun = this.getLatestClusterRun(repository.id);
1410
- const clusterMembership = latestRun
1411
- ? ((this.db
1412
- .prepare(
1413
- `select cm.cluster_id
1414
- from cluster_members cm
1415
- join clusters c on c.id = cm.cluster_id
1416
- where c.cluster_run_id = ? and cm.thread_id = ?
1417
- limit 1`,
1418
- )
1419
- .get(latestRun.id, row.id) as { cluster_id: number } | undefined) ?? null)
1420
- : null;
1421
-
1422
- const summaryRows = this.db
1423
- .prepare(
1424
- `select summary_kind, summary_text
1425
- from document_summaries
1426
- where thread_id = ? and model = ?
1427
- order by summary_kind asc`,
1428
- )
1429
- .all(row.id, this.config.summaryModel) as Array<{ summary_kind: string; summary_text: string }>;
1430
- const summaries: TuiThreadDetail['summaries'] = {};
1431
- for (const summary of summaryRows) {
1432
- if (
1433
- summary.summary_kind === 'problem_summary' ||
1434
- summary.summary_kind === 'solution_summary' ||
1435
- summary.summary_kind === 'maintainer_signal_summary' ||
1436
- summary.summary_kind === 'dedupe_summary'
1437
- ) {
1438
- summaries[summary.summary_kind] = summary.summary_text;
1439
- }
1440
- }
1441
-
1442
- let neighbors: SearchHitDto['neighbors'] = [];
1443
- if (params.includeNeighbors !== false) {
1444
- try {
1445
- neighbors = this.listNeighbors({
1446
- owner: params.owner,
1447
- repo: params.repo,
1448
- threadNumber: row.number,
1449
- limit: 8,
1450
- minScore: 0.2,
1451
- }).neighbors;
1452
- } catch {
1453
- neighbors = [];
1454
- }
1455
- }
1456
-
1457
- return {
1458
- thread: threadToDto(row, clusterMembership?.cluster_id ?? null),
1459
- summaries,
1460
- neighbors,
1461
- };
1462
- }
1463
-
1464
- async rerunAction(request: ActionRequest): Promise<ActionResponse> {
1465
- switch (request.action) {
1466
- case 'summarize': {
1467
- const result = await this.summarizeRepository(request);
1468
- return actionResponseSchema.parse({
1469
- ok: true,
1470
- action: request.action,
1471
- runId: result.runId,
1472
- message: `Summarized ${result.summarized} thread(s)`,
1473
- });
1474
- }
1475
- case 'embed': {
1476
- const result = await this.embedRepository(request);
1477
- return actionResponseSchema.parse({
1478
- ok: true,
1479
- action: request.action,
1480
- runId: result.runId,
1481
- message: `Embedded ${result.embedded} source vector(s)`,
1482
- });
1483
- }
1484
- case 'cluster': {
1485
- const result = this.clusterRepository(request);
1486
- return actionResponseSchema.parse({
1487
- ok: true,
1488
- action: request.action,
1489
- runId: result.runId,
1490
- message: `Clustered ${result.clusters} group(s) from ${result.edges} edge(s)`,
1491
- });
1492
- }
1493
- }
1494
- }
1495
-
1496
- private getSyncCursorState(repoId: number): SyncCursorState {
1497
- const persisted = (this.db
1498
- .prepare(
1499
- `select
1500
- last_full_open_scan_started_at,
1501
- last_overlapping_open_scan_completed_at,
1502
- last_non_overlapping_scan_completed_at,
1503
- last_open_close_reconciled_at
1504
- from repo_sync_state
1505
- where repo_id = ?`,
1506
- )
1507
- .get(repoId) as
1508
- | {
1509
- last_full_open_scan_started_at: string | null;
1510
- last_overlapping_open_scan_completed_at: string | null;
1511
- last_non_overlapping_scan_completed_at: string | null;
1512
- last_open_close_reconciled_at: string | null;
1513
- }
1514
- | undefined) ?? null;
1515
- if (persisted) {
1516
- return {
1517
- lastFullOpenScanStartedAt: persisted.last_full_open_scan_started_at,
1518
- lastOverlappingOpenScanCompletedAt: persisted.last_overlapping_open_scan_completed_at,
1519
- lastNonOverlappingScanCompletedAt: persisted.last_non_overlapping_scan_completed_at,
1520
- lastReconciledOpenCloseAt: persisted.last_open_close_reconciled_at,
1521
- };
1522
- }
1523
-
1524
- const rows = this.db
1525
- .prepare("select finished_at, stats_json from sync_runs where repo_id = ? and status = 'completed' order by id desc")
1526
- .all(repoId) as Array<{ finished_at: string | null; stats_json: string | null }>;
1527
- const state: SyncCursorState = {
1528
- lastFullOpenScanStartedAt: null,
1529
- lastOverlappingOpenScanCompletedAt: null,
1530
- lastNonOverlappingScanCompletedAt: null,
1531
- lastReconciledOpenCloseAt: null,
1532
- };
1533
-
1534
- for (const row of rows) {
1535
- const stats = parseSyncRunStats(row.stats_json);
1536
- if (!stats) continue;
1537
- if (state.lastFullOpenScanStartedAt === null && stats.isFullOpenScan) {
1538
- state.lastFullOpenScanStartedAt = stats.crawlStartedAt;
1539
- }
1540
- if (state.lastOverlappingOpenScanCompletedAt === null && stats.isOverlappingOpenScan && row.finished_at) {
1541
- state.lastOverlappingOpenScanCompletedAt = row.finished_at;
1542
- }
1543
- if (state.lastNonOverlappingScanCompletedAt === null && !stats.isFullOpenScan && !stats.isOverlappingOpenScan && row.finished_at) {
1544
- state.lastNonOverlappingScanCompletedAt = row.finished_at;
1545
- }
1546
- if (state.lastReconciledOpenCloseAt === null && stats.reconciledOpenCloseAt) {
1547
- state.lastReconciledOpenCloseAt = stats.reconciledOpenCloseAt;
1548
- }
1549
- }
1550
-
1551
- if (
1552
- state.lastFullOpenScanStartedAt !== null ||
1553
- state.lastOverlappingOpenScanCompletedAt !== null ||
1554
- state.lastNonOverlappingScanCompletedAt !== null ||
1555
- state.lastReconciledOpenCloseAt !== null
1556
- ) {
1557
- this.writeSyncCursorState(repoId, state);
1558
- }
1559
-
1560
- return state;
1561
- }
1562
-
1563
- private writeSyncCursorState(repoId: number, state: SyncCursorState): void {
1564
- this.db
1565
- .prepare(
1566
- `insert into repo_sync_state (
1567
- repo_id,
1568
- last_full_open_scan_started_at,
1569
- last_overlapping_open_scan_completed_at,
1570
- last_non_overlapping_scan_completed_at,
1571
- last_open_close_reconciled_at,
1572
- updated_at
1573
- ) values (?, ?, ?, ?, ?, ?)
1574
- on conflict(repo_id) do update set
1575
- last_full_open_scan_started_at = excluded.last_full_open_scan_started_at,
1576
- last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at,
1577
- last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at,
1578
- last_open_close_reconciled_at = excluded.last_open_close_reconciled_at,
1579
- updated_at = excluded.updated_at`,
1580
- )
1581
- .run(
1582
- repoId,
1583
- state.lastFullOpenScanStartedAt,
1584
- state.lastOverlappingOpenScanCompletedAt,
1585
- state.lastNonOverlappingScanCompletedAt,
1586
- state.lastReconciledOpenCloseAt,
1587
- nowIso(),
1588
- );
1589
- }
1590
-
1591
- private getTuiRepoStats(repoId: number): TuiRepoStats {
1592
- const counts = this.db
1593
- .prepare(
1594
- `select kind, count(*) as count
1595
- from threads
1596
- where repo_id = ? and state = 'open'
1597
- group by kind`,
1598
- )
1599
- .all(repoId) as Array<{ kind: 'issue' | 'pull_request'; count: number }>;
1600
- const latestRun = this.getLatestClusterRun(repoId);
1601
- const latestSync = (this.db
1602
- .prepare("select finished_at from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1603
- .get(repoId) as { finished_at: string | null } | undefined) ?? null;
1604
- const latestEmbed = (this.db
1605
- .prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1606
- .get(repoId) as { finished_at: string | null } | undefined) ?? null;
1607
- const embeddingWorkset = this.getEmbeddingWorkset(repoId);
1608
- const staleThreadIds = new Set<number>(embeddingWorkset.pending.map((task) => task.threadId));
1609
- return {
1610
- openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0,
1611
- openPullRequestCount: counts.find((row) => row.kind === 'pull_request')?.count ?? 0,
1612
- lastGithubReconciliationAt: latestSync?.finished_at ?? null,
1613
- lastEmbedRefreshAt: latestEmbed?.finished_at ?? null,
1614
- staleEmbedThreadCount: staleThreadIds.size,
1615
- staleEmbedSourceCount: embeddingWorkset.pending.length,
1616
- latestClusterRunId: latestRun?.id ?? null,
1617
- latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
1618
- };
1619
- }
1620
-
1621
- private getLatestClusterRun(repoId: number): { id: number; finished_at: string | null } | null {
1622
- return (
1623
- (this.db
1624
- .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
1625
- .get(repoId) as { id: number; finished_at: string | null } | undefined) ?? null
1626
- );
1627
- }
1628
-
1629
- private listRawTuiClusters(repoId: number, clusterRunId: number): TuiClusterSummary[] {
1630
- const rows = this.db
1631
- .prepare(
1632
- `select
1633
- c.id as cluster_id,
1634
- c.member_count,
1635
- c.representative_thread_id,
1636
- rt.number as representative_number,
1637
- rt.kind as representative_kind,
1638
- rt.title as representative_title,
1639
- max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at,
1640
- sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count,
1641
- sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count,
1642
- group_concat(lower(coalesce(t.title, '')), ' ') as search_text
1643
- from clusters c
1644
- left join threads rt on rt.id = c.representative_thread_id
1645
- join cluster_members cm on cm.cluster_id = c.id
1646
- join threads t on t.id = cm.thread_id
1647
- where c.repo_id = ? and c.cluster_run_id = ?
1648
- group by
1649
- c.id,
1650
- c.member_count,
1651
- c.representative_thread_id,
1652
- rt.number,
1653
- rt.kind,
1654
- rt.title`,
1655
- )
1656
- .all(repoId, clusterRunId) as Array<{
1657
- cluster_id: number;
1658
- member_count: number;
1659
- representative_thread_id: number | null;
1660
- representative_number: number | null;
1661
- representative_kind: 'issue' | 'pull_request' | null;
1662
- representative_title: string | null;
1663
- latest_updated_at: string | null;
1664
- issue_count: number;
1665
- pull_request_count: number;
1666
- search_text: string | null;
1667
- }>;
1668
-
1669
- return rows.map((row) => ({
1670
- clusterId: row.cluster_id,
1671
- displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`,
1672
- totalCount: row.member_count,
1673
- issueCount: row.issue_count,
1674
- pullRequestCount: row.pull_request_count,
1675
- latestUpdatedAt: row.latest_updated_at,
1676
- representativeThreadId: row.representative_thread_id,
1677
- representativeNumber: row.representative_number,
1678
- representativeKind: row.representative_kind,
1679
- searchText: `${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(),
1680
- }));
1681
- }
1682
-
1683
- private compareTuiClusterSummary(left: TuiClusterSummary, right: TuiClusterSummary, sort: TuiClusterSortMode): number {
1684
- const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0;
1685
- const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0;
1686
- if (sort === 'size') {
1687
- return right.totalCount - left.totalCount || rightTime - leftTime || left.clusterId - right.clusterId;
1688
- }
1689
- return rightTime - leftTime || right.totalCount - left.totalCount || left.clusterId - right.clusterId;
1690
- }
1691
-
1692
- private async fetchThreadComments(
1693
- owner: string,
1694
- repo: string,
1695
- number: number,
1696
- isPr: boolean,
1697
- reporter?: (message: string) => void,
1698
- ): Promise<CommentSeed[]> {
1699
- const github = this.requireGithub();
1700
- const comments: CommentSeed[] = [];
1701
-
1702
- const issueComments = await github.listIssueComments(owner, repo, number, reporter);
1703
- comments.push(
1704
- ...issueComments.map((comment) => ({
1705
- githubId: String(comment.id),
1706
- commentType: 'issue_comment',
1707
- authorLogin: userLogin(comment),
1708
- authorType: userType(comment),
1709
- body: String(comment.body ?? ''),
1710
- isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }),
1711
- rawJson: asJson(comment),
1712
- createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null,
1713
- updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null,
1714
- })),
1715
- );
1716
-
1717
- if (isPr) {
1718
- const reviews = await github.listPullReviews(owner, repo, number, reporter);
1719
- comments.push(
1720
- ...reviews.map((review) => ({
1721
- githubId: String(review.id),
1722
- commentType: 'review',
1723
- authorLogin: userLogin(review),
1724
- authorType: userType(review),
1725
- body: String(review.body ?? review.state ?? ''),
1726
- isBot: isBotLikeAuthor({ authorLogin: userLogin(review), authorType: userType(review) }),
1727
- rawJson: asJson(review),
1728
- createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null,
1729
- updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null,
1730
- })),
1731
- );
1732
-
1733
- const reviewComments = await github.listPullReviewComments(owner, repo, number, reporter);
1734
- comments.push(
1735
- ...reviewComments.map((comment) => ({
1736
- githubId: String(comment.id),
1737
- commentType: 'review_comment',
1738
- authorLogin: userLogin(comment),
1739
- authorType: userType(comment),
1740
- body: String(comment.body ?? ''),
1741
- isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }),
1742
- rawJson: asJson(comment),
1743
- createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null,
1744
- updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null,
1745
- })),
1746
- );
1747
- }
1748
-
1749
- return comments;
1750
- }
1751
-
1752
- private requireAi(): AiProvider {
1753
- if (!this.ai) {
1754
- requireOpenAiKey(this.config);
1755
- }
1756
- return this.ai as AiProvider;
1757
- }
1758
-
1759
- private requireGithub(): GitHubClient {
1760
- if (!this.github) {
1761
- requireGithubToken(this.config);
1762
- }
1763
- return this.github as GitHubClient;
1764
- }
1765
-
1766
- private requireRepository(owner: string, repo: string): RepositoryDto {
1767
- const fullName = `${owner}/${repo}`;
1768
- const row = this.db.prepare('select * from repositories where full_name = ? limit 1').get(fullName) as Record<string, unknown> | undefined;
1769
- if (!row) {
1770
- throw new Error(`Repository ${fullName} not found. Run sync first.`);
1771
- }
1772
- return repositoryToDto(row);
1773
- }
1774
-
1775
- private upsertRepository(owner: string, repo: string, payload: Record<string, unknown>): number {
1776
- const fullName = `${owner}/${repo}`;
1777
- this.db
1778
- .prepare(
1779
- `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at)
1780
- values (?, ?, ?, ?, ?, ?)
1781
- on conflict(full_name) do update set
1782
- github_repo_id = excluded.github_repo_id,
1783
- raw_json = excluded.raw_json,
1784
- updated_at = excluded.updated_at`,
1785
- )
1786
- .run(owner, repo, fullName, payload.id ? String(payload.id) : null, asJson(payload), nowIso());
1787
- const row = this.db.prepare('select id from repositories where full_name = ?').get(fullName) as { id: number };
1788
- return row.id;
1789
- }
1790
-
1791
- private upsertThread(
1792
- repoId: number,
1793
- kind: 'issue' | 'pull_request',
1794
- payload: Record<string, unknown>,
1795
- pulledAt: string,
1796
- ): number {
1797
- const title = String(payload.title ?? `#${payload.number}`);
1798
- const body = typeof payload.body === 'string' ? payload.body : null;
1799
- const labels = parseLabels(payload);
1800
- const assignees = parseAssignees(payload);
1801
- const contentHash = stableContentHash(`${title}\n${body ?? ''}`);
1802
- this.db
1803
- .prepare(
1804
- `insert into threads (
1805
- repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url,
1806
- labels_json, assignees_json, raw_json, content_hash, is_draft,
1807
- created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at
1808
- ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1809
- on conflict(repo_id, kind, number) do update set
1810
- github_id = excluded.github_id,
1811
- state = excluded.state,
1812
- title = excluded.title,
1813
- body = excluded.body,
1814
- author_login = excluded.author_login,
1815
- author_type = excluded.author_type,
1816
- html_url = excluded.html_url,
1817
- labels_json = excluded.labels_json,
1818
- assignees_json = excluded.assignees_json,
1819
- raw_json = excluded.raw_json,
1820
- content_hash = excluded.content_hash,
1821
- is_draft = excluded.is_draft,
1822
- created_at_gh = excluded.created_at_gh,
1823
- updated_at_gh = excluded.updated_at_gh,
1824
- closed_at_gh = excluded.closed_at_gh,
1825
- merged_at_gh = excluded.merged_at_gh,
1826
- last_pulled_at = excluded.last_pulled_at,
1827
- updated_at = excluded.updated_at`,
1828
- )
1829
- .run(
1830
- repoId,
1831
- String(payload.id),
1832
- Number(payload.number),
1833
- kind,
1834
- String(payload.state ?? 'open'),
1835
- title,
1836
- body,
1837
- userLogin(payload),
1838
- userType(payload),
1839
- String(payload.html_url),
1840
- asJson(labels),
1841
- asJson(assignees),
1842
- asJson(payload),
1843
- contentHash,
1844
- payload.draft ? 1 : 0,
1845
- typeof payload.created_at === 'string' ? payload.created_at : null,
1846
- typeof payload.updated_at === 'string' ? payload.updated_at : null,
1847
- typeof payload.closed_at === 'string' ? payload.closed_at : null,
1848
- typeof payload.merged_at === 'string' ? payload.merged_at : null,
1849
- pulledAt,
1850
- pulledAt,
1851
- nowIso(),
1852
- );
1853
- const row = this.db
1854
- .prepare('select id from threads where repo_id = ? and kind = ? and number = ?')
1855
- .get(repoId, kind, Number(payload.number)) as { id: number };
1856
- return row.id;
1857
- }
1858
-
1859
- private async reconcileMissingOpenThreads(params: {
1860
- repoId: number;
1861
- owner: string;
1862
- repo: string;
1863
- crawlStartedAt: string;
1864
- reporter?: (message: string) => void;
1865
- onProgress?: (message: string) => void;
1866
- }): Promise<number> {
1867
- const github = this.requireGithub();
1868
- const staleRows = this.db
1869
- .prepare(
1870
- `select id, number, kind
1871
- from threads
1872
- where repo_id = ?
1873
- and state = 'open'
1874
- and (last_pulled_at is null or last_pulled_at < ?)
1875
- order by number asc`,
1876
- )
1877
- .all(params.repoId, params.crawlStartedAt) as Array<{ id: number; number: number; kind: 'issue' | 'pull_request' }>;
1878
-
1879
- if (staleRows.length === 0) {
1880
- return 0;
1881
- }
1882
-
1883
- params.onProgress?.(
1884
- `[sync] reconciling ${staleRows.length} previously-open thread(s) not seen in the open crawl`,
1885
- );
1886
-
1887
- let threadsClosed = 0;
1888
- for (const [index, row] of staleRows.entries()) {
1889
- if (index > 0 && index % SYNC_BATCH_SIZE === 0) {
1890
- params.onProgress?.(`[sync] stale reconciliation batch boundary reached at ${index} threads; sleeping 5s before continuing`);
1891
- await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS));
1892
- }
1893
- params.onProgress?.(`[sync] reconciling stale ${row.kind} #${row.number}`);
1894
- const pulledAt = nowIso();
1895
- let payload: Record<string, unknown> | null = null;
1896
- let state = 'closed';
1897
-
1898
- try {
1899
- payload =
1900
- row.kind === 'pull_request'
1901
- ? await github.getPull(params.owner, params.repo, row.number, params.reporter)
1902
- : await github.getIssue(params.owner, params.repo, row.number, params.reporter);
1903
- state = String(payload.state ?? 'open');
1904
- } catch (error) {
1905
- if (!isMissingGitHubResourceError(error)) {
1906
- throw error;
1907
- }
1908
- params.onProgress?.(
1909
- `[sync] stale ${row.kind} #${row.number} is missing on GitHub; marking it closed locally and continuing`,
1910
- );
1911
- }
1912
-
1913
- if (payload) {
1914
- this.db
1915
- .prepare(
1916
- `update threads
1917
- set state = ?,
1918
- raw_json = ?,
1919
- updated_at_gh = ?,
1920
- closed_at_gh = ?,
1921
- merged_at_gh = ?,
1922
- last_pulled_at = ?,
1923
- updated_at = ?
1924
- where id = ?`,
1925
- )
1926
- .run(
1927
- state,
1928
- asJson(payload),
1929
- typeof payload.updated_at === 'string' ? payload.updated_at : null,
1930
- typeof payload.closed_at === 'string' ? payload.closed_at : null,
1931
- typeof payload.merged_at === 'string' ? payload.merged_at : null,
1932
- pulledAt,
1933
- pulledAt,
1934
- row.id,
1935
- );
1936
- } else {
1937
- this.db
1938
- .prepare(
1939
- `update threads
1940
- set state = 'closed',
1941
- closed_at_gh = coalesce(closed_at_gh, ?),
1942
- last_pulled_at = ?,
1943
- updated_at = ?
1944
- where id = ?`,
1945
- )
1946
- .run(pulledAt, pulledAt, pulledAt, row.id);
1947
- }
1948
-
1949
- if (state !== 'open') {
1950
- threadsClosed += 1;
1951
- }
1952
- }
1953
-
1954
- if (threadsClosed > 0) {
1955
- params.onProgress?.(`[sync] marked ${threadsClosed} stale thread(s) as closed after GitHub confirmation`);
1956
- }
1957
-
1958
- return threadsClosed;
1959
- }
1960
-
1961
- private replaceComments(threadId: number, comments: CommentSeed[]): void {
1962
- const insert = this.db.prepare(
1963
- `insert into comments (
1964
- thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh
1965
- ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
1966
- );
1967
- const tx = this.db.transaction((commentRows: CommentSeed[]) => {
1968
- this.db.prepare('delete from comments where thread_id = ?').run(threadId);
1969
- for (const comment of commentRows) {
1970
- insert.run(
1971
- threadId,
1972
- comment.githubId,
1973
- comment.commentType,
1974
- comment.authorLogin,
1975
- comment.authorType,
1976
- comment.body,
1977
- comment.isBot ? 1 : 0,
1978
- comment.rawJson,
1979
- comment.createdAtGh,
1980
- comment.updatedAtGh,
1981
- );
1982
- }
1983
- });
1984
- tx(comments);
1985
- }
1986
-
1987
- private refreshDocument(threadId: number): void {
1988
- const thread = this.db.prepare('select * from threads where id = ?').get(threadId) as ThreadRow;
1989
- const comments = this.db
1990
- .prepare(
1991
- 'select body, author_login, author_type, is_bot from comments where thread_id = ? order by coalesce(created_at_gh, updated_at_gh) asc, id asc',
1992
- )
1993
- .all(threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>;
1994
-
1995
- const canonical = buildCanonicalDocument({
1996
- title: thread.title,
1997
- body: thread.body,
1998
- labels: parseArray(thread.labels_json),
1999
- comments: comments.map((comment) => ({
2000
- body: comment.body,
2001
- authorLogin: comment.author_login,
2002
- authorType: comment.author_type,
2003
- isBot: comment.is_bot === 1,
2004
- })),
2005
- });
2006
-
2007
- this.db
2008
- .prepare(
2009
- `insert into documents (thread_id, title, body, raw_text, dedupe_text, updated_at)
2010
- values (?, ?, ?, ?, ?, ?)
2011
- on conflict(thread_id) do update set
2012
- title = excluded.title,
2013
- body = excluded.body,
2014
- raw_text = excluded.raw_text,
2015
- dedupe_text = excluded.dedupe_text,
2016
- updated_at = excluded.updated_at`,
2017
- )
2018
- .run(threadId, thread.title, thread.body, canonical.rawText, canonical.dedupeText, nowIso());
2019
-
2020
- this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId);
2021
- }
2022
-
2023
- private buildSummarySource(
2024
- threadId: number,
2025
- title: string,
2026
- body: string | null,
2027
- labels: string[],
2028
- includeComments: boolean,
2029
- ): { summaryInput: string; summaryContentHash: string } {
2030
- const parts = [`title: ${normalizeSummaryText(title)}`];
2031
- const normalizedBody = normalizeSummaryText(body ?? '');
2032
- if (normalizedBody) {
2033
- parts.push(`body: ${normalizedBody}`);
2034
- }
2035
- if (labels.length > 0) {
2036
- parts.push(`labels: ${labels.join(', ')}`);
2037
- }
2038
-
2039
- if (includeComments) {
2040
- const comments = this.db
2041
- .prepare(
2042
- `select body, author_login, author_type, is_bot
2043
- from comments
2044
- where thread_id = ?
2045
- order by coalesce(created_at_gh, updated_at_gh) asc, id asc`,
2046
- )
2047
- .all(threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>;
2048
-
2049
- const humanComments = comments
2050
- .filter((comment) =>
2051
- !isBotLikeAuthor({
2052
- authorLogin: comment.author_login,
2053
- authorType: comment.author_type,
2054
- isBot: comment.is_bot === 1,
2055
- }),
2056
- )
2057
- .map((comment) => {
2058
- const author = comment.author_login ? `@${comment.author_login}` : 'unknown';
2059
- const normalized = normalizeSummaryText(comment.body);
2060
- return normalized ? `${author}: ${normalized}` : '';
2061
- })
2062
- .filter(Boolean);
2063
-
2064
- if (humanComments.length > 0) {
2065
- parts.push(`discussion:\n${humanComments.join('\n')}`);
2066
- }
2067
- }
2068
-
2069
- const summaryInput = parts.join('\n\n');
2070
- const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
2071
- return { summaryInput, summaryContentHash };
2072
- }
2073
-
2074
- private buildEmbeddingTasks(params: {
2075
- threadId: number;
2076
- threadNumber: number;
2077
- title: string;
2078
- body: string | null;
2079
- dedupeSummary: string | null;
2080
- }): EmbeddingTask[] {
2081
- const tasks: EmbeddingTask[] = [];
2082
- const titleText = this.prepareEmbeddingText(normalizeSummaryText(params.title), EMBED_MAX_ITEM_TOKENS);
2083
- if (titleText) {
2084
- tasks.push({
2085
- threadId: params.threadId,
2086
- threadNumber: params.threadNumber,
2087
- sourceKind: 'title',
2088
- text: titleText.text,
2089
- contentHash: stableContentHash(`embedding:title\n${titleText.text}`),
2090
- estimatedTokens: titleText.estimatedTokens,
2091
- wasTruncated: titleText.wasTruncated,
2092
- });
2093
- }
2094
-
2095
- const bodyText = this.prepareEmbeddingText(normalizeSummaryText(params.body ?? ''), EMBED_MAX_ITEM_TOKENS);
2096
- if (bodyText) {
2097
- tasks.push({
2098
- threadId: params.threadId,
2099
- threadNumber: params.threadNumber,
2100
- sourceKind: 'body',
2101
- text: bodyText.text,
2102
- contentHash: stableContentHash(`embedding:body\n${bodyText.text}`),
2103
- estimatedTokens: bodyText.estimatedTokens,
2104
- wasTruncated: bodyText.wasTruncated,
2105
- });
2106
- }
2107
-
2108
- const summaryText = this.prepareEmbeddingText(normalizeSummaryText(params.dedupeSummary ?? ''), EMBED_MAX_ITEM_TOKENS);
2109
- if (summaryText) {
2110
- tasks.push({
2111
- threadId: params.threadId,
2112
- threadNumber: params.threadNumber,
2113
- sourceKind: 'dedupe_summary',
2114
- text: summaryText.text,
2115
- contentHash: stableContentHash(`embedding:dedupe_summary\n${summaryText.text}`),
2116
- estimatedTokens: summaryText.estimatedTokens,
2117
- wasTruncated: summaryText.wasTruncated,
2118
- });
2119
- }
2120
-
2121
- return tasks;
2122
- }
2123
-
2124
- private prepareEmbeddingText(
2125
- text: string,
2126
- maxEstimatedTokens: number,
2127
- ): { text: string; estimatedTokens: number; wasTruncated: boolean } | null {
2128
- if (!text) {
2129
- return null;
2130
- }
2131
-
2132
- const maxChars = maxEstimatedTokens * EMBED_ESTIMATED_CHARS_PER_TOKEN;
2133
- const wasTruncated = text.length > maxChars;
2134
- const prepared = wasTruncated
2135
- ? `${text.slice(0, Math.max(0, maxChars - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`
2136
- : text;
2137
- return {
2138
- text: prepared,
2139
- estimatedTokens: this.estimateEmbeddingTokens(prepared),
2140
- wasTruncated,
2141
- };
2142
- }
2143
-
2144
- private estimateEmbeddingTokens(text: string): number {
2145
- return Math.max(1, Math.ceil(text.length / EMBED_ESTIMATED_CHARS_PER_TOKEN));
2146
- }
2147
-
2148
- private isEmbeddingContextError(error: unknown): boolean {
2149
- const message = error instanceof Error ? error.message : String(error);
2150
- return /maximum context length/i.test(message) || /requested \d+ tokens/i.test(message);
2151
- }
2152
-
2153
- private async embedBatchWithRecovery(
2154
- ai: AiProvider,
2155
- batch: EmbeddingTask[],
2156
- onProgress?: (message: string) => void,
2157
- ): Promise<Array<{ task: EmbeddingTask; embedding: number[] }>> {
2158
- try {
2159
- const embeddings = await ai.embedTexts({
2160
- model: this.config.embedModel,
2161
- texts: batch.map((task) => task.text),
2162
- });
2163
- return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
2164
- } catch (error) {
2165
- if (!this.isEmbeddingContextError(error) || batch.length === 1) {
2166
- if (batch.length === 1 && this.isEmbeddingContextError(error)) {
2167
- const recovered = await this.embedSingleTaskWithRecovery(ai, batch[0], onProgress);
2168
- return [recovered];
2169
- }
2170
- throw error;
2171
- }
2172
-
2173
- onProgress?.(
2174
- `[embed] batch context error; isolating ${batch.length} item(s) to find oversized input(s)`,
2175
- );
2176
-
2177
- const recovered: Array<{ task: EmbeddingTask; embedding: number[] }> = [];
2178
- for (const task of batch) {
2179
- recovered.push(await this.embedSingleTaskWithRecovery(ai, task, onProgress));
2180
- }
2181
- return recovered;
2182
- }
2183
- }
2184
-
2185
- private async embedSingleTaskWithRecovery(
2186
- ai: AiProvider,
2187
- task: EmbeddingTask,
2188
- onProgress?: (message: string) => void,
2189
- ): Promise<{ task: EmbeddingTask; embedding: number[] }> {
2190
- let current = task;
2191
-
2192
- for (let attempt = 0; attempt < 4; attempt += 1) {
2193
- try {
2194
- const [embedding] = await ai.embedTexts({
2195
- model: this.config.embedModel,
2196
- texts: [current.text],
2197
- });
2198
- return { task: current, embedding };
2199
- } catch (error) {
2200
- if (!this.isEmbeddingContextError(error)) {
2201
- throw error;
2202
- }
2203
-
2204
- const next = this.shrinkEmbeddingTask(current);
2205
- if (!next || next.text === current.text) {
2206
- throw error;
2207
- }
2208
- onProgress?.(
2209
- `[embed] shortened #${current.threadNumber}:${current.sourceKind} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`,
2210
- );
2211
- current = next;
2212
- }
2213
- }
2214
-
2215
- throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.sourceKind} below model limits`);
2216
- }
2217
-
2218
- private shrinkEmbeddingTask(task: EmbeddingTask): EmbeddingTask | null {
2219
- const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
2220
- ? task.text.slice(0, -EMBED_TRUNCATION_MARKER.length)
2221
- : task.text;
2222
- if (withoutMarker.length < 256) {
2223
- return null;
2224
- }
2225
-
2226
- const nextLength = Math.max(256, Math.floor(withoutMarker.length * 0.5));
2227
- const nextText = `${withoutMarker.slice(0, Math.max(0, nextLength - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`;
2228
- return {
2229
- ...task,
2230
- text: nextText,
2231
- contentHash: stableContentHash(`embedding:${task.sourceKind}\n${nextText}`),
2232
- estimatedTokens: this.estimateEmbeddingTokens(nextText),
2233
- wasTruncated: true,
2234
- };
2235
- }
2236
-
2237
- private chunkEmbeddingTasks(items: EmbeddingTask[], maxItems: number, maxEstimatedTokens: number): EmbeddingTask[][] {
2238
- const chunks: EmbeddingTask[][] = [];
2239
- let current: EmbeddingTask[] = [];
2240
- let currentEstimatedTokens = 0;
2241
-
2242
- for (const item of items) {
2243
- const wouldExceedItemCount = current.length >= maxItems;
2244
- const wouldExceedTokenBudget = current.length > 0 && currentEstimatedTokens + item.estimatedTokens > maxEstimatedTokens;
2245
- if (wouldExceedItemCount || wouldExceedTokenBudget) {
2246
- chunks.push(current);
2247
- current = [];
2248
- currentEstimatedTokens = 0;
2249
- }
2250
-
2251
- current.push(item);
2252
- currentEstimatedTokens += item.estimatedTokens;
2253
- }
2254
-
2255
- if (current.length > 0) {
2256
- chunks.push(current);
2257
- }
2258
- return chunks;
2259
- }
2260
-
2261
- private loadStoredEmbeddings(repoId: number): StoredEmbeddingRow[] {
2262
- return this.db
2263
- .prepare(
2264
- `select t.id, t.repo_id, t.number, t.kind, t.state, t.title, t.body, t.author_login, t.html_url, t.labels_json,
2265
- t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
2266
- from threads t
2267
- join document_embeddings e on e.thread_id = t.id
2268
- where t.repo_id = ? and t.state = 'open' and e.model = ?
2269
- order by t.number asc, e.source_kind asc`,
2270
- )
2271
- .all(repoId, this.config.embedModel) as StoredEmbeddingRow[];
2272
- }
2273
-
2274
- private loadParsedStoredEmbeddings(repoId: number): ParsedStoredEmbeddingRow[] {
2275
- const cached = this.parsedEmbeddingCache.get(repoId);
2276
- if (cached) {
2277
- return cached;
2278
- }
2279
-
2280
- const parsed = this.loadStoredEmbeddings(repoId).map((row) => ({
2281
- ...row,
2282
- embedding: JSON.parse(row.embedding_json) as number[],
2283
- }));
2284
- this.parsedEmbeddingCache.set(repoId, parsed);
2285
- return parsed;
2286
- }
2287
-
2288
- private getEmbeddingWorkset(repoId: number, threadNumber?: number): EmbeddingWorkset {
2289
- let sql =
2290
- `select t.id, t.number, t.title, t.body
2291
- from threads t
2292
- where t.repo_id = ? and t.state = 'open'`;
2293
- const args: Array<string | number> = [repoId];
2294
- if (threadNumber) {
2295
- sql += ' and t.number = ?';
2296
- args.push(threadNumber);
2297
- }
2298
- sql += ' order by t.number asc';
2299
- const rows = this.db.prepare(sql).all(...args) as Array<{
2300
- id: number;
2301
- number: number;
2302
- title: string;
2303
- body: string | null;
2304
- }>;
2305
- const summaryTexts = this.loadCombinedSummaryTextMap(repoId, threadNumber);
2306
- const tasks = rows.flatMap((row) =>
2307
- this.buildEmbeddingTasks({
2308
- threadId: row.id,
2309
- threadNumber: row.number,
2310
- title: row.title,
2311
- body: row.body,
2312
- dedupeSummary: summaryTexts.get(row.id) ?? null,
2313
- }),
2314
- );
2315
- const existingRows = this.db
2316
- .prepare(
2317
- `select e.thread_id, e.source_kind, e.content_hash
2318
- from document_embeddings e
2319
- join threads t on t.id = e.thread_id
2320
- where t.repo_id = ? and e.model = ?`,
2321
- )
2322
- .all(repoId, this.config.embedModel) as Array<{
2323
- thread_id: number;
2324
- source_kind: EmbeddingSourceKind;
2325
- content_hash: string;
2326
- }>;
2327
- const existing = new Map<string, string>();
2328
- for (const row of existingRows) {
2329
- existing.set(`${row.thread_id}:${row.source_kind}`, row.content_hash);
2330
- }
2331
- const pending = tasks.filter((task) => existing.get(`${task.threadId}:${task.sourceKind}`) !== task.contentHash);
2332
- return { rows, tasks, existing, pending };
2333
- }
2334
-
2335
- private loadCombinedSummaryTextMap(repoId: number, threadNumber?: number): Map<number, string> {
2336
- let sql =
2337
- `select s.thread_id, s.summary_kind, s.summary_text
2338
- from document_summaries s
2339
- join threads t on t.id = s.thread_id
2340
- where t.repo_id = ? and t.state = 'open' and s.model = ?`;
2341
- const args: Array<number | string> = [repoId, this.config.summaryModel];
2342
- if (threadNumber) {
2343
- sql += ' and t.number = ?';
2344
- args.push(threadNumber);
2345
- }
2346
- sql += ' order by t.number asc, s.summary_kind asc';
2347
-
2348
- const rows = this.db.prepare(sql).all(...args) as Array<{
2349
- thread_id: number;
2350
- summary_kind: string;
2351
- summary_text: string;
2352
- }>;
2353
- const byThread = new Map<number, Map<string, string>>();
2354
- for (const row of rows) {
2355
- const entry = byThread.get(row.thread_id) ?? new Map<string, string>();
2356
- entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
2357
- byThread.set(row.thread_id, entry);
2358
- }
2359
-
2360
- const combined = new Map<number, string>();
2361
- const order = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary'];
2362
- for (const [threadId, entry] of byThread.entries()) {
2363
- const parts = order
2364
- .map((summaryKind) => {
2365
- const text = entry.get(summaryKind);
2366
- return text ? `${summaryKind}: ${text}` : '';
2367
- })
2368
- .filter(Boolean);
2369
- if (parts.length > 0) {
2370
- combined.set(threadId, parts.join('\n\n'));
2371
- }
2372
- }
2373
- return combined;
2374
- }
2375
-
2376
- private edgeKey(leftThreadId: number, rightThreadId: number): string {
2377
- const left = Math.min(leftThreadId, rightThreadId);
2378
- const right = Math.max(leftThreadId, rightThreadId);
2379
- return `${left}:${right}`;
2380
- }
2381
-
2382
- private aggregateRepositoryEdges(
2383
- rows: ParsedStoredEmbeddingRow[],
2384
- params: { limit: number; minScore: number },
2385
- ): Map<string, { leftThreadId: number; rightThreadId: number; score: number; sourceKinds: Set<EmbeddingSourceKind> }> {
2386
- const bySource = new Map<EmbeddingSourceKind, Array<{ id: number; embedding: number[] }>>();
2387
- for (const row of rows) {
2388
- const list = bySource.get(row.source_kind) ?? [];
2389
- list.push({ id: row.id, embedding: row.embedding });
2390
- bySource.set(row.source_kind, list);
2391
- }
2392
-
2393
- const aggregated = new Map<string, { leftThreadId: number; rightThreadId: number; score: number; sourceKinds: Set<EmbeddingSourceKind> }>();
2394
- for (const [sourceKind, items] of bySource.entries()) {
2395
- for (const item of items) {
2396
- const neighbors = rankNearestNeighbors(items, {
2397
- targetEmbedding: item.embedding,
2398
- limit: params.limit,
2399
- minScore: params.minScore,
2400
- skipId: item.id,
2401
- });
2402
- for (const neighbor of neighbors) {
2403
- const key = this.edgeKey(item.id, neighbor.item.id);
2404
- const existing = aggregated.get(key);
2405
- if (existing) {
2406
- existing.score = Math.max(existing.score, neighbor.score);
2407
- existing.sourceKinds.add(sourceKind);
2408
- continue;
2409
- }
2410
- aggregated.set(key, {
2411
- leftThreadId: Math.min(item.id, neighbor.item.id),
2412
- rightThreadId: Math.max(item.id, neighbor.item.id),
2413
- score: neighbor.score,
2414
- sourceKinds: new Set([sourceKind]),
2415
- });
2416
- }
2417
- }
2418
- }
2419
-
2420
- return aggregated;
2421
- }
2422
-
2423
- private upsertSummary(threadId: number, contentHash: string, summaryKind: string, summaryText: string): void {
2424
- this.db
2425
- .prepare(
2426
- `insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
2427
- values (?, ?, ?, ?, ?, ?, ?)
2428
- on conflict(thread_id, summary_kind, model) do update set
2429
- content_hash = excluded.content_hash,
2430
- summary_text = excluded.summary_text,
2431
- updated_at = excluded.updated_at`,
2432
- )
2433
- .run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
2434
- }
2435
-
2436
- private upsertEmbedding(threadId: number, sourceKind: EmbeddingSourceKind, contentHash: string, embedding: number[]): void {
2437
- this.db
2438
- .prepare(
2439
- `insert into document_embeddings (thread_id, source_kind, model, dimensions, content_hash, embedding_json, created_at, updated_at)
2440
- values (?, ?, ?, ?, ?, ?, ?, ?)
2441
- on conflict(thread_id, source_kind, model) do update set
2442
- dimensions = excluded.dimensions,
2443
- content_hash = excluded.content_hash,
2444
- embedding_json = excluded.embedding_json,
2445
- updated_at = excluded.updated_at`,
2446
- )
2447
- .run(
2448
- threadId,
2449
- sourceKind,
2450
- this.config.embedModel,
2451
- embedding.length,
2452
- contentHash,
2453
- asJson(embedding),
2454
- nowIso(),
2455
- nowIso(),
2456
- );
2457
- const row = this.db.prepare('select repo_id from threads where id = ? limit 1').get(threadId) as { repo_id: number } | undefined;
2458
- if (row) {
2459
- this.parsedEmbeddingCache.delete(row.repo_id);
2460
- }
2461
- }
2462
-
2463
- private startRun(table: RunTable, repoId: number, scope: string): number {
2464
- const result = this.db
2465
- .prepare(`insert into ${table} (repo_id, scope, status, started_at) values (?, ?, 'running', ?)`)
2466
- .run(repoId, scope, nowIso());
2467
- return Number(result.lastInsertRowid);
2468
- }
2469
-
2470
- private finishRun(
2471
- table: RunTable,
2472
- runId: number,
2473
- status: 'completed' | 'failed',
2474
- stats?: unknown,
2475
- error?: unknown,
2476
- finishedAt = nowIso(),
2477
- ): void {
2478
- this.db
2479
- .prepare(`update ${table} set status = ?, finished_at = ?, stats_json = ?, error_text = ? where id = ?`)
2480
- .run(
2481
- status,
2482
- finishedAt,
2483
- stats === undefined ? null : asJson(stats),
2484
- error instanceof Error ? error.message : error ? String(error) : null,
2485
- runId,
2486
- );
2487
- }
2488
- }
2489
-
2490
- export function parseRepoParams(url: URL): { owner: string; repo: string } {
2491
- const owner = url.searchParams.get('owner');
2492
- const repo = url.searchParams.get('repo');
2493
- if (!owner || !repo) {
2494
- throw new Error('Missing owner or repo query parameter');
2495
- }
2496
- return { owner, repo };
2497
- }