@ghcrawl/api-core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/api/server.d.ts +4 -0
- package/dist/api/server.d.ts.map +1 -0
- package/dist/api/server.js +142 -0
- package/dist/api/server.js.map +1 -0
- package/dist/cluster/build.d.ts +16 -0
- package/dist/cluster/build.d.ts.map +1 -0
- package/dist/cluster/build.js +62 -0
- package/dist/cluster/build.js.map +1 -0
- package/dist/config.d.ts +83 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +257 -0
- package/dist/config.js.map +1 -0
- package/dist/db/migrate.d.ts +3 -0
- package/dist/db/migrate.d.ts.map +1 -0
- package/{src/db/migrate.ts → dist/db/migrate.js} +30 -36
- package/dist/db/migrate.js.map +1 -0
- package/dist/db/sqlite.d.ts +4 -0
- package/dist/db/sqlite.d.ts.map +1 -0
- package/dist/db/sqlite.js +11 -0
- package/dist/db/sqlite.js.map +1 -0
- package/dist/documents/normalize.d.ts +23 -0
- package/dist/documents/normalize.d.ts.map +1 -0
- package/dist/documents/normalize.js +36 -0
- package/dist/documents/normalize.js.map +1 -0
- package/dist/github/client.d.ts +24 -0
- package/dist/github/client.d.ts.map +1 -0
- package/dist/github/client.js +170 -0
- package/dist/github/client.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/{src/index.ts → dist/index.js} +1 -0
- package/dist/index.js.map +1 -0
- package/dist/openai/provider.d.ts +44 -0
- package/dist/openai/provider.d.ts.map +1 -0
- package/dist/openai/provider.js +107 -0
- package/dist/openai/provider.js.map +1 -0
- package/dist/search/exact.d.ts +14 -0
- package/dist/search/exact.d.ts.map +1 -0
- package/dist/search/exact.js +26 -0
- package/dist/search/exact.js.map +1 -0
- package/dist/service.d.ts +247 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/service.js +1735 -0
- package/dist/service.js.map +1 -0
- package/package.json +6 -5
- package/src/api/server.test.ts +0 -296
- package/src/api/server.ts +0 -171
- package/src/cluster/build.test.ts +0 -18
- package/src/cluster/build.ts +0 -74
- package/src/config.test.ts +0 -247
- package/src/config.ts +0 -421
- package/src/db/migrate.test.ts +0 -30
- package/src/db/sqlite.ts +0 -14
- package/src/documents/normalize.test.ts +0 -25
- package/src/documents/normalize.ts +0 -52
- package/src/github/client.ts +0 -241
- package/src/openai/provider.ts +0 -141
- package/src/search/exact.test.ts +0 -22
- package/src/search/exact.ts +0 -28
- package/src/service.test.ts +0 -2036
- package/src/service.ts +0 -2497
- package/src/types/better-sqlite3.d.ts +0 -1
package/dist/service.js
ADDED
|
@@ -0,0 +1,1735 @@
|
|
|
1
|
+
import http from 'node:http';
|
|
2
|
+
import crypto from 'node:crypto';
|
|
3
|
+
import { IterableMapper } from '@shutterstock/p-map-iterable';
|
|
4
|
+
import { actionResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, } from '@ghcrawl/api-contract';
|
|
5
|
+
import { buildClusters } from './cluster/build.js';
|
|
6
|
+
import { ensureRuntimeDirs, isLikelyGitHubToken, isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, } from './config.js';
|
|
7
|
+
import { migrate } from './db/migrate.js';
|
|
8
|
+
import { openDb } from './db/sqlite.js';
|
|
9
|
+
import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js';
|
|
10
|
+
import { makeGitHubClient } from './github/client.js';
|
|
11
|
+
import { OpenAiProvider } from './openai/provider.js';
|
|
12
|
+
import { cosineSimilarity, rankNearestNeighbors } from './search/exact.js';
|
|
13
|
+
const SYNC_BATCH_SIZE = 100;
|
|
14
|
+
const SYNC_BATCH_DELAY_MS = 5000;
|
|
15
|
+
const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3;
|
|
16
|
+
const EMBED_MAX_ITEM_TOKENS = 7000;
|
|
17
|
+
const EMBED_MAX_BATCH_TOKENS = 250000;
|
|
18
|
+
const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]';
|
|
19
|
+
function nowIso() {
|
|
20
|
+
return new Date().toISOString();
|
|
21
|
+
}
|
|
22
|
+
function parseIso(value) {
|
|
23
|
+
if (!value)
|
|
24
|
+
return null;
|
|
25
|
+
const parsed = Date.parse(value);
|
|
26
|
+
return Number.isNaN(parsed) ? null : parsed;
|
|
27
|
+
}
|
|
28
|
+
function isMissingGitHubResourceError(error) {
|
|
29
|
+
const status = typeof error?.status === 'number' ? Number(error.status) : null;
|
|
30
|
+
if (status === 404 || status === 410) {
|
|
31
|
+
return true;
|
|
32
|
+
}
|
|
33
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
34
|
+
return /\b(404|410)\b/.test(message) || /Not Found|Gone/i.test(message);
|
|
35
|
+
}
|
|
36
|
+
function deriveIncrementalSince(referenceAt, crawlStartedAt) {
|
|
37
|
+
const referenceMs = parseIso(referenceAt) ?? Date.now();
|
|
38
|
+
const crawlMs = parseIso(crawlStartedAt) ?? Date.now();
|
|
39
|
+
const gapMs = Math.max(0, crawlMs - referenceMs);
|
|
40
|
+
const hourMs = 60 * 60 * 1000;
|
|
41
|
+
const roundedHours = Math.max(2, Math.ceil(gapMs / hourMs));
|
|
42
|
+
return new Date(crawlMs - roundedHours * hourMs).toISOString();
|
|
43
|
+
}
|
|
44
|
+
function parseSyncRunStats(statsJson) {
|
|
45
|
+
if (!statsJson)
|
|
46
|
+
return null;
|
|
47
|
+
try {
|
|
48
|
+
const parsed = JSON.parse(statsJson);
|
|
49
|
+
if (typeof parsed.crawlStartedAt !== 'string') {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
threadsSynced: typeof parsed.threadsSynced === 'number' ? parsed.threadsSynced : 0,
|
|
54
|
+
commentsSynced: typeof parsed.commentsSynced === 'number' ? parsed.commentsSynced : 0,
|
|
55
|
+
threadsClosed: typeof parsed.threadsClosed === 'number' ? parsed.threadsClosed : 0,
|
|
56
|
+
crawlStartedAt: parsed.crawlStartedAt,
|
|
57
|
+
requestedSince: typeof parsed.requestedSince === 'string' ? parsed.requestedSince : null,
|
|
58
|
+
effectiveSince: typeof parsed.effectiveSince === 'string' ? parsed.effectiveSince : null,
|
|
59
|
+
limit: typeof parsed.limit === 'number' ? parsed.limit : null,
|
|
60
|
+
includeComments: parsed.includeComments === true,
|
|
61
|
+
isFullOpenScan: parsed.isFullOpenScan === true,
|
|
62
|
+
isOverlappingOpenScan: parsed.isOverlappingOpenScan === true,
|
|
63
|
+
overlapReferenceAt: typeof parsed.overlapReferenceAt === 'string' ? parsed.overlapReferenceAt : null,
|
|
64
|
+
reconciledOpenCloseAt: typeof parsed.reconciledOpenCloseAt === 'string' ? parsed.reconciledOpenCloseAt : null,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
function asJson(value) {
|
|
72
|
+
return JSON.stringify(value ?? null);
|
|
73
|
+
}
|
|
74
|
+
function parseArray(value) {
|
|
75
|
+
return JSON.parse(value);
|
|
76
|
+
}
|
|
77
|
+
function userLogin(payload) {
|
|
78
|
+
const user = payload.user;
|
|
79
|
+
const login = user?.login;
|
|
80
|
+
return typeof login === 'string' ? login : null;
|
|
81
|
+
}
|
|
82
|
+
function userType(payload) {
|
|
83
|
+
const user = payload.user;
|
|
84
|
+
const type = user?.type;
|
|
85
|
+
return typeof type === 'string' ? type : null;
|
|
86
|
+
}
|
|
87
|
+
function isPullRequestPayload(payload) {
|
|
88
|
+
return Boolean(payload.pull_request);
|
|
89
|
+
}
|
|
90
|
+
function parseLabels(payload) {
|
|
91
|
+
const labels = payload.labels;
|
|
92
|
+
if (!Array.isArray(labels))
|
|
93
|
+
return [];
|
|
94
|
+
return labels
|
|
95
|
+
.map((label) => {
|
|
96
|
+
if (typeof label === 'string')
|
|
97
|
+
return label;
|
|
98
|
+
if (label && typeof label === 'object' && typeof label.name === 'string') {
|
|
99
|
+
return String(label.name);
|
|
100
|
+
}
|
|
101
|
+
return null;
|
|
102
|
+
})
|
|
103
|
+
.filter((value) => Boolean(value));
|
|
104
|
+
}
|
|
105
|
+
function parseAssignees(payload) {
|
|
106
|
+
const assignees = payload.assignees;
|
|
107
|
+
if (!Array.isArray(assignees))
|
|
108
|
+
return [];
|
|
109
|
+
return assignees
|
|
110
|
+
.map((assignee) => {
|
|
111
|
+
if (assignee && typeof assignee === 'object' && typeof assignee.login === 'string') {
|
|
112
|
+
return String(assignee.login);
|
|
113
|
+
}
|
|
114
|
+
return null;
|
|
115
|
+
})
|
|
116
|
+
.filter((value) => Boolean(value));
|
|
117
|
+
}
|
|
118
|
+
function stableContentHash(input) {
|
|
119
|
+
return crypto.createHash('sha256').update(input).digest('hex');
|
|
120
|
+
}
|
|
121
|
+
function normalizeSummaryText(value) {
|
|
122
|
+
return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim();
|
|
123
|
+
}
|
|
124
|
+
function snippetText(value, maxChars) {
|
|
125
|
+
if (!value)
|
|
126
|
+
return null;
|
|
127
|
+
const normalized = value.replace(/\s+/g, ' ').trim();
|
|
128
|
+
if (!normalized)
|
|
129
|
+
return null;
|
|
130
|
+
if (normalized.length <= maxChars)
|
|
131
|
+
return normalized;
|
|
132
|
+
return `${normalized.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`;
|
|
133
|
+
}
|
|
134
|
+
function repositoryToDto(row) {
|
|
135
|
+
return {
|
|
136
|
+
id: Number(row.id),
|
|
137
|
+
owner: String(row.owner),
|
|
138
|
+
name: String(row.name),
|
|
139
|
+
fullName: String(row.full_name),
|
|
140
|
+
githubRepoId: row.github_repo_id === null ? null : String(row.github_repo_id),
|
|
141
|
+
updatedAt: String(row.updated_at),
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
function threadToDto(row, clusterId) {
|
|
145
|
+
return {
|
|
146
|
+
id: row.id,
|
|
147
|
+
repoId: row.repo_id,
|
|
148
|
+
number: row.number,
|
|
149
|
+
kind: row.kind,
|
|
150
|
+
state: row.state,
|
|
151
|
+
title: row.title,
|
|
152
|
+
body: row.body,
|
|
153
|
+
authorLogin: row.author_login,
|
|
154
|
+
htmlUrl: row.html_url,
|
|
155
|
+
labels: parseArray(row.labels_json),
|
|
156
|
+
updatedAtGh: row.updated_at_gh,
|
|
157
|
+
clusterId: clusterId ?? null,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
export class GHCrawlService {
|
|
161
|
+
config;
|
|
162
|
+
db;
|
|
163
|
+
github;
|
|
164
|
+
ai;
|
|
165
|
+
parsedEmbeddingCache = new Map();
|
|
166
|
+
constructor(options = {}) {
|
|
167
|
+
this.config = options.config ?? loadConfig();
|
|
168
|
+
ensureRuntimeDirs(this.config);
|
|
169
|
+
this.db = options.db ?? openDb(this.config.dbPath);
|
|
170
|
+
migrate(this.db);
|
|
171
|
+
this.github = options.github ?? (this.config.githubToken ? makeGitHubClient({ token: this.config.githubToken }) : undefined);
|
|
172
|
+
this.ai = options.ai ?? (this.config.openaiApiKey ? new OpenAiProvider(this.config.openaiApiKey) : undefined);
|
|
173
|
+
}
|
|
174
|
+
close() {
|
|
175
|
+
this.parsedEmbeddingCache.clear();
|
|
176
|
+
this.db.close();
|
|
177
|
+
}
|
|
178
|
+
init() {
|
|
179
|
+
ensureRuntimeDirs(this.config);
|
|
180
|
+
migrate(this.db);
|
|
181
|
+
const response = {
|
|
182
|
+
ok: true,
|
|
183
|
+
configPath: this.config.configPath,
|
|
184
|
+
configFileExists: this.config.configFileExists,
|
|
185
|
+
dbPath: this.config.dbPath,
|
|
186
|
+
apiPort: this.config.apiPort,
|
|
187
|
+
githubConfigured: Boolean(this.config.githubToken),
|
|
188
|
+
openaiConfigured: Boolean(this.config.openaiApiKey),
|
|
189
|
+
};
|
|
190
|
+
return healthResponseSchema.parse(response);
|
|
191
|
+
}
|
|
192
|
+
async doctor() {
|
|
193
|
+
const health = this.init();
|
|
194
|
+
const github = {
|
|
195
|
+
configured: Boolean(this.config.githubToken),
|
|
196
|
+
source: this.config.githubTokenSource,
|
|
197
|
+
formatOk: this.config.githubToken ? isLikelyGitHubToken(this.config.githubToken) : false,
|
|
198
|
+
authOk: false,
|
|
199
|
+
error: null,
|
|
200
|
+
};
|
|
201
|
+
const openai = {
|
|
202
|
+
configured: Boolean(this.config.openaiApiKey),
|
|
203
|
+
source: this.config.openaiApiKeySource,
|
|
204
|
+
formatOk: this.config.openaiApiKey ? isLikelyOpenAiApiKey(this.config.openaiApiKey) : false,
|
|
205
|
+
authOk: false,
|
|
206
|
+
error: null,
|
|
207
|
+
};
|
|
208
|
+
if (!github.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) {
|
|
209
|
+
github.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so GITHUB_TOKEN is present in the environment.`;
|
|
210
|
+
}
|
|
211
|
+
if (!openai.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) {
|
|
212
|
+
openai.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so OPENAI_API_KEY is present in the environment.`;
|
|
213
|
+
}
|
|
214
|
+
if (github.configured) {
|
|
215
|
+
if (!github.formatOk) {
|
|
216
|
+
github.error = 'Token format does not look like a GitHub personal access token.';
|
|
217
|
+
}
|
|
218
|
+
else {
|
|
219
|
+
try {
|
|
220
|
+
await this.requireGithub().checkAuth();
|
|
221
|
+
github.authOk = true;
|
|
222
|
+
}
|
|
223
|
+
catch (error) {
|
|
224
|
+
github.error = error instanceof Error ? error.message : String(error);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
if (openai.configured) {
|
|
229
|
+
if (!openai.formatOk) {
|
|
230
|
+
openai.error = 'Key format does not look like an OpenAI API key.';
|
|
231
|
+
}
|
|
232
|
+
else {
|
|
233
|
+
try {
|
|
234
|
+
await this.requireAi().checkAuth();
|
|
235
|
+
openai.authOk = true;
|
|
236
|
+
}
|
|
237
|
+
catch (error) {
|
|
238
|
+
openai.error = error instanceof Error ? error.message : String(error);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
return { health, github, openai };
|
|
243
|
+
}
|
|
244
|
+
listRepositories() {
|
|
245
|
+
const rows = this.db.prepare('select * from repositories order by full_name asc').all();
|
|
246
|
+
return repositoriesResponseSchema.parse({ repositories: rows.map(repositoryToDto) });
|
|
247
|
+
}
|
|
248
|
+
listThreads(params) {
|
|
249
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
250
|
+
const clusterIds = new Map();
|
|
251
|
+
const clusterRows = this.db
|
|
252
|
+
.prepare(`select cm.thread_id, cm.cluster_id
|
|
253
|
+
from cluster_members cm
|
|
254
|
+
join clusters c on c.id = cm.cluster_id
|
|
255
|
+
where c.repo_id = ? and c.cluster_run_id = (
|
|
256
|
+
select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
|
|
257
|
+
)`)
|
|
258
|
+
.all(repository.id, repository.id);
|
|
259
|
+
for (const row of clusterRows)
|
|
260
|
+
clusterIds.set(row.thread_id, row.cluster_id);
|
|
261
|
+
let sql = "select * from threads where repo_id = ? and state = 'open'";
|
|
262
|
+
const args = [repository.id];
|
|
263
|
+
if (params.kind) {
|
|
264
|
+
sql += ' and kind = ?';
|
|
265
|
+
args.push(params.kind);
|
|
266
|
+
}
|
|
267
|
+
sql += ' order by updated_at_gh desc, number desc';
|
|
268
|
+
const rows = this.db.prepare(sql).all(...args);
|
|
269
|
+
return threadsResponseSchema.parse({
|
|
270
|
+
repository,
|
|
271
|
+
threads: rows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)),
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
async syncRepository(params) {
|
|
275
|
+
const crawlStartedAt = params.startedAt ?? nowIso();
|
|
276
|
+
const includeComments = params.includeComments ?? false;
|
|
277
|
+
const github = this.requireGithub();
|
|
278
|
+
params.onProgress?.(`[sync] fetching repository metadata for ${params.owner}/${params.repo}`);
|
|
279
|
+
const reporter = params.onProgress ? (message) => params.onProgress?.(message.replace(/^\[github\]/, '[sync/github]')) : undefined;
|
|
280
|
+
const repoData = await github.getRepo(params.owner, params.repo, reporter);
|
|
281
|
+
const repoId = this.upsertRepository(params.owner, params.repo, repoData);
|
|
282
|
+
const runId = this.startRun('sync_runs', repoId, `${params.owner}/${params.repo}`);
|
|
283
|
+
const syncCursor = this.getSyncCursorState(repoId);
|
|
284
|
+
const overlapReferenceAt = syncCursor.lastOverlappingOpenScanCompletedAt ?? syncCursor.lastFullOpenScanStartedAt;
|
|
285
|
+
const effectiveSince = params.since ??
|
|
286
|
+
(params.limit === undefined && overlapReferenceAt ? deriveIncrementalSince(overlapReferenceAt, crawlStartedAt) : undefined);
|
|
287
|
+
const isFullOpenScan = params.limit === undefined && params.since === undefined && overlapReferenceAt === null;
|
|
288
|
+
const isOverlappingOpenScan = params.limit === undefined &&
|
|
289
|
+
overlapReferenceAt !== null &&
|
|
290
|
+
effectiveSince !== undefined &&
|
|
291
|
+
(parseIso(effectiveSince) ?? Number.POSITIVE_INFINITY) <= (parseIso(overlapReferenceAt) ?? Number.NEGATIVE_INFINITY);
|
|
292
|
+
try {
|
|
293
|
+
params.onProgress?.(`[sync] listing issues and pull requests for ${params.owner}/${params.repo}`);
|
|
294
|
+
params.onProgress?.(includeComments
|
|
295
|
+
? '[sync] comment hydration enabled; fetching issue comments, reviews, and review comments'
|
|
296
|
+
: '[sync] metadata-only mode; skipping comment, review, and review-comment fetches');
|
|
297
|
+
if (isFullOpenScan) {
|
|
298
|
+
params.onProgress?.('[sync] full open scan; no prior completed overlap/full cursor was found for this repository');
|
|
299
|
+
}
|
|
300
|
+
else if (params.since === undefined && effectiveSince && overlapReferenceAt) {
|
|
301
|
+
params.onProgress?.(`[sync] derived incremental window since=${effectiveSince} from overlap reference ${overlapReferenceAt}`);
|
|
302
|
+
}
|
|
303
|
+
else if (params.since !== undefined) {
|
|
304
|
+
params.onProgress?.(`[sync] using requested since=${params.since}`);
|
|
305
|
+
}
|
|
306
|
+
const items = await github.listRepositoryIssues(params.owner, params.repo, effectiveSince, params.limit, reporter);
|
|
307
|
+
params.onProgress?.(`[sync] discovered ${items.length} threads to process`);
|
|
308
|
+
let threadsSynced = 0;
|
|
309
|
+
let commentsSynced = 0;
|
|
310
|
+
for (const [index, item] of items.entries()) {
|
|
311
|
+
if (index > 0 && index % SYNC_BATCH_SIZE === 0) {
|
|
312
|
+
params.onProgress?.(`[sync] batch boundary reached at ${index} threads; sleeping 5s before continuing`);
|
|
313
|
+
await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS));
|
|
314
|
+
}
|
|
315
|
+
const number = Number(item.number);
|
|
316
|
+
const isPr = isPullRequestPayload(item);
|
|
317
|
+
const kind = isPr ? 'pull_request' : 'issue';
|
|
318
|
+
params.onProgress?.(`[sync] ${index + 1}/${items.length} ${kind} #${number}`);
|
|
319
|
+
try {
|
|
320
|
+
const threadPayload = isPr ? await github.getPull(params.owner, params.repo, number, reporter) : item;
|
|
321
|
+
const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt);
|
|
322
|
+
if (includeComments) {
|
|
323
|
+
const comments = await this.fetchThreadComments(params.owner, params.repo, number, isPr, reporter);
|
|
324
|
+
this.replaceComments(threadId, comments);
|
|
325
|
+
commentsSynced += comments.length;
|
|
326
|
+
}
|
|
327
|
+
this.refreshDocument(threadId);
|
|
328
|
+
threadsSynced += 1;
|
|
329
|
+
}
|
|
330
|
+
catch (error) {
|
|
331
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
332
|
+
throw new Error(`sync failed while processing ${kind} #${number}: ${message}`);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
const shouldReconcileMissingOpenThreads = params.limit === undefined && (isFullOpenScan || isOverlappingOpenScan);
|
|
336
|
+
if (!shouldReconcileMissingOpenThreads) {
|
|
337
|
+
params.onProgress?.('[sync] skipping stale-open reconciliation because this scan did not overlap a confirmed full/overlap cursor');
|
|
338
|
+
}
|
|
339
|
+
const threadsClosed = shouldReconcileMissingOpenThreads
|
|
340
|
+
? await this.reconcileMissingOpenThreads({
|
|
341
|
+
repoId,
|
|
342
|
+
owner: params.owner,
|
|
343
|
+
repo: params.repo,
|
|
344
|
+
crawlStartedAt,
|
|
345
|
+
reporter,
|
|
346
|
+
onProgress: params.onProgress,
|
|
347
|
+
})
|
|
348
|
+
: 0;
|
|
349
|
+
const finishedAt = nowIso();
|
|
350
|
+
const reconciledOpenCloseAt = shouldReconcileMissingOpenThreads ? finishedAt : null;
|
|
351
|
+
const nextSyncCursor = {
|
|
352
|
+
lastFullOpenScanStartedAt: isFullOpenScan ? crawlStartedAt : syncCursor.lastFullOpenScanStartedAt,
|
|
353
|
+
lastOverlappingOpenScanCompletedAt: isOverlappingOpenScan ? finishedAt : syncCursor.lastOverlappingOpenScanCompletedAt,
|
|
354
|
+
lastNonOverlappingScanCompletedAt: !isFullOpenScan && !isOverlappingOpenScan ? finishedAt : syncCursor.lastNonOverlappingScanCompletedAt,
|
|
355
|
+
lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt,
|
|
356
|
+
};
|
|
357
|
+
this.writeSyncCursorState(repoId, nextSyncCursor);
|
|
358
|
+
this.finishRun('sync_runs', runId, 'completed', {
|
|
359
|
+
threadsSynced,
|
|
360
|
+
commentsSynced,
|
|
361
|
+
threadsClosed,
|
|
362
|
+
crawlStartedAt,
|
|
363
|
+
requestedSince: params.since ?? null,
|
|
364
|
+
effectiveSince: effectiveSince ?? null,
|
|
365
|
+
limit: params.limit ?? null,
|
|
366
|
+
includeComments,
|
|
367
|
+
isFullOpenScan,
|
|
368
|
+
isOverlappingOpenScan,
|
|
369
|
+
overlapReferenceAt,
|
|
370
|
+
reconciledOpenCloseAt,
|
|
371
|
+
}, undefined, finishedAt);
|
|
372
|
+
return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, threadsClosed });
|
|
373
|
+
}
|
|
374
|
+
catch (error) {
|
|
375
|
+
this.finishRun('sync_runs', runId, 'failed', null, error);
|
|
376
|
+
throw error;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
async summarizeRepository(params) {
|
|
380
|
+
const ai = this.requireAi();
|
|
381
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
382
|
+
const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
|
|
383
|
+
const includeComments = params.includeComments ?? false;
|
|
384
|
+
try {
|
|
385
|
+
let sql = `select t.id, t.number, t.title, t.body, t.labels_json
|
|
386
|
+
from threads t
|
|
387
|
+
where t.repo_id = ? and t.state = 'open'`;
|
|
388
|
+
const args = [repository.id];
|
|
389
|
+
if (params.threadNumber) {
|
|
390
|
+
sql += ' and t.number = ?';
|
|
391
|
+
args.push(params.threadNumber);
|
|
392
|
+
}
|
|
393
|
+
sql += ' order by t.number asc';
|
|
394
|
+
const rows = this.db.prepare(sql).all(...args);
|
|
395
|
+
params.onProgress?.(`[summarize] loaded ${rows.length} candidate thread(s) for ${repository.fullName}`);
|
|
396
|
+
params.onProgress?.(includeComments
|
|
397
|
+
? '[summarize] include-comments enabled; hydrated human comments may be included in the summary input'
|
|
398
|
+
: '[summarize] metadata-only mode; comments are excluded from the summary input');
|
|
399
|
+
const sources = rows.map((row) => {
|
|
400
|
+
const source = this.buildSummarySource(row.id, row.title, row.body, parseArray(row.labels_json), includeComments);
|
|
401
|
+
return { ...row, ...source };
|
|
402
|
+
});
|
|
403
|
+
const pending = sources.filter((row) => {
|
|
404
|
+
const latest = this.db
|
|
405
|
+
.prepare('select content_hash from document_summaries where thread_id = ? and summary_kind = ? and model = ? limit 1')
|
|
406
|
+
.get(row.id, 'dedupe_summary', this.config.summaryModel);
|
|
407
|
+
return latest?.content_hash !== row.summaryContentHash;
|
|
408
|
+
});
|
|
409
|
+
params.onProgress?.(`[summarize] pending=${pending.length} skipped=${rows.length - pending.length} model=${this.config.summaryModel}`);
|
|
410
|
+
let summarized = 0;
|
|
411
|
+
let inputTokens = 0;
|
|
412
|
+
let outputTokens = 0;
|
|
413
|
+
let totalTokens = 0;
|
|
414
|
+
for (const [index, row] of pending.entries()) {
|
|
415
|
+
params.onProgress?.(`[summarize] ${index + 1}/${pending.length} thread #${row.number}`);
|
|
416
|
+
const result = await ai.summarizeThread({
|
|
417
|
+
model: this.config.summaryModel,
|
|
418
|
+
text: row.summaryInput,
|
|
419
|
+
});
|
|
420
|
+
const summary = result.summary;
|
|
421
|
+
this.upsertSummary(row.id, row.summaryContentHash, 'problem_summary', summary.problemSummary);
|
|
422
|
+
this.upsertSummary(row.id, row.summaryContentHash, 'solution_summary', summary.solutionSummary);
|
|
423
|
+
this.upsertSummary(row.id, row.summaryContentHash, 'maintainer_signal_summary', summary.maintainerSignalSummary);
|
|
424
|
+
this.upsertSummary(row.id, row.summaryContentHash, 'dedupe_summary', summary.dedupeSummary);
|
|
425
|
+
if (result.usage) {
|
|
426
|
+
inputTokens += result.usage.inputTokens;
|
|
427
|
+
outputTokens += result.usage.outputTokens;
|
|
428
|
+
totalTokens += result.usage.totalTokens;
|
|
429
|
+
params.onProgress?.(`[summarize] tokens thread #${row.number} in=${result.usage.inputTokens} out=${result.usage.outputTokens} total=${result.usage.totalTokens} cached_in=${result.usage.cachedInputTokens} reasoning=${result.usage.reasoningTokens}`);
|
|
430
|
+
}
|
|
431
|
+
summarized += 1;
|
|
432
|
+
}
|
|
433
|
+
this.finishRun('summary_runs', runId, 'completed', { summarized, inputTokens, outputTokens, totalTokens });
|
|
434
|
+
return { runId, summarized, inputTokens, outputTokens, totalTokens };
|
|
435
|
+
}
|
|
436
|
+
catch (error) {
|
|
437
|
+
this.finishRun('summary_runs', runId, 'failed', null, error);
|
|
438
|
+
throw error;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
purgeComments(params) {
|
|
442
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
443
|
+
let sql = 'select id, number from threads where repo_id = ?';
|
|
444
|
+
const args = [repository.id];
|
|
445
|
+
if (params.threadNumber) {
|
|
446
|
+
sql += ' and number = ?';
|
|
447
|
+
args.push(params.threadNumber);
|
|
448
|
+
}
|
|
449
|
+
sql += ' order by number asc';
|
|
450
|
+
const threads = this.db.prepare(sql).all(...args);
|
|
451
|
+
if (threads.length === 0) {
|
|
452
|
+
return { purgedComments: 0, refreshedThreads: 0 };
|
|
453
|
+
}
|
|
454
|
+
params.onProgress?.(`[purge-comments] removing hydrated comments from ${threads.length} thread(s) in ${repository.fullName}`);
|
|
455
|
+
const deleteComments = this.db.prepare('delete from comments where thread_id = ?');
|
|
456
|
+
let purgedComments = 0;
|
|
457
|
+
for (const thread of threads) {
|
|
458
|
+
const row = this.db.prepare('select count(*) as count from comments where thread_id = ?').get(thread.id);
|
|
459
|
+
if (row.count > 0) {
|
|
460
|
+
deleteComments.run(thread.id);
|
|
461
|
+
purgedComments += row.count;
|
|
462
|
+
}
|
|
463
|
+
this.refreshDocument(thread.id);
|
|
464
|
+
}
|
|
465
|
+
params.onProgress?.(`[purge-comments] removed ${purgedComments} comment(s) and refreshed ${threads.length} document(s) for ${repository.fullName}`);
|
|
466
|
+
return { purgedComments, refreshedThreads: threads.length };
|
|
467
|
+
}
|
|
468
|
+
async embedRepository(params) {
|
|
469
|
+
const ai = this.requireAi();
|
|
470
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
471
|
+
const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName);
|
|
472
|
+
try {
|
|
473
|
+
const { rows, tasks, pending } = this.getEmbeddingWorkset(repository.id, params.threadNumber);
|
|
474
|
+
const skipped = tasks.length - pending.length;
|
|
475
|
+
const truncated = tasks.filter((task) => task.wasTruncated).length;
|
|
476
|
+
params.onProgress?.(`[embed] loaded ${rows.length} open thread(s) and ${tasks.length} embedding source(s) for ${repository.fullName}`);
|
|
477
|
+
params.onProgress?.(`[embed] pending=${pending.length} skipped=${skipped} truncated=${truncated} model=${this.config.embedModel} batch_size=${this.config.embedBatchSize} concurrency=${this.config.embedConcurrency} max_unread=${this.config.embedMaxUnread} max_batch_tokens=${EMBED_MAX_BATCH_TOKENS}`);
|
|
478
|
+
let embedded = 0;
|
|
479
|
+
const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS);
|
|
480
|
+
const mapper = new IterableMapper(batches, async (batch) => {
|
|
481
|
+
return this.embedBatchWithRecovery(ai, batch, params.onProgress);
|
|
482
|
+
}, {
|
|
483
|
+
concurrency: this.config.embedConcurrency,
|
|
484
|
+
maxUnread: this.config.embedMaxUnread,
|
|
485
|
+
});
|
|
486
|
+
let completedBatches = 0;
|
|
487
|
+
for await (const batchResult of mapper) {
|
|
488
|
+
completedBatches += 1;
|
|
489
|
+
const numbers = batchResult.map(({ task }) => `#${task.threadNumber}:${task.sourceKind}`);
|
|
490
|
+
const estimatedTokens = batchResult.reduce((sum, { task }) => sum + task.estimatedTokens, 0);
|
|
491
|
+
params.onProgress?.(`[embed] batch ${completedBatches}/${Math.max(batches.length, 1)} size=${batchResult.length} est_tokens=${estimatedTokens} items=${numbers.join(',')}`);
|
|
492
|
+
for (const { task, embedding } of batchResult) {
|
|
493
|
+
this.upsertEmbedding(task.threadId, task.sourceKind, task.contentHash, embedding);
|
|
494
|
+
embedded += 1;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
this.finishRun('embedding_runs', runId, 'completed', { embedded });
|
|
498
|
+
return embedResultSchema.parse({ runId, embedded });
|
|
499
|
+
}
|
|
500
|
+
catch (error) {
|
|
501
|
+
this.finishRun('embedding_runs', runId, 'failed', null, error);
|
|
502
|
+
throw error;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
clusterRepository(params) {
|
|
506
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
507
|
+
const runId = this.startRun('cluster_runs', repository.id, repository.fullName);
|
|
508
|
+
const minScore = params.minScore ?? 0.82;
|
|
509
|
+
const k = params.k ?? 6;
|
|
510
|
+
try {
|
|
511
|
+
const rows = this.loadParsedStoredEmbeddings(repository.id);
|
|
512
|
+
const threadMeta = new Map();
|
|
513
|
+
for (const row of rows) {
|
|
514
|
+
threadMeta.set(row.id, { number: row.number, title: row.title });
|
|
515
|
+
}
|
|
516
|
+
const items = Array.from(threadMeta.entries()).map(([id, meta]) => ({
|
|
517
|
+
id,
|
|
518
|
+
number: meta.number,
|
|
519
|
+
title: meta.title,
|
|
520
|
+
}));
|
|
521
|
+
params.onProgress?.(`[cluster] loaded ${items.length} embedded thread(s) across ${new Set(rows.map((row) => row.source_kind)).size} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`);
|
|
522
|
+
this.db.prepare('delete from cluster_members where cluster_id in (select id from clusters where cluster_run_id = ?)').run(runId);
|
|
523
|
+
this.db.prepare('delete from clusters where cluster_run_id = ?').run(runId);
|
|
524
|
+
this.db.prepare('delete from similarity_edges where cluster_run_id = ?').run(runId);
|
|
525
|
+
const aggregatedEdges = this.aggregateRepositoryEdges(rows, { limit: k, minScore });
|
|
526
|
+
const edges = Array.from(aggregatedEdges.values()).map((entry) => ({
|
|
527
|
+
leftThreadId: entry.leftThreadId,
|
|
528
|
+
rightThreadId: entry.rightThreadId,
|
|
529
|
+
score: entry.score,
|
|
530
|
+
}));
|
|
531
|
+
const insertEdge = this.db.prepare(`insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at)
|
|
532
|
+
values (?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
533
|
+
for (const edge of aggregatedEdges.values()) {
|
|
534
|
+
insertEdge.run(repository.id, runId, edge.leftThreadId, edge.rightThreadId, 'exact_cosine', edge.score, asJson({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel }), nowIso());
|
|
535
|
+
}
|
|
536
|
+
params.onProgress?.(`[cluster] built ${edges.length} similarity edge(s)`);
|
|
537
|
+
const clusters = buildClusters(items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges);
|
|
538
|
+
const insertCluster = this.db.prepare('insert into clusters (repo_id, cluster_run_id, representative_thread_id, member_count, created_at) values (?, ?, ?, ?, ?)');
|
|
539
|
+
const insertMember = this.db.prepare('insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)');
|
|
540
|
+
for (const cluster of clusters) {
|
|
541
|
+
const clusterResult = insertCluster.run(repository.id, runId, cluster.representativeThreadId, cluster.members.length, nowIso());
|
|
542
|
+
const clusterId = Number(clusterResult.lastInsertRowid);
|
|
543
|
+
for (const memberId of cluster.members) {
|
|
544
|
+
const key = this.edgeKey(cluster.representativeThreadId, memberId);
|
|
545
|
+
const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null);
|
|
546
|
+
insertMember.run(clusterId, memberId, score, nowIso());
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s)`);
|
|
550
|
+
this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length });
|
|
551
|
+
return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length });
|
|
552
|
+
}
|
|
553
|
+
catch (error) {
|
|
554
|
+
this.finishRun('cluster_runs', runId, 'failed', null, error);
|
|
555
|
+
throw error;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
async searchRepository(params) {
|
|
559
|
+
const mode = params.mode ?? 'hybrid';
|
|
560
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
561
|
+
const limit = params.limit ?? 20;
|
|
562
|
+
const keywordScores = new Map();
|
|
563
|
+
const semanticScores = new Map();
|
|
564
|
+
if (mode !== 'semantic') {
|
|
565
|
+
const rows = this.db
|
|
566
|
+
.prepare(`select d.thread_id, bm25(documents_fts) as rank
|
|
567
|
+
from documents_fts
|
|
568
|
+
join documents d on d.id = documents_fts.rowid
|
|
569
|
+
join threads t on t.id = d.thread_id
|
|
570
|
+
where t.repo_id = ? and t.state = 'open' and documents_fts match ?
|
|
571
|
+
order by rank
|
|
572
|
+
limit ?`)
|
|
573
|
+
.all(repository.id, params.query, limit * 2);
|
|
574
|
+
for (const row of rows) {
|
|
575
|
+
keywordScores.set(row.thread_id, 1 / (1 + Math.abs(row.rank)));
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
if (mode !== 'keyword' && this.ai) {
|
|
579
|
+
const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] });
|
|
580
|
+
const rows = this.loadParsedStoredEmbeddings(repository.id);
|
|
581
|
+
for (const row of rows) {
|
|
582
|
+
const score = cosineSimilarity(queryEmbedding, row.embedding);
|
|
583
|
+
if (score < 0.2)
|
|
584
|
+
continue;
|
|
585
|
+
semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score));
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
const candidateIds = new Set([...keywordScores.keys(), ...semanticScores.keys()]);
|
|
589
|
+
const threadRows = candidateIds.size
|
|
590
|
+
? this.db
|
|
591
|
+
.prepare(`select * from threads
|
|
592
|
+
where repo_id = ? and state = 'open' and id in (${[...candidateIds].map(() => '?').join(',')})
|
|
593
|
+
order by updated_at_gh desc, number desc`)
|
|
594
|
+
.all(repository.id, ...candidateIds)
|
|
595
|
+
: [];
|
|
596
|
+
const neighborRows = this.db
|
|
597
|
+
.prepare(`select se.left_thread_id, se.right_thread_id, se.score, t1.number as left_number, t2.number as right_number,
|
|
598
|
+
t1.kind as left_kind, t2.kind as right_kind, t1.title as left_title, t2.title as right_title
|
|
599
|
+
from similarity_edges se
|
|
600
|
+
join threads t1 on t1.id = se.left_thread_id
|
|
601
|
+
join threads t2 on t2.id = se.right_thread_id
|
|
602
|
+
where se.repo_id = ? and se.cluster_run_id = (
|
|
603
|
+
select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1
|
|
604
|
+
)`)
|
|
605
|
+
.all(repository.id, repository.id);
|
|
606
|
+
const neighborsByThread = new Map();
|
|
607
|
+
for (const edge of neighborRows) {
|
|
608
|
+
const leftList = neighborsByThread.get(edge.left_thread_id) ?? [];
|
|
609
|
+
leftList.push({
|
|
610
|
+
threadId: edge.right_thread_id,
|
|
611
|
+
number: edge.right_number,
|
|
612
|
+
kind: edge.right_kind,
|
|
613
|
+
title: edge.right_title,
|
|
614
|
+
score: edge.score,
|
|
615
|
+
});
|
|
616
|
+
neighborsByThread.set(edge.left_thread_id, leftList);
|
|
617
|
+
const rightList = neighborsByThread.get(edge.right_thread_id) ?? [];
|
|
618
|
+
rightList.push({
|
|
619
|
+
threadId: edge.left_thread_id,
|
|
620
|
+
number: edge.left_number,
|
|
621
|
+
kind: edge.left_kind,
|
|
622
|
+
title: edge.left_title,
|
|
623
|
+
score: edge.score,
|
|
624
|
+
});
|
|
625
|
+
neighborsByThread.set(edge.right_thread_id, rightList);
|
|
626
|
+
}
|
|
627
|
+
const hits = threadRows
|
|
628
|
+
.map((row) => {
|
|
629
|
+
const keywordScore = keywordScores.get(row.id) ?? null;
|
|
630
|
+
const semanticScore = semanticScores.get(row.id) ?? null;
|
|
631
|
+
const hybridScore = (keywordScore ?? 0) + (semanticScore ?? 0);
|
|
632
|
+
return {
|
|
633
|
+
thread: threadToDto(row),
|
|
634
|
+
keywordScore,
|
|
635
|
+
semanticScore,
|
|
636
|
+
hybridScore,
|
|
637
|
+
neighbors: (neighborsByThread.get(row.id) ?? []).sort((left, right) => right.score - left.score).slice(0, 3),
|
|
638
|
+
};
|
|
639
|
+
})
|
|
640
|
+
.sort((left, right) => right.hybridScore - left.hybridScore)
|
|
641
|
+
.slice(0, limit);
|
|
642
|
+
return searchResponseSchema.parse({
|
|
643
|
+
repository,
|
|
644
|
+
query: params.query,
|
|
645
|
+
mode,
|
|
646
|
+
hits,
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
listNeighbors(params) {
|
|
650
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
651
|
+
const limit = params.limit ?? 10;
|
|
652
|
+
const minScore = params.minScore ?? 0.2;
|
|
653
|
+
const rows = this.loadParsedStoredEmbeddings(repository.id);
|
|
654
|
+
const targetRows = rows.filter((row) => row.number === params.threadNumber);
|
|
655
|
+
if (targetRows.length === 0) {
|
|
656
|
+
throw new Error(`Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`);
|
|
657
|
+
}
|
|
658
|
+
const targetRow = targetRows[0];
|
|
659
|
+
const targetBySource = new Map();
|
|
660
|
+
for (const row of targetRows) {
|
|
661
|
+
targetBySource.set(row.source_kind, row.embedding);
|
|
662
|
+
}
|
|
663
|
+
const aggregated = new Map();
|
|
664
|
+
for (const row of rows) {
|
|
665
|
+
if (row.id === targetRow.id)
|
|
666
|
+
continue;
|
|
667
|
+
const targetEmbedding = targetBySource.get(row.source_kind);
|
|
668
|
+
if (!targetEmbedding)
|
|
669
|
+
continue;
|
|
670
|
+
const score = cosineSimilarity(targetEmbedding, row.embedding);
|
|
671
|
+
if (score < minScore)
|
|
672
|
+
continue;
|
|
673
|
+
const previous = aggregated.get(row.id);
|
|
674
|
+
if (!previous || score > previous.score) {
|
|
675
|
+
aggregated.set(row.id, { number: row.number, kind: row.kind, title: row.title, score });
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
const neighbors = Array.from(aggregated.entries())
|
|
679
|
+
.map(([threadId, value]) => ({
|
|
680
|
+
threadId,
|
|
681
|
+
number: value.number,
|
|
682
|
+
kind: value.kind,
|
|
683
|
+
title: value.title,
|
|
684
|
+
score: value.score,
|
|
685
|
+
}))
|
|
686
|
+
.sort((left, right) => right.score - left.score)
|
|
687
|
+
.slice(0, limit);
|
|
688
|
+
return neighborsResponseSchema.parse({
|
|
689
|
+
repository,
|
|
690
|
+
thread: threadToDto(targetRow),
|
|
691
|
+
neighbors,
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
listClusters(params) {
|
|
695
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
696
|
+
const latestRun = this.db
|
|
697
|
+
.prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
698
|
+
.get(repository.id);
|
|
699
|
+
if (!latestRun) {
|
|
700
|
+
return clustersResponseSchema.parse({ repository, clusters: [] });
|
|
701
|
+
}
|
|
702
|
+
const rows = this.db
|
|
703
|
+
.prepare(`select c.id, c.repo_id, c.representative_thread_id, c.member_count,
|
|
704
|
+
cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title
|
|
705
|
+
from clusters c
|
|
706
|
+
left join cluster_members cm on cm.cluster_id = c.id
|
|
707
|
+
left join threads t on t.id = cm.thread_id
|
|
708
|
+
where c.cluster_run_id = ?
|
|
709
|
+
order by c.member_count desc, c.id asc, t.number asc`)
|
|
710
|
+
.all(latestRun.id);
|
|
711
|
+
const clusters = new Map();
|
|
712
|
+
for (const row of rows) {
|
|
713
|
+
const cluster = clusters.get(row.id) ?? {
|
|
714
|
+
id: row.id,
|
|
715
|
+
repoId: row.repo_id,
|
|
716
|
+
representativeThreadId: row.representative_thread_id,
|
|
717
|
+
memberCount: row.member_count,
|
|
718
|
+
members: [],
|
|
719
|
+
};
|
|
720
|
+
if (row.thread_id !== null && row.number !== null && row.kind !== null && row.title !== null) {
|
|
721
|
+
cluster.members.push({
|
|
722
|
+
threadId: row.thread_id,
|
|
723
|
+
number: row.number,
|
|
724
|
+
kind: row.kind,
|
|
725
|
+
title: row.title,
|
|
726
|
+
scoreToRepresentative: row.score_to_representative,
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
clusters.set(row.id, cluster);
|
|
730
|
+
}
|
|
731
|
+
return clustersResponseSchema.parse({
|
|
732
|
+
repository,
|
|
733
|
+
clusters: Array.from(clusters.values()),
|
|
734
|
+
});
|
|
735
|
+
}
|
|
736
|
+
async refreshRepository(params) {
|
|
737
|
+
const selected = {
|
|
738
|
+
sync: params.sync ?? true,
|
|
739
|
+
embed: params.embed ?? true,
|
|
740
|
+
cluster: params.cluster ?? true,
|
|
741
|
+
};
|
|
742
|
+
if (!selected.sync && !selected.embed && !selected.cluster) {
|
|
743
|
+
throw new Error('Refresh requires at least one selected step');
|
|
744
|
+
}
|
|
745
|
+
if (!selected.sync) {
|
|
746
|
+
this.requireRepository(params.owner, params.repo);
|
|
747
|
+
}
|
|
748
|
+
let sync = null;
|
|
749
|
+
let embed = null;
|
|
750
|
+
let cluster = null;
|
|
751
|
+
if (selected.sync) {
|
|
752
|
+
sync = await this.syncRepository({
|
|
753
|
+
owner: params.owner,
|
|
754
|
+
repo: params.repo,
|
|
755
|
+
onProgress: params.onProgress,
|
|
756
|
+
});
|
|
757
|
+
}
|
|
758
|
+
if (selected.embed) {
|
|
759
|
+
embed = await this.embedRepository({
|
|
760
|
+
owner: params.owner,
|
|
761
|
+
repo: params.repo,
|
|
762
|
+
onProgress: params.onProgress,
|
|
763
|
+
});
|
|
764
|
+
}
|
|
765
|
+
if (selected.cluster) {
|
|
766
|
+
cluster = this.clusterRepository({
|
|
767
|
+
owner: params.owner,
|
|
768
|
+
repo: params.repo,
|
|
769
|
+
onProgress: params.onProgress,
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
773
|
+
return refreshResponseSchema.parse({
|
|
774
|
+
repository,
|
|
775
|
+
selected,
|
|
776
|
+
sync,
|
|
777
|
+
embed,
|
|
778
|
+
cluster,
|
|
779
|
+
});
|
|
780
|
+
}
|
|
781
|
+
listClusterSummaries(params) {
|
|
782
|
+
const snapshot = this.getTuiSnapshot({
|
|
783
|
+
owner: params.owner,
|
|
784
|
+
repo: params.repo,
|
|
785
|
+
minSize: params.minSize,
|
|
786
|
+
sort: params.sort,
|
|
787
|
+
search: params.search,
|
|
788
|
+
});
|
|
789
|
+
const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters;
|
|
790
|
+
return clusterSummariesResponseSchema.parse({
|
|
791
|
+
repository: snapshot.repository,
|
|
792
|
+
stats: snapshot.stats,
|
|
793
|
+
clusters: clusters.map((cluster) => ({
|
|
794
|
+
clusterId: cluster.clusterId,
|
|
795
|
+
displayTitle: cluster.displayTitle,
|
|
796
|
+
totalCount: cluster.totalCount,
|
|
797
|
+
issueCount: cluster.issueCount,
|
|
798
|
+
pullRequestCount: cluster.pullRequestCount,
|
|
799
|
+
latestUpdatedAt: cluster.latestUpdatedAt,
|
|
800
|
+
representativeThreadId: cluster.representativeThreadId,
|
|
801
|
+
representativeNumber: cluster.representativeNumber,
|
|
802
|
+
representativeKind: cluster.representativeKind,
|
|
803
|
+
})),
|
|
804
|
+
});
|
|
805
|
+
}
|
|
806
|
+
getClusterDetailDump(params) {
|
|
807
|
+
const snapshot = this.getTuiSnapshot({
|
|
808
|
+
owner: params.owner,
|
|
809
|
+
repo: params.repo,
|
|
810
|
+
minSize: 0,
|
|
811
|
+
});
|
|
812
|
+
const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId);
|
|
813
|
+
if (!cluster) {
|
|
814
|
+
throw new Error(`Cluster ${params.clusterId} was not found for ${snapshot.repository.fullName}.`);
|
|
815
|
+
}
|
|
816
|
+
const detail = this.getTuiClusterDetail({
|
|
817
|
+
owner: params.owner,
|
|
818
|
+
repo: params.repo,
|
|
819
|
+
clusterId: params.clusterId,
|
|
820
|
+
});
|
|
821
|
+
const members = detail.members.slice(0, params.memberLimit ?? detail.members.length).map((member) => {
|
|
822
|
+
const threadDetail = this.getTuiThreadDetail({
|
|
823
|
+
owner: params.owner,
|
|
824
|
+
repo: params.repo,
|
|
825
|
+
threadId: member.id,
|
|
826
|
+
includeNeighbors: false,
|
|
827
|
+
});
|
|
828
|
+
return {
|
|
829
|
+
thread: {
|
|
830
|
+
...threadDetail.thread,
|
|
831
|
+
body: null,
|
|
832
|
+
},
|
|
833
|
+
bodySnippet: snippetText(threadDetail.thread.body, params.bodyChars ?? 280),
|
|
834
|
+
summaries: threadDetail.summaries,
|
|
835
|
+
};
|
|
836
|
+
});
|
|
837
|
+
return clusterDetailResponseSchema.parse({
|
|
838
|
+
repository: snapshot.repository,
|
|
839
|
+
stats: snapshot.stats,
|
|
840
|
+
cluster: {
|
|
841
|
+
clusterId: cluster.clusterId,
|
|
842
|
+
displayTitle: cluster.displayTitle,
|
|
843
|
+
totalCount: cluster.totalCount,
|
|
844
|
+
issueCount: cluster.issueCount,
|
|
845
|
+
pullRequestCount: cluster.pullRequestCount,
|
|
846
|
+
latestUpdatedAt: cluster.latestUpdatedAt,
|
|
847
|
+
representativeThreadId: cluster.representativeThreadId,
|
|
848
|
+
representativeNumber: cluster.representativeNumber,
|
|
849
|
+
representativeKind: cluster.representativeKind,
|
|
850
|
+
},
|
|
851
|
+
members,
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
getTuiSnapshot(params) {
|
|
855
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
856
|
+
const stats = this.getTuiRepoStats(repository.id);
|
|
857
|
+
const latestRun = this.getLatestClusterRun(repository.id);
|
|
858
|
+
if (!latestRun) {
|
|
859
|
+
return { repository, stats, clusters: [] };
|
|
860
|
+
}
|
|
861
|
+
const clusters = this.listRawTuiClusters(repository.id, latestRun.id)
|
|
862
|
+
.filter((cluster) => cluster.totalCount >= (params.minSize ?? 10))
|
|
863
|
+
.filter((cluster) => {
|
|
864
|
+
const search = params.search?.trim().toLowerCase();
|
|
865
|
+
if (!search)
|
|
866
|
+
return true;
|
|
867
|
+
return cluster.searchText.includes(search);
|
|
868
|
+
})
|
|
869
|
+
.sort((left, right) => this.compareTuiClusterSummary(left, right, params.sort ?? 'recent'));
|
|
870
|
+
return {
|
|
871
|
+
repository,
|
|
872
|
+
stats,
|
|
873
|
+
clusters,
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
getTuiClusterDetail(params) {
|
|
877
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
878
|
+
const latestRun = this.getLatestClusterRun(repository.id);
|
|
879
|
+
if (!latestRun) {
|
|
880
|
+
throw new Error(`No completed cluster run found for ${repository.fullName}. Run cluster first.`);
|
|
881
|
+
}
|
|
882
|
+
const summary = this.listRawTuiClusters(repository.id, latestRun.id).find((cluster) => cluster.clusterId === params.clusterId);
|
|
883
|
+
if (!summary) {
|
|
884
|
+
throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`);
|
|
885
|
+
}
|
|
886
|
+
const rows = this.db
|
|
887
|
+
.prepare(`select t.id, t.number, t.kind, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative
|
|
888
|
+
from cluster_members cm
|
|
889
|
+
join threads t on t.id = cm.thread_id
|
|
890
|
+
where cm.cluster_id = ?
|
|
891
|
+
order by
|
|
892
|
+
case t.kind when 'issue' then 0 else 1 end asc,
|
|
893
|
+
coalesce(t.updated_at_gh, t.updated_at) desc,
|
|
894
|
+
t.number desc`)
|
|
895
|
+
.all(params.clusterId);
|
|
896
|
+
return {
|
|
897
|
+
clusterId: summary.clusterId,
|
|
898
|
+
displayTitle: summary.displayTitle,
|
|
899
|
+
totalCount: summary.totalCount,
|
|
900
|
+
issueCount: summary.issueCount,
|
|
901
|
+
pullRequestCount: summary.pullRequestCount,
|
|
902
|
+
latestUpdatedAt: summary.latestUpdatedAt,
|
|
903
|
+
representativeThreadId: summary.representativeThreadId,
|
|
904
|
+
representativeNumber: summary.representativeNumber,
|
|
905
|
+
representativeKind: summary.representativeKind,
|
|
906
|
+
members: rows.map((row) => ({
|
|
907
|
+
id: row.id,
|
|
908
|
+
number: row.number,
|
|
909
|
+
kind: row.kind,
|
|
910
|
+
title: row.title,
|
|
911
|
+
updatedAtGh: row.updated_at_gh,
|
|
912
|
+
htmlUrl: row.html_url,
|
|
913
|
+
labels: parseArray(row.labels_json),
|
|
914
|
+
clusterScore: row.score_to_representative,
|
|
915
|
+
})),
|
|
916
|
+
};
|
|
917
|
+
}
|
|
918
|
+
getTuiThreadDetail(params) {
|
|
919
|
+
const repository = this.requireRepository(params.owner, params.repo);
|
|
920
|
+
const row = params.threadId
|
|
921
|
+
? (this.db
|
|
922
|
+
.prepare('select * from threads where repo_id = ? and id = ? and state = \'open\' limit 1')
|
|
923
|
+
.get(repository.id, params.threadId) ?? null)
|
|
924
|
+
: params.threadNumber
|
|
925
|
+
? (this.db
|
|
926
|
+
.prepare('select * from threads where repo_id = ? and number = ? and state = \'open\' limit 1')
|
|
927
|
+
.get(repository.id, params.threadNumber) ?? null)
|
|
928
|
+
: null;
|
|
929
|
+
if (!row) {
|
|
930
|
+
throw new Error(`Thread was not found for ${repository.fullName}.`);
|
|
931
|
+
}
|
|
932
|
+
const latestRun = this.getLatestClusterRun(repository.id);
|
|
933
|
+
const clusterMembership = latestRun
|
|
934
|
+
? (this.db
|
|
935
|
+
.prepare(`select cm.cluster_id
|
|
936
|
+
from cluster_members cm
|
|
937
|
+
join clusters c on c.id = cm.cluster_id
|
|
938
|
+
where c.cluster_run_id = ? and cm.thread_id = ?
|
|
939
|
+
limit 1`)
|
|
940
|
+
.get(latestRun.id, row.id) ?? null)
|
|
941
|
+
: null;
|
|
942
|
+
const summaryRows = this.db
|
|
943
|
+
.prepare(`select summary_kind, summary_text
|
|
944
|
+
from document_summaries
|
|
945
|
+
where thread_id = ? and model = ?
|
|
946
|
+
order by summary_kind asc`)
|
|
947
|
+
.all(row.id, this.config.summaryModel);
|
|
948
|
+
const summaries = {};
|
|
949
|
+
for (const summary of summaryRows) {
|
|
950
|
+
if (summary.summary_kind === 'problem_summary' ||
|
|
951
|
+
summary.summary_kind === 'solution_summary' ||
|
|
952
|
+
summary.summary_kind === 'maintainer_signal_summary' ||
|
|
953
|
+
summary.summary_kind === 'dedupe_summary') {
|
|
954
|
+
summaries[summary.summary_kind] = summary.summary_text;
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
let neighbors = [];
|
|
958
|
+
if (params.includeNeighbors !== false) {
|
|
959
|
+
try {
|
|
960
|
+
neighbors = this.listNeighbors({
|
|
961
|
+
owner: params.owner,
|
|
962
|
+
repo: params.repo,
|
|
963
|
+
threadNumber: row.number,
|
|
964
|
+
limit: 8,
|
|
965
|
+
minScore: 0.2,
|
|
966
|
+
}).neighbors;
|
|
967
|
+
}
|
|
968
|
+
catch {
|
|
969
|
+
neighbors = [];
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
return {
|
|
973
|
+
thread: threadToDto(row, clusterMembership?.cluster_id ?? null),
|
|
974
|
+
summaries,
|
|
975
|
+
neighbors,
|
|
976
|
+
};
|
|
977
|
+
}
|
|
978
|
+
async rerunAction(request) {
|
|
979
|
+
switch (request.action) {
|
|
980
|
+
case 'summarize': {
|
|
981
|
+
const result = await this.summarizeRepository(request);
|
|
982
|
+
return actionResponseSchema.parse({
|
|
983
|
+
ok: true,
|
|
984
|
+
action: request.action,
|
|
985
|
+
runId: result.runId,
|
|
986
|
+
message: `Summarized ${result.summarized} thread(s)`,
|
|
987
|
+
});
|
|
988
|
+
}
|
|
989
|
+
case 'embed': {
|
|
990
|
+
const result = await this.embedRepository(request);
|
|
991
|
+
return actionResponseSchema.parse({
|
|
992
|
+
ok: true,
|
|
993
|
+
action: request.action,
|
|
994
|
+
runId: result.runId,
|
|
995
|
+
message: `Embedded ${result.embedded} source vector(s)`,
|
|
996
|
+
});
|
|
997
|
+
}
|
|
998
|
+
case 'cluster': {
|
|
999
|
+
const result = this.clusterRepository(request);
|
|
1000
|
+
return actionResponseSchema.parse({
|
|
1001
|
+
ok: true,
|
|
1002
|
+
action: request.action,
|
|
1003
|
+
runId: result.runId,
|
|
1004
|
+
message: `Clustered ${result.clusters} group(s) from ${result.edges} edge(s)`,
|
|
1005
|
+
});
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
getSyncCursorState(repoId) {
|
|
1010
|
+
const persisted = this.db
|
|
1011
|
+
.prepare(`select
|
|
1012
|
+
last_full_open_scan_started_at,
|
|
1013
|
+
last_overlapping_open_scan_completed_at,
|
|
1014
|
+
last_non_overlapping_scan_completed_at,
|
|
1015
|
+
last_open_close_reconciled_at
|
|
1016
|
+
from repo_sync_state
|
|
1017
|
+
where repo_id = ?`)
|
|
1018
|
+
.get(repoId) ?? null;
|
|
1019
|
+
if (persisted) {
|
|
1020
|
+
return {
|
|
1021
|
+
lastFullOpenScanStartedAt: persisted.last_full_open_scan_started_at,
|
|
1022
|
+
lastOverlappingOpenScanCompletedAt: persisted.last_overlapping_open_scan_completed_at,
|
|
1023
|
+
lastNonOverlappingScanCompletedAt: persisted.last_non_overlapping_scan_completed_at,
|
|
1024
|
+
lastReconciledOpenCloseAt: persisted.last_open_close_reconciled_at,
|
|
1025
|
+
};
|
|
1026
|
+
}
|
|
1027
|
+
const rows = this.db
|
|
1028
|
+
.prepare("select finished_at, stats_json from sync_runs where repo_id = ? and status = 'completed' order by id desc")
|
|
1029
|
+
.all(repoId);
|
|
1030
|
+
const state = {
|
|
1031
|
+
lastFullOpenScanStartedAt: null,
|
|
1032
|
+
lastOverlappingOpenScanCompletedAt: null,
|
|
1033
|
+
lastNonOverlappingScanCompletedAt: null,
|
|
1034
|
+
lastReconciledOpenCloseAt: null,
|
|
1035
|
+
};
|
|
1036
|
+
for (const row of rows) {
|
|
1037
|
+
const stats = parseSyncRunStats(row.stats_json);
|
|
1038
|
+
if (!stats)
|
|
1039
|
+
continue;
|
|
1040
|
+
if (state.lastFullOpenScanStartedAt === null && stats.isFullOpenScan) {
|
|
1041
|
+
state.lastFullOpenScanStartedAt = stats.crawlStartedAt;
|
|
1042
|
+
}
|
|
1043
|
+
if (state.lastOverlappingOpenScanCompletedAt === null && stats.isOverlappingOpenScan && row.finished_at) {
|
|
1044
|
+
state.lastOverlappingOpenScanCompletedAt = row.finished_at;
|
|
1045
|
+
}
|
|
1046
|
+
if (state.lastNonOverlappingScanCompletedAt === null && !stats.isFullOpenScan && !stats.isOverlappingOpenScan && row.finished_at) {
|
|
1047
|
+
state.lastNonOverlappingScanCompletedAt = row.finished_at;
|
|
1048
|
+
}
|
|
1049
|
+
if (state.lastReconciledOpenCloseAt === null && stats.reconciledOpenCloseAt) {
|
|
1050
|
+
state.lastReconciledOpenCloseAt = stats.reconciledOpenCloseAt;
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
if (state.lastFullOpenScanStartedAt !== null ||
|
|
1054
|
+
state.lastOverlappingOpenScanCompletedAt !== null ||
|
|
1055
|
+
state.lastNonOverlappingScanCompletedAt !== null ||
|
|
1056
|
+
state.lastReconciledOpenCloseAt !== null) {
|
|
1057
|
+
this.writeSyncCursorState(repoId, state);
|
|
1058
|
+
}
|
|
1059
|
+
return state;
|
|
1060
|
+
}
|
|
1061
|
+
writeSyncCursorState(repoId, state) {
|
|
1062
|
+
this.db
|
|
1063
|
+
.prepare(`insert into repo_sync_state (
|
|
1064
|
+
repo_id,
|
|
1065
|
+
last_full_open_scan_started_at,
|
|
1066
|
+
last_overlapping_open_scan_completed_at,
|
|
1067
|
+
last_non_overlapping_scan_completed_at,
|
|
1068
|
+
last_open_close_reconciled_at,
|
|
1069
|
+
updated_at
|
|
1070
|
+
) values (?, ?, ?, ?, ?, ?)
|
|
1071
|
+
on conflict(repo_id) do update set
|
|
1072
|
+
last_full_open_scan_started_at = excluded.last_full_open_scan_started_at,
|
|
1073
|
+
last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at,
|
|
1074
|
+
last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at,
|
|
1075
|
+
last_open_close_reconciled_at = excluded.last_open_close_reconciled_at,
|
|
1076
|
+
updated_at = excluded.updated_at`)
|
|
1077
|
+
.run(repoId, state.lastFullOpenScanStartedAt, state.lastOverlappingOpenScanCompletedAt, state.lastNonOverlappingScanCompletedAt, state.lastReconciledOpenCloseAt, nowIso());
|
|
1078
|
+
}
|
|
1079
|
+
getTuiRepoStats(repoId) {
|
|
1080
|
+
const counts = this.db
|
|
1081
|
+
.prepare(`select kind, count(*) as count
|
|
1082
|
+
from threads
|
|
1083
|
+
where repo_id = ? and state = 'open'
|
|
1084
|
+
group by kind`)
|
|
1085
|
+
.all(repoId);
|
|
1086
|
+
const latestRun = this.getLatestClusterRun(repoId);
|
|
1087
|
+
const latestSync = this.db
|
|
1088
|
+
.prepare("select finished_at from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1089
|
+
.get(repoId) ?? null;
|
|
1090
|
+
const latestEmbed = this.db
|
|
1091
|
+
.prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1092
|
+
.get(repoId) ?? null;
|
|
1093
|
+
const embeddingWorkset = this.getEmbeddingWorkset(repoId);
|
|
1094
|
+
const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId));
|
|
1095
|
+
return {
|
|
1096
|
+
openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0,
|
|
1097
|
+
openPullRequestCount: counts.find((row) => row.kind === 'pull_request')?.count ?? 0,
|
|
1098
|
+
lastGithubReconciliationAt: latestSync?.finished_at ?? null,
|
|
1099
|
+
lastEmbedRefreshAt: latestEmbed?.finished_at ?? null,
|
|
1100
|
+
staleEmbedThreadCount: staleThreadIds.size,
|
|
1101
|
+
staleEmbedSourceCount: embeddingWorkset.pending.length,
|
|
1102
|
+
latestClusterRunId: latestRun?.id ?? null,
|
|
1103
|
+
latestClusterRunFinishedAt: latestRun?.finished_at ?? null,
|
|
1104
|
+
};
|
|
1105
|
+
}
|
|
1106
|
+
getLatestClusterRun(repoId) {
|
|
1107
|
+
return (this.db
|
|
1108
|
+
.prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1")
|
|
1109
|
+
.get(repoId) ?? null);
|
|
1110
|
+
}
|
|
1111
|
+
listRawTuiClusters(repoId, clusterRunId) {
|
|
1112
|
+
const rows = this.db
|
|
1113
|
+
.prepare(`select
|
|
1114
|
+
c.id as cluster_id,
|
|
1115
|
+
c.member_count,
|
|
1116
|
+
c.representative_thread_id,
|
|
1117
|
+
rt.number as representative_number,
|
|
1118
|
+
rt.kind as representative_kind,
|
|
1119
|
+
rt.title as representative_title,
|
|
1120
|
+
max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at,
|
|
1121
|
+
sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count,
|
|
1122
|
+
sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count,
|
|
1123
|
+
group_concat(lower(coalesce(t.title, '')), ' ') as search_text
|
|
1124
|
+
from clusters c
|
|
1125
|
+
left join threads rt on rt.id = c.representative_thread_id
|
|
1126
|
+
join cluster_members cm on cm.cluster_id = c.id
|
|
1127
|
+
join threads t on t.id = cm.thread_id
|
|
1128
|
+
where c.repo_id = ? and c.cluster_run_id = ?
|
|
1129
|
+
group by
|
|
1130
|
+
c.id,
|
|
1131
|
+
c.member_count,
|
|
1132
|
+
c.representative_thread_id,
|
|
1133
|
+
rt.number,
|
|
1134
|
+
rt.kind,
|
|
1135
|
+
rt.title`)
|
|
1136
|
+
.all(repoId, clusterRunId);
|
|
1137
|
+
return rows.map((row) => ({
|
|
1138
|
+
clusterId: row.cluster_id,
|
|
1139
|
+
displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`,
|
|
1140
|
+
totalCount: row.member_count,
|
|
1141
|
+
issueCount: row.issue_count,
|
|
1142
|
+
pullRequestCount: row.pull_request_count,
|
|
1143
|
+
latestUpdatedAt: row.latest_updated_at,
|
|
1144
|
+
representativeThreadId: row.representative_thread_id,
|
|
1145
|
+
representativeNumber: row.representative_number,
|
|
1146
|
+
representativeKind: row.representative_kind,
|
|
1147
|
+
searchText: `${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(),
|
|
1148
|
+
}));
|
|
1149
|
+
}
|
|
1150
|
+
compareTuiClusterSummary(left, right, sort) {
|
|
1151
|
+
const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0;
|
|
1152
|
+
const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0;
|
|
1153
|
+
if (sort === 'size') {
|
|
1154
|
+
return right.totalCount - left.totalCount || rightTime - leftTime || left.clusterId - right.clusterId;
|
|
1155
|
+
}
|
|
1156
|
+
return rightTime - leftTime || right.totalCount - left.totalCount || left.clusterId - right.clusterId;
|
|
1157
|
+
}
|
|
1158
|
+
async fetchThreadComments(owner, repo, number, isPr, reporter) {
|
|
1159
|
+
const github = this.requireGithub();
|
|
1160
|
+
const comments = [];
|
|
1161
|
+
const issueComments = await github.listIssueComments(owner, repo, number, reporter);
|
|
1162
|
+
comments.push(...issueComments.map((comment) => ({
|
|
1163
|
+
githubId: String(comment.id),
|
|
1164
|
+
commentType: 'issue_comment',
|
|
1165
|
+
authorLogin: userLogin(comment),
|
|
1166
|
+
authorType: userType(comment),
|
|
1167
|
+
body: String(comment.body ?? ''),
|
|
1168
|
+
isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }),
|
|
1169
|
+
rawJson: asJson(comment),
|
|
1170
|
+
createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null,
|
|
1171
|
+
updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null,
|
|
1172
|
+
})));
|
|
1173
|
+
if (isPr) {
|
|
1174
|
+
const reviews = await github.listPullReviews(owner, repo, number, reporter);
|
|
1175
|
+
comments.push(...reviews.map((review) => ({
|
|
1176
|
+
githubId: String(review.id),
|
|
1177
|
+
commentType: 'review',
|
|
1178
|
+
authorLogin: userLogin(review),
|
|
1179
|
+
authorType: userType(review),
|
|
1180
|
+
body: String(review.body ?? review.state ?? ''),
|
|
1181
|
+
isBot: isBotLikeAuthor({ authorLogin: userLogin(review), authorType: userType(review) }),
|
|
1182
|
+
rawJson: asJson(review),
|
|
1183
|
+
createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null,
|
|
1184
|
+
updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null,
|
|
1185
|
+
})));
|
|
1186
|
+
const reviewComments = await github.listPullReviewComments(owner, repo, number, reporter);
|
|
1187
|
+
comments.push(...reviewComments.map((comment) => ({
|
|
1188
|
+
githubId: String(comment.id),
|
|
1189
|
+
commentType: 'review_comment',
|
|
1190
|
+
authorLogin: userLogin(comment),
|
|
1191
|
+
authorType: userType(comment),
|
|
1192
|
+
body: String(comment.body ?? ''),
|
|
1193
|
+
isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }),
|
|
1194
|
+
rawJson: asJson(comment),
|
|
1195
|
+
createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null,
|
|
1196
|
+
updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null,
|
|
1197
|
+
})));
|
|
1198
|
+
}
|
|
1199
|
+
return comments;
|
|
1200
|
+
}
|
|
1201
|
+
requireAi() {
|
|
1202
|
+
if (!this.ai) {
|
|
1203
|
+
requireOpenAiKey(this.config);
|
|
1204
|
+
}
|
|
1205
|
+
return this.ai;
|
|
1206
|
+
}
|
|
1207
|
+
requireGithub() {
|
|
1208
|
+
if (!this.github) {
|
|
1209
|
+
requireGithubToken(this.config);
|
|
1210
|
+
}
|
|
1211
|
+
return this.github;
|
|
1212
|
+
}
|
|
1213
|
+
requireRepository(owner, repo) {
|
|
1214
|
+
const fullName = `${owner}/${repo}`;
|
|
1215
|
+
const row = this.db.prepare('select * from repositories where full_name = ? limit 1').get(fullName);
|
|
1216
|
+
if (!row) {
|
|
1217
|
+
throw new Error(`Repository ${fullName} not found. Run sync first.`);
|
|
1218
|
+
}
|
|
1219
|
+
return repositoryToDto(row);
|
|
1220
|
+
}
|
|
1221
|
+
upsertRepository(owner, repo, payload) {
|
|
1222
|
+
const fullName = `${owner}/${repo}`;
|
|
1223
|
+
this.db
|
|
1224
|
+
.prepare(`insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at)
|
|
1225
|
+
values (?, ?, ?, ?, ?, ?)
|
|
1226
|
+
on conflict(full_name) do update set
|
|
1227
|
+
github_repo_id = excluded.github_repo_id,
|
|
1228
|
+
raw_json = excluded.raw_json,
|
|
1229
|
+
updated_at = excluded.updated_at`)
|
|
1230
|
+
.run(owner, repo, fullName, payload.id ? String(payload.id) : null, asJson(payload), nowIso());
|
|
1231
|
+
const row = this.db.prepare('select id from repositories where full_name = ?').get(fullName);
|
|
1232
|
+
return row.id;
|
|
1233
|
+
}
|
|
1234
|
+
upsertThread(repoId, kind, payload, pulledAt) {
|
|
1235
|
+
const title = String(payload.title ?? `#${payload.number}`);
|
|
1236
|
+
const body = typeof payload.body === 'string' ? payload.body : null;
|
|
1237
|
+
const labels = parseLabels(payload);
|
|
1238
|
+
const assignees = parseAssignees(payload);
|
|
1239
|
+
const contentHash = stableContentHash(`${title}\n${body ?? ''}`);
|
|
1240
|
+
this.db
|
|
1241
|
+
.prepare(`insert into threads (
|
|
1242
|
+
repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url,
|
|
1243
|
+
labels_json, assignees_json, raw_json, content_hash, is_draft,
|
|
1244
|
+
created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at
|
|
1245
|
+
) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1246
|
+
on conflict(repo_id, kind, number) do update set
|
|
1247
|
+
github_id = excluded.github_id,
|
|
1248
|
+
state = excluded.state,
|
|
1249
|
+
title = excluded.title,
|
|
1250
|
+
body = excluded.body,
|
|
1251
|
+
author_login = excluded.author_login,
|
|
1252
|
+
author_type = excluded.author_type,
|
|
1253
|
+
html_url = excluded.html_url,
|
|
1254
|
+
labels_json = excluded.labels_json,
|
|
1255
|
+
assignees_json = excluded.assignees_json,
|
|
1256
|
+
raw_json = excluded.raw_json,
|
|
1257
|
+
content_hash = excluded.content_hash,
|
|
1258
|
+
is_draft = excluded.is_draft,
|
|
1259
|
+
created_at_gh = excluded.created_at_gh,
|
|
1260
|
+
updated_at_gh = excluded.updated_at_gh,
|
|
1261
|
+
closed_at_gh = excluded.closed_at_gh,
|
|
1262
|
+
merged_at_gh = excluded.merged_at_gh,
|
|
1263
|
+
last_pulled_at = excluded.last_pulled_at,
|
|
1264
|
+
updated_at = excluded.updated_at`)
|
|
1265
|
+
.run(repoId, String(payload.id), Number(payload.number), kind, String(payload.state ?? 'open'), title, body, userLogin(payload), userType(payload), String(payload.html_url), asJson(labels), asJson(assignees), asJson(payload), contentHash, payload.draft ? 1 : 0, typeof payload.created_at === 'string' ? payload.created_at : null, typeof payload.updated_at === 'string' ? payload.updated_at : null, typeof payload.closed_at === 'string' ? payload.closed_at : null, typeof payload.merged_at === 'string' ? payload.merged_at : null, pulledAt, pulledAt, nowIso());
|
|
1266
|
+
const row = this.db
|
|
1267
|
+
.prepare('select id from threads where repo_id = ? and kind = ? and number = ?')
|
|
1268
|
+
.get(repoId, kind, Number(payload.number));
|
|
1269
|
+
return row.id;
|
|
1270
|
+
}
|
|
1271
|
+
async reconcileMissingOpenThreads(params) {
|
|
1272
|
+
const github = this.requireGithub();
|
|
1273
|
+
const staleRows = this.db
|
|
1274
|
+
.prepare(`select id, number, kind
|
|
1275
|
+
from threads
|
|
1276
|
+
where repo_id = ?
|
|
1277
|
+
and state = 'open'
|
|
1278
|
+
and (last_pulled_at is null or last_pulled_at < ?)
|
|
1279
|
+
order by number asc`)
|
|
1280
|
+
.all(params.repoId, params.crawlStartedAt);
|
|
1281
|
+
if (staleRows.length === 0) {
|
|
1282
|
+
return 0;
|
|
1283
|
+
}
|
|
1284
|
+
params.onProgress?.(`[sync] reconciling ${staleRows.length} previously-open thread(s) not seen in the open crawl`);
|
|
1285
|
+
let threadsClosed = 0;
|
|
1286
|
+
for (const [index, row] of staleRows.entries()) {
|
|
1287
|
+
if (index > 0 && index % SYNC_BATCH_SIZE === 0) {
|
|
1288
|
+
params.onProgress?.(`[sync] stale reconciliation batch boundary reached at ${index} threads; sleeping 5s before continuing`);
|
|
1289
|
+
await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS));
|
|
1290
|
+
}
|
|
1291
|
+
params.onProgress?.(`[sync] reconciling stale ${row.kind} #${row.number}`);
|
|
1292
|
+
const pulledAt = nowIso();
|
|
1293
|
+
let payload = null;
|
|
1294
|
+
let state = 'closed';
|
|
1295
|
+
try {
|
|
1296
|
+
payload =
|
|
1297
|
+
row.kind === 'pull_request'
|
|
1298
|
+
? await github.getPull(params.owner, params.repo, row.number, params.reporter)
|
|
1299
|
+
: await github.getIssue(params.owner, params.repo, row.number, params.reporter);
|
|
1300
|
+
state = String(payload.state ?? 'open');
|
|
1301
|
+
}
|
|
1302
|
+
catch (error) {
|
|
1303
|
+
if (!isMissingGitHubResourceError(error)) {
|
|
1304
|
+
throw error;
|
|
1305
|
+
}
|
|
1306
|
+
params.onProgress?.(`[sync] stale ${row.kind} #${row.number} is missing on GitHub; marking it closed locally and continuing`);
|
|
1307
|
+
}
|
|
1308
|
+
if (payload) {
|
|
1309
|
+
this.db
|
|
1310
|
+
.prepare(`update threads
|
|
1311
|
+
set state = ?,
|
|
1312
|
+
raw_json = ?,
|
|
1313
|
+
updated_at_gh = ?,
|
|
1314
|
+
closed_at_gh = ?,
|
|
1315
|
+
merged_at_gh = ?,
|
|
1316
|
+
last_pulled_at = ?,
|
|
1317
|
+
updated_at = ?
|
|
1318
|
+
where id = ?`)
|
|
1319
|
+
.run(state, asJson(payload), typeof payload.updated_at === 'string' ? payload.updated_at : null, typeof payload.closed_at === 'string' ? payload.closed_at : null, typeof payload.merged_at === 'string' ? payload.merged_at : null, pulledAt, pulledAt, row.id);
|
|
1320
|
+
}
|
|
1321
|
+
else {
|
|
1322
|
+
this.db
|
|
1323
|
+
.prepare(`update threads
|
|
1324
|
+
set state = 'closed',
|
|
1325
|
+
closed_at_gh = coalesce(closed_at_gh, ?),
|
|
1326
|
+
last_pulled_at = ?,
|
|
1327
|
+
updated_at = ?
|
|
1328
|
+
where id = ?`)
|
|
1329
|
+
.run(pulledAt, pulledAt, pulledAt, row.id);
|
|
1330
|
+
}
|
|
1331
|
+
if (state !== 'open') {
|
|
1332
|
+
threadsClosed += 1;
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
if (threadsClosed > 0) {
|
|
1336
|
+
params.onProgress?.(`[sync] marked ${threadsClosed} stale thread(s) as closed after GitHub confirmation`);
|
|
1337
|
+
}
|
|
1338
|
+
return threadsClosed;
|
|
1339
|
+
}
|
|
1340
|
+
replaceComments(threadId, comments) {
|
|
1341
|
+
const insert = this.db.prepare(`insert into comments (
|
|
1342
|
+
thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh
|
|
1343
|
+
) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`);
|
|
1344
|
+
const tx = this.db.transaction((commentRows) => {
|
|
1345
|
+
this.db.prepare('delete from comments where thread_id = ?').run(threadId);
|
|
1346
|
+
for (const comment of commentRows) {
|
|
1347
|
+
insert.run(threadId, comment.githubId, comment.commentType, comment.authorLogin, comment.authorType, comment.body, comment.isBot ? 1 : 0, comment.rawJson, comment.createdAtGh, comment.updatedAtGh);
|
|
1348
|
+
}
|
|
1349
|
+
});
|
|
1350
|
+
tx(comments);
|
|
1351
|
+
}
|
|
1352
|
+
refreshDocument(threadId) {
|
|
1353
|
+
const thread = this.db.prepare('select * from threads where id = ?').get(threadId);
|
|
1354
|
+
const comments = this.db
|
|
1355
|
+
.prepare('select body, author_login, author_type, is_bot from comments where thread_id = ? order by coalesce(created_at_gh, updated_at_gh) asc, id asc')
|
|
1356
|
+
.all(threadId);
|
|
1357
|
+
const canonical = buildCanonicalDocument({
|
|
1358
|
+
title: thread.title,
|
|
1359
|
+
body: thread.body,
|
|
1360
|
+
labels: parseArray(thread.labels_json),
|
|
1361
|
+
comments: comments.map((comment) => ({
|
|
1362
|
+
body: comment.body,
|
|
1363
|
+
authorLogin: comment.author_login,
|
|
1364
|
+
authorType: comment.author_type,
|
|
1365
|
+
isBot: comment.is_bot === 1,
|
|
1366
|
+
})),
|
|
1367
|
+
});
|
|
1368
|
+
this.db
|
|
1369
|
+
.prepare(`insert into documents (thread_id, title, body, raw_text, dedupe_text, updated_at)
|
|
1370
|
+
values (?, ?, ?, ?, ?, ?)
|
|
1371
|
+
on conflict(thread_id) do update set
|
|
1372
|
+
title = excluded.title,
|
|
1373
|
+
body = excluded.body,
|
|
1374
|
+
raw_text = excluded.raw_text,
|
|
1375
|
+
dedupe_text = excluded.dedupe_text,
|
|
1376
|
+
updated_at = excluded.updated_at`)
|
|
1377
|
+
.run(threadId, thread.title, thread.body, canonical.rawText, canonical.dedupeText, nowIso());
|
|
1378
|
+
this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId);
|
|
1379
|
+
}
|
|
1380
|
+
buildSummarySource(threadId, title, body, labels, includeComments) {
|
|
1381
|
+
const parts = [`title: ${normalizeSummaryText(title)}`];
|
|
1382
|
+
const normalizedBody = normalizeSummaryText(body ?? '');
|
|
1383
|
+
if (normalizedBody) {
|
|
1384
|
+
parts.push(`body: ${normalizedBody}`);
|
|
1385
|
+
}
|
|
1386
|
+
if (labels.length > 0) {
|
|
1387
|
+
parts.push(`labels: ${labels.join(', ')}`);
|
|
1388
|
+
}
|
|
1389
|
+
if (includeComments) {
|
|
1390
|
+
const comments = this.db
|
|
1391
|
+
.prepare(`select body, author_login, author_type, is_bot
|
|
1392
|
+
from comments
|
|
1393
|
+
where thread_id = ?
|
|
1394
|
+
order by coalesce(created_at_gh, updated_at_gh) asc, id asc`)
|
|
1395
|
+
.all(threadId);
|
|
1396
|
+
const humanComments = comments
|
|
1397
|
+
.filter((comment) => !isBotLikeAuthor({
|
|
1398
|
+
authorLogin: comment.author_login,
|
|
1399
|
+
authorType: comment.author_type,
|
|
1400
|
+
isBot: comment.is_bot === 1,
|
|
1401
|
+
}))
|
|
1402
|
+
.map((comment) => {
|
|
1403
|
+
const author = comment.author_login ? `@${comment.author_login}` : 'unknown';
|
|
1404
|
+
const normalized = normalizeSummaryText(comment.body);
|
|
1405
|
+
return normalized ? `${author}: ${normalized}` : '';
|
|
1406
|
+
})
|
|
1407
|
+
.filter(Boolean);
|
|
1408
|
+
if (humanComments.length > 0) {
|
|
1409
|
+
parts.push(`discussion:\n${humanComments.join('\n')}`);
|
|
1410
|
+
}
|
|
1411
|
+
}
|
|
1412
|
+
const summaryInput = parts.join('\n\n');
|
|
1413
|
+
const summaryContentHash = stableContentHash(`summary:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`);
|
|
1414
|
+
return { summaryInput, summaryContentHash };
|
|
1415
|
+
}
|
|
1416
|
+
buildEmbeddingTasks(params) {
|
|
1417
|
+
const tasks = [];
|
|
1418
|
+
const titleText = this.prepareEmbeddingText(normalizeSummaryText(params.title), EMBED_MAX_ITEM_TOKENS);
|
|
1419
|
+
if (titleText) {
|
|
1420
|
+
tasks.push({
|
|
1421
|
+
threadId: params.threadId,
|
|
1422
|
+
threadNumber: params.threadNumber,
|
|
1423
|
+
sourceKind: 'title',
|
|
1424
|
+
text: titleText.text,
|
|
1425
|
+
contentHash: stableContentHash(`embedding:title\n${titleText.text}`),
|
|
1426
|
+
estimatedTokens: titleText.estimatedTokens,
|
|
1427
|
+
wasTruncated: titleText.wasTruncated,
|
|
1428
|
+
});
|
|
1429
|
+
}
|
|
1430
|
+
const bodyText = this.prepareEmbeddingText(normalizeSummaryText(params.body ?? ''), EMBED_MAX_ITEM_TOKENS);
|
|
1431
|
+
if (bodyText) {
|
|
1432
|
+
tasks.push({
|
|
1433
|
+
threadId: params.threadId,
|
|
1434
|
+
threadNumber: params.threadNumber,
|
|
1435
|
+
sourceKind: 'body',
|
|
1436
|
+
text: bodyText.text,
|
|
1437
|
+
contentHash: stableContentHash(`embedding:body\n${bodyText.text}`),
|
|
1438
|
+
estimatedTokens: bodyText.estimatedTokens,
|
|
1439
|
+
wasTruncated: bodyText.wasTruncated,
|
|
1440
|
+
});
|
|
1441
|
+
}
|
|
1442
|
+
const summaryText = this.prepareEmbeddingText(normalizeSummaryText(params.dedupeSummary ?? ''), EMBED_MAX_ITEM_TOKENS);
|
|
1443
|
+
if (summaryText) {
|
|
1444
|
+
tasks.push({
|
|
1445
|
+
threadId: params.threadId,
|
|
1446
|
+
threadNumber: params.threadNumber,
|
|
1447
|
+
sourceKind: 'dedupe_summary',
|
|
1448
|
+
text: summaryText.text,
|
|
1449
|
+
contentHash: stableContentHash(`embedding:dedupe_summary\n${summaryText.text}`),
|
|
1450
|
+
estimatedTokens: summaryText.estimatedTokens,
|
|
1451
|
+
wasTruncated: summaryText.wasTruncated,
|
|
1452
|
+
});
|
|
1453
|
+
}
|
|
1454
|
+
return tasks;
|
|
1455
|
+
}
|
|
1456
|
+
prepareEmbeddingText(text, maxEstimatedTokens) {
|
|
1457
|
+
if (!text) {
|
|
1458
|
+
return null;
|
|
1459
|
+
}
|
|
1460
|
+
const maxChars = maxEstimatedTokens * EMBED_ESTIMATED_CHARS_PER_TOKEN;
|
|
1461
|
+
const wasTruncated = text.length > maxChars;
|
|
1462
|
+
const prepared = wasTruncated
|
|
1463
|
+
? `${text.slice(0, Math.max(0, maxChars - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`
|
|
1464
|
+
: text;
|
|
1465
|
+
return {
|
|
1466
|
+
text: prepared,
|
|
1467
|
+
estimatedTokens: this.estimateEmbeddingTokens(prepared),
|
|
1468
|
+
wasTruncated,
|
|
1469
|
+
};
|
|
1470
|
+
}
|
|
1471
|
+
estimateEmbeddingTokens(text) {
|
|
1472
|
+
return Math.max(1, Math.ceil(text.length / EMBED_ESTIMATED_CHARS_PER_TOKEN));
|
|
1473
|
+
}
|
|
1474
|
+
isEmbeddingContextError(error) {
|
|
1475
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1476
|
+
return /maximum context length/i.test(message) || /requested \d+ tokens/i.test(message);
|
|
1477
|
+
}
|
|
1478
|
+
async embedBatchWithRecovery(ai, batch, onProgress) {
|
|
1479
|
+
try {
|
|
1480
|
+
const embeddings = await ai.embedTexts({
|
|
1481
|
+
model: this.config.embedModel,
|
|
1482
|
+
texts: batch.map((task) => task.text),
|
|
1483
|
+
});
|
|
1484
|
+
return batch.map((task, index) => ({ task, embedding: embeddings[index] }));
|
|
1485
|
+
}
|
|
1486
|
+
catch (error) {
|
|
1487
|
+
if (!this.isEmbeddingContextError(error) || batch.length === 1) {
|
|
1488
|
+
if (batch.length === 1 && this.isEmbeddingContextError(error)) {
|
|
1489
|
+
const recovered = await this.embedSingleTaskWithRecovery(ai, batch[0], onProgress);
|
|
1490
|
+
return [recovered];
|
|
1491
|
+
}
|
|
1492
|
+
throw error;
|
|
1493
|
+
}
|
|
1494
|
+
onProgress?.(`[embed] batch context error; isolating ${batch.length} item(s) to find oversized input(s)`);
|
|
1495
|
+
const recovered = [];
|
|
1496
|
+
for (const task of batch) {
|
|
1497
|
+
recovered.push(await this.embedSingleTaskWithRecovery(ai, task, onProgress));
|
|
1498
|
+
}
|
|
1499
|
+
return recovered;
|
|
1500
|
+
}
|
|
1501
|
+
}
|
|
1502
|
+
async embedSingleTaskWithRecovery(ai, task, onProgress) {
|
|
1503
|
+
let current = task;
|
|
1504
|
+
for (let attempt = 0; attempt < 4; attempt += 1) {
|
|
1505
|
+
try {
|
|
1506
|
+
const [embedding] = await ai.embedTexts({
|
|
1507
|
+
model: this.config.embedModel,
|
|
1508
|
+
texts: [current.text],
|
|
1509
|
+
});
|
|
1510
|
+
return { task: current, embedding };
|
|
1511
|
+
}
|
|
1512
|
+
catch (error) {
|
|
1513
|
+
if (!this.isEmbeddingContextError(error)) {
|
|
1514
|
+
throw error;
|
|
1515
|
+
}
|
|
1516
|
+
const next = this.shrinkEmbeddingTask(current);
|
|
1517
|
+
if (!next || next.text === current.text) {
|
|
1518
|
+
throw error;
|
|
1519
|
+
}
|
|
1520
|
+
onProgress?.(`[embed] shortened #${current.threadNumber}:${current.sourceKind} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`);
|
|
1521
|
+
current = next;
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.sourceKind} below model limits`);
|
|
1525
|
+
}
|
|
1526
|
+
shrinkEmbeddingTask(task) {
|
|
1527
|
+
const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER)
|
|
1528
|
+
? task.text.slice(0, -EMBED_TRUNCATION_MARKER.length)
|
|
1529
|
+
: task.text;
|
|
1530
|
+
if (withoutMarker.length < 256) {
|
|
1531
|
+
return null;
|
|
1532
|
+
}
|
|
1533
|
+
const nextLength = Math.max(256, Math.floor(withoutMarker.length * 0.5));
|
|
1534
|
+
const nextText = `${withoutMarker.slice(0, Math.max(0, nextLength - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`;
|
|
1535
|
+
return {
|
|
1536
|
+
...task,
|
|
1537
|
+
text: nextText,
|
|
1538
|
+
contentHash: stableContentHash(`embedding:${task.sourceKind}\n${nextText}`),
|
|
1539
|
+
estimatedTokens: this.estimateEmbeddingTokens(nextText),
|
|
1540
|
+
wasTruncated: true,
|
|
1541
|
+
};
|
|
1542
|
+
}
|
|
1543
|
+
chunkEmbeddingTasks(items, maxItems, maxEstimatedTokens) {
|
|
1544
|
+
const chunks = [];
|
|
1545
|
+
let current = [];
|
|
1546
|
+
let currentEstimatedTokens = 0;
|
|
1547
|
+
for (const item of items) {
|
|
1548
|
+
const wouldExceedItemCount = current.length >= maxItems;
|
|
1549
|
+
const wouldExceedTokenBudget = current.length > 0 && currentEstimatedTokens + item.estimatedTokens > maxEstimatedTokens;
|
|
1550
|
+
if (wouldExceedItemCount || wouldExceedTokenBudget) {
|
|
1551
|
+
chunks.push(current);
|
|
1552
|
+
current = [];
|
|
1553
|
+
currentEstimatedTokens = 0;
|
|
1554
|
+
}
|
|
1555
|
+
current.push(item);
|
|
1556
|
+
currentEstimatedTokens += item.estimatedTokens;
|
|
1557
|
+
}
|
|
1558
|
+
if (current.length > 0) {
|
|
1559
|
+
chunks.push(current);
|
|
1560
|
+
}
|
|
1561
|
+
return chunks;
|
|
1562
|
+
}
|
|
1563
|
+
loadStoredEmbeddings(repoId) {
|
|
1564
|
+
return this.db
|
|
1565
|
+
.prepare(`select t.id, t.repo_id, t.number, t.kind, t.state, t.title, t.body, t.author_login, t.html_url, t.labels_json,
|
|
1566
|
+
t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json
|
|
1567
|
+
from threads t
|
|
1568
|
+
join document_embeddings e on e.thread_id = t.id
|
|
1569
|
+
where t.repo_id = ? and t.state = 'open' and e.model = ?
|
|
1570
|
+
order by t.number asc, e.source_kind asc`)
|
|
1571
|
+
.all(repoId, this.config.embedModel);
|
|
1572
|
+
}
|
|
1573
|
+
loadParsedStoredEmbeddings(repoId) {
|
|
1574
|
+
const cached = this.parsedEmbeddingCache.get(repoId);
|
|
1575
|
+
if (cached) {
|
|
1576
|
+
return cached;
|
|
1577
|
+
}
|
|
1578
|
+
const parsed = this.loadStoredEmbeddings(repoId).map((row) => ({
|
|
1579
|
+
...row,
|
|
1580
|
+
embedding: JSON.parse(row.embedding_json),
|
|
1581
|
+
}));
|
|
1582
|
+
this.parsedEmbeddingCache.set(repoId, parsed);
|
|
1583
|
+
return parsed;
|
|
1584
|
+
}
|
|
1585
|
+
getEmbeddingWorkset(repoId, threadNumber) {
|
|
1586
|
+
let sql = `select t.id, t.number, t.title, t.body
|
|
1587
|
+
from threads t
|
|
1588
|
+
where t.repo_id = ? and t.state = 'open'`;
|
|
1589
|
+
const args = [repoId];
|
|
1590
|
+
if (threadNumber) {
|
|
1591
|
+
sql += ' and t.number = ?';
|
|
1592
|
+
args.push(threadNumber);
|
|
1593
|
+
}
|
|
1594
|
+
sql += ' order by t.number asc';
|
|
1595
|
+
const rows = this.db.prepare(sql).all(...args);
|
|
1596
|
+
const summaryTexts = this.loadCombinedSummaryTextMap(repoId, threadNumber);
|
|
1597
|
+
const tasks = rows.flatMap((row) => this.buildEmbeddingTasks({
|
|
1598
|
+
threadId: row.id,
|
|
1599
|
+
threadNumber: row.number,
|
|
1600
|
+
title: row.title,
|
|
1601
|
+
body: row.body,
|
|
1602
|
+
dedupeSummary: summaryTexts.get(row.id) ?? null,
|
|
1603
|
+
}));
|
|
1604
|
+
const existingRows = this.db
|
|
1605
|
+
.prepare(`select e.thread_id, e.source_kind, e.content_hash
|
|
1606
|
+
from document_embeddings e
|
|
1607
|
+
join threads t on t.id = e.thread_id
|
|
1608
|
+
where t.repo_id = ? and e.model = ?`)
|
|
1609
|
+
.all(repoId, this.config.embedModel);
|
|
1610
|
+
const existing = new Map();
|
|
1611
|
+
for (const row of existingRows) {
|
|
1612
|
+
existing.set(`${row.thread_id}:${row.source_kind}`, row.content_hash);
|
|
1613
|
+
}
|
|
1614
|
+
const pending = tasks.filter((task) => existing.get(`${task.threadId}:${task.sourceKind}`) !== task.contentHash);
|
|
1615
|
+
return { rows, tasks, existing, pending };
|
|
1616
|
+
}
|
|
1617
|
+
loadCombinedSummaryTextMap(repoId, threadNumber) {
|
|
1618
|
+
let sql = `select s.thread_id, s.summary_kind, s.summary_text
|
|
1619
|
+
from document_summaries s
|
|
1620
|
+
join threads t on t.id = s.thread_id
|
|
1621
|
+
where t.repo_id = ? and t.state = 'open' and s.model = ?`;
|
|
1622
|
+
const args = [repoId, this.config.summaryModel];
|
|
1623
|
+
if (threadNumber) {
|
|
1624
|
+
sql += ' and t.number = ?';
|
|
1625
|
+
args.push(threadNumber);
|
|
1626
|
+
}
|
|
1627
|
+
sql += ' order by t.number asc, s.summary_kind asc';
|
|
1628
|
+
const rows = this.db.prepare(sql).all(...args);
|
|
1629
|
+
const byThread = new Map();
|
|
1630
|
+
for (const row of rows) {
|
|
1631
|
+
const entry = byThread.get(row.thread_id) ?? new Map();
|
|
1632
|
+
entry.set(row.summary_kind, normalizeSummaryText(row.summary_text));
|
|
1633
|
+
byThread.set(row.thread_id, entry);
|
|
1634
|
+
}
|
|
1635
|
+
const combined = new Map();
|
|
1636
|
+
const order = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary'];
|
|
1637
|
+
for (const [threadId, entry] of byThread.entries()) {
|
|
1638
|
+
const parts = order
|
|
1639
|
+
.map((summaryKind) => {
|
|
1640
|
+
const text = entry.get(summaryKind);
|
|
1641
|
+
return text ? `${summaryKind}: ${text}` : '';
|
|
1642
|
+
})
|
|
1643
|
+
.filter(Boolean);
|
|
1644
|
+
if (parts.length > 0) {
|
|
1645
|
+
combined.set(threadId, parts.join('\n\n'));
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
return combined;
|
|
1649
|
+
}
|
|
1650
|
+
edgeKey(leftThreadId, rightThreadId) {
|
|
1651
|
+
const left = Math.min(leftThreadId, rightThreadId);
|
|
1652
|
+
const right = Math.max(leftThreadId, rightThreadId);
|
|
1653
|
+
return `${left}:${right}`;
|
|
1654
|
+
}
|
|
1655
|
+
aggregateRepositoryEdges(rows, params) {
|
|
1656
|
+
const bySource = new Map();
|
|
1657
|
+
for (const row of rows) {
|
|
1658
|
+
const list = bySource.get(row.source_kind) ?? [];
|
|
1659
|
+
list.push({ id: row.id, embedding: row.embedding });
|
|
1660
|
+
bySource.set(row.source_kind, list);
|
|
1661
|
+
}
|
|
1662
|
+
const aggregated = new Map();
|
|
1663
|
+
for (const [sourceKind, items] of bySource.entries()) {
|
|
1664
|
+
for (const item of items) {
|
|
1665
|
+
const neighbors = rankNearestNeighbors(items, {
|
|
1666
|
+
targetEmbedding: item.embedding,
|
|
1667
|
+
limit: params.limit,
|
|
1668
|
+
minScore: params.minScore,
|
|
1669
|
+
skipId: item.id,
|
|
1670
|
+
});
|
|
1671
|
+
for (const neighbor of neighbors) {
|
|
1672
|
+
const key = this.edgeKey(item.id, neighbor.item.id);
|
|
1673
|
+
const existing = aggregated.get(key);
|
|
1674
|
+
if (existing) {
|
|
1675
|
+
existing.score = Math.max(existing.score, neighbor.score);
|
|
1676
|
+
existing.sourceKinds.add(sourceKind);
|
|
1677
|
+
continue;
|
|
1678
|
+
}
|
|
1679
|
+
aggregated.set(key, {
|
|
1680
|
+
leftThreadId: Math.min(item.id, neighbor.item.id),
|
|
1681
|
+
rightThreadId: Math.max(item.id, neighbor.item.id),
|
|
1682
|
+
score: neighbor.score,
|
|
1683
|
+
sourceKinds: new Set([sourceKind]),
|
|
1684
|
+
});
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
return aggregated;
|
|
1689
|
+
}
|
|
1690
|
+
upsertSummary(threadId, contentHash, summaryKind, summaryText) {
|
|
1691
|
+
this.db
|
|
1692
|
+
.prepare(`insert into document_summaries (thread_id, summary_kind, model, content_hash, summary_text, created_at, updated_at)
|
|
1693
|
+
values (?, ?, ?, ?, ?, ?, ?)
|
|
1694
|
+
on conflict(thread_id, summary_kind, model) do update set
|
|
1695
|
+
content_hash = excluded.content_hash,
|
|
1696
|
+
summary_text = excluded.summary_text,
|
|
1697
|
+
updated_at = excluded.updated_at`)
|
|
1698
|
+
.run(threadId, summaryKind, this.config.summaryModel, contentHash, summaryText, nowIso(), nowIso());
|
|
1699
|
+
}
|
|
1700
|
+
upsertEmbedding(threadId, sourceKind, contentHash, embedding) {
|
|
1701
|
+
this.db
|
|
1702
|
+
.prepare(`insert into document_embeddings (thread_id, source_kind, model, dimensions, content_hash, embedding_json, created_at, updated_at)
|
|
1703
|
+
values (?, ?, ?, ?, ?, ?, ?, ?)
|
|
1704
|
+
on conflict(thread_id, source_kind, model) do update set
|
|
1705
|
+
dimensions = excluded.dimensions,
|
|
1706
|
+
content_hash = excluded.content_hash,
|
|
1707
|
+
embedding_json = excluded.embedding_json,
|
|
1708
|
+
updated_at = excluded.updated_at`)
|
|
1709
|
+
.run(threadId, sourceKind, this.config.embedModel, embedding.length, contentHash, asJson(embedding), nowIso(), nowIso());
|
|
1710
|
+
const row = this.db.prepare('select repo_id from threads where id = ? limit 1').get(threadId);
|
|
1711
|
+
if (row) {
|
|
1712
|
+
this.parsedEmbeddingCache.delete(row.repo_id);
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
startRun(table, repoId, scope) {
|
|
1716
|
+
const result = this.db
|
|
1717
|
+
.prepare(`insert into ${table} (repo_id, scope, status, started_at) values (?, ?, 'running', ?)`)
|
|
1718
|
+
.run(repoId, scope, nowIso());
|
|
1719
|
+
return Number(result.lastInsertRowid);
|
|
1720
|
+
}
|
|
1721
|
+
finishRun(table, runId, status, stats, error, finishedAt = nowIso()) {
|
|
1722
|
+
this.db
|
|
1723
|
+
.prepare(`update ${table} set status = ?, finished_at = ?, stats_json = ?, error_text = ? where id = ?`)
|
|
1724
|
+
.run(status, finishedAt, stats === undefined ? null : asJson(stats), error instanceof Error ? error.message : error ? String(error) : null, runId);
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
export function parseRepoParams(url) {
|
|
1728
|
+
const owner = url.searchParams.get('owner');
|
|
1729
|
+
const repo = url.searchParams.get('repo');
|
|
1730
|
+
if (!owner || !repo) {
|
|
1731
|
+
throw new Error('Missing owner or repo query parameter');
|
|
1732
|
+
}
|
|
1733
|
+
return { owner, repo };
|
|
1734
|
+
}
|
|
1735
|
+
//# sourceMappingURL=service.js.map
|