@ghcrawl/api-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,235 @@
1
+ import type { SqliteDatabase } from './sqlite.js';
2
+
3
+ const migrationStatements = [
4
+ `
5
+ create table if not exists repositories (
6
+ id integer primary key,
7
+ owner text not null,
8
+ name text not null,
9
+ full_name text not null unique,
10
+ github_repo_id text,
11
+ raw_json text not null,
12
+ updated_at text not null
13
+ )
14
+ `,
15
+ `
16
+ create table if not exists threads (
17
+ id integer primary key,
18
+ repo_id integer not null references repositories(id) on delete cascade,
19
+ github_id text not null,
20
+ number integer not null,
21
+ kind text not null,
22
+ state text not null,
23
+ title text not null,
24
+ body text,
25
+ author_login text,
26
+ author_type text,
27
+ html_url text not null,
28
+ labels_json text not null,
29
+ assignees_json text not null,
30
+ raw_json text not null,
31
+ content_hash text not null,
32
+ is_draft integer not null default 0,
33
+ created_at_gh text,
34
+ updated_at_gh text,
35
+ closed_at_gh text,
36
+ merged_at_gh text,
37
+ first_pulled_at text,
38
+ last_pulled_at text,
39
+ updated_at text not null,
40
+ unique(repo_id, kind, number)
41
+ )
42
+ `,
43
+ `
44
+ create table if not exists comments (
45
+ id integer primary key,
46
+ thread_id integer not null references threads(id) on delete cascade,
47
+ github_id text not null,
48
+ comment_type text not null,
49
+ author_login text,
50
+ author_type text,
51
+ body text not null,
52
+ is_bot integer not null default 0,
53
+ raw_json text not null,
54
+ created_at_gh text,
55
+ updated_at_gh text,
56
+ unique(thread_id, comment_type, github_id)
57
+ )
58
+ `,
59
+ `
60
+ create table if not exists documents (
61
+ id integer primary key,
62
+ thread_id integer not null unique references threads(id) on delete cascade,
63
+ title text not null,
64
+ body text,
65
+ raw_text text not null,
66
+ dedupe_text text not null,
67
+ updated_at text not null
68
+ )
69
+ `,
70
+ `
71
+ create virtual table if not exists documents_fts using fts5(
72
+ title,
73
+ body,
74
+ raw_text,
75
+ dedupe_text,
76
+ content='documents',
77
+ content_rowid='id'
78
+ )
79
+ `,
80
+ `
81
+ create trigger if not exists documents_ai after insert on documents begin
82
+ insert into documents_fts(rowid, title, body, raw_text, dedupe_text)
83
+ values (new.id, new.title, new.body, new.raw_text, new.dedupe_text);
84
+ end
85
+ `,
86
+ `
87
+ create trigger if not exists documents_ad after delete on documents begin
88
+ insert into documents_fts(documents_fts, rowid, title, body, raw_text, dedupe_text)
89
+ values ('delete', old.id, old.title, old.body, old.raw_text, old.dedupe_text);
90
+ end
91
+ `,
92
+ `
93
+ create trigger if not exists documents_au after update on documents begin
94
+ insert into documents_fts(documents_fts, rowid, title, body, raw_text, dedupe_text)
95
+ values ('delete', old.id, old.title, old.body, old.raw_text, old.dedupe_text);
96
+ insert into documents_fts(rowid, title, body, raw_text, dedupe_text)
97
+ values (new.id, new.title, new.body, new.raw_text, new.dedupe_text);
98
+ end
99
+ `,
100
+ `
101
+ create table if not exists document_summaries (
102
+ id integer primary key,
103
+ thread_id integer not null references threads(id) on delete cascade,
104
+ summary_kind text not null,
105
+ model text not null,
106
+ content_hash text not null,
107
+ summary_text text not null,
108
+ created_at text not null,
109
+ updated_at text not null,
110
+ unique(thread_id, summary_kind, model)
111
+ )
112
+ `,
113
+ `
114
+ create table if not exists document_embeddings (
115
+ id integer primary key,
116
+ thread_id integer not null references threads(id) on delete cascade,
117
+ source_kind text not null,
118
+ model text not null,
119
+ dimensions integer not null,
120
+ content_hash text not null,
121
+ embedding_json text not null,
122
+ created_at text not null,
123
+ updated_at text not null,
124
+ unique(thread_id, source_kind, model)
125
+ )
126
+ `,
127
+ `
128
+ create table if not exists sync_runs (
129
+ id integer primary key,
130
+ repo_id integer references repositories(id) on delete cascade,
131
+ scope text not null,
132
+ status text not null,
133
+ started_at text not null,
134
+ finished_at text,
135
+ stats_json text,
136
+ error_text text
137
+ )
138
+ `,
139
+ `
140
+ create table if not exists repo_sync_state (
141
+ repo_id integer primary key references repositories(id) on delete cascade,
142
+ last_full_open_scan_started_at text,
143
+ last_overlapping_open_scan_completed_at text,
144
+ last_non_overlapping_scan_completed_at text,
145
+ last_open_close_reconciled_at text,
146
+ updated_at text not null
147
+ )
148
+ `,
149
+ `
150
+ create table if not exists summary_runs (
151
+ id integer primary key,
152
+ repo_id integer references repositories(id) on delete cascade,
153
+ scope text not null,
154
+ status text not null,
155
+ started_at text not null,
156
+ finished_at text,
157
+ stats_json text,
158
+ error_text text
159
+ )
160
+ `,
161
+ `
162
+ create table if not exists embedding_runs (
163
+ id integer primary key,
164
+ repo_id integer references repositories(id) on delete cascade,
165
+ scope text not null,
166
+ status text not null,
167
+ started_at text not null,
168
+ finished_at text,
169
+ stats_json text,
170
+ error_text text
171
+ )
172
+ `,
173
+ `
174
+ create table if not exists cluster_runs (
175
+ id integer primary key,
176
+ repo_id integer references repositories(id) on delete cascade,
177
+ scope text not null,
178
+ status text not null,
179
+ started_at text not null,
180
+ finished_at text,
181
+ stats_json text,
182
+ error_text text
183
+ )
184
+ `,
185
+ `
186
+ create table if not exists similarity_edges (
187
+ id integer primary key,
188
+ repo_id integer not null references repositories(id) on delete cascade,
189
+ cluster_run_id integer references cluster_runs(id) on delete cascade,
190
+ left_thread_id integer not null references threads(id) on delete cascade,
191
+ right_thread_id integer not null references threads(id) on delete cascade,
192
+ method text not null,
193
+ score real not null,
194
+ explanation_json text not null,
195
+ created_at text not null,
196
+ unique(cluster_run_id, left_thread_id, right_thread_id)
197
+ )
198
+ `,
199
+ `
200
+ create table if not exists clusters (
201
+ id integer primary key,
202
+ repo_id integer not null references repositories(id) on delete cascade,
203
+ cluster_run_id integer not null references cluster_runs(id) on delete cascade,
204
+ representative_thread_id integer references threads(id) on delete set null,
205
+ member_count integer not null,
206
+ created_at text not null
207
+ )
208
+ `,
209
+ `
210
+ create table if not exists cluster_members (
211
+ cluster_id integer not null references clusters(id) on delete cascade,
212
+ thread_id integer not null references threads(id) on delete cascade,
213
+ score_to_representative real,
214
+ created_at text not null,
215
+ primary key (cluster_id, thread_id)
216
+ )
217
+ `
218
+ ];
219
+
220
+ export function migrate(db: SqliteDatabase): void {
221
+ for (const statement of migrationStatements) {
222
+ db.exec(statement);
223
+ }
224
+
225
+ const threadColumns = new Set(
226
+ (db.prepare('pragma table_info(threads)').all() as Array<{ name: string }>).map((column) => column.name),
227
+ );
228
+
229
+ if (!threadColumns.has('first_pulled_at')) {
230
+ db.exec('alter table threads add column first_pulled_at text');
231
+ }
232
+ if (!threadColumns.has('last_pulled_at')) {
233
+ db.exec('alter table threads add column last_pulled_at text');
234
+ }
235
+ }
@@ -0,0 +1,14 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+
4
+ import BetterSqlite3 from 'better-sqlite3';
5
+
6
+ export type SqliteDatabase = InstanceType<typeof BetterSqlite3>;
7
+
8
+ export function openDb(dbPath: string): SqliteDatabase {
9
+ fs.mkdirSync(path.dirname(dbPath), { recursive: true });
10
+ const db = new BetterSqlite3(dbPath);
11
+ db.pragma('journal_mode = WAL');
12
+ db.pragma('foreign_keys = ON');
13
+ return db;
14
+ }
@@ -0,0 +1,25 @@
1
+ import test from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+
4
+ import { buildCanonicalDocument, isBotLikeAuthor } from './normalize.js';
5
+
6
+ test('bot detection catches bot users and common automation', () => {
7
+ assert.equal(isBotLikeAuthor({ authorLogin: 'dependabot[bot]' }), true);
8
+ assert.equal(isBotLikeAuthor({ authorType: 'Bot' }), true);
9
+ assert.equal(isBotLikeAuthor({ authorLogin: 'maintainer' }), false);
10
+ });
11
+
12
+ test('canonical document excludes bot comments from dedupe text', () => {
13
+ const document = buildCanonicalDocument({
14
+ title: 'Downloader stalls',
15
+ body: 'The transfer never finishes.',
16
+ labels: ['bug'],
17
+ comments: [
18
+ { body: 'same failure on macOS', authorLogin: 'alice', authorType: 'User', isBot: false },
19
+ { body: 'automated reminder', authorLogin: 'github-actions[bot]', authorType: 'Bot', isBot: true },
20
+ ],
21
+ });
22
+
23
+ assert.match(document.rawText, /same failure on macOS/);
24
+ assert.doesNotMatch(document.dedupeText, /automated reminder/);
25
+ });
@@ -0,0 +1,52 @@
1
+ import crypto from 'node:crypto';
2
+
3
+ export type NormalizedComment = {
4
+ body: string;
5
+ authorLogin: string | null;
6
+ authorType: string | null;
7
+ isBot: boolean;
8
+ };
9
+
10
+ export type NormalizedThread = {
11
+ title: string;
12
+ body: string | null;
13
+ labels: string[];
14
+ comments: NormalizedComment[];
15
+ };
16
+
17
+ function normalizeWhitespace(value: string): string {
18
+ return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim();
19
+ }
20
+
21
+ export function isBotLikeAuthor(input: { authorType?: string | null; authorLogin?: string | null; isBot?: boolean }): boolean {
22
+ if (input.isBot) return true;
23
+ if ((input.authorType ?? '').toLowerCase() === 'bot') return true;
24
+ const login = (input.authorLogin ?? '').toLowerCase();
25
+ return login.endsWith('[bot]') || login.includes('renovate') || login.includes('dependabot');
26
+ }
27
+
28
+ export function buildCanonicalDocument(thread: NormalizedThread): { rawText: string; dedupeText: string; contentHash: string } {
29
+ const labels = thread.labels.length > 0 ? `labels: ${thread.labels.join(', ')}` : '';
30
+ const humanComments = thread.comments
31
+ .filter((comment) => !isBotLikeAuthor(comment))
32
+ .map((comment) => {
33
+ const author = comment.authorLogin ? `@${comment.authorLogin}` : 'unknown';
34
+ return `${author}: ${normalizeWhitespace(comment.body)}`;
35
+ })
36
+ .filter(Boolean);
37
+
38
+ const title = normalizeWhitespace(thread.title);
39
+ const body = normalizeWhitespace(thread.body ?? '');
40
+ const rawParts = [title, body, labels, ...humanComments].filter(Boolean);
41
+ const dedupeParts = [
42
+ `title: ${title}`,
43
+ body ? `body: ${body}` : '',
44
+ labels ? labels : '',
45
+ humanComments.length > 0 ? `discussion: ${humanComments.join('\n')}` : '',
46
+ ].filter(Boolean);
47
+
48
+ const rawText = rawParts.join('\n\n');
49
+ const dedupeText = dedupeParts.join('\n\n');
50
+ const contentHash = crypto.createHash('sha256').update(`${rawText}\n---\n${dedupeText}`).digest('hex');
51
+ return { rawText, dedupeText, contentHash };
52
+ }
@@ -0,0 +1,241 @@
1
+ import { retry } from '@octokit/plugin-retry';
2
+ import { throttling } from '@octokit/plugin-throttling';
3
+ import { Octokit } from 'octokit';
4
+
5
+ export type GitHubClient = {
6
+ checkAuth: (reporter?: GitHubReporter) => Promise<void>;
7
+ getRepo: (owner: string, repo: string, reporter?: GitHubReporter) => Promise<Record<string, unknown>>;
8
+ listRepositoryIssues: (
9
+ owner: string,
10
+ repo: string,
11
+ since?: string,
12
+ limit?: number,
13
+ reporter?: GitHubReporter,
14
+ ) => Promise<Array<Record<string, unknown>>>;
15
+ getIssue: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise<Record<string, unknown>>;
16
+ getPull: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise<Record<string, unknown>>;
17
+ listIssueComments: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise<Array<Record<string, unknown>>>;
18
+ listPullReviews: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise<Array<Record<string, unknown>>>;
19
+ listPullReviewComments: (
20
+ owner: string,
21
+ repo: string,
22
+ number: number,
23
+ reporter?: GitHubReporter,
24
+ ) => Promise<Array<Record<string, unknown>>>;
25
+ };
26
+
27
+ export type GitHubReporter = (message: string) => void;
28
+
29
+ export class GitHubRequestError extends Error {
30
+ readonly status?: number;
31
+
32
+ constructor(message: string, status?: number) {
33
+ super(message);
34
+ this.name = 'GitHubRequestError';
35
+ this.status = status;
36
+ }
37
+ }
38
+
39
+ type RequestOptions = {
40
+ token: string;
41
+ userAgent?: string;
42
+ timeoutMs?: number;
43
+ pageDelayMs?: number;
44
+ };
45
+
46
+ type OctokitPage<T> = {
47
+ data: T[];
48
+ };
49
+
50
+ function delay(ms: number): Promise<void> {
51
+ return new Promise((resolve) => setTimeout(resolve, ms));
52
+ }
53
+
54
+ function formatDuration(ms: number): string {
55
+ if (ms < 1000) return `${ms}ms`;
56
+ const seconds = Math.ceil(ms / 1000);
57
+ if (seconds < 60) return `${seconds}s`;
58
+ const minutes = Math.floor(seconds / 60);
59
+ const remainingSeconds = seconds % 60;
60
+ if (minutes < 60) return remainingSeconds === 0 ? `${minutes}m` : `${minutes}m ${remainingSeconds}s`;
61
+ const hours = Math.floor(minutes / 60);
62
+ const remainingMinutes = minutes % 60;
63
+ return remainingMinutes === 0 ? `${hours}h` : `${hours}h ${remainingMinutes}m`;
64
+ }
65
+
66
+ function formatResetTime(resetSeconds: string | null | undefined): string | null {
67
+ if (!resetSeconds) return null;
68
+ const value = Number(resetSeconds);
69
+ if (!Number.isFinite(value) || value <= 0) return null;
70
+ return new Date(value * 1000).toISOString();
71
+ }
72
+
73
+ export function makeGitHubClient(options: RequestOptions): GitHubClient {
74
+ const userAgent = options.userAgent ?? 'ghcrawl';
75
+ const timeoutMs = options.timeoutMs ?? 30_000;
76
+ const pageDelayMs = options.pageDelayMs ?? 5000;
77
+ const BaseOctokit = Octokit.plugin(retry, throttling);
78
+
79
+ function createOctokit(reporter?: GitHubReporter) {
80
+ return new BaseOctokit({
81
+ auth: options.token,
82
+ request: {
83
+ timeout: timeoutMs,
84
+ },
85
+ userAgent,
86
+ retry: {
87
+ doNotRetry: [400, 401, 403, 404, 422],
88
+ retries: 4,
89
+ },
90
+ throttle: {
91
+ fallbackSecondaryRateRetryAfter: Math.ceil(pageDelayMs / 1000),
92
+ onRateLimit: (retryAfter, requestOptions) => {
93
+ const responseHeaders = (requestOptions.response as { headers?: Record<string, string> } | undefined)?.headers;
94
+ const resetAt = formatResetTime(responseHeaders?.['x-ratelimit-reset']);
95
+ const remaining = responseHeaders?.['x-ratelimit-remaining'];
96
+ const method = requestOptions.method ?? 'GET';
97
+ const url = requestOptions.url ?? 'unknown';
98
+ reporter?.(
99
+ `[github] backoff rate-limited wait=${formatDuration(retryAfter * 1000)}${remaining ? ` remaining=${remaining}` : ''}${resetAt ? ` reset_at=${resetAt}` : ''} method=${method} url=${url}`,
100
+ );
101
+ return true;
102
+ },
103
+ onSecondaryRateLimit: (retryAfter, requestOptions) => {
104
+ const method = requestOptions.method ?? 'GET';
105
+ const url = requestOptions.url ?? 'unknown';
106
+ reporter?.(
107
+ `[github] backoff secondary-rate-limit wait=${formatDuration(retryAfter * 1000)} method=${method} url=${url}`,
108
+ );
109
+ return true;
110
+ },
111
+ },
112
+ });
113
+ }
114
+
115
+ async function request<T>(label: string, reporter: GitHubReporter | undefined, fn: (octokit: InstanceType<typeof BaseOctokit>) => Promise<T>): Promise<T> {
116
+ reporter?.(`[github] request ${label}`);
117
+ const octokit = createOctokit(reporter);
118
+ try {
119
+ return await fn(octokit);
120
+ } catch (error) {
121
+ const message = error instanceof Error ? error.message : String(error);
122
+ const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : undefined;
123
+ throw new GitHubRequestError(`GitHub request failed for ${label}: ${message}`, status);
124
+ }
125
+ }
126
+
127
+ async function paginate<T>(
128
+ label: string,
129
+ limit: number | undefined,
130
+ reporter: GitHubReporter | undefined,
131
+ iteratorFactory: (octokit: InstanceType<typeof BaseOctokit>) => AsyncIterable<OctokitPage<T>>,
132
+ ): Promise<T[]> {
133
+ reporter?.(`[github] request ${label}`);
134
+ const octokit = createOctokit(reporter);
135
+ const out: T[] = [];
136
+
137
+ try {
138
+ let pageIndex = 0;
139
+ for await (const page of iteratorFactory(octokit)) {
140
+ pageIndex += 1;
141
+ const remaining = typeof limit === 'number' ? Math.max(limit - out.length, 0) : page.data.length;
142
+ out.push(...page.data.slice(0, remaining));
143
+ reporter?.(`[github] page ${pageIndex} fetched count=${page.data.length} accumulated=${out.length}`);
144
+ if (typeof limit === 'number' && out.length >= limit) {
145
+ break;
146
+ }
147
+ await delay(pageDelayMs);
148
+ }
149
+ return out;
150
+ } catch (error) {
151
+ const message = error instanceof Error ? error.message : String(error);
152
+ const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : undefined;
153
+ throw new GitHubRequestError(`GitHub pagination failed for ${label}: ${message}`, status);
154
+ }
155
+ }
156
+
157
+ return {
158
+ async checkAuth(reporter) {
159
+ await request('GET /rate_limit', reporter, async (octokit) => {
160
+ await octokit.request('GET /rate_limit');
161
+ });
162
+ },
163
+ async getRepo(owner, repo, reporter) {
164
+ return request(`GET /repos/${owner}/${repo}`, reporter, async (octokit) => {
165
+ const response = await octokit.rest.repos.get({ owner, repo });
166
+ return response.data as Record<string, unknown>;
167
+ });
168
+ },
169
+ async listRepositoryIssues(owner, repo, since, limit, reporter) {
170
+ return paginate(
171
+ `GET /repos/${owner}/${repo}/issues state=open per_page=100`,
172
+ limit,
173
+ reporter,
174
+ (octokit) =>
175
+ octokit.paginate.iterator(octokit.rest.issues.listForRepo, {
176
+ owner,
177
+ repo,
178
+ state: 'open',
179
+ sort: 'updated',
180
+ direction: 'desc',
181
+ per_page: 100,
182
+ since,
183
+ }) as AsyncIterable<OctokitPage<Record<string, unknown>>>,
184
+ );
185
+ },
186
+ async getIssue(owner, repo, number, reporter) {
187
+ return request(`GET /repos/${owner}/${repo}/issues/${number}`, reporter, async (octokit) => {
188
+ const response = await octokit.rest.issues.get({ owner, repo, issue_number: number });
189
+ return response.data as Record<string, unknown>;
190
+ });
191
+ },
192
+ async getPull(owner, repo, number, reporter) {
193
+ return request(`GET /repos/${owner}/${repo}/pulls/${number}`, reporter, async (octokit) => {
194
+ const response = await octokit.rest.pulls.get({ owner, repo, pull_number: number });
195
+ return response.data as Record<string, unknown>;
196
+ });
197
+ },
198
+ async listIssueComments(owner, repo, number, reporter) {
199
+ return paginate(
200
+ `GET /repos/${owner}/${repo}/issues/${number}/comments per_page=100`,
201
+ undefined,
202
+ reporter,
203
+ (octokit) =>
204
+ octokit.paginate.iterator(octokit.rest.issues.listComments, {
205
+ owner,
206
+ repo,
207
+ issue_number: number,
208
+ per_page: 100,
209
+ }) as AsyncIterable<OctokitPage<Record<string, unknown>>>,
210
+ );
211
+ },
212
+ async listPullReviews(owner, repo, number, reporter) {
213
+ return paginate(
214
+ `GET /repos/${owner}/${repo}/pulls/${number}/reviews per_page=100`,
215
+ undefined,
216
+ reporter,
217
+ (octokit) =>
218
+ octokit.paginate.iterator(octokit.rest.pulls.listReviews, {
219
+ owner,
220
+ repo,
221
+ pull_number: number,
222
+ per_page: 100,
223
+ }) as AsyncIterable<OctokitPage<Record<string, unknown>>>,
224
+ );
225
+ },
226
+ async listPullReviewComments(owner, repo, number, reporter) {
227
+ return paginate(
228
+ `GET /repos/${owner}/${repo}/pulls/${number}/comments per_page=100`,
229
+ undefined,
230
+ reporter,
231
+ (octokit) =>
232
+ octokit.paginate.iterator(octokit.rest.pulls.listReviewComments, {
233
+ owner,
234
+ repo,
235
+ pull_number: number,
236
+ per_page: 100,
237
+ }) as AsyncIterable<OctokitPage<Record<string, unknown>>>,
238
+ );
239
+ },
240
+ };
241
+ }
package/src/index.ts ADDED
@@ -0,0 +1,6 @@
1
+ export * from './api/server.js';
2
+ export * from './config.js';
3
+ export * from './documents/normalize.js';
4
+ export * from './search/exact.js';
5
+ export * from './cluster/build.js';
6
+ export * from './service.js';