@ghcrawl/api-core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +25 -0
  2. package/dist/api/server.d.ts +4 -0
  3. package/dist/api/server.d.ts.map +1 -0
  4. package/dist/api/server.js +142 -0
  5. package/dist/api/server.js.map +1 -0
  6. package/dist/cluster/build.d.ts +16 -0
  7. package/dist/cluster/build.d.ts.map +1 -0
  8. package/dist/cluster/build.js +62 -0
  9. package/dist/cluster/build.js.map +1 -0
  10. package/dist/config.d.ts +83 -0
  11. package/dist/config.d.ts.map +1 -0
  12. package/dist/config.js +257 -0
  13. package/dist/config.js.map +1 -0
  14. package/dist/db/migrate.d.ts +3 -0
  15. package/dist/db/migrate.d.ts.map +1 -0
  16. package/{src/db/migrate.ts → dist/db/migrate.js} +30 -36
  17. package/dist/db/migrate.js.map +1 -0
  18. package/dist/db/sqlite.d.ts +4 -0
  19. package/dist/db/sqlite.d.ts.map +1 -0
  20. package/dist/db/sqlite.js +11 -0
  21. package/dist/db/sqlite.js.map +1 -0
  22. package/dist/documents/normalize.d.ts +23 -0
  23. package/dist/documents/normalize.d.ts.map +1 -0
  24. package/dist/documents/normalize.js +36 -0
  25. package/dist/documents/normalize.js.map +1 -0
  26. package/dist/github/client.d.ts +24 -0
  27. package/dist/github/client.d.ts.map +1 -0
  28. package/dist/github/client.js +170 -0
  29. package/dist/github/client.js.map +1 -0
  30. package/dist/index.d.ts +7 -0
  31. package/dist/index.d.ts.map +1 -0
  32. package/{src/index.ts → dist/index.js} +1 -0
  33. package/dist/index.js.map +1 -0
  34. package/dist/openai/provider.d.ts +44 -0
  35. package/dist/openai/provider.d.ts.map +1 -0
  36. package/dist/openai/provider.js +107 -0
  37. package/dist/openai/provider.js.map +1 -0
  38. package/dist/search/exact.d.ts +14 -0
  39. package/dist/search/exact.d.ts.map +1 -0
  40. package/dist/search/exact.js +26 -0
  41. package/dist/search/exact.js.map +1 -0
  42. package/dist/service.d.ts +247 -0
  43. package/dist/service.d.ts.map +1 -0
  44. package/dist/service.js +1735 -0
  45. package/dist/service.js.map +1 -0
  46. package/package.json +6 -5
  47. package/src/api/server.test.ts +0 -296
  48. package/src/api/server.ts +0 -171
  49. package/src/cluster/build.test.ts +0 -18
  50. package/src/cluster/build.ts +0 -74
  51. package/src/config.test.ts +0 -247
  52. package/src/config.ts +0 -421
  53. package/src/db/migrate.test.ts +0 -30
  54. package/src/db/sqlite.ts +0 -14
  55. package/src/documents/normalize.test.ts +0 -25
  56. package/src/documents/normalize.ts +0 -52
  57. package/src/github/client.ts +0 -241
  58. package/src/openai/provider.ts +0 -141
  59. package/src/search/exact.test.ts +0 -22
  60. package/src/search/exact.ts +0 -28
  61. package/src/service.test.ts +0 -2036
  62. package/src/service.ts +0 -2497
  63. package/src/types/better-sqlite3.d.ts +0 -1
package/src/config.ts DELETED
@@ -1,421 +0,0 @@
1
- import fs from 'node:fs';
2
- import os from 'node:os';
3
- import path from 'node:path';
4
-
5
- import dotenv from 'dotenv';
6
-
7
- export type ConfigValueSource = 'env' | 'config' | 'dotenv' | 'default' | 'none';
8
- export type SecretProvider = 'plaintext' | 'op';
9
- export type TuiSortPreference = 'recent' | 'size';
10
- export type TuiMinClusterSize = 0 | 1 | 10 | 20 | 50;
11
-
12
- export type TuiRepositoryPreference = {
13
- minClusterSize: TuiMinClusterSize;
14
- sortMode: TuiSortPreference;
15
- };
16
-
17
- export type PersistedGitcrawlConfig = {
18
- githubToken?: string;
19
- openaiApiKey?: string;
20
- secretProvider?: SecretProvider;
21
- opVaultName?: string;
22
- opItemName?: string;
23
- dbPath?: string;
24
- apiPort?: number;
25
- summaryModel?: string;
26
- embedModel?: string;
27
- embedBatchSize?: number;
28
- embedConcurrency?: number;
29
- embedMaxUnread?: number;
30
- openSearchUrl?: string;
31
- openSearchIndex?: string;
32
- tuiPreferences?: Record<string, TuiRepositoryPreference>;
33
- };
34
-
35
- export type GitcrawlConfig = {
36
- workspaceRoot: string;
37
- configDir: string;
38
- configPath: string;
39
- configFileExists: boolean;
40
- dbPath: string;
41
- dbPathSource: ConfigValueSource;
42
- apiPort: number;
43
- githubToken?: string;
44
- githubTokenSource: ConfigValueSource;
45
- openaiApiKey?: string;
46
- openaiApiKeySource: ConfigValueSource;
47
- secretProvider: SecretProvider;
48
- opVaultName?: string;
49
- opItemName?: string;
50
- summaryModel: string;
51
- embedModel: string;
52
- embedBatchSize: number;
53
- embedConcurrency: number;
54
- embedMaxUnread: number;
55
- openSearchUrl?: string;
56
- openSearchIndex: string;
57
- tuiPreferences: Record<string, TuiRepositoryPreference>;
58
- };
59
-
60
- type LoadedStoredConfig = {
61
- configDir: string;
62
- configPath: string;
63
- exists: boolean;
64
- data: PersistedGitcrawlConfig;
65
- };
66
-
67
- type LoadConfigOptions = {
68
- cwd?: string;
69
- env?: NodeJS.ProcessEnv;
70
- platform?: NodeJS.Platform;
71
- };
72
-
73
- type LayeredValue<T> = {
74
- source: ConfigValueSource;
75
- value: T | undefined;
76
- };
77
-
78
- function pathModuleForPlatform(platform: NodeJS.Platform) {
79
- return platform === 'win32' ? path.win32 : path;
80
- }
81
-
82
- function findWorkspaceRoot(start: string): string {
83
- let current = path.resolve(start);
84
- while (true) {
85
- if (fs.existsSync(path.join(current, 'pnpm-workspace.yaml'))) {
86
- return current;
87
- }
88
- const parent = path.dirname(current);
89
- if (parent === current) return path.resolve(start);
90
- current = parent;
91
- }
92
- }
93
-
94
- function resolveHomeDirectory(env: NodeJS.ProcessEnv): string {
95
- const home = env.HOME ?? env.USERPROFILE ?? os.homedir();
96
- return path.resolve(home);
97
- }
98
-
99
- export function getConfigDir(options: LoadConfigOptions = {}): string {
100
- const env = options.env ?? process.env;
101
- const platform = options.platform ?? process.platform;
102
- const pathModule = pathModuleForPlatform(platform);
103
- if (env.XDG_CONFIG_HOME) {
104
- return pathModule.resolve(env.XDG_CONFIG_HOME, 'ghcrawl');
105
- }
106
- if (platform === 'win32' && env.APPDATA) {
107
- return pathModule.resolve(env.APPDATA, 'ghcrawl');
108
- }
109
- return pathModule.join(resolveHomeDirectory(env), '.config', 'ghcrawl');
110
- }
111
-
112
- export function getConfigPath(options: LoadConfigOptions = {}): string {
113
- const platform = options.platform ?? process.platform;
114
- const pathModule = pathModuleForPlatform(platform);
115
- return pathModule.join(getConfigDir(options), 'config.json');
116
- }
117
-
118
- function readDotenvFile(workspaceRoot: string): Record<string, string> {
119
- const dotenvPath = path.join(workspaceRoot, '.env.local');
120
- if (!fs.existsSync(dotenvPath)) {
121
- return {};
122
- }
123
- return dotenv.parse(fs.readFileSync(dotenvPath, 'utf8'));
124
- }
125
-
126
- function pickDefined<T>(...values: Array<LayeredValue<T>>): LayeredValue<T> {
127
- for (const entry of values) {
128
- if (entry.value !== undefined && entry.value !== null) {
129
- return entry;
130
- }
131
- }
132
- return { source: 'none', value: undefined };
133
- }
134
-
135
- function getString(value: unknown): string | undefined {
136
- return typeof value === 'string' && value.trim().length > 0 ? value : undefined;
137
- }
138
-
139
- function getEnvString(env: NodeJS.ProcessEnv, primary: string, legacy?: string): string | undefined {
140
- return getString(env[primary]) ?? (legacy ? getString(env[legacy]) : undefined);
141
- }
142
-
143
- function getDotenvString(values: Record<string, string>, primary: string, legacy?: string): string | undefined {
144
- return getString(values[primary]) ?? (legacy ? getString(values[legacy]) : undefined);
145
- }
146
-
147
- function getNumber(value: unknown): number | undefined {
148
- return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
149
- }
150
-
151
- function getSecretProvider(value: unknown): SecretProvider | undefined {
152
- return value === 'plaintext' || value === 'op' ? value : undefined;
153
- }
154
-
155
- function getTuiSortPreference(value: unknown): TuiSortPreference | undefined {
156
- return value === 'recent' || value === 'size' ? value : undefined;
157
- }
158
-
159
- function getTuiMinClusterSize(value: unknown): TuiMinClusterSize | undefined {
160
- return value === 0 || value === 1 || value === 10 || value === 20 || value === 50 ? value : undefined;
161
- }
162
-
163
- function getTuiPreferences(value: unknown): Record<string, TuiRepositoryPreference> | undefined {
164
- if (!value || typeof value !== 'object') {
165
- return undefined;
166
- }
167
-
168
- const preferences: Record<string, TuiRepositoryPreference> = {};
169
- for (const [fullName, preference] of Object.entries(value as Record<string, unknown>)) {
170
- if (!preference || typeof preference !== 'object') {
171
- continue;
172
- }
173
- const record = preference as Record<string, unknown>;
174
- const minClusterSize = getTuiMinClusterSize(record.minClusterSize);
175
- const sortMode = getTuiSortPreference(record.sortMode);
176
- if (minClusterSize === undefined || sortMode === undefined) {
177
- continue;
178
- }
179
- preferences[fullName] = { minClusterSize, sortMode };
180
- }
181
-
182
- return preferences;
183
- }
184
-
185
- export function readPersistedConfig(options: LoadConfigOptions = {}): LoadedStoredConfig {
186
- const configDir = getConfigDir(options);
187
- const configPath = getConfigPath(options);
188
- if (!fs.existsSync(configPath)) {
189
- return { configDir, configPath, exists: false, data: {} };
190
- }
191
-
192
- const raw = JSON.parse(fs.readFileSync(configPath, 'utf8')) as Record<string, unknown>;
193
- return {
194
- configDir,
195
- configPath,
196
- exists: true,
197
- data: {
198
- githubToken: getString(raw.githubToken),
199
- openaiApiKey: getString(raw.openaiApiKey),
200
- secretProvider: getSecretProvider(raw.secretProvider),
201
- opVaultName: getString(raw.opVaultName),
202
- opItemName: getString(raw.opItemName),
203
- dbPath: getString(raw.dbPath),
204
- apiPort: getNumber(raw.apiPort),
205
- summaryModel: getString(raw.summaryModel),
206
- embedModel: getString(raw.embedModel),
207
- embedBatchSize: getNumber(raw.embedBatchSize),
208
- embedConcurrency: getNumber(raw.embedConcurrency),
209
- embedMaxUnread: getNumber(raw.embedMaxUnread),
210
- openSearchUrl: getString(raw.openSearchUrl),
211
- openSearchIndex: getString(raw.openSearchIndex),
212
- tuiPreferences: getTuiPreferences(raw.tuiPreferences),
213
- },
214
- };
215
- }
216
-
217
- export function writePersistedConfig(values: PersistedGitcrawlConfig, options: LoadConfigOptions = {}): { configPath: string } {
218
- const current = readPersistedConfig(options);
219
- fs.mkdirSync(current.configDir, { recursive: true });
220
- const next = {
221
- ...current.data,
222
- ...values,
223
- };
224
- fs.writeFileSync(current.configPath, `${JSON.stringify(next, null, 2)}\n`, { mode: 0o600 });
225
- return { configPath: current.configPath };
226
- }
227
-
228
- function resolveConfiguredPath(configDir: string, value: string): string {
229
- return path.isAbsolute(value) ? value : path.resolve(configDir, value);
230
- }
231
-
232
- function getWorkspaceDbPath(workspaceRoot: string): string | null {
233
- const workspacePath = path.join(workspaceRoot, 'data', 'ghcrawl.db');
234
- return fs.existsSync(workspacePath) ? workspacePath : null;
235
- }
236
-
237
- function parseIntegerSetting(name: string, raw: string): number {
238
- const parsed = Number(raw);
239
- if (!Number.isSafeInteger(parsed) || parsed <= 0) {
240
- throw new Error(`Invalid ${name}: ${raw}`);
241
- }
242
- return parsed;
243
- }
244
-
245
- export function isLikelyGitHubToken(value: string): boolean {
246
- return /^(gh[pousr]_[A-Za-z0-9_]+|github_pat_[A-Za-z0-9_]+)$/.test(value.trim());
247
- }
248
-
249
- export function isLikelyOpenAiApiKey(value: string): boolean {
250
- return /^sk-[A-Za-z0-9._-]+$/.test(value.trim());
251
- }
252
-
253
- export function loadConfig(options: LoadConfigOptions = {}): GitcrawlConfig {
254
- const cwd = options.cwd ?? process.cwd();
255
- const env = options.env ?? process.env;
256
- const platform = options.platform ?? process.platform;
257
- const workspaceRoot = findWorkspaceRoot(cwd);
258
- const stored = readPersistedConfig({ cwd, env, platform });
259
- const dotenvValues = readDotenvFile(workspaceRoot);
260
-
261
- const githubToken = pickDefined<string>(
262
- { source: 'env', value: getString(env.GITHUB_TOKEN) },
263
- { source: 'config', value: stored.data.githubToken },
264
- { source: 'dotenv', value: getString(dotenvValues.GITHUB_TOKEN) },
265
- );
266
- const openaiApiKey = pickDefined<string>(
267
- { source: 'env', value: getString(env.OPENAI_API_KEY) },
268
- { source: 'config', value: stored.data.openaiApiKey },
269
- { source: 'dotenv', value: getString(dotenvValues.OPENAI_API_KEY) },
270
- );
271
- const configuredDbPath = pickDefined<string>(
272
- { source: 'env', value: getEnvString(env, 'GHCRAWL_DB_PATH', 'GHCRAWL_DB_PATH') },
273
- { source: 'config', value: stored.data.dbPath },
274
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_DB_PATH', 'GHCRAWL_DB_PATH') },
275
- );
276
- const workspaceDbPath = configuredDbPath.value === undefined ? getWorkspaceDbPath(workspaceRoot) : null;
277
- const dbPathValue =
278
- workspaceDbPath !== null
279
- ? { source: 'default' as const, value: workspaceDbPath }
280
- : pickDefined<string>(configuredDbPath, { source: 'default', value: 'ghcrawl.db' });
281
- const apiPortValue = pickDefined<string | number>(
282
- { source: 'env', value: getEnvString(env, 'GHCRAWL_API_PORT', 'GHCRAWL_API_PORT') },
283
- { source: 'config', value: stored.data.apiPort },
284
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_API_PORT', 'GHCRAWL_API_PORT') },
285
- { source: 'default', value: '5179' },
286
- );
287
- const embedBatchSizeValue = pickDefined<string | number>(
288
- { source: 'env', value: getEnvString(env, 'GHCRAWL_EMBED_BATCH_SIZE', 'GHCRAWL_EMBED_BATCH_SIZE') },
289
- { source: 'config', value: stored.data.embedBatchSize },
290
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_EMBED_BATCH_SIZE', 'GHCRAWL_EMBED_BATCH_SIZE') },
291
- { source: 'default', value: '8' },
292
- );
293
- const embedConcurrencyValue = pickDefined<string | number>(
294
- { source: 'env', value: getEnvString(env, 'GHCRAWL_EMBED_CONCURRENCY', 'GHCRAWL_EMBED_CONCURRENCY') },
295
- { source: 'config', value: stored.data.embedConcurrency },
296
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_EMBED_CONCURRENCY', 'GHCRAWL_EMBED_CONCURRENCY') },
297
- { source: 'default', value: '10' },
298
- );
299
- const embedMaxUnreadValue = pickDefined<string | number>(
300
- { source: 'env', value: getEnvString(env, 'GHCRAWL_EMBED_MAX_UNREAD', 'GHCRAWL_EMBED_MAX_UNREAD') },
301
- { source: 'config', value: stored.data.embedMaxUnread },
302
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_EMBED_MAX_UNREAD', 'GHCRAWL_EMBED_MAX_UNREAD') },
303
- { source: 'default', value: '20' },
304
- );
305
- const summaryModel = pickDefined<string>(
306
- { source: 'env', value: getEnvString(env, 'GHCRAWL_SUMMARY_MODEL', 'GHCRAWL_SUMMARY_MODEL') },
307
- { source: 'config', value: stored.data.summaryModel },
308
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_SUMMARY_MODEL', 'GHCRAWL_SUMMARY_MODEL') },
309
- { source: 'default', value: 'gpt-5-mini' },
310
- );
311
- const embedModel = pickDefined<string>(
312
- { source: 'env', value: getEnvString(env, 'GHCRAWL_EMBED_MODEL', 'GHCRAWL_EMBED_MODEL') },
313
- { source: 'config', value: stored.data.embedModel },
314
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_EMBED_MODEL', 'GHCRAWL_EMBED_MODEL') },
315
- { source: 'default', value: 'text-embedding-3-large' },
316
- );
317
- const openSearchUrl = pickDefined<string>(
318
- { source: 'env', value: getEnvString(env, 'GHCRAWL_OPENSEARCH_URL', 'GHCRAWL_OPENSEARCH_URL') },
319
- { source: 'config', value: stored.data.openSearchUrl },
320
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_OPENSEARCH_URL', 'GHCRAWL_OPENSEARCH_URL') },
321
- );
322
- const openSearchIndex = pickDefined<string>(
323
- { source: 'env', value: getEnvString(env, 'GHCRAWL_OPENSEARCH_INDEX', 'GHCRAWL_OPENSEARCH_INDEX') },
324
- { source: 'config', value: stored.data.openSearchIndex },
325
- { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_OPENSEARCH_INDEX', 'GHCRAWL_OPENSEARCH_INDEX') },
326
- { source: 'default', value: 'ghcrawl-threads' },
327
- );
328
-
329
- const dbPath =
330
- dbPathValue.value && path.isAbsolute(dbPathValue.value)
331
- ? dbPathValue.value
332
- : resolveConfiguredPath(stored.configDir, dbPathValue.value ?? 'ghcrawl.db');
333
- const apiPort = parseIntegerSetting('GHCRAWL_API_PORT', String(apiPortValue.value ?? '5179'));
334
- const embedBatchSize = parseIntegerSetting('GHCRAWL_EMBED_BATCH_SIZE', String(embedBatchSizeValue.value ?? '8'));
335
- const embedConcurrency = parseIntegerSetting('GHCRAWL_EMBED_CONCURRENCY', String(embedConcurrencyValue.value ?? '10'));
336
- const embedMaxUnread = parseIntegerSetting('GHCRAWL_EMBED_MAX_UNREAD', String(embedMaxUnreadValue.value ?? '20'));
337
-
338
- return {
339
- workspaceRoot,
340
- configDir: stored.configDir,
341
- configPath: stored.configPath,
342
- configFileExists: stored.exists,
343
- dbPath,
344
- dbPathSource: dbPathValue.source,
345
- apiPort,
346
- githubToken: githubToken.value,
347
- githubTokenSource: githubToken.source,
348
- openaiApiKey: openaiApiKey.value,
349
- openaiApiKeySource: openaiApiKey.source,
350
- secretProvider: stored.data.secretProvider ?? 'plaintext',
351
- opVaultName: stored.data.opVaultName,
352
- opItemName: stored.data.opItemName,
353
- summaryModel: summaryModel.value ?? 'gpt-5-mini',
354
- embedModel: embedModel.value ?? 'text-embedding-3-large',
355
- embedBatchSize,
356
- embedConcurrency,
357
- embedMaxUnread,
358
- openSearchUrl: openSearchUrl.value,
359
- openSearchIndex: openSearchIndex.value ?? 'ghcrawl-threads',
360
- tuiPreferences: stored.data.tuiPreferences ?? {},
361
- };
362
- }
363
-
364
- export function ensureRuntimeDirs(config: GitcrawlConfig): void {
365
- fs.mkdirSync(config.configDir, { recursive: true });
366
- fs.mkdirSync(path.dirname(config.dbPath), { recursive: true });
367
- }
368
-
369
- export function getTuiRepositoryPreference(config: GitcrawlConfig, owner: string, repo: string): TuiRepositoryPreference {
370
- return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 10, sortMode: 'recent' };
371
- }
372
-
373
- export function writeTuiRepositoryPreference(
374
- config: GitcrawlConfig,
375
- params: { owner: string; repo: string; minClusterSize: TuiMinClusterSize; sortMode: TuiSortPreference },
376
- ): { configPath: string } {
377
- const fullName = `${params.owner}/${params.repo}`;
378
- const nextPreferences = {
379
- ...config.tuiPreferences,
380
- [fullName]: {
381
- minClusterSize: params.minClusterSize,
382
- sortMode: params.sortMode,
383
- },
384
- };
385
- config.tuiPreferences = nextPreferences;
386
- const next = fs.existsSync(config.configPath)
387
- ? ({
388
- ...(JSON.parse(fs.readFileSync(config.configPath, 'utf8')) as PersistedGitcrawlConfig),
389
- tuiPreferences: nextPreferences,
390
- } satisfies PersistedGitcrawlConfig)
391
- : ({
392
- tuiPreferences: nextPreferences,
393
- } satisfies PersistedGitcrawlConfig);
394
- fs.mkdirSync(config.configDir, { recursive: true });
395
- fs.writeFileSync(config.configPath, `${JSON.stringify(next, null, 2)}\n`, { mode: 0o600 });
396
- return { configPath: config.configPath };
397
- }
398
-
399
- export function requireGithubToken(config: GitcrawlConfig): string {
400
- if (!config.githubToken) {
401
- if (config.secretProvider === 'op' && config.opVaultName && config.opItemName) {
402
- throw new Error(
403
- `Missing GitHub token in the environment. This config is set to use 1Password CLI via ${config.opVaultName}/${config.opItemName}; run ghcrawl through your op wrapper or set GITHUB_TOKEN. Expected config at ${config.configPath}`,
404
- );
405
- }
406
- throw new Error(`Missing GitHub token. Run ghcrawl init or set GITHUB_TOKEN. Expected config at ${config.configPath}`);
407
- }
408
- return config.githubToken;
409
- }
410
-
411
- export function requireOpenAiKey(config: GitcrawlConfig): string {
412
- if (!config.openaiApiKey) {
413
- if (config.secretProvider === 'op' && config.opVaultName && config.opItemName) {
414
- throw new Error(
415
- `Missing OpenAI API key in the environment. This config is set to use 1Password CLI via ${config.opVaultName}/${config.opItemName}; run ghcrawl through your op wrapper or set OPENAI_API_KEY. Expected config at ${config.configPath}`,
416
- );
417
- }
418
- throw new Error(`Missing OpenAI API key. Run ghcrawl init or set OPENAI_API_KEY. Expected config at ${config.configPath}`);
419
- }
420
- return config.openaiApiKey;
421
- }
@@ -1,30 +0,0 @@
1
- import test from 'node:test';
2
- import assert from 'node:assert/strict';
3
-
4
- import { migrate } from './migrate.js';
5
- import { openDb } from './sqlite.js';
6
-
7
- test('migrate creates core tables', () => {
8
- const db = openDb(':memory:');
9
- try {
10
- migrate(db);
11
- const rows = db
12
- .prepare("select name from sqlite_master where type in ('table', 'view') order by name asc")
13
- .all() as Array<{ name: string }>;
14
- const names = rows.map((row) => row.name);
15
-
16
- assert.ok(names.includes('repositories'));
17
- assert.ok(names.includes('threads'));
18
- assert.ok(names.includes('documents'));
19
- assert.ok(names.includes('document_embeddings'));
20
- assert.ok(names.includes('cluster_runs'));
21
- assert.ok(names.includes('repo_sync_state'));
22
-
23
- const threadColumns = db.prepare('pragma table_info(threads)').all() as Array<{ name: string }>;
24
- const threadColumnNames = threadColumns.map((column) => column.name);
25
- assert.ok(threadColumnNames.includes('first_pulled_at'));
26
- assert.ok(threadColumnNames.includes('last_pulled_at'));
27
- } finally {
28
- db.close();
29
- }
30
- });
package/src/db/sqlite.ts DELETED
@@ -1,14 +0,0 @@
1
- import fs from 'node:fs';
2
- import path from 'node:path';
3
-
4
- import BetterSqlite3 from 'better-sqlite3';
5
-
6
- export type SqliteDatabase = InstanceType<typeof BetterSqlite3>;
7
-
8
- export function openDb(dbPath: string): SqliteDatabase {
9
- fs.mkdirSync(path.dirname(dbPath), { recursive: true });
10
- const db = new BetterSqlite3(dbPath);
11
- db.pragma('journal_mode = WAL');
12
- db.pragma('foreign_keys = ON');
13
- return db;
14
- }
@@ -1,25 +0,0 @@
1
- import test from 'node:test';
2
- import assert from 'node:assert/strict';
3
-
4
- import { buildCanonicalDocument, isBotLikeAuthor } from './normalize.js';
5
-
6
- test('bot detection catches bot users and common automation', () => {
7
- assert.equal(isBotLikeAuthor({ authorLogin: 'dependabot[bot]' }), true);
8
- assert.equal(isBotLikeAuthor({ authorType: 'Bot' }), true);
9
- assert.equal(isBotLikeAuthor({ authorLogin: 'maintainer' }), false);
10
- });
11
-
12
- test('canonical document excludes bot comments from dedupe text', () => {
13
- const document = buildCanonicalDocument({
14
- title: 'Downloader stalls',
15
- body: 'The transfer never finishes.',
16
- labels: ['bug'],
17
- comments: [
18
- { body: 'same failure on macOS', authorLogin: 'alice', authorType: 'User', isBot: false },
19
- { body: 'automated reminder', authorLogin: 'github-actions[bot]', authorType: 'Bot', isBot: true },
20
- ],
21
- });
22
-
23
- assert.match(document.rawText, /same failure on macOS/);
24
- assert.doesNotMatch(document.dedupeText, /automated reminder/);
25
- });
@@ -1,52 +0,0 @@
1
- import crypto from 'node:crypto';
2
-
3
- export type NormalizedComment = {
4
- body: string;
5
- authorLogin: string | null;
6
- authorType: string | null;
7
- isBot: boolean;
8
- };
9
-
10
- export type NormalizedThread = {
11
- title: string;
12
- body: string | null;
13
- labels: string[];
14
- comments: NormalizedComment[];
15
- };
16
-
17
- function normalizeWhitespace(value: string): string {
18
- return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim();
19
- }
20
-
21
- export function isBotLikeAuthor(input: { authorType?: string | null; authorLogin?: string | null; isBot?: boolean }): boolean {
22
- if (input.isBot) return true;
23
- if ((input.authorType ?? '').toLowerCase() === 'bot') return true;
24
- const login = (input.authorLogin ?? '').toLowerCase();
25
- return login.endsWith('[bot]') || login.includes('renovate') || login.includes('dependabot');
26
- }
27
-
28
- export function buildCanonicalDocument(thread: NormalizedThread): { rawText: string; dedupeText: string; contentHash: string } {
29
- const labels = thread.labels.length > 0 ? `labels: ${thread.labels.join(', ')}` : '';
30
- const humanComments = thread.comments
31
- .filter((comment) => !isBotLikeAuthor(comment))
32
- .map((comment) => {
33
- const author = comment.authorLogin ? `@${comment.authorLogin}` : 'unknown';
34
- return `${author}: ${normalizeWhitespace(comment.body)}`;
35
- })
36
- .filter(Boolean);
37
-
38
- const title = normalizeWhitespace(thread.title);
39
- const body = normalizeWhitespace(thread.body ?? '');
40
- const rawParts = [title, body, labels, ...humanComments].filter(Boolean);
41
- const dedupeParts = [
42
- `title: ${title}`,
43
- body ? `body: ${body}` : '',
44
- labels ? labels : '',
45
- humanComments.length > 0 ? `discussion: ${humanComments.join('\n')}` : '',
46
- ].filter(Boolean);
47
-
48
- const rawText = rawParts.join('\n\n');
49
- const dedupeText = dedupeParts.join('\n\n');
50
- const contentHash = crypto.createHash('sha256').update(`${rawText}\n---\n${dedupeText}`).digest('hex');
51
- return { rawText, dedupeText, contentHash };
52
- }