@bodhi-ventures/aiocs 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4601 +0,0 @@
1
- // src/errors.ts
2
- var AIOCS_ERROR_CODES = {
3
- invalidArgument: "INVALID_ARGUMENT",
4
- sourceNotFound: "SOURCE_NOT_FOUND",
5
- snapshotNotFound: "SNAPSHOT_NOT_FOUND",
6
- snapshotDiffBaseNotFound: "SNAPSHOT_DIFF_BASE_NOT_FOUND",
7
- noPagesFetched: "NO_PAGES_FETCHED",
8
- noProjectScope: "NO_PROJECT_SCOPE",
9
- chunkNotFound: "CHUNK_NOT_FOUND",
10
- referenceFileNotFound: "REFERENCE_FILE_NOT_FOUND",
11
- invalidReferenceFile: "INVALID_REFERENCE_FILE",
12
- authEnvMissing: "AUTH_ENV_MISSING",
13
- canaryFailed: "CANARY_FAILED",
14
- backupConflict: "BACKUP_CONFLICT",
15
- backupInvalid: "BACKUP_INVALID",
16
- backupSourceMissing: "BACKUP_SOURCE_MISSING",
17
- embeddingConfigInvalid: "EMBEDDING_CONFIG_INVALID",
18
- embeddingProviderUnavailable: "EMBEDDING_PROVIDER_UNAVAILABLE",
19
- vectorStoreUnavailable: "VECTOR_STORE_UNAVAILABLE",
20
- embeddingJobNotFound: "EMBEDDING_JOB_NOT_FOUND",
21
- internalError: "INTERNAL_ERROR"
22
- };
23
- var AiocsError = class extends Error {
24
- code;
25
- details;
26
- constructor(code, message, details) {
27
- super(message);
28
- this.name = "AiocsError";
29
- this.code = code;
30
- this.details = details;
31
- }
32
- };
33
- function isAiocsError(error) {
34
- return error instanceof AiocsError;
35
- }
36
- function toAiocsError(error) {
37
- if (isAiocsError(error)) {
38
- return error;
39
- }
40
- if (error instanceof Error) {
41
- return new AiocsError(AIOCS_ERROR_CODES.internalError, error.message);
42
- }
43
- return new AiocsError(AIOCS_ERROR_CODES.internalError, String(error));
44
- }
45
-
46
- // src/runtime/paths.ts
47
- import { homedir } from "os";
48
- import { join as join2, relative, resolve, sep } from "path";
49
- import { mkdirSync } from "fs";
50
-
51
- // src/runtime/bundled-sources.ts
52
- import { existsSync } from "fs";
53
- import { dirname, join } from "path";
54
- import { fileURLToPath } from "url";
55
- function findPackageRoot(startDir) {
56
- let currentDir = startDir;
57
- while (true) {
58
- if (existsSync(join(currentDir, "package.json")) && existsSync(join(currentDir, "sources"))) {
59
- return currentDir;
60
- }
61
- const parentDir = dirname(currentDir);
62
- if (parentDir === currentDir) {
63
- throw new Error(`Could not locate aiocs package root from ${startDir}`);
64
- }
65
- currentDir = parentDir;
66
- }
67
- }
68
- function getBundledSourcesDir() {
69
- const currentFilePath = fileURLToPath(import.meta.url);
70
- const packageRoot = findPackageRoot(dirname(currentFilePath));
71
- return join(packageRoot, "sources");
72
- }
73
-
74
- // src/runtime/paths.ts
75
- var PORTABLE_USER_SOURCES_PREFIX = "~/.aiocs/sources";
76
- var PORTABLE_BUNDLED_SOURCES_PREFIX = "aiocs://bundled";
77
- var CONTAINER_USER_SOURCES_DIR = "/root/.aiocs/sources";
78
- var CONTAINER_BUNDLED_SOURCES_DIR = "/app/sources";
79
- function expandTilde(path) {
80
- if (path === "~") {
81
- return homedir();
82
- }
83
- if (path.startsWith("~/")) {
84
- return join2(homedir(), path.slice(2));
85
- }
86
- return path;
87
- }
88
- function getAiocsDataDir(env = process.env) {
89
- const override = env.AIOCS_DATA_DIR;
90
- if (override) {
91
- mkdirSync(expandTilde(override), { recursive: true });
92
- return expandTilde(override);
93
- }
94
- const target = join2(homedir(), ".aiocs", "data");
95
- mkdirSync(target, { recursive: true });
96
- return target;
97
- }
98
- function getAiocsConfigDir(env = process.env) {
99
- const override = env.AIOCS_CONFIG_DIR;
100
- if (override) {
101
- mkdirSync(expandTilde(override), { recursive: true });
102
- return expandTilde(override);
103
- }
104
- const target = join2(homedir(), ".aiocs", "config");
105
- mkdirSync(target, { recursive: true });
106
- return target;
107
- }
108
- function getAiocsSourcesDir(env = process.env) {
109
- const override = env.AIOCS_SOURCES_DIR;
110
- if (override) {
111
- mkdirSync(expandTilde(override), { recursive: true });
112
- return expandTilde(override);
113
- }
114
- const target = join2(homedir(), ".aiocs", "sources");
115
- mkdirSync(target, { recursive: true });
116
- return target;
117
- }
118
- function isWithinRoot(candidatePath, rootPath) {
119
- return candidatePath === rootPath || candidatePath.startsWith(`${rootPath}${sep}`);
120
- }
121
- function toPortablePath(prefix, rootPath, candidatePath) {
122
- const relativePath = relative(rootPath, candidatePath).split(sep).join("/");
123
- return relativePath ? `${prefix}/${relativePath}` : prefix;
124
- }
125
- function canonicalizeManagedSpecPath(specPath, env = process.env) {
126
- if (specPath === PORTABLE_USER_SOURCES_PREFIX || specPath.startsWith(`${PORTABLE_USER_SOURCES_PREFIX}/`) || specPath === PORTABLE_BUNDLED_SOURCES_PREFIX || specPath.startsWith(`${PORTABLE_BUNDLED_SOURCES_PREFIX}/`)) {
127
- return specPath;
128
- }
129
- const resolvedPath = resolve(specPath);
130
- const userRoots = [resolve(getAiocsSourcesDir(env)), CONTAINER_USER_SOURCES_DIR];
131
- for (const rootPath of userRoots) {
132
- if (isWithinRoot(resolvedPath, rootPath)) {
133
- return toPortablePath(PORTABLE_USER_SOURCES_PREFIX, rootPath, resolvedPath);
134
- }
135
- }
136
- const bundledRoots = [resolve(getBundledSourcesDir()), CONTAINER_BUNDLED_SOURCES_DIR];
137
- for (const rootPath of bundledRoots) {
138
- if (isWithinRoot(resolvedPath, rootPath)) {
139
- return toPortablePath(PORTABLE_BUNDLED_SOURCES_PREFIX, rootPath, resolvedPath);
140
- }
141
- }
142
- return resolvedPath;
143
- }
144
-
145
- // src/catalog/catalog.ts
146
- import { mkdirSync as mkdirSync2 } from "fs";
147
- import { join as join3, resolve as resolve3 } from "path";
148
- import { randomUUID } from "crypto";
149
- import Database from "better-sqlite3";
150
-
151
- // src/catalog/chunking.ts
152
- var MAX_CHUNK_BYTES = 16384;
153
- var HEADING_PATTERN = /^(#{1,6})\s+(.*)$/;
154
- function byteLength(value) {
155
- return Buffer.byteLength(value, "utf8");
156
- }
157
- function splitLargeSection(sectionTitle, markdown, startOrder) {
158
- const lines = markdown.split("\n");
159
- const chunks = [];
160
- let current = "";
161
- let order = startOrder;
162
- const flush = () => {
163
- const trimmed = current.trim();
164
- if (!trimmed) {
165
- current = "";
166
- return;
167
- }
168
- chunks.push({
169
- sectionTitle,
170
- markdown: trimmed,
171
- chunkOrder: order
172
- });
173
- order += 1;
174
- current = "";
175
- };
176
- for (const line of lines) {
177
- const next = current ? `${current}
178
- ${line}` : line;
179
- if (current && byteLength(next) > MAX_CHUNK_BYTES) {
180
- flush();
181
- }
182
- current = current ? `${current}
183
- ${line}` : line;
184
- }
185
- flush();
186
- return chunks;
187
- }
188
- function chunkMarkdown(pageTitle, markdown) {
189
- const trimmed = markdown.trim();
190
- if (!trimmed) {
191
- return [];
192
- }
193
- if (byteLength(trimmed) <= MAX_CHUNK_BYTES) {
194
- return [{ sectionTitle: pageTitle, markdown: trimmed, chunkOrder: 0 }];
195
- }
196
- const lines = trimmed.split("\n");
197
- const sections = [];
198
- let currentTitle = pageTitle;
199
- let currentLines = [];
200
- const flushSection = () => {
201
- const content = currentLines.join("\n").trim();
202
- if (!content) {
203
- currentLines = [];
204
- return;
205
- }
206
- sections.push({ title: currentTitle, markdown: content });
207
- currentLines = [];
208
- };
209
- for (const line of lines) {
210
- const match = line.trim().match(HEADING_PATTERN);
211
- if (match && match[1].length >= 2) {
212
- flushSection();
213
- currentTitle = match[2].trim() || pageTitle;
214
- }
215
- currentLines.push(line);
216
- }
217
- flushSection();
218
- const chunks = [];
219
- let order = 0;
220
- for (const section of sections) {
221
- if (byteLength(section.markdown) <= MAX_CHUNK_BYTES) {
222
- chunks.push({
223
- sectionTitle: section.title,
224
- markdown: section.markdown,
225
- chunkOrder: order
226
- });
227
- order += 1;
228
- continue;
229
- }
230
- const split = splitLargeSection(section.title, section.markdown, order);
231
- chunks.push(...split);
232
- order = chunks.length;
233
- }
234
- return chunks;
235
- }
236
-
237
- // src/catalog/fingerprint.ts
238
- import { createHash } from "crypto";
239
- function sha256(value) {
240
- return createHash("sha256").update(value).digest("hex");
241
- }
242
- function buildSnapshotFingerprint(input) {
243
- const normalizedPages = [...input.pages].sort((left, right) => left.url.localeCompare(right.url));
244
- const payload = JSON.stringify({
245
- sourceId: input.sourceId,
246
- configHash: input.configHash,
247
- pages: normalizedPages
248
- });
249
- return sha256(payload);
250
- }
251
-
252
- // src/catalog/project-scope.ts
253
- import { realpathSync } from "fs";
254
- import { resolve as resolve2 } from "path";
255
- function isWithin(candidate, root) {
256
- return candidate === root || candidate.startsWith(`${root}/`);
257
- }
258
- function canonicalizeProjectPath(path) {
259
- const resolved = resolve2(path);
260
- try {
261
- return realpathSync.native(resolved);
262
- } catch {
263
- return resolved;
264
- }
265
- }
266
- function resolveProjectScope(cwd, scopes) {
267
- const normalizedCwd = canonicalizeProjectPath(cwd);
268
- const normalizedScopes = scopes.map((scope) => ({
269
- projectPath: canonicalizeProjectPath(scope.projectPath),
270
- sourceIds: [...scope.sourceIds]
271
- })).filter((scope) => isWithin(normalizedCwd, scope.projectPath)).sort((left, right) => right.projectPath.length - left.projectPath.length);
272
- return normalizedScopes[0] ?? null;
273
- }
274
-
275
- // src/spec/source-spec.ts
276
- import { readFile } from "fs/promises";
277
- import { extname } from "path";
278
- import YAML from "yaml";
279
- import { z } from "zod";
280
- var patternSchema = z.string().min(1);
281
- var interactionSchema = z.discriminatedUnion("action", [
282
- z.object({
283
- action: z.literal("hover"),
284
- selector: z.string().min(1),
285
- timeoutMs: z.number().int().positive().optional()
286
- }),
287
- z.object({
288
- action: z.literal("click"),
289
- selector: z.string().min(1),
290
- timeoutMs: z.number().int().positive().optional()
291
- }),
292
- z.object({
293
- action: z.literal("press"),
294
- key: z.string().min(1)
295
- }),
296
- z.object({
297
- action: z.literal("wait"),
298
- timeoutMs: z.number().int().positive()
299
- })
300
- ]);
301
- var clipboardExtractSchema = z.object({
302
- strategy: z.literal("clipboardButton"),
303
- interactions: z.array(interactionSchema).min(1),
304
- clipboardTimeoutMs: z.number().int().positive().default(1e4)
305
- });
306
- var selectorExtractSchema = z.object({
307
- strategy: z.literal("selector"),
308
- selector: z.string().min(1)
309
- });
310
- var readabilityExtractSchema = z.object({
311
- strategy: z.literal("readability")
312
- });
313
- var authHeaderSchema = z.object({
314
- name: z.string().min(1),
315
- valueFromEnv: z.string().min(1),
316
- hosts: z.array(z.string().min(1)).min(1).optional(),
317
- include: z.array(patternSchema).min(1).optional()
318
- });
319
- var authCookieSchema = z.object({
320
- name: z.string().min(1),
321
- valueFromEnv: z.string().min(1),
322
- domain: z.string().min(1),
323
- path: z.string().min(1).default("/"),
324
- secure: z.boolean().optional(),
325
- httpOnly: z.boolean().optional(),
326
- sameSite: z.enum(["Strict", "Lax", "None"]).optional()
327
- });
328
- var canaryCheckSchema = z.object({
329
- url: z.string().url(),
330
- expectedTitle: z.string().min(1).optional(),
331
- expectedText: z.string().min(1).optional(),
332
- minMarkdownLength: z.number().int().positive().default(40)
333
- });
334
- var sourceSpecSchema = z.object({
335
- id: z.string().min(1).regex(/^[a-z0-9-]+$/),
336
- label: z.string().min(1),
337
- startUrls: z.array(z.string().url()).min(1),
338
- allowedHosts: z.array(z.string().min(1)).min(1),
339
- discovery: z.object({
340
- include: z.array(patternSchema).min(1),
341
- exclude: z.array(patternSchema),
342
- maxPages: z.number().int().positive()
343
- }),
344
- extract: z.discriminatedUnion("strategy", [
345
- clipboardExtractSchema,
346
- selectorExtractSchema,
347
- readabilityExtractSchema
348
- ]),
349
- normalize: z.object({
350
- prependSourceComment: z.boolean().default(true)
351
- }),
352
- schedule: z.object({
353
- everyHours: z.number().int().positive()
354
- }),
355
- auth: z.object({
356
- headers: z.array(authHeaderSchema).default([]),
357
- cookies: z.array(authCookieSchema).default([])
358
- }).optional(),
359
- canary: z.object({
360
- everyHours: z.number().int().positive().optional(),
361
- checks: z.array(canaryCheckSchema).min(1)
362
- }).optional()
363
- }).superRefine((spec, context) => {
364
- for (const [index, header] of (spec.auth?.headers ?? []).entries()) {
365
- if (!header.hosts) {
366
- continue;
367
- }
368
- for (const host of header.hosts) {
369
- if (!spec.allowedHosts.includes(host)) {
370
- context.addIssue({
371
- code: z.ZodIssueCode.custom,
372
- path: ["auth", "headers", index, "hosts"],
373
- message: `Authenticated header host '${host}' must be included in allowedHosts`
374
- });
375
- }
376
- }
377
- }
378
- });
379
- function parseSourceSpec(raw, ext) {
380
- if (ext === ".json") {
381
- return JSON.parse(raw);
382
- }
383
- return YAML.parse(raw);
384
- }
385
- async function loadSourceSpec(path) {
386
- const raw = await readFile(path, "utf8");
387
- const parsed = parseSourceSpec(raw, extname(path).toLowerCase());
388
- return sourceSpecSchema.parse(parsed);
389
- }
390
- function resolveSourceCanary(spec) {
391
- return {
392
- everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
393
- checks: spec.canary?.checks ?? [
394
- {
395
- url: spec.startUrls[0],
396
- minMarkdownLength: 40
397
- }
398
- ]
399
- };
400
- }
401
-
402
- // src/catalog/catalog.ts
403
- function initSchema(db) {
404
- db.exec(`
405
- PRAGMA foreign_keys = ON;
406
-
407
- CREATE TABLE IF NOT EXISTS sources (
408
- id TEXT PRIMARY KEY,
409
- label TEXT NOT NULL,
410
- spec_json TEXT NOT NULL,
411
- spec_path TEXT,
412
- config_hash TEXT NOT NULL,
413
- created_at TEXT NOT NULL,
414
- updated_at TEXT NOT NULL,
415
- last_checked_at TEXT,
416
- last_successful_snapshot_at TEXT,
417
- last_successful_snapshot_id TEXT,
418
- last_canary_checked_at TEXT,
419
- last_successful_canary_at TEXT,
420
- last_canary_status TEXT,
421
- next_canary_due_at TEXT,
422
- next_due_at TEXT NOT NULL
423
- );
424
-
425
- CREATE TABLE IF NOT EXISTS snapshots (
426
- id TEXT PRIMARY KEY,
427
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
428
- fingerprint TEXT NOT NULL,
429
- config_hash TEXT NOT NULL,
430
- detected_version TEXT,
431
- page_count INTEGER NOT NULL,
432
- created_at TEXT NOT NULL,
433
- UNIQUE(source_id, fingerprint)
434
- );
435
-
436
- CREATE TABLE IF NOT EXISTS pages (
437
- id INTEGER PRIMARY KEY AUTOINCREMENT,
438
- snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
439
- url TEXT NOT NULL,
440
- title TEXT NOT NULL,
441
- markdown TEXT NOT NULL,
442
- content_hash TEXT NOT NULL,
443
- UNIQUE(snapshot_id, url)
444
- );
445
-
446
- CREATE TABLE IF NOT EXISTS chunks (
447
- id INTEGER PRIMARY KEY AUTOINCREMENT,
448
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
449
- snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
450
- page_id INTEGER NOT NULL REFERENCES pages(id) ON DELETE CASCADE,
451
- page_url TEXT NOT NULL,
452
- page_title TEXT NOT NULL,
453
- section_title TEXT NOT NULL,
454
- chunk_order INTEGER NOT NULL,
455
- markdown TEXT NOT NULL
456
- );
457
-
458
- CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
459
- page_title,
460
- section_title,
461
- markdown,
462
- content=chunks,
463
- content_rowid=id,
464
- tokenize='porter unicode61'
465
- );
466
-
467
- CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
468
- INSERT INTO chunks_fts(rowid, page_title, section_title, markdown)
469
- VALUES (new.id, new.page_title, new.section_title, new.markdown);
470
- END;
471
-
472
- CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
473
- INSERT INTO chunks_fts(chunks_fts, rowid, page_title, section_title, markdown)
474
- VALUES ('delete', old.id, old.page_title, old.section_title, old.markdown);
475
- END;
476
-
477
- CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
478
- INSERT INTO chunks_fts(chunks_fts, rowid, page_title, section_title, markdown)
479
- VALUES ('delete', old.id, old.page_title, old.section_title, old.markdown);
480
- INSERT INTO chunks_fts(rowid, page_title, section_title, markdown)
481
- VALUES (new.id, new.page_title, new.section_title, new.markdown);
482
- END;
483
-
484
- CREATE TABLE IF NOT EXISTS fetch_runs (
485
- id TEXT PRIMARY KEY,
486
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
487
- status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
488
- error_message TEXT,
489
- snapshot_id TEXT REFERENCES snapshots(id) ON DELETE SET NULL,
490
- started_at TEXT NOT NULL,
491
- finished_at TEXT NOT NULL
492
- );
493
-
494
- CREATE TABLE IF NOT EXISTS canary_runs (
495
- id TEXT PRIMARY KEY,
496
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
497
- status TEXT NOT NULL CHECK(status IN ('pass', 'fail')),
498
- checked_at TEXT NOT NULL,
499
- details_json TEXT NOT NULL
500
- );
501
-
502
- CREATE TABLE IF NOT EXISTS project_links (
503
- project_path TEXT NOT NULL,
504
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
505
- created_at TEXT NOT NULL,
506
- PRIMARY KEY(project_path, source_id)
507
- );
508
-
509
- CREATE TABLE IF NOT EXISTS daemon_state (
510
- singleton_id INTEGER PRIMARY KEY CHECK(singleton_id = 1),
511
- last_started_at TEXT,
512
- last_cycle_started_at TEXT,
513
- last_cycle_completed_at TEXT,
514
- last_cycle_status TEXT,
515
- interval_minutes INTEGER,
516
- fetch_on_start INTEGER
517
- );
518
-
519
- CREATE TABLE IF NOT EXISTS embedding_state (
520
- chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
521
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
522
- snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
523
- content_hash TEXT NOT NULL,
524
- model_key TEXT,
525
- status TEXT NOT NULL CHECK(status IN ('pending', 'indexed', 'failed', 'stale')),
526
- vector_point_id TEXT,
527
- last_attempted_at TEXT,
528
- indexed_at TEXT,
529
- error_message TEXT
530
- );
531
-
532
- CREATE TABLE IF NOT EXISTS embedding_jobs (
533
- source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
534
- snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
535
- status TEXT NOT NULL CHECK(status IN ('pending', 'running', 'succeeded', 'failed')),
536
- attempt_count INTEGER NOT NULL DEFAULT 0,
537
- chunk_count INTEGER NOT NULL,
538
- created_at TEXT NOT NULL,
539
- updated_at TEXT NOT NULL,
540
- claimed_at TEXT,
541
- completed_at TEXT,
542
- error_message TEXT,
543
- PRIMARY KEY(source_id, snapshot_id)
544
- );
545
-
546
- CREATE INDEX IF NOT EXISTS idx_embedding_jobs_status_updated
547
- ON embedding_jobs(status, updated_at, source_id, snapshot_id);
548
-
549
- CREATE INDEX IF NOT EXISTS idx_embedding_state_source_snapshot
550
- ON embedding_state(source_id, snapshot_id, status);
551
- `);
552
- const sourceColumns = db.prepare("PRAGMA table_info(sources)").all();
553
- if (!sourceColumns.some((column) => column.name === "spec_path")) {
554
- db.exec("ALTER TABLE sources ADD COLUMN spec_path TEXT");
555
- }
556
- if (!sourceColumns.some((column) => column.name === "last_successful_snapshot_at")) {
557
- db.exec("ALTER TABLE sources ADD COLUMN last_successful_snapshot_at TEXT");
558
- }
559
- if (!sourceColumns.some((column) => column.name === "last_canary_checked_at")) {
560
- db.exec("ALTER TABLE sources ADD COLUMN last_canary_checked_at TEXT");
561
- }
562
- if (!sourceColumns.some((column) => column.name === "last_successful_canary_at")) {
563
- db.exec("ALTER TABLE sources ADD COLUMN last_successful_canary_at TEXT");
564
- }
565
- if (!sourceColumns.some((column) => column.name === "last_canary_status")) {
566
- db.exec("ALTER TABLE sources ADD COLUMN last_canary_status TEXT");
567
- }
568
- if (!sourceColumns.some((column) => column.name === "next_canary_due_at")) {
569
- db.exec("ALTER TABLE sources ADD COLUMN next_canary_due_at TEXT");
570
- }
571
- }
572
- function nowIso() {
573
- return (/* @__PURE__ */ new Date()).toISOString();
574
- }
575
- function addHoursIso(hours) {
576
- return new Date(Date.now() + hours * 60 * 60 * 1e3).toISOString();
577
- }
578
- function stableStringify(value) {
579
- if (Array.isArray(value)) {
580
- return `[${value.map((entry) => stableStringify(entry)).join(",")}]`;
581
- }
582
- if (value && typeof value === "object") {
583
- const entries = Object.entries(value).sort(
584
- ([left], [right]) => left.localeCompare(right)
585
- );
586
- return `{${entries.map(([key, entry]) => `${JSON.stringify(key)}:${stableStringify(entry)}`).join(",")}}`;
587
- }
588
- return JSON.stringify(value);
589
- }
590
- function normalizeQuery(query) {
591
- const words = query.replace(/[^\p{L}\p{N}]+/gu, " ").split(/\s+/).map((part) => part.trim()).filter(Boolean);
592
- return words.join(" ");
593
- }
594
- function assertPaginationValue(value, field, fallback) {
595
- if (typeof value === "undefined") {
596
- return fallback;
597
- }
598
- if (!Number.isInteger(value) || value < 0) {
599
- throw new AiocsError(
600
- AIOCS_ERROR_CODES.invalidArgument,
601
- `${field} must be a non-negative integer`
602
- );
603
- }
604
- if (field === "limit" && value === 0) {
605
- throw new AiocsError(
606
- AIOCS_ERROR_CODES.invalidArgument,
607
- "limit must be greater than zero"
608
- );
609
- }
610
- return value;
611
- }
612
- function openCatalog(options) {
613
- const dataDir = resolve3(options.dataDir);
614
- mkdirSync2(dataDir, { recursive: true });
615
- const db = new Database(join3(dataDir, "catalog.sqlite"));
616
- initSchema(db);
617
- const listProjectLinks = () => {
618
- const rows = db.prepare("SELECT project_path, source_id FROM project_links ORDER BY project_path, source_id").all();
619
- const grouped = /* @__PURE__ */ new Map();
620
- for (const row of rows) {
621
- const current = grouped.get(row.project_path) ?? [];
622
- current.push(row.source_id);
623
- grouped.set(row.project_path, current);
624
- }
625
- return [...grouped.entries()].map(([projectPath, sourceIds]) => ({ projectPath, sourceIds }));
626
- };
627
- const resolveSearchScope = (input) => {
628
- const limit = assertPaginationValue(input.limit, "limit", 20);
629
- const offset = assertPaginationValue(input.offset, "offset", 0);
630
- let sourceIds = input.sourceIds ? [...input.sourceIds] : void 0;
631
- if (!sourceIds || sourceIds.length === 0) {
632
- if (input.cwd) {
633
- const scope = resolveProjectScope(
634
- input.cwd,
635
- listProjectLinks().map((link) => ({
636
- projectPath: link.projectPath,
637
- sourceIds: link.sourceIds
638
- }))
639
- );
640
- if (scope) {
641
- sourceIds = scope.sourceIds;
642
- }
643
- }
644
- }
645
- if ((!sourceIds || sourceIds.length === 0) && !input.all) {
646
- return {
647
- limit,
648
- offset,
649
- sourceIds: null,
650
- snapshotIds: []
651
- };
652
- }
653
- const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
654
- const latestSnapshotIds = input.snapshotId ? [input.snapshotId] : db.prepare(`
655
- SELECT last_successful_snapshot_id AS snapshot_id
656
- FROM sources
657
- WHERE last_successful_snapshot_id IS NOT NULL
658
- ${filterSourceIds ? `AND id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
659
- `).all(...filterSourceIds ?? []).map((row) => row.snapshot_id);
660
- return {
661
- limit,
662
- offset,
663
- sourceIds: filterSourceIds,
664
- snapshotIds: latestSnapshotIds
665
- };
666
- };
667
- const searchLexicalByScope = (input) => {
668
- const normalized = normalizeQuery(input.query);
669
- const limit = assertPaginationValue(input.limit, "limit", input.scope.limit);
670
- const offset = assertPaginationValue(input.offset, "offset", input.scope.offset);
671
- if (!normalized || input.scope.snapshotIds.length === 0) {
672
- return {
673
- total: 0,
674
- limit,
675
- offset,
676
- hasMore: false,
677
- results: []
678
- };
679
- }
680
- const whereSnapshotPlaceholders = input.scope.snapshotIds.map(() => "?").join(",");
681
- const sourceSql = input.scope.sourceIds ? `AND c.source_id IN (${input.scope.sourceIds.map(() => "?").join(",")})` : "";
682
- const queryArgs = [
683
- normalized,
684
- ...input.scope.snapshotIds,
685
- ...input.scope.sourceIds ?? []
686
- ];
687
- const totalRow = db.prepare(`
688
- SELECT COUNT(*) AS total
689
- FROM chunks_fts
690
- JOIN chunks c ON c.id = chunks_fts.rowid
691
- WHERE chunks_fts MATCH ?
692
- AND c.snapshot_id IN (${whereSnapshotPlaceholders})
693
- ${sourceSql}
694
- `).get(...queryArgs);
695
- const rows = db.prepare(`
696
- SELECT
697
- c.id AS chunk_id,
698
- c.source_id,
699
- c.snapshot_id,
700
- c.page_url,
701
- c.page_title,
702
- c.section_title,
703
- c.markdown
704
- FROM chunks_fts
705
- JOIN chunks c ON c.id = chunks_fts.rowid
706
- WHERE chunks_fts MATCH ?
707
- AND c.snapshot_id IN (${whereSnapshotPlaceholders})
708
- ${sourceSql}
709
- ORDER BY bm25(chunks_fts), c.id
710
- LIMIT ?
711
- OFFSET ?
712
- `).all(...queryArgs, limit, offset);
713
- const results = rows.map((row) => ({
714
- chunkId: row.chunk_id,
715
- sourceId: row.source_id,
716
- snapshotId: row.snapshot_id,
717
- pageUrl: row.page_url,
718
- pageTitle: row.page_title,
719
- sectionTitle: row.section_title,
720
- markdown: row.markdown
721
- }));
722
- return {
723
- total: totalRow.total,
724
- limit,
725
- offset,
726
- hasMore: offset + results.length < totalRow.total,
727
- results
728
- };
729
- };
730
- const listLatestSnapshots = (sourceIds) => {
731
- const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
732
- const rows = db.prepare(`
733
- SELECT id AS source_id, last_successful_snapshot_id AS snapshot_id
734
- FROM sources
735
- WHERE last_successful_snapshot_id IS NOT NULL
736
- ${filterSourceIds ? `AND id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
737
- ORDER BY id
738
- `).all(...filterSourceIds ?? []);
739
- return rows.map((row) => ({
740
- sourceId: row.source_id,
741
- snapshotId: row.snapshot_id
742
- }));
743
- };
744
- const queueEmbeddingJobForSnapshot = (sourceId, snapshotId, previousLatestSnapshotId) => {
745
- const timestamp = nowIso();
746
- if (previousLatestSnapshotId && previousLatestSnapshotId !== snapshotId) {
747
- db.prepare(`
748
- UPDATE embedding_state
749
- SET
750
- status = 'stale',
751
- vector_point_id = NULL,
752
- indexed_at = NULL,
753
- error_message = NULL
754
- WHERE source_id = ?
755
- AND snapshot_id = ?
756
- `).run(sourceId, previousLatestSnapshotId);
757
- db.prepare(`
758
- DELETE FROM embedding_jobs
759
- WHERE source_id = ?
760
- AND snapshot_id = ?
761
- `).run(sourceId, previousLatestSnapshotId);
762
- }
763
- const chunkRows = db.prepare(`
764
- SELECT id, markdown
765
- FROM chunks
766
- WHERE source_id = ?
767
- AND snapshot_id = ?
768
- ORDER BY id
769
- `).all(sourceId, snapshotId);
770
- const upsertState = db.prepare(`
771
- INSERT INTO embedding_state (
772
- chunk_id,
773
- source_id,
774
- snapshot_id,
775
- content_hash,
776
- model_key,
777
- status,
778
- vector_point_id,
779
- last_attempted_at,
780
- indexed_at,
781
- error_message
782
- ) VALUES (?, ?, ?, ?, NULL, 'pending', NULL, NULL, NULL, NULL)
783
- ON CONFLICT(chunk_id) DO UPDATE SET
784
- source_id = excluded.source_id,
785
- snapshot_id = excluded.snapshot_id,
786
- content_hash = excluded.content_hash,
787
- model_key = CASE
788
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
789
- THEN embedding_state.model_key
790
- ELSE NULL
791
- END,
792
- status = CASE
793
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
794
- THEN 'indexed'
795
- ELSE 'pending'
796
- END,
797
- vector_point_id = CASE
798
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
799
- THEN embedding_state.vector_point_id
800
- ELSE NULL
801
- END,
802
- last_attempted_at = CASE
803
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
804
- THEN embedding_state.last_attempted_at
805
- ELSE NULL
806
- END,
807
- indexed_at = CASE
808
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
809
- THEN embedding_state.indexed_at
810
- ELSE NULL
811
- END,
812
- error_message = CASE
813
- WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
814
- THEN embedding_state.error_message
815
- ELSE NULL
816
- END
817
- `);
818
- const transaction = db.transaction(() => {
819
- for (const chunk of chunkRows) {
820
- upsertState.run(
821
- chunk.id,
822
- sourceId,
823
- snapshotId,
824
- sha256(chunk.markdown)
825
- );
826
- }
827
- });
828
- transaction();
829
- const pendingRow = db.prepare(`
830
- SELECT COUNT(*) AS pending_count
831
- FROM embedding_state
832
- WHERE source_id = ?
833
- AND snapshot_id = ?
834
- AND status != 'indexed'
835
- `).get(sourceId, snapshotId);
836
- if (pendingRow.pending_count === 0) {
837
- db.prepare(`
838
- INSERT INTO embedding_jobs (
839
- source_id,
840
- snapshot_id,
841
- status,
842
- attempt_count,
843
- chunk_count,
844
- created_at,
845
- updated_at,
846
- claimed_at,
847
- completed_at,
848
- error_message
849
- ) VALUES (?, ?, 'succeeded', 0, ?, ?, ?, NULL, ?, NULL)
850
- ON CONFLICT(source_id, snapshot_id) DO UPDATE SET
851
- status = 'succeeded',
852
- chunk_count = excluded.chunk_count,
853
- updated_at = excluded.updated_at,
854
- claimed_at = NULL,
855
- completed_at = excluded.completed_at,
856
- error_message = NULL
857
- `).run(sourceId, snapshotId, chunkRows.length, timestamp, timestamp, timestamp);
858
- return;
859
- }
860
- db.prepare(`
861
- INSERT INTO embedding_jobs (
862
- source_id,
863
- snapshot_id,
864
- status,
865
- attempt_count,
866
- chunk_count,
867
- created_at,
868
- updated_at,
869
- claimed_at,
870
- completed_at,
871
- error_message
872
- ) VALUES (?, ?, 'pending', 0, ?, ?, ?, NULL, NULL, NULL)
873
- ON CONFLICT(source_id, snapshot_id) DO UPDATE SET
874
- status = 'pending',
875
- chunk_count = excluded.chunk_count,
876
- updated_at = excluded.updated_at,
877
- claimed_at = NULL,
878
- completed_at = NULL,
879
- error_message = NULL
880
- `).run(sourceId, snapshotId, chunkRows.length, timestamp, timestamp);
881
- };
882
- return {
883
- close() {
884
- db.close();
885
- },
886
- upsertSource(spec, options2) {
887
- const timestamp = nowIso();
888
- const configHash = sha256(stableStringify(spec));
889
- const existing = db.prepare("SELECT id, created_at, next_due_at, next_canary_due_at, config_hash FROM sources WHERE id = ?").get(spec.id);
890
- const resolvedSpecPath = options2?.specPath ? canonicalizeManagedSpecPath(options2.specPath) : null;
891
- const nextDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_due_at : timestamp;
892
- const canaryConfig = resolveSourceCanary(spec);
893
- const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryConfig.everyHours) : timestamp;
894
- const configChanged = Boolean(existing && existing.config_hash !== configHash);
895
- db.prepare(`
896
- INSERT INTO sources (
897
- id, label, spec_json, spec_path, config_hash, created_at, updated_at, next_due_at, next_canary_due_at
898
- ) VALUES (
899
- @id, @label, @specJson, @specPath, @configHash, @createdAt, @updatedAt, @nextDueAt, @nextCanaryDueAt
900
- )
901
- ON CONFLICT(id) DO UPDATE SET
902
- label = excluded.label,
903
- spec_json = excluded.spec_json,
904
- spec_path = excluded.spec_path,
905
- config_hash = excluded.config_hash,
906
- updated_at = excluded.updated_at,
907
- next_due_at = excluded.next_due_at,
908
- next_canary_due_at = excluded.next_canary_due_at
909
- `).run({
910
- id: spec.id,
911
- label: spec.label,
912
- specJson: JSON.stringify(spec),
913
- specPath: resolvedSpecPath,
914
- configHash,
915
- createdAt: existing?.created_at ?? timestamp,
916
- updatedAt: timestamp,
917
- nextDueAt,
918
- nextCanaryDueAt
919
- });
920
- return {
921
- sourceId: spec.id,
922
- configHash,
923
- configChanged
924
- };
925
- },
926
- getSourceSpec(sourceId) {
927
- const row = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(sourceId);
928
- if (!row) {
929
- return null;
930
- }
931
- return JSON.parse(row.spec_json);
932
- },
933
- listSources() {
934
- const rows = db.prepare(`
935
- SELECT
936
- id,
937
- label,
938
- spec_path,
939
- next_due_at,
940
- next_canary_due_at,
941
- last_checked_at,
942
- last_successful_snapshot_at,
943
- last_successful_snapshot_id,
944
- last_canary_checked_at,
945
- last_successful_canary_at,
946
- last_canary_status
947
- FROM sources
948
- ORDER BY id
949
- `).all();
950
- return rows.map((row) => ({
951
- id: row.id,
952
- label: row.label,
953
- specPath: row.spec_path ? canonicalizeManagedSpecPath(row.spec_path) : null,
954
- nextDueAt: row.next_due_at,
955
- isDue: Date.parse(row.next_due_at) <= Date.now(),
956
- nextCanaryDueAt: row.next_canary_due_at,
957
- isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
958
- lastCheckedAt: row.last_checked_at,
959
- lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
960
- lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
961
- lastCanaryCheckedAt: row.last_canary_checked_at,
962
- lastSuccessfulCanaryAt: row.last_successful_canary_at,
963
- lastCanaryStatus: row.last_canary_status
964
- }));
965
- },
966
- listDueSourceIds(referenceTime = nowIso()) {
967
- const rows = db.prepare(`
968
- SELECT id
969
- FROM sources
970
- WHERE next_due_at <= ?
971
- ORDER BY next_due_at, id
972
- `).all(referenceTime);
973
- return rows.map((row) => row.id);
974
- },
975
- listCanaryDueSourceIds(referenceTime = nowIso()) {
976
- const rows = db.prepare(`
977
- SELECT id
978
- FROM sources
979
- WHERE next_canary_due_at IS NOT NULL
980
- AND next_canary_due_at <= ?
981
- ORDER BY next_canary_due_at, id
982
- `).all(referenceTime);
983
- return rows.map((row) => row.id);
984
- },
985
- linkProject(projectPath, sourceIds) {
986
- const normalizedPath = canonicalizeProjectPath(projectPath);
987
- const timestamp = nowIso();
988
- const insert = db.prepare(`
989
- INSERT INTO project_links (project_path, source_id, created_at)
990
- VALUES (?, ?, ?)
991
- ON CONFLICT(project_path, source_id) DO NOTHING
992
- `);
993
- const transaction = db.transaction((ids) => {
994
- for (const sourceId of ids) {
995
- insert.run(normalizedPath, sourceId, timestamp);
996
- }
997
- });
998
- transaction(sourceIds);
999
- },
1000
- unlinkProject(projectPath, sourceIds) {
1001
- const normalizedPath = canonicalizeProjectPath(projectPath);
1002
- if (!sourceIds || sourceIds.length === 0) {
1003
- db.prepare("DELETE FROM project_links WHERE project_path = ?").run(normalizedPath);
1004
- return;
1005
- }
1006
- const statement = db.prepare("DELETE FROM project_links WHERE project_path = ? AND source_id = ?");
1007
- const transaction = db.transaction((ids) => {
1008
- for (const sourceId of ids) {
1009
- statement.run(normalizedPath, sourceId);
1010
- }
1011
- });
1012
- transaction(sourceIds);
1013
- },
1014
- recordSuccessfulSnapshot(input) {
1015
- const sourceRow = db.prepare("SELECT config_hash, spec_json, last_successful_snapshot_id FROM sources WHERE id = ?").get(input.sourceId);
1016
- if (!sourceRow) {
1017
- throw new AiocsError(
1018
- AIOCS_ERROR_CODES.sourceNotFound,
1019
- `Unknown source '${input.sourceId}'`
1020
- );
1021
- }
1022
- const pagesWithHashes = input.pages.map((page) => ({
1023
- ...page,
1024
- markdown: page.markdown.trim(),
1025
- contentHash: sha256(page.markdown.trim())
1026
- }));
1027
- const fingerprint = buildSnapshotFingerprint({
1028
- sourceId: input.sourceId,
1029
- configHash: sourceRow.config_hash,
1030
- pages: pagesWithHashes.map((page) => ({
1031
- url: page.url,
1032
- contentHash: page.contentHash
1033
- }))
1034
- });
1035
- const existing = db.prepare("SELECT id FROM snapshots WHERE source_id = ? AND fingerprint = ?").get(input.sourceId, fingerprint);
1036
- const spec = JSON.parse(sourceRow.spec_json);
1037
- const checkedAt = nowIso();
1038
- const nextDueAt = addHoursIso(spec.schedule.everyHours);
1039
- if (existing) {
1040
- db.prepare(`
1041
- UPDATE sources
1042
- SET last_checked_at = ?, last_successful_snapshot_at = ?, last_successful_snapshot_id = ?, next_due_at = ?, updated_at = ?
1043
- WHERE id = ?
1044
- `).run(checkedAt, checkedAt, existing.id, nextDueAt, checkedAt, input.sourceId);
1045
- queueEmbeddingJobForSnapshot(
1046
- input.sourceId,
1047
- existing.id,
1048
- sourceRow.last_successful_snapshot_id
1049
- );
1050
- db.prepare(`
1051
- INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
1052
- VALUES (?, ?, 'success', ?, ?, ?)
1053
- `).run(randomUUID(), input.sourceId, existing.id, checkedAt, checkedAt);
1054
- return {
1055
- snapshotId: existing.id,
1056
- reused: true
1057
- };
1058
- }
1059
- const snapshotId = `snp_${checkedAt.replace(/[-:.TZ]/g, "")}_${fingerprint.slice(0, 12)}`;
1060
- const insertSnapshot = db.prepare(`
1061
- INSERT INTO snapshots (
1062
- id, source_id, fingerprint, config_hash, detected_version, page_count, created_at
1063
- ) VALUES (?, ?, ?, ?, ?, ?, ?)
1064
- `);
1065
- const insertPage = db.prepare(`
1066
- INSERT INTO pages (snapshot_id, url, title, markdown, content_hash)
1067
- VALUES (?, ?, ?, ?, ?)
1068
- `);
1069
- const insertChunk = db.prepare(`
1070
- INSERT INTO chunks (
1071
- source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown
1072
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1073
- `);
1074
- const insertRun = db.prepare(`
1075
- INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
1076
- VALUES (?, ?, 'success', ?, ?, ?)
1077
- `);
1078
- const transaction = db.transaction(() => {
1079
- insertSnapshot.run(
1080
- snapshotId,
1081
- input.sourceId,
1082
- fingerprint,
1083
- sourceRow.config_hash,
1084
- input.detectedVersion ?? null,
1085
- pagesWithHashes.length,
1086
- checkedAt
1087
- );
1088
- for (const page of pagesWithHashes) {
1089
- const pageInsert = insertPage.run(snapshotId, page.url, page.title, page.markdown, page.contentHash);
1090
- const pageId = Number(pageInsert.lastInsertRowid);
1091
- const chunks = chunkMarkdown(page.title, page.markdown);
1092
- for (const chunk of chunks) {
1093
- insertChunk.run(
1094
- input.sourceId,
1095
- snapshotId,
1096
- pageId,
1097
- page.url,
1098
- page.title,
1099
- chunk.sectionTitle,
1100
- chunk.chunkOrder,
1101
- chunk.markdown
1102
- );
1103
- }
1104
- }
1105
- db.prepare(`
1106
- UPDATE sources
1107
- SET last_checked_at = ?, last_successful_snapshot_at = ?, last_successful_snapshot_id = ?, next_due_at = ?, updated_at = ?
1108
- WHERE id = ?
1109
- `).run(checkedAt, checkedAt, snapshotId, nextDueAt, checkedAt, input.sourceId);
1110
- queueEmbeddingJobForSnapshot(
1111
- input.sourceId,
1112
- snapshotId,
1113
- sourceRow.last_successful_snapshot_id
1114
- );
1115
- insertRun.run(randomUUID(), input.sourceId, snapshotId, checkedAt, checkedAt);
1116
- });
1117
- transaction();
1118
- return {
1119
- snapshotId,
1120
- reused: false
1121
- };
1122
- },
1123
- recordFailedFetchRun(input) {
1124
- const sourceRow = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(input.sourceId);
1125
- if (!sourceRow) {
1126
- throw new AiocsError(
1127
- AIOCS_ERROR_CODES.sourceNotFound,
1128
- `Unknown source '${input.sourceId}'`
1129
- );
1130
- }
1131
- const spec = JSON.parse(sourceRow.spec_json);
1132
- const timestamp = nowIso();
1133
- db.prepare(`
1134
- INSERT INTO fetch_runs (id, source_id, status, error_message, started_at, finished_at)
1135
- VALUES (?, ?, 'failed', ?, ?, ?)
1136
- `).run(randomUUID(), input.sourceId, input.errorMessage, timestamp, timestamp);
1137
- db.prepare(`
1138
- UPDATE sources
1139
- SET last_checked_at = ?, next_due_at = ?, updated_at = ?
1140
- WHERE id = ?
1141
- `).run(timestamp, addHoursIso(spec.schedule.everyHours), timestamp, input.sourceId);
1142
- },
1143
- recordCanaryRun(input) {
1144
- const sourceRow = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(input.sourceId);
1145
- if (!sourceRow) {
1146
- throw new AiocsError(
1147
- AIOCS_ERROR_CODES.sourceNotFound,
1148
- `Unknown source '${input.sourceId}'`
1149
- );
1150
- }
1151
- const spec = JSON.parse(sourceRow.spec_json);
1152
- const canary = resolveSourceCanary(spec);
1153
- db.prepare(`
1154
- INSERT INTO canary_runs (id, source_id, status, checked_at, details_json)
1155
- VALUES (?, ?, ?, ?, ?)
1156
- `).run(
1157
- randomUUID(),
1158
- input.sourceId,
1159
- input.status,
1160
- input.checkedAt,
1161
- JSON.stringify(input.details)
1162
- );
1163
- db.prepare(`
1164
- UPDATE sources
1165
- SET
1166
- last_canary_checked_at = ?,
1167
- last_successful_canary_at = CASE WHEN ? = 'pass' THEN ? ELSE last_successful_canary_at END,
1168
- last_canary_status = ?,
1169
- next_canary_due_at = ?,
1170
- updated_at = ?
1171
- WHERE id = ?
1172
- `).run(
1173
- input.checkedAt,
1174
- input.status,
1175
- input.checkedAt,
1176
- input.status,
1177
- addHoursIso(canary.everyHours),
1178
- input.checkedAt,
1179
- input.sourceId
1180
- );
1181
- },
1182
- listProjectLinks,
1183
- removeManagedSources(input) {
1184
- if (input.managedRoots.length === 0) {
1185
- return [];
1186
- }
1187
- const activeSourceKeys = new Set(
1188
- input.activeSources.map((source) => `${source.sourceId}::${canonicalizeManagedSpecPath(source.specPath)}`)
1189
- );
1190
- const normalizedManagedRoots = input.managedRoots.map((managedRoot) => canonicalizeManagedSpecPath(managedRoot));
1191
- const rows = db.prepare(`
1192
- SELECT id, spec_path
1193
- FROM sources
1194
- WHERE spec_path IS NOT NULL
1195
- ORDER BY id
1196
- `).all();
1197
- const toDelete = rows.filter((row) => {
1198
- if (!row.spec_path) {
1199
- return false;
1200
- }
1201
- const normalizedSpecPath = canonicalizeManagedSpecPath(row.spec_path);
1202
- return normalizedManagedRoots.some(
1203
- (managedRoot) => normalizedSpecPath === managedRoot || normalizedSpecPath.startsWith(`${managedRoot}/`)
1204
- ) && !activeSourceKeys.has(`${row.id}::${normalizedSpecPath}`);
1205
- }).map((row) => row.id);
1206
- if (toDelete.length === 0) {
1207
- return [];
1208
- }
1209
- const deleteStatement = db.prepare("DELETE FROM sources WHERE id = ?");
1210
- const transaction = db.transaction((sourceIds) => {
1211
- for (const sourceId of sourceIds) {
1212
- deleteStatement.run(sourceId);
1213
- }
1214
- });
1215
- transaction(toDelete);
1216
- return toDelete;
1217
- },
1218
- listSnapshots(sourceId) {
1219
- const rows = db.prepare(`
1220
- SELECT id, source_id, detected_version, created_at, page_count
1221
- FROM snapshots
1222
- WHERE source_id = ?
1223
- ORDER BY rowid DESC
1224
- `).all(sourceId);
1225
- return rows.map((row) => ({
1226
- snapshotId: row.id,
1227
- sourceId: row.source_id,
1228
- detectedVersion: row.detected_version,
1229
- createdAt: row.created_at,
1230
- pageCount: row.page_count
1231
- }));
1232
- },
1233
- diffSnapshots(input) {
1234
- const snapshots = this.listSnapshots(input.sourceId);
1235
- if (snapshots.length === 0) {
1236
- throw new AiocsError(
1237
- AIOCS_ERROR_CODES.snapshotNotFound,
1238
- `No successful snapshot found for source '${input.sourceId}'`
1239
- );
1240
- }
1241
- const toSnapshot = input.toSnapshotId ? snapshots.find((snapshot) => snapshot.snapshotId === input.toSnapshotId) : snapshots[0];
1242
- if (!toSnapshot) {
1243
- throw new AiocsError(
1244
- AIOCS_ERROR_CODES.snapshotNotFound,
1245
- `Snapshot '${input.toSnapshotId}' not found for source '${input.sourceId}'`
1246
- );
1247
- }
1248
- const toSnapshotIndex = snapshots.findIndex((snapshot) => snapshot.snapshotId === toSnapshot.snapshotId);
1249
- const fromSnapshot = input.fromSnapshotId ? snapshots.find((snapshot) => snapshot.snapshotId === input.fromSnapshotId) : snapshots[toSnapshotIndex + 1];
1250
- if (!fromSnapshot) {
1251
- throw new AiocsError(
1252
- AIOCS_ERROR_CODES.snapshotDiffBaseNotFound,
1253
- `No base snapshot available to diff source '${input.sourceId}'`
1254
- );
1255
- }
1256
- const loadSnapshotPages = (snapshotId) => db.prepare(`
1257
- SELECT url, title, markdown, content_hash
1258
- FROM pages
1259
- WHERE snapshot_id = ?
1260
- ORDER BY url
1261
- `).all(snapshotId);
1262
- const beforePages = loadSnapshotPages(fromSnapshot.snapshotId);
1263
- const afterPages = loadSnapshotPages(toSnapshot.snapshotId);
1264
- const beforeMap = new Map(beforePages.map((page) => [page.url, page]));
1265
- const afterMap = new Map(afterPages.map((page) => [page.url, page]));
1266
- const addedPages = afterPages.filter((page) => !beforeMap.has(page.url)).map((page) => ({
1267
- url: page.url,
1268
- title: page.title
1269
- }));
1270
- const removedPages = beforePages.filter((page) => !afterMap.has(page.url)).map((page) => ({
1271
- url: page.url,
1272
- title: page.title
1273
- }));
1274
- const summarizeLineDiff = (beforeMarkdown, afterMarkdown) => {
1275
- const beforeLines = beforeMarkdown.split("\n");
1276
- const afterLines = afterMarkdown.split("\n");
1277
- let prefix = 0;
1278
- while (prefix < beforeLines.length && prefix < afterLines.length && beforeLines[prefix] === afterLines[prefix]) {
1279
- prefix += 1;
1280
- }
1281
- let suffix = 0;
1282
- while (suffix < beforeLines.length - prefix && suffix < afterLines.length - prefix && beforeLines[beforeLines.length - 1 - suffix] === afterLines[afterLines.length - 1 - suffix]) {
1283
- suffix += 1;
1284
- }
1285
- return {
1286
- addedLineCount: Math.max(0, afterLines.length - prefix - suffix),
1287
- removedLineCount: Math.max(0, beforeLines.length - prefix - suffix)
1288
- };
1289
- };
1290
- const changedPages = beforePages.filter((page) => afterMap.has(page.url)).map((page) => ({
1291
- before: page,
1292
- after: afterMap.get(page.url)
1293
- })).filter(({ before, after }) => before.content_hash !== after.content_hash || before.title !== after.title).map(({ before, after }) => ({
1294
- url: before.url,
1295
- beforeTitle: before.title,
1296
- afterTitle: after.title,
1297
- lineSummary: summarizeLineDiff(before.markdown, after.markdown)
1298
- }));
1299
- const unchangedPageCount = beforePages.filter((page) => {
1300
- const next = afterMap.get(page.url);
1301
- return next && next.content_hash === page.content_hash && next.title === page.title;
1302
- }).length;
1303
- return {
1304
- sourceId: input.sourceId,
1305
- fromSnapshotId: fromSnapshot.snapshotId,
1306
- toSnapshotId: toSnapshot.snapshotId,
1307
- summary: {
1308
- addedPageCount: addedPages.length,
1309
- removedPageCount: removedPages.length,
1310
- changedPageCount: changedPages.length,
1311
- unchangedPageCount
1312
- },
1313
- addedPages,
1314
- removedPages,
1315
- changedPages
1316
- };
1317
- },
1318
- resolveSearchScope(input) {
1319
- return resolveSearchScope(input);
1320
- },
1321
- searchLexical(input) {
1322
- return searchLexicalByScope(input);
1323
- },
1324
- search(input) {
1325
- return searchLexicalByScope({
1326
- query: input.query,
1327
- scope: resolveSearchScope(input)
1328
- });
1329
- },
1330
- listLatestSnapshots(sourceIds) {
1331
- return listLatestSnapshots(sourceIds);
1332
- },
1333
- listSnapshotChunks(input) {
1334
- const rows = db.prepare(`
1335
- SELECT
1336
- c.id AS chunk_id,
1337
- c.source_id,
1338
- c.snapshot_id,
1339
- c.page_url,
1340
- c.page_title,
1341
- c.section_title,
1342
- c.markdown
1343
- FROM chunks c
1344
- WHERE c.source_id = ?
1345
- AND c.snapshot_id = ?
1346
- ORDER BY c.id
1347
- `).all(input.sourceId, input.snapshotId);
1348
- return rows.map((row) => ({
1349
- chunkId: row.chunk_id,
1350
- sourceId: row.source_id,
1351
- snapshotId: row.snapshot_id,
1352
- pageUrl: row.page_url,
1353
- pageTitle: row.page_title,
1354
- sectionTitle: row.section_title,
1355
- markdown: row.markdown,
1356
- contentHash: sha256(row.markdown)
1357
- }));
1358
- },
1359
- getSnapshotEmbeddingState(input) {
1360
- const rows = db.prepare(`
1361
- SELECT chunk_id, status, model_key, content_hash
1362
- FROM embedding_state
1363
- WHERE source_id = ?
1364
- AND snapshot_id = ?
1365
- ORDER BY chunk_id
1366
- `).all(input.sourceId, input.snapshotId);
1367
- return rows.map((row) => ({
1368
- chunkId: row.chunk_id,
1369
- status: row.status,
1370
- modelKey: row.model_key,
1371
- contentHash: row.content_hash
1372
- }));
1373
- },
1374
- listStaleEmbeddingChunkIds(sourceId) {
1375
- const rows = db.prepare(`
1376
- SELECT chunk_id
1377
- FROM embedding_state
1378
- WHERE source_id = ?
1379
- AND status = 'stale'
1380
- ORDER BY chunk_id
1381
- `).all(sourceId);
1382
- return rows.map((row) => row.chunk_id);
1383
- },
1384
- listEmbeddingChunkIds(sourceIds) {
1385
- const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
1386
- const rows = db.prepare(`
1387
- SELECT chunk_id
1388
- FROM embedding_state
1389
- ${filterSourceIds ? `WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
1390
- ORDER BY chunk_id
1391
- `).all(...filterSourceIds ?? []);
1392
- return rows.map((row) => row.chunk_id);
1393
- },
1394
- getChunksByIds(chunkIds) {
1395
- if (chunkIds.length === 0) {
1396
- return [];
1397
- }
1398
- const rows = db.prepare(`
1399
- SELECT
1400
- c.id AS chunk_id,
1401
- c.source_id,
1402
- c.snapshot_id,
1403
- c.page_url,
1404
- c.page_title,
1405
- c.section_title,
1406
- c.markdown
1407
- FROM chunks c
1408
- WHERE c.id IN (${chunkIds.map(() => "?").join(",")})
1409
- `).all(...chunkIds);
1410
- return rows.map((row) => ({
1411
- chunkId: row.chunk_id,
1412
- sourceId: row.source_id,
1413
- snapshotId: row.snapshot_id,
1414
- pageUrl: row.page_url,
1415
- pageTitle: row.page_title,
1416
- sectionTitle: row.section_title,
1417
- markdown: row.markdown
1418
- }));
1419
- },
1420
- queueLatestEmbeddingJobs(sourceIds) {
1421
- const latestSnapshots = listLatestSnapshots(sourceIds);
1422
- const transaction = db.transaction((snapshots) => {
1423
- for (const snapshot of snapshots) {
1424
- queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
1425
- }
1426
- });
1427
- transaction(latestSnapshots);
1428
- return {
1429
- queuedJobs: latestSnapshots.length
1430
- };
1431
- },
1432
- requeueLatestEmbeddingJobs(sourceIds) {
1433
- const latestSnapshots = listLatestSnapshots(sourceIds);
1434
- const transaction = db.transaction((snapshots) => {
1435
- for (const snapshot of snapshots) {
1436
- db.prepare(`
1437
- UPDATE embedding_state
1438
- SET
1439
- status = 'pending',
1440
- model_key = NULL,
1441
- vector_point_id = NULL,
1442
- last_attempted_at = NULL,
1443
- indexed_at = NULL,
1444
- error_message = NULL
1445
- WHERE source_id = ?
1446
- AND snapshot_id = ?
1447
- `).run(snapshot.sourceId, snapshot.snapshotId);
1448
- queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
1449
- }
1450
- });
1451
- transaction(latestSnapshots);
1452
- return {
1453
- queuedJobs: latestSnapshots.length
1454
- };
1455
- },
1456
- resetEmbeddingsAfterImport() {
1457
- const transaction = db.transaction(() => {
1458
- db.prepare("DELETE FROM embedding_jobs").run();
1459
- db.prepare("DELETE FROM embedding_state").run();
1460
- });
1461
- transaction();
1462
- const latestSnapshots = listLatestSnapshots();
1463
- const queueTransaction = db.transaction((snapshots) => {
1464
- for (const snapshot of snapshots) {
1465
- queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
1466
- }
1467
- });
1468
- queueTransaction(latestSnapshots);
1469
- return {
1470
- queuedJobs: latestSnapshots.length
1471
- };
1472
- },
1473
- resetRunningEmbeddingJobs() {
1474
- const result = db.prepare(`
1475
- UPDATE embedding_jobs
1476
- SET
1477
- status = 'pending',
1478
- updated_at = ?,
1479
- claimed_at = NULL,
1480
- error_message = NULL
1481
- WHERE status = 'running'
1482
- `).run(nowIso());
1483
- return result.changes;
1484
- },
1485
- claimEmbeddingJobs(limit) {
1486
- const normalizedLimit = assertPaginationValue(limit, "limit", limit);
1487
- if (normalizedLimit === 0) {
1488
- return [];
1489
- }
1490
- const claimedAt = nowIso();
1491
- const transaction = db.transaction(() => {
1492
- const pending = db.prepare(`
1493
- SELECT
1494
- source_id,
1495
- snapshot_id,
1496
- status,
1497
- attempt_count,
1498
- chunk_count,
1499
- created_at,
1500
- updated_at,
1501
- claimed_at,
1502
- completed_at,
1503
- error_message
1504
- FROM embedding_jobs
1505
- WHERE status = 'pending'
1506
- ORDER BY updated_at, source_id, snapshot_id
1507
- LIMIT ?
1508
- `).all(normalizedLimit);
1509
- const claim = db.prepare(`
1510
- UPDATE embedding_jobs
1511
- SET
1512
- status = 'running',
1513
- attempt_count = attempt_count + 1,
1514
- updated_at = ?,
1515
- claimed_at = ?,
1516
- error_message = NULL
1517
- WHERE source_id = ?
1518
- AND snapshot_id = ?
1519
- `);
1520
- for (const job of pending) {
1521
- claim.run(claimedAt, claimedAt, job.source_id, job.snapshot_id);
1522
- }
1523
- return pending.map((job) => ({
1524
- sourceId: job.source_id,
1525
- snapshotId: job.snapshot_id,
1526
- status: "running",
1527
- attemptCount: job.attempt_count + 1,
1528
- chunkCount: job.chunk_count,
1529
- createdAt: job.created_at,
1530
- updatedAt: claimedAt,
1531
- claimedAt,
1532
- completedAt: job.completed_at,
1533
- errorMessage: null
1534
- }));
1535
- });
1536
- return transaction();
1537
- },
1538
- markEmbeddingJobSucceeded(input) {
1539
- const timestamp = nowIso();
1540
- const staleChunkIds = [...new Set(input.staleChunkIds ?? [])];
1541
- const indexedChunkIds = [...new Set(input.indexedChunkIds)];
1542
- const indexedPlaceholders = indexedChunkIds.length > 0 ? indexedChunkIds.map(() => "?").join(",") : null;
1543
- const stalePlaceholders = staleChunkIds.length > 0 ? staleChunkIds.map(() => "?").join(",") : null;
1544
- const transaction = db.transaction(() => {
1545
- if (indexedPlaceholders) {
1546
- db.prepare(`
1547
- UPDATE embedding_state
1548
- SET
1549
- status = 'indexed',
1550
- model_key = ?,
1551
- vector_point_id = CAST(chunk_id AS TEXT),
1552
- last_attempted_at = ?,
1553
- indexed_at = ?,
1554
- error_message = NULL
1555
- WHERE chunk_id IN (${indexedPlaceholders})
1556
- `).run(input.modelKey, timestamp, timestamp, ...indexedChunkIds);
1557
- }
1558
- db.prepare(`
1559
- UPDATE embedding_state
1560
- SET
1561
- status = 'failed',
1562
- model_key = NULL,
1563
- vector_point_id = NULL,
1564
- last_attempted_at = ?,
1565
- indexed_at = NULL,
1566
- error_message = 'Chunk was not indexed during the latest embedding run'
1567
- WHERE source_id = ?
1568
- AND snapshot_id = ?
1569
- AND status != 'indexed'
1570
- `).run(timestamp, input.sourceId, input.snapshotId);
1571
- if (stalePlaceholders) {
1572
- db.prepare(`
1573
- DELETE FROM embedding_state
1574
- WHERE chunk_id IN (${stalePlaceholders})
1575
- `).run(...staleChunkIds);
1576
- }
1577
- db.prepare(`
1578
- UPDATE embedding_jobs
1579
- SET
1580
- status = 'succeeded',
1581
- updated_at = ?,
1582
- completed_at = ?,
1583
- claimed_at = NULL,
1584
- error_message = NULL
1585
- WHERE source_id = ?
1586
- AND snapshot_id = ?
1587
- `).run(timestamp, timestamp, input.sourceId, input.snapshotId);
1588
- });
1589
- transaction();
1590
- },
1591
- markEmbeddingJobFailed(input) {
1592
- const timestamp = nowIso();
1593
- const transaction = db.transaction(() => {
1594
- db.prepare(`
1595
- UPDATE embedding_state
1596
- SET
1597
- status = 'failed',
1598
- model_key = NULL,
1599
- vector_point_id = NULL,
1600
- last_attempted_at = ?,
1601
- indexed_at = NULL,
1602
- error_message = ?
1603
- WHERE source_id = ?
1604
- AND snapshot_id = ?
1605
- AND status != 'indexed'
1606
- `).run(timestamp, input.errorMessage, input.sourceId, input.snapshotId);
1607
- db.prepare(`
1608
- UPDATE embedding_jobs
1609
- SET
1610
- status = 'failed',
1611
- updated_at = ?,
1612
- completed_at = ?,
1613
- claimed_at = NULL,
1614
- error_message = ?
1615
- WHERE source_id = ?
1616
- AND snapshot_id = ?
1617
- `).run(timestamp, timestamp, input.errorMessage, input.sourceId, input.snapshotId);
1618
- });
1619
- transaction();
1620
- },
1621
- clearEmbeddings(sourceIds) {
1622
- const latestSnapshots = listLatestSnapshots(sourceIds);
1623
- const clearedSources = latestSnapshots.map((snapshot) => snapshot.sourceId);
1624
- const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
1625
- const transaction = db.transaction(() => {
1626
- if (filterSourceIds && filterSourceIds.length > 0) {
1627
- db.prepare(`
1628
- DELETE FROM embedding_jobs
1629
- WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})
1630
- `).run(...filterSourceIds);
1631
- db.prepare(`
1632
- DELETE FROM embedding_state
1633
- WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})
1634
- `).run(...filterSourceIds);
1635
- } else {
1636
- db.prepare("DELETE FROM embedding_jobs").run();
1637
- db.prepare("DELETE FROM embedding_state").run();
1638
- }
1639
- });
1640
- transaction();
1641
- return {
1642
- clearedSources
1643
- };
1644
- },
1645
- getEmbeddingOverview() {
1646
- const queueCounts = db.prepare(`
1647
- SELECT
1648
- SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_jobs,
1649
- SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) AS running_jobs,
1650
- SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed_jobs
1651
- FROM embedding_jobs
1652
- `).get();
1653
- const rows = db.prepare(`
1654
- SELECT
1655
- s.id AS source_id,
1656
- s.last_successful_snapshot_id AS snapshot_id,
1657
- COUNT(c.id) AS total_chunks,
1658
- SUM(CASE WHEN es.status = 'indexed' THEN 1 ELSE 0 END) AS indexed_chunks,
1659
- SUM(CASE WHEN es.status = 'pending' THEN 1 ELSE 0 END) AS pending_chunks,
1660
- SUM(CASE WHEN es.status = 'failed' THEN 1 ELSE 0 END) AS failed_chunks,
1661
- SUM(CASE WHEN es.status = 'stale' THEN 1 ELSE 0 END) AS stale_chunks
1662
- FROM sources s
1663
- LEFT JOIN chunks c
1664
- ON c.snapshot_id = s.last_successful_snapshot_id
1665
- LEFT JOIN embedding_state es
1666
- ON es.chunk_id = c.id
1667
- GROUP BY s.id, s.last_successful_snapshot_id
1668
- ORDER BY s.id
1669
- `).all();
1670
- return {
1671
- queue: {
1672
- pendingJobs: queueCounts.pending_jobs ?? 0,
1673
- runningJobs: queueCounts.running_jobs ?? 0,
1674
- failedJobs: queueCounts.failed_jobs ?? 0
1675
- },
1676
- sources: rows.map((row) => ({
1677
- sourceId: row.source_id,
1678
- snapshotId: row.snapshot_id,
1679
- totalChunks: row.total_chunks,
1680
- indexedChunks: row.indexed_chunks,
1681
- pendingChunks: row.pending_chunks,
1682
- failedChunks: row.failed_chunks,
1683
- staleChunks: row.stale_chunks,
1684
- coverageRatio: row.total_chunks === 0 ? 0 : row.indexed_chunks / row.total_chunks
1685
- }))
1686
- };
1687
- },
1688
- markDaemonStarted(input) {
1689
- db.prepare(`
1690
- INSERT INTO daemon_state (
1691
- singleton_id,
1692
- last_started_at,
1693
- interval_minutes,
1694
- fetch_on_start
1695
- ) VALUES (1, ?, ?, ?)
1696
- ON CONFLICT(singleton_id) DO UPDATE SET
1697
- last_started_at = excluded.last_started_at,
1698
- interval_minutes = excluded.interval_minutes,
1699
- fetch_on_start = excluded.fetch_on_start
1700
- `).run(
1701
- input.startedAt,
1702
- input.intervalMinutes,
1703
- input.fetchOnStart ? 1 : 0
1704
- );
1705
- },
1706
- markDaemonCycleStarted(startedAt) {
1707
- db.prepare(`
1708
- INSERT INTO daemon_state (singleton_id, last_cycle_started_at)
1709
- VALUES (1, ?)
1710
- ON CONFLICT(singleton_id) DO UPDATE SET
1711
- last_cycle_started_at = excluded.last_cycle_started_at
1712
- `).run(startedAt);
1713
- },
1714
- markDaemonCycleCompleted(input) {
1715
- db.prepare(`
1716
- INSERT INTO daemon_state (
1717
- singleton_id,
1718
- last_cycle_completed_at,
1719
- last_cycle_status
1720
- ) VALUES (1, ?, ?)
1721
- ON CONFLICT(singleton_id) DO UPDATE SET
1722
- last_cycle_completed_at = excluded.last_cycle_completed_at,
1723
- last_cycle_status = excluded.last_cycle_status
1724
- `).run(
1725
- input.completedAt,
1726
- input.status
1727
- );
1728
- },
1729
- getDaemonState() {
1730
- const row = db.prepare(`
1731
- SELECT
1732
- last_started_at,
1733
- last_cycle_started_at,
1734
- last_cycle_completed_at,
1735
- last_cycle_status,
1736
- interval_minutes,
1737
- fetch_on_start
1738
- FROM daemon_state
1739
- WHERE singleton_id = 1
1740
- `).get();
1741
- if (!row) {
1742
- return null;
1743
- }
1744
- return {
1745
- lastStartedAt: row.last_started_at,
1746
- lastCycleStartedAt: row.last_cycle_started_at,
1747
- lastCycleCompletedAt: row.last_cycle_completed_at,
1748
- lastCycleStatus: row.last_cycle_status,
1749
- intervalMinutes: row.interval_minutes,
1750
- fetchOnStart: row.fetch_on_start === null ? null : row.fetch_on_start === 1
1751
- };
1752
- },
1753
- getCoverageCorpus(input) {
1754
- const sourceRow = db.prepare("SELECT last_successful_snapshot_id FROM sources WHERE id = ?").get(input.sourceId);
1755
- if (!sourceRow) {
1756
- throw new AiocsError(
1757
- AIOCS_ERROR_CODES.sourceNotFound,
1758
- `Unknown source '${input.sourceId}'`
1759
- );
1760
- }
1761
- const snapshotId = input.snapshotId ?? sourceRow.last_successful_snapshot_id;
1762
- if (!snapshotId) {
1763
- throw new AiocsError(
1764
- AIOCS_ERROR_CODES.snapshotNotFound,
1765
- `No successful snapshot found for source '${input.sourceId}'`
1766
- );
1767
- }
1768
- const snapshotRow = db.prepare("SELECT id FROM snapshots WHERE id = ? AND source_id = ?").get(snapshotId, input.sourceId);
1769
- if (!snapshotRow) {
1770
- throw new AiocsError(
1771
- AIOCS_ERROR_CODES.snapshotNotFound,
1772
- `Snapshot '${snapshotId}' not found for source '${input.sourceId}'`
1773
- );
1774
- }
1775
- const rows = db.prepare(`
1776
- SELECT page_title, section_title, markdown
1777
- FROM chunks
1778
- WHERE source_id = ?
1779
- AND snapshot_id = ?
1780
- ORDER BY page_id, chunk_order
1781
- `).all(input.sourceId, snapshotId);
1782
- return {
1783
- sourceId: input.sourceId,
1784
- snapshotId,
1785
- entries: rows.map((row) => ({
1786
- pageTitle: row.page_title,
1787
- sectionTitle: row.section_title,
1788
- markdown: row.markdown
1789
- }))
1790
- };
1791
- },
1792
- getChunkById(chunkId) {
1793
- const row = db.prepare(`
1794
- SELECT
1795
- c.id AS chunk_id,
1796
- c.source_id,
1797
- c.snapshot_id,
1798
- c.page_url,
1799
- c.page_title,
1800
- c.section_title,
1801
- c.markdown
1802
- FROM chunks c
1803
- WHERE c.id = ?
1804
- `).get(chunkId);
1805
- if (!row) {
1806
- return null;
1807
- }
1808
- return {
1809
- chunkId: row.chunk_id,
1810
- sourceId: row.source_id,
1811
- snapshotId: row.snapshot_id,
1812
- pageUrl: row.page_url,
1813
- pageTitle: row.page_title,
1814
- sectionTitle: row.section_title,
1815
- markdown: row.markdown
1816
- };
1817
- }
1818
- };
1819
- }
1820
-
1821
- // src/daemon.ts
1822
- import { existsSync as existsSync2 } from "fs";
1823
- import { resolve as resolve5 } from "path";
1824
- import { setTimeout as sleep2 } from "timers/promises";
1825
-
1826
- // src/fetch/fetch-source.ts
1827
- import { mkdirSync as mkdirSync3, writeFileSync } from "fs";
1828
- import { join as join4 } from "path";
1829
- import { setTimeout as sleep } from "timers/promises";
1830
- import { chromium } from "playwright";
1831
-
1832
- // src/fetch/extract.ts
1833
- import { JSDOM } from "jsdom";
1834
- import { Readability } from "@mozilla/readability";
1835
-
1836
- // src/fetch/normalize.ts
1837
- import TurndownService from "turndown";
1838
- import { gfm } from "turndown-plugin-gfm";
1839
- var turndown = new TurndownService({
1840
- headingStyle: "atx",
1841
- codeBlockStyle: "fenced"
1842
- });
1843
- turndown.use(gfm);
1844
- function htmlToMarkdown(html) {
1845
- return turndown.turndown(html).trim();
1846
- }
1847
- function ensureTitle(markdown, title) {
1848
- const trimmed = markdown.trim();
1849
- if (!trimmed) {
1850
- return `# ${title}`;
1851
- }
1852
- if (trimmed.startsWith("# ")) {
1853
- return trimmed;
1854
- }
1855
- return `# ${title}
1856
-
1857
- ${trimmed}`;
1858
- }
1859
- function normalizeMarkdown(spec, page) {
1860
- const titled = ensureTitle(page.markdown, page.title);
1861
- if (!spec.normalize.prependSourceComment) {
1862
- return titled;
1863
- }
1864
- return `<!-- source: ${page.url} -->
1865
-
1866
- ${titled}`;
1867
- }
1868
-
1869
- // src/fetch/extract.ts
1870
- var CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS = 1e3;
1871
- async function readClipboard(page) {
1872
- return page.evaluate(() => navigator.clipboard.readText());
1873
- }
1874
- async function writeClipboard(page, value) {
1875
- return page.evaluate(async (nextValue) => {
1876
- try {
1877
- await navigator.clipboard.writeText(nextValue);
1878
- return true;
1879
- } catch {
1880
- return false;
1881
- }
1882
- }, value);
1883
- }
1884
- async function waitForClipboardChange(page, previousText, timeoutMs) {
1885
- const startedAt = Date.now();
1886
- while (Date.now() - startedAt < timeoutMs) {
1887
- const current = (await readClipboard(page)).trim();
1888
- if (current && current !== previousText.trim()) {
1889
- return current;
1890
- }
1891
- await page.waitForTimeout(100);
1892
- }
1893
- throw new Error("Timed out waiting for clipboard content to change");
1894
- }
1895
- async function performClipboardInteractions(page, strategy, deadlineAt) {
1896
- for (const interaction of strategy.interactions) {
1897
- const remainingMs = deadlineAt - Date.now();
1898
- if (remainingMs <= 0) {
1899
- throw new Error("Timed out before clipboard copy controls became ready");
1900
- }
1901
- if (interaction.action === "hover") {
1902
- const locator = page.locator(interaction.selector).first();
1903
- const interactionTimeout = Math.min(
1904
- interaction.timeoutMs ?? CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS,
1905
- remainingMs
1906
- );
1907
- await locator.waitFor({
1908
- state: "visible",
1909
- timeout: interactionTimeout
1910
- });
1911
- await locator.hover({
1912
- timeout: interactionTimeout
1913
- });
1914
- continue;
1915
- }
1916
- if (interaction.action === "click") {
1917
- const locator = page.locator(interaction.selector).first();
1918
- const interactionTimeout = Math.min(
1919
- interaction.timeoutMs ?? CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS,
1920
- remainingMs
1921
- );
1922
- await locator.waitFor({
1923
- state: "visible",
1924
- timeout: interactionTimeout
1925
- });
1926
- await locator.click({
1927
- timeout: interactionTimeout
1928
- });
1929
- continue;
1930
- }
1931
- if (interaction.action === "press") {
1932
- await page.keyboard.press(interaction.key);
1933
- continue;
1934
- }
1935
- await page.waitForTimeout(Math.min(interaction.timeoutMs, remainingMs));
1936
- }
1937
- }
1938
- async function runClipboardStrategy(page, strategy) {
1939
- const sentinel = `__aiocs_clipboard_marker__${Date.now()}__${Math.random().toString(36).slice(2)}__`;
1940
- const before = await writeClipboard(page, sentinel).catch(() => false) ? sentinel : await readClipboard(page).catch(() => "");
1941
- const deadlineAt = Date.now() + strategy.clipboardTimeoutMs;
1942
- let lastError = null;
1943
- let markdown = null;
1944
- while (Date.now() < deadlineAt && !markdown) {
1945
- try {
1946
- await performClipboardInteractions(page, strategy, deadlineAt);
1947
- } catch (error) {
1948
- lastError = error instanceof Error ? error : new Error(String(error));
1949
- }
1950
- const remainingMs = deadlineAt - Date.now();
1951
- if (remainingMs <= 0) {
1952
- break;
1953
- }
1954
- try {
1955
- markdown = await waitForClipboardChange(page, before, Math.min(400, remainingMs));
1956
- } catch (error) {
1957
- lastError = error instanceof Error ? error : new Error(String(error));
1958
- }
1959
- }
1960
- if (!markdown) {
1961
- throw lastError ?? new Error("Timed out waiting for clipboard content to change");
1962
- }
1963
- const title = extractTitleFromMarkdown(markdown) ?? await page.title();
1964
- return {
1965
- title,
1966
- markdown: markdown.trim()
1967
- };
1968
- }
1969
- async function runSelectorStrategy(page, selector) {
1970
- const locator = page.locator(selector).first();
1971
- await locator.waitFor({ state: "visible", timeout: 1e4 });
1972
- const html = await locator.innerHTML();
1973
- const heading = await locator.locator("h1").first().textContent().catch(() => null);
1974
- const title = (heading ?? await page.title()).trim();
1975
- return {
1976
- title,
1977
- markdown: htmlToMarkdown(html)
1978
- };
1979
- }
1980
- async function runReadabilityStrategy(page) {
1981
- const html = await page.content();
1982
- const dom = new JSDOM(html, { url: page.url() });
1983
- const reader = new Readability(dom.window.document);
1984
- const article = reader.parse();
1985
- if (!article?.content) {
1986
- throw new Error(`Readability could not extract content for ${page.url()}`);
1987
- }
1988
- return {
1989
- title: article.title?.trim() || await page.title(),
1990
- markdown: htmlToMarkdown(article.content)
1991
- };
1992
- }
1993
- function extractTitleFromMarkdown(markdown) {
1994
- for (const line of markdown.split("\n")) {
1995
- const trimmed = line.trim();
1996
- if (trimmed.startsWith("# ")) {
1997
- return trimmed.slice(2).trim();
1998
- }
1999
- }
2000
- return null;
2001
- }
2002
- async function extractPage(page, strategy) {
2003
- if (strategy.strategy === "clipboardButton") {
2004
- return runClipboardStrategy(page, strategy);
2005
- }
2006
- if (strategy.strategy === "selector") {
2007
- return runSelectorStrategy(page, strategy.selector);
2008
- }
2009
- return runReadabilityStrategy(page);
2010
- }
2011
-
2012
- // src/fetch/url-patterns.ts
2013
- function escapeRegex(value) {
2014
- return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
2015
- }
2016
- function patternToRegex(pattern) {
2017
- let regex = "^";
2018
- for (let index = 0; index < pattern.length; index += 1) {
2019
- const current = pattern[index];
2020
- const next = pattern[index + 1];
2021
- if (current === "*" && next === "*") {
2022
- regex += ".*";
2023
- index += 1;
2024
- continue;
2025
- }
2026
- if (current === "*") {
2027
- regex += "[^?#]*";
2028
- continue;
2029
- }
2030
- regex += escapeRegex(current ?? "");
2031
- }
2032
- return new RegExp(`${regex}$`);
2033
- }
2034
- function matchesPatterns(url, patterns) {
2035
- return patterns.some((pattern) => patternToRegex(pattern).test(url));
2036
- }
2037
-
2038
- // src/fetch/fetch-source.ts
2039
- var MAX_FETCH_ATTEMPTS = 3;
2040
- var RETRY_DELAY_MS = 250;
2041
- function nowIso2() {
2042
- return (/* @__PURE__ */ new Date()).toISOString();
2043
- }
2044
- function canonicalizeUrl(raw) {
2045
- const url = new URL(raw);
2046
- url.hash = "";
2047
- if (url.pathname !== "/" && url.pathname.endsWith("/")) {
2048
- url.pathname = url.pathname.slice(0, -1);
2049
- }
2050
- return url.toString();
2051
- }
2052
- function getCrawlKey(raw) {
2053
- const url = new URL(canonicalizeUrl(raw));
2054
- if (/\.(md|markdown)$/i.test(url.pathname)) {
2055
- url.pathname = url.pathname.replace(/\.(md|markdown)$/i, "");
2056
- }
2057
- return url.toString();
2058
- }
2059
- function isAllowed(url, allowedHosts, include, exclude) {
2060
- const parsed = new URL(url);
2061
- if (!allowedHosts.includes(parsed.hostname)) {
2062
- return false;
2063
- }
2064
- if (parsed.pathname.startsWith("/~gitbook/")) {
2065
- return false;
2066
- }
2067
- if (!matchesPatterns(url, include)) {
2068
- return false;
2069
- }
2070
- if (exclude.length > 0 && matchesPatterns(url, exclude)) {
2071
- return false;
2072
- }
2073
- return true;
2074
- }
2075
- function slugify(value) {
2076
- return value.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "page";
2077
- }
2078
- function extractTitleFromMarkdown2(markdown) {
2079
- for (const line of markdown.split("\n")) {
2080
- const trimmed = line.trim();
2081
- if (trimmed.startsWith("# ")) {
2082
- return trimmed.slice(2).trim();
2083
- }
2084
- }
2085
- return null;
2086
- }
2087
- function deriveTitleFromUrl(url) {
2088
- const pathname = new URL(url).pathname;
2089
- const lastSegment = pathname.split("/").filter(Boolean).pop() ?? "page";
2090
- return lastSegment.replace(/\.(md|markdown)$/i, "").replace(/[-_]+/g, " ").trim() || "page";
2091
- }
2092
- function isRawMarkdownResponse(url, response) {
2093
- if (!response) {
2094
- return false;
2095
- }
2096
- const contentType = response.headers()["content-type"]?.toLowerCase() ?? "";
2097
- if (contentType.includes("text/markdown") || contentType.includes("text/x-markdown")) {
2098
- return true;
2099
- }
2100
- return contentType.includes("text/plain") && /\.(md|markdown)$/i.test(new URL(url).pathname);
2101
- }
2102
- async function extractRawMarkdownPage(url, response) {
2103
- const markdown = (await response.text()).trim();
2104
- return {
2105
- url,
2106
- title: extractTitleFromMarkdown2(markdown) ?? deriveTitleFromUrl(url),
2107
- markdown
2108
- };
2109
- }
2110
- function persistSnapshotPages(input, snapshotId, pages) {
2111
- const snapshotDir = join4(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
2112
- mkdirSync3(snapshotDir, { recursive: true });
2113
- pages.forEach((page, index) => {
2114
- const filename = `${String(index + 1).padStart(3, "0")}-${slugify(page.title)}.md`;
2115
- writeFileSync(join4(snapshotDir, filename), page.markdown, "utf8");
2116
- });
2117
- }
2118
- function resolveEnvValue(name, env) {
2119
- const value = env[name];
2120
- if (!value) {
2121
- throw new AiocsError(
2122
- AIOCS_ERROR_CODES.authEnvMissing,
2123
- `Missing required environment variable '${name}' for authenticated source access`,
2124
- {
2125
- envVar: name
2126
- }
2127
- );
2128
- }
2129
- return value;
2130
- }
2131
- function resolveSourceAuth(spec, env) {
2132
- const scopedHeaders = (spec.auth?.headers ?? []).map((header) => ({
2133
- name: header.name,
2134
- value: resolveEnvValue(header.valueFromEnv, env),
2135
- hosts: header.hosts ?? spec.allowedHosts,
2136
- ...header.include ? { include: header.include } : {}
2137
- }));
2138
- const cookies = (spec.auth?.cookies ?? []).map((cookie) => ({
2139
- name: cookie.name,
2140
- value: resolveEnvValue(cookie.valueFromEnv, env),
2141
- domain: cookie.domain,
2142
- path: cookie.path,
2143
- ...typeof cookie.secure === "boolean" ? { secure: cookie.secure } : {},
2144
- ...typeof cookie.httpOnly === "boolean" ? { httpOnly: cookie.httpOnly } : {},
2145
- ...cookie.sameSite ? { sameSite: cookie.sameSite } : {}
2146
- }));
2147
- return {
2148
- scopedHeaders,
2149
- cookies
2150
- };
2151
- }
2152
- function applyScopedAuthHeaders(requestUrl, headers, scopedHeaders) {
2153
- if (scopedHeaders.length === 0) {
2154
- return headers;
2155
- }
2156
- const hostname = new URL(requestUrl).hostname;
2157
- const nextHeaders = { ...headers };
2158
- for (const header of scopedHeaders) {
2159
- if (!header.hosts.includes(hostname)) {
2160
- continue;
2161
- }
2162
- if (header.include && !matchesPatterns(requestUrl, header.include)) {
2163
- continue;
2164
- }
2165
- nextHeaders[header.name] = header.value;
2166
- }
2167
- return nextHeaders;
2168
- }
2169
- async function createSourceContext(spec, env) {
2170
- const { scopedHeaders, cookies } = resolveSourceAuth(spec, env);
2171
- const browser = await chromium.launch({ headless: true });
2172
- const context = await browser.newContext({
2173
- viewport: {
2174
- width: 1440,
2175
- height: 1200
2176
- }
2177
- });
2178
- if (scopedHeaders.length > 0) {
2179
- await context.route("**/*", async (route) => {
2180
- await route.continue({
2181
- headers: applyScopedAuthHeaders(route.request().url(), route.request().headers(), scopedHeaders)
2182
- });
2183
- });
2184
- }
2185
- if (cookies.length > 0) {
2186
- await context.addCookies(cookies);
2187
- }
2188
- const uniqueOrigins = [...new Set(spec.startUrls.map((url) => new URL(url).origin))];
2189
- for (const origin of uniqueOrigins) {
2190
- await context.grantPermissions(["clipboard-read", "clipboard-write"], { origin });
2191
- }
2192
- const page = await context.newPage();
2193
- page.setDefaultTimeout(15e3);
2194
- return {
2195
- page,
2196
- async close() {
2197
- await context.close();
2198
- await browser.close();
2199
- }
2200
- };
2201
- }
2202
- async function discoverLinks(page) {
2203
- return page.locator("a[href]").evaluateAll(
2204
- (anchors) => anchors.map((anchor) => anchor.href).filter((href) => typeof href === "string" && href.length > 0)
2205
- );
2206
- }
2207
- async function extractFetchedPage(spec, page, url, response) {
2208
- if (response && isRawMarkdownResponse(url, response)) {
2209
- const extracted2 = await extractRawMarkdownPage(url, response);
2210
- const markdown2 = normalizeMarkdown(spec, extracted2);
2211
- return {
2212
- ...extracted2,
2213
- markdown: markdown2,
2214
- markdownLength: markdown2.trim().length
2215
- };
2216
- }
2217
- await page.waitForTimeout(150);
2218
- const extracted = await extractPage(page, spec.extract);
2219
- const markdown = normalizeMarkdown(spec, {
2220
- title: extracted.title,
2221
- url,
2222
- markdown: extracted.markdown
2223
- });
2224
- return {
2225
- url,
2226
- title: extracted.title,
2227
- markdown,
2228
- markdownLength: markdown.trim().length
2229
- };
2230
- }
2231
- async function fetchSourceOnce(input) {
2232
- const spec = input.catalog.getSourceSpec(input.sourceId);
2233
- if (!spec) {
2234
- throw new AiocsError(
2235
- AIOCS_ERROR_CODES.sourceNotFound,
2236
- `Unknown source '${input.sourceId}'`
2237
- );
2238
- }
2239
- const session = await createSourceContext(spec, input.env ?? process.env);
2240
- const { page } = session;
2241
- const queue = spec.startUrls.map((url) => canonicalizeUrl(url));
2242
- const seen = /* @__PURE__ */ new Set();
2243
- const pageOrder = [];
2244
- const pagesByCrawlKey = /* @__PURE__ */ new Map();
2245
- const pendingRawFallbacks = /* @__PURE__ */ new Map();
2246
- try {
2247
- while (queue.length > 0 && pagesByCrawlKey.size < spec.discovery.maxPages) {
2248
- const next = queue.shift();
2249
- if (!next) {
2250
- break;
2251
- }
2252
- const url = canonicalizeUrl(next);
2253
- const crawlKey = getCrawlKey(url);
2254
- const isRawMarkdownUrl = crawlKey !== url;
2255
- const existing = pagesByCrawlKey.get(crawlKey);
2256
- if (isRawMarkdownUrl) {
2257
- if (existing && !existing.isRawMarkdown) {
2258
- continue;
2259
- }
2260
- if (!seen.has(crawlKey) && !existing) {
2261
- pendingRawFallbacks.set(crawlKey, url);
2262
- const canonicalQueued = queue.some((queuedUrl) => canonicalizeUrl(queuedUrl) === crawlKey);
2263
- if (!canonicalQueued) {
2264
- queue.unshift(crawlKey);
2265
- }
2266
- continue;
2267
- }
2268
- }
2269
- if (seen.has(url)) {
2270
- continue;
2271
- }
2272
- seen.add(url);
2273
- if (!isAllowed(url, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
2274
- continue;
2275
- }
2276
- const response = await page.goto(url, { waitUntil: "domcontentloaded" });
2277
- if (response && response.status() >= 400) {
2278
- const pendingRawFallback = pendingRawFallbacks.get(crawlKey);
2279
- if (!isRawMarkdownUrl && pendingRawFallback && !seen.has(pendingRawFallback)) {
2280
- queue.unshift(pendingRawFallback);
2281
- }
2282
- continue;
2283
- }
2284
- let fetchedPage;
2285
- try {
2286
- fetchedPage = await extractFetchedPage(spec, page, url, response);
2287
- } catch (error) {
2288
- const pendingRawFallback = pendingRawFallbacks.get(crawlKey);
2289
- if (!isRawMarkdownUrl && pendingRawFallback && !seen.has(pendingRawFallback)) {
2290
- queue.unshift(pendingRawFallback);
2291
- continue;
2292
- }
2293
- throw error;
2294
- }
2295
- const isRawMarkdown = response !== null && isRawMarkdownResponse(url, response);
2296
- if (!existing) {
2297
- pageOrder.push(crawlKey);
2298
- pagesByCrawlKey.set(crawlKey, { page: fetchedPage, isRawMarkdown });
2299
- } else if (existing.isRawMarkdown && !isRawMarkdown) {
2300
- pagesByCrawlKey.set(crawlKey, { page: fetchedPage, isRawMarkdown });
2301
- }
2302
- if (!isRawMarkdown) {
2303
- pendingRawFallbacks.delete(crawlKey);
2304
- }
2305
- if (!isRawMarkdown) {
2306
- const links = await discoverLinks(page);
2307
- for (const link of links) {
2308
- const canonical = canonicalizeUrl(link);
2309
- if (!seen.has(canonical) && isAllowed(canonical, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
2310
- queue.push(canonical);
2311
- }
2312
- }
2313
- }
2314
- }
2315
- const pages = pageOrder.map((crawlKey) => pagesByCrawlKey.get(crawlKey)?.page).filter((pageEntry) => pageEntry !== void 0);
2316
- if (pages.length === 0) {
2317
- throw new AiocsError(
2318
- AIOCS_ERROR_CODES.noPagesFetched,
2319
- `No pages fetched for source '${input.sourceId}'`
2320
- );
2321
- }
2322
- const result = input.catalog.recordSuccessfulSnapshot({
2323
- sourceId: input.sourceId,
2324
- pages
2325
- });
2326
- if (!result.reused) {
2327
- persistSnapshotPages(input, result.snapshotId, pages);
2328
- }
2329
- return {
2330
- snapshotId: result.snapshotId,
2331
- pageCount: pages.length,
2332
- reused: result.reused
2333
- };
2334
- } finally {
2335
- await session.close();
2336
- }
2337
- }
2338
- async function fetchSource(input) {
2339
- let lastError;
2340
- for (let attempt = 1; attempt <= MAX_FETCH_ATTEMPTS; attempt += 1) {
2341
- try {
2342
- return await fetchSourceOnce(input);
2343
- } catch (error) {
2344
- lastError = error;
2345
- if (attempt >= MAX_FETCH_ATTEMPTS) {
2346
- input.catalog.recordFailedFetchRun({
2347
- sourceId: input.sourceId,
2348
- errorMessage: error instanceof Error ? error.message : String(error)
2349
- });
2350
- throw error;
2351
- }
2352
- await sleep(RETRY_DELAY_MS * attempt);
2353
- }
2354
- }
2355
- throw lastError instanceof Error ? lastError : new Error(String(lastError));
2356
- }
2357
- async function runSourceCanaryOnce(input) {
2358
- const spec = input.catalog.getSourceSpec(input.sourceId);
2359
- if (!spec) {
2360
- throw new AiocsError(
2361
- AIOCS_ERROR_CODES.sourceNotFound,
2362
- `Unknown source '${input.sourceId}'`
2363
- );
2364
- }
2365
- const canary = resolveSourceCanary(spec);
2366
- const session = await createSourceContext(spec, input.env ?? process.env);
2367
- const { page } = session;
2368
- const checks = [];
2369
- try {
2370
- for (const check of canary.checks) {
2371
- const url = canonicalizeUrl(check.url);
2372
- try {
2373
- if (!isAllowed(url, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
2374
- throw new AiocsError(
2375
- AIOCS_ERROR_CODES.invalidArgument,
2376
- `Canary URL '${url}' is outside the allowed source scope`
2377
- );
2378
- }
2379
- const response = await page.goto(url, { waitUntil: "domcontentloaded" });
2380
- if (response && response.status() >= 400) {
2381
- throw new Error(`Canary request failed with HTTP ${response.status()}`);
2382
- }
2383
- const extracted = await extractFetchedPage(spec, page, url, response);
2384
- if (check.expectedTitle && !extracted.title.includes(check.expectedTitle)) {
2385
- throw new Error(`Expected title to include '${check.expectedTitle}'`);
2386
- }
2387
- if (check.expectedText && !extracted.markdown.includes(check.expectedText)) {
2388
- throw new Error(`Expected markdown to include '${check.expectedText}'`);
2389
- }
2390
- if (extracted.markdownLength < check.minMarkdownLength) {
2391
- throw new Error(
2392
- `Expected markdown length to be at least ${check.minMarkdownLength}, received ${extracted.markdownLength}`
2393
- );
2394
- }
2395
- checks.push({
2396
- url,
2397
- status: "pass",
2398
- title: extracted.title,
2399
- markdownLength: extracted.markdownLength
2400
- });
2401
- } catch (error) {
2402
- checks.push({
2403
- url,
2404
- status: "fail",
2405
- errorMessage: error instanceof Error ? error.message : String(error)
2406
- });
2407
- }
2408
- }
2409
- } finally {
2410
- await session.close();
2411
- }
2412
- const result = {
2413
- sourceId: input.sourceId,
2414
- status: checks.every((check) => check.status === "pass") ? "pass" : "fail",
2415
- checkedAt: nowIso2(),
2416
- summary: {
2417
- checkCount: checks.length,
2418
- passCount: checks.filter((check) => check.status === "pass").length,
2419
- failCount: checks.filter((check) => check.status === "fail").length
2420
- },
2421
- checks
2422
- };
2423
- input.catalog.recordCanaryRun({
2424
- sourceId: input.sourceId,
2425
- status: result.status,
2426
- checkedAt: result.checkedAt,
2427
- details: result
2428
- });
2429
- if (result.status === "fail") {
2430
- throw new AiocsError(
2431
- AIOCS_ERROR_CODES.canaryFailed,
2432
- `Canary failed for source '${input.sourceId}'`,
2433
- result
2434
- );
2435
- }
2436
- return result;
2437
- }
2438
- async function runSourceCanary(input) {
2439
- let lastError;
2440
- for (let attempt = 1; attempt <= MAX_FETCH_ATTEMPTS; attempt += 1) {
2441
- try {
2442
- return await runSourceCanaryOnce(input);
2443
- } catch (error) {
2444
- lastError = error;
2445
- if (attempt >= MAX_FETCH_ATTEMPTS) {
2446
- if (error instanceof AiocsError && error.code === AIOCS_ERROR_CODES.canaryFailed) {
2447
- return error.details;
2448
- }
2449
- throw error;
2450
- }
2451
- await sleep(RETRY_DELAY_MS * attempt);
2452
- }
2453
- }
2454
- throw lastError instanceof Error ? lastError : new Error(String(lastError));
2455
- }
2456
-
2457
- // src/hybrid/ollama.ts
2458
- function getEmbeddingModelKey(config) {
2459
- return `${config.embeddingProvider}:${config.ollamaEmbeddingModel}`;
2460
- }
2461
- function normalizeBaseUrl(baseUrl) {
2462
- return baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
2463
- }
2464
- function normalizeEmbeddingWhitespace(value) {
2465
- return value.replace(/\s+/g, " ").trim();
2466
- }
2467
- function truncateEmbeddingText(value, maxChars) {
2468
- if (value.length <= maxChars) {
2469
- return value;
2470
- }
2471
- const slice = value.slice(0, maxChars);
2472
- const lastWhitespace = slice.lastIndexOf(" ");
2473
- if (lastWhitespace >= Math.floor(maxChars * 0.8)) {
2474
- return slice.slice(0, lastWhitespace).trim();
2475
- }
2476
- return slice.trim();
2477
- }
2478
- function prepareTextForEmbedding(markdown, maxChars) {
2479
- const withoutComments = markdown.replace(/<!--[\s\S]*?-->/g, " ");
2480
- const withoutImages = withoutComments.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, "$1");
2481
- const withoutLinks = withoutImages.replace(/\[([^\]]+)\]\(([^)]+)\)/g, "$1");
2482
- const withoutHtml = withoutLinks.replace(/<[^>]+>/g, " ");
2483
- const withoutCodeFenceMarkers = withoutHtml.replace(/```[^\n]*\n/g, "\n").replace(/```/g, "\n");
2484
- const withoutInlineCodeTicks = withoutCodeFenceMarkers.replace(/`([^`]+)`/g, "$1");
2485
- const normalized = normalizeEmbeddingWhitespace(withoutInlineCodeTicks);
2486
- return truncateEmbeddingText(normalized, maxChars);
2487
- }
2488
- async function parseJsonResponse(response) {
2489
- const text = await response.text();
2490
- if (!text) {
2491
- return {};
2492
- }
2493
- try {
2494
- return JSON.parse(text);
2495
- } catch {
2496
- throw new AiocsError(
2497
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2498
- `Ollama returned a non-JSON response with status ${response.status}`
2499
- );
2500
- }
2501
- }
2502
- async function embedTexts(config, texts) {
2503
- if (texts.length === 0) {
2504
- return [];
2505
- }
2506
- const preparedTexts = texts.map((text) => prepareTextForEmbedding(text, config.ollamaMaxInputChars));
2507
- const response = await fetch(`${normalizeBaseUrl(config.ollamaBaseUrl)}/api/embed`, {
2508
- method: "POST",
2509
- headers: {
2510
- "content-type": "application/json"
2511
- },
2512
- signal: AbortSignal.timeout(config.ollamaTimeoutMs),
2513
- body: JSON.stringify({
2514
- model: config.ollamaEmbeddingModel,
2515
- input: preparedTexts
2516
- })
2517
- }).catch((error) => {
2518
- throw new AiocsError(
2519
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2520
- `Unable to reach Ollama at ${config.ollamaBaseUrl}: ${error instanceof Error ? error.message : String(error)}`
2521
- );
2522
- });
2523
- if (!response.ok) {
2524
- const body = await response.text();
2525
- throw new AiocsError(
2526
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2527
- `Ollama embed request failed with status ${response.status}`,
2528
- body ? { body } : void 0
2529
- );
2530
- }
2531
- const payload = await parseJsonResponse(response);
2532
- if (!Array.isArray(payload.embeddings)) {
2533
- throw new AiocsError(
2534
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2535
- "Ollama embed response did not include an embeddings array"
2536
- );
2537
- }
2538
- const embeddings = payload.embeddings.map((entry) => {
2539
- if (!Array.isArray(entry) || !entry.every((value) => typeof value === "number")) {
2540
- throw new AiocsError(
2541
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2542
- "Ollama embed response contained an invalid embedding vector"
2543
- );
2544
- }
2545
- return entry;
2546
- });
2547
- if (embeddings.length !== texts.length) {
2548
- throw new AiocsError(
2549
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2550
- `Ollama returned ${embeddings.length} embeddings for ${texts.length} inputs`
2551
- );
2552
- }
2553
- return embeddings;
2554
- }
2555
- async function getEmbeddingProviderStatus(config) {
2556
- const response = await fetch(`${normalizeBaseUrl(config.ollamaBaseUrl)}/api/tags`, {
2557
- signal: AbortSignal.timeout(config.ollamaTimeoutMs)
2558
- }).catch((error) => {
2559
- throw new AiocsError(
2560
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2561
- `Unable to reach Ollama at ${config.ollamaBaseUrl}: ${error instanceof Error ? error.message : String(error)}`
2562
- );
2563
- });
2564
- if (!response.ok) {
2565
- throw new AiocsError(
2566
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2567
- `Ollama tags request failed with status ${response.status}`
2568
- );
2569
- }
2570
- const payload = await parseJsonResponse(response);
2571
- const availableModels = (payload.models ?? []).map((entry) => entry.name ?? entry.model).filter((entry) => typeof entry === "string" && entry.length > 0);
2572
- const modelPresent = availableModels.some(
2573
- (name) => name === config.ollamaEmbeddingModel || name.startsWith(`${config.ollamaEmbeddingModel}:`)
2574
- );
2575
- return {
2576
- ok: modelPresent,
2577
- modelPresent,
2578
- baseUrl: config.ollamaBaseUrl,
2579
- model: config.ollamaEmbeddingModel,
2580
- availableModels
2581
- };
2582
- }
2583
-
2584
- // src/hybrid/qdrant.ts
2585
- import { QdrantClient } from "@qdrant/js-client-rest";
2586
- var AiocsVectorStore = class {
2587
- client;
2588
- collectionName;
2589
- constructor(config) {
2590
- this.client = new QdrantClient({
2591
- url: config.qdrantUrl,
2592
- timeout: config.qdrantTimeoutMs,
2593
- checkCompatibility: false
2594
- });
2595
- this.collectionName = config.qdrantCollection;
2596
- }
2597
- pointIdForChunk(chunkId) {
2598
- return chunkId;
2599
- }
2600
- async ensureCollection(dimension) {
2601
- const existsResponse = await this.client.collectionExists(this.collectionName).catch((error) => {
2602
- throw new AiocsError(
2603
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2604
- `Unable to reach Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2605
- );
2606
- });
2607
- const exists = typeof existsResponse === "boolean" ? existsResponse : Boolean(existsResponse.exists);
2608
- if (!exists) {
2609
- await this.client.createCollection(this.collectionName, {
2610
- vectors: {
2611
- size: dimension,
2612
- distance: "Cosine"
2613
- }
2614
- }).catch((error) => {
2615
- throw new AiocsError(
2616
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2617
- `Unable to create Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2618
- );
2619
- });
2620
- return;
2621
- }
2622
- const collection = await this.client.getCollection(this.collectionName).catch((error) => {
2623
- throw new AiocsError(
2624
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2625
- `Unable to inspect Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2626
- );
2627
- });
2628
- const params = collection.config?.params?.vectors;
2629
- const currentSize = typeof params === "object" && params && "size" in params ? Number(params.size) : null;
2630
- if (!currentSize || currentSize !== dimension) {
2631
- await this.client.recreateCollection(this.collectionName, {
2632
- vectors: {
2633
- size: dimension,
2634
- distance: "Cosine"
2635
- }
2636
- }).catch((error) => {
2637
- throw new AiocsError(
2638
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2639
- `Unable to recreate Qdrant collection '${this.collectionName}' for dimension ${dimension}: ${error instanceof Error ? error.message : String(error)}`
2640
- );
2641
- });
2642
- }
2643
- }
2644
- async upsertChunks(input) {
2645
- if (input.points.length === 0) {
2646
- return;
2647
- }
2648
- const points = input.points.map((point) => ({
2649
- id: this.pointIdForChunk(point.chunkId),
2650
- vector: point.vector,
2651
- payload: {
2652
- chunkId: point.chunkId,
2653
- sourceId: point.sourceId,
2654
- snapshotId: point.snapshotId,
2655
- pageUrl: point.pageUrl,
2656
- pageTitle: point.pageTitle,
2657
- sectionTitle: point.sectionTitle,
2658
- modelKey: input.modelKey
2659
- }
2660
- }));
2661
- await this.client.upsert(this.collectionName, {
2662
- wait: true,
2663
- points
2664
- }).catch((error) => {
2665
- throw new AiocsError(
2666
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2667
- `Unable to upsert vectors into Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2668
- );
2669
- });
2670
- }
2671
- async deleteChunkIds(chunkIds) {
2672
- if (chunkIds.length === 0) {
2673
- return;
2674
- }
2675
- await this.client.delete(this.collectionName, {
2676
- wait: true,
2677
- points: chunkIds.map((chunkId) => this.pointIdForChunk(chunkId))
2678
- }).catch((error) => {
2679
- throw new AiocsError(
2680
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2681
- `Unable to delete vectors from Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2682
- );
2683
- });
2684
- }
2685
- async clearCollection() {
2686
- const existsResponse = await this.client.collectionExists(this.collectionName).catch((error) => {
2687
- throw new AiocsError(
2688
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2689
- `Unable to reach Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2690
- );
2691
- });
2692
- const exists = typeof existsResponse === "boolean" ? existsResponse : Boolean(existsResponse.exists);
2693
- if (!exists) {
2694
- return;
2695
- }
2696
- await this.client.deleteCollection(this.collectionName).catch((error) => {
2697
- throw new AiocsError(
2698
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2699
- `Unable to delete Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2700
- );
2701
- });
2702
- }
2703
- async search(input) {
2704
- if (input.snapshotIds.length === 0) {
2705
- return [];
2706
- }
2707
- const results = await this.client.search(this.collectionName, {
2708
- vector: input.vector,
2709
- limit: input.limit,
2710
- ...typeof input.offset === "number" ? { offset: input.offset } : {},
2711
- with_payload: ["chunkId", "snapshotId", "sourceId", "modelKey"],
2712
- filter: {
2713
- must: [
2714
- {
2715
- key: "snapshotId",
2716
- match: {
2717
- any: input.snapshotIds
2718
- }
2719
- },
2720
- {
2721
- key: "modelKey",
2722
- match: {
2723
- value: input.modelKey
2724
- }
2725
- },
2726
- ...input.sourceIds && input.sourceIds.length > 0 ? [{
2727
- key: "sourceId",
2728
- match: {
2729
- any: input.sourceIds
2730
- }
2731
- }] : []
2732
- ]
2733
- }
2734
- }).catch((error) => {
2735
- throw new AiocsError(
2736
- AIOCS_ERROR_CODES.vectorStoreUnavailable,
2737
- `Unable to search Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
2738
- );
2739
- });
2740
- return results.map((result) => {
2741
- const payload = result.payload ?? {};
2742
- const chunkId = typeof payload.chunkId === "number" ? payload.chunkId : typeof result.id === "number" ? result.id : Number(result.id);
2743
- if (!Number.isInteger(chunkId)) {
2744
- return null;
2745
- }
2746
- return {
2747
- chunkId,
2748
- score: result.score
2749
- };
2750
- }).filter((result) => result !== null);
2751
- }
2752
- async getHealth() {
2753
- try {
2754
- const response = await this.client.getCollections();
2755
- return {
2756
- ok: true,
2757
- collections: response.collections?.map((entry) => entry.name) ?? []
2758
- };
2759
- } catch (error) {
2760
- return {
2761
- ok: false,
2762
- errorMessage: error instanceof Error ? error.message : String(error)
2763
- };
2764
- }
2765
- }
2766
- };
2767
-
2768
- // src/hybrid/worker.ts
2769
- function chunkArray(values, size) {
2770
- const chunks = [];
2771
- for (let index = 0; index < values.length; index += size) {
2772
- chunks.push(values.slice(index, index + size));
2773
- }
2774
- return chunks;
2775
- }
2776
- async function processEmbeddingJobs(input) {
2777
- const claimedJobs = input.catalog.claimEmbeddingJobs(input.config.embeddingJobsPerCycle);
2778
- if (claimedJobs.length === 0) {
2779
- return {
2780
- processedJobs: 0,
2781
- succeededJobs: [],
2782
- failedJobs: []
2783
- };
2784
- }
2785
- const vectorStore = new AiocsVectorStore(input.config);
2786
- const modelKey = getEmbeddingModelKey(input.config);
2787
- const succeededJobs = [];
2788
- const failedJobs = [];
2789
- for (const job of claimedJobs) {
2790
- try {
2791
- const chunks = input.catalog.listSnapshotChunks({
2792
- sourceId: job.sourceId,
2793
- snapshotId: job.snapshotId
2794
- });
2795
- if (chunks.length === 0) {
2796
- input.catalog.markEmbeddingJobFailed({
2797
- sourceId: job.sourceId,
2798
- snapshotId: job.snapshotId,
2799
- errorMessage: "No chunks found for embedding job snapshot"
2800
- });
2801
- failedJobs.push({
2802
- sourceId: job.sourceId,
2803
- snapshotId: job.snapshotId,
2804
- errorMessage: "No chunks found for embedding job snapshot"
2805
- });
2806
- continue;
2807
- }
2808
- const existingState = input.catalog.getSnapshotEmbeddingState({
2809
- sourceId: job.sourceId,
2810
- snapshotId: job.snapshotId
2811
- });
2812
- const staleChunkIds = [
2813
- .../* @__PURE__ */ new Set([
2814
- ...input.catalog.listStaleEmbeddingChunkIds(job.sourceId),
2815
- ...existingState.filter((entry) => entry.modelKey && entry.modelKey !== modelKey).map((entry) => entry.chunkId)
2816
- ])
2817
- ];
2818
- const needsReindex = existingState.some((entry) => entry.status !== "indexed" || entry.modelKey !== modelKey);
2819
- if (!needsReindex) {
2820
- input.catalog.markEmbeddingJobSucceeded({
2821
- sourceId: job.sourceId,
2822
- snapshotId: job.snapshotId,
2823
- modelKey,
2824
- indexedChunkIds: chunks.map((chunk) => chunk.chunkId),
2825
- staleChunkIds
2826
- });
2827
- succeededJobs.push({
2828
- sourceId: job.sourceId,
2829
- snapshotId: job.snapshotId,
2830
- chunkCount: chunks.length
2831
- });
2832
- continue;
2833
- }
2834
- const dimensionProbe = await embedTexts(input.config, [chunks[0].markdown]);
2835
- const vectorDimension = dimensionProbe[0]?.length;
2836
- if (!vectorDimension) {
2837
- throw new AiocsError(
2838
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2839
- "Embedding provider returned an empty vector for the first chunk"
2840
- );
2841
- }
2842
- await vectorStore.ensureCollection(vectorDimension);
2843
- if (staleChunkIds.length > 0) {
2844
- await vectorStore.deleteChunkIds(staleChunkIds);
2845
- }
2846
- const indexedChunkIds = [];
2847
- const batchedChunks = chunkArray(chunks, input.config.embeddingBatchSize);
2848
- let dimensionProbeConsumed = false;
2849
- for (const batch of batchedChunks) {
2850
- const embeddings = dimensionProbeConsumed ? await embedTexts(input.config, batch.map((chunk) => chunk.markdown)) : [
2851
- dimensionProbe[0],
2852
- ...batch.length > 1 ? await embedTexts(input.config, batch.slice(1).map((chunk) => chunk.markdown)) : []
2853
- ];
2854
- dimensionProbeConsumed = true;
2855
- if (embeddings.length !== batch.length) {
2856
- throw new AiocsError(
2857
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
2858
- `Embedding provider returned ${embeddings.length} embeddings for a batch of ${batch.length}`
2859
- );
2860
- }
2861
- await vectorStore.upsertChunks({
2862
- modelKey,
2863
- points: batch.map((chunk, index) => ({
2864
- chunkId: chunk.chunkId,
2865
- vector: embeddings[index],
2866
- sourceId: chunk.sourceId,
2867
- snapshotId: chunk.snapshotId,
2868
- pageUrl: chunk.pageUrl,
2869
- pageTitle: chunk.pageTitle,
2870
- sectionTitle: chunk.sectionTitle
2871
- }))
2872
- });
2873
- indexedChunkIds.push(...batch.map((chunk) => chunk.chunkId));
2874
- }
2875
- input.catalog.markEmbeddingJobSucceeded({
2876
- sourceId: job.sourceId,
2877
- snapshotId: job.snapshotId,
2878
- modelKey,
2879
- indexedChunkIds,
2880
- staleChunkIds
2881
- });
2882
- succeededJobs.push({
2883
- sourceId: job.sourceId,
2884
- snapshotId: job.snapshotId,
2885
- chunkCount: indexedChunkIds.length
2886
- });
2887
- } catch (error) {
2888
- const errorMessage = error instanceof Error ? error.message : String(error);
2889
- input.catalog.markEmbeddingJobFailed({
2890
- sourceId: job.sourceId,
2891
- snapshotId: job.snapshotId,
2892
- errorMessage
2893
- });
2894
- failedJobs.push({
2895
- sourceId: job.sourceId,
2896
- snapshotId: job.snapshotId,
2897
- errorMessage
2898
- });
2899
- }
2900
- }
2901
- return {
2902
- processedJobs: claimedJobs.length,
2903
- succeededJobs,
2904
- failedJobs
2905
- };
2906
- }
2907
-
2908
- // src/runtime/hybrid-config.ts
2909
- function parsePositiveInteger(value, field, fallback) {
2910
- if (typeof value === "undefined" || value.trim() === "") {
2911
- return fallback;
2912
- }
2913
- const parsed = Number(value);
2914
- if (!Number.isInteger(parsed) || parsed <= 0) {
2915
- throw new AiocsError(
2916
- AIOCS_ERROR_CODES.embeddingConfigInvalid,
2917
- `${field} must be a positive integer`
2918
- );
2919
- }
2920
- return parsed;
2921
- }
2922
- function parseSearchMode(value) {
2923
- if (!value) {
2924
- return "auto";
2925
- }
2926
- if (value === "auto" || value === "lexical" || value === "hybrid" || value === "semantic") {
2927
- return value;
2928
- }
2929
- throw new AiocsError(
2930
- AIOCS_ERROR_CODES.embeddingConfigInvalid,
2931
- "AIOCS_SEARCH_MODE_DEFAULT must be one of: auto, lexical, hybrid, semantic"
2932
- );
2933
- }
2934
- function getHybridRuntimeConfig(env = process.env) {
2935
- const embeddingProvider = env.AIOCS_EMBEDDING_PROVIDER ?? "ollama";
2936
- if (embeddingProvider !== "ollama") {
2937
- throw new AiocsError(
2938
- AIOCS_ERROR_CODES.embeddingConfigInvalid,
2939
- "AIOCS_EMBEDDING_PROVIDER currently supports only ollama"
2940
- );
2941
- }
2942
- return {
2943
- defaultSearchMode: parseSearchMode(env.AIOCS_SEARCH_MODE_DEFAULT),
2944
- qdrantUrl: env.AIOCS_QDRANT_URL ?? "http://127.0.0.1:6333",
2945
- qdrantCollection: env.AIOCS_QDRANT_COLLECTION ?? "aiocs_docs_chunks",
2946
- qdrantTimeoutMs: parsePositiveInteger(env.AIOCS_QDRANT_TIMEOUT_MS, "AIOCS_QDRANT_TIMEOUT_MS", 1e3),
2947
- embeddingProvider: "ollama",
2948
- ollamaBaseUrl: env.AIOCS_OLLAMA_BASE_URL ?? "http://127.0.0.1:11434",
2949
- ollamaEmbeddingModel: env.AIOCS_OLLAMA_EMBEDDING_MODEL ?? "nomic-embed-text",
2950
- ollamaTimeoutMs: parsePositiveInteger(env.AIOCS_OLLAMA_TIMEOUT_MS, "AIOCS_OLLAMA_TIMEOUT_MS", 1e4),
2951
- ollamaMaxInputChars: parsePositiveInteger(env.AIOCS_OLLAMA_MAX_INPUT_CHARS, "AIOCS_OLLAMA_MAX_INPUT_CHARS", 4e3),
2952
- embeddingBatchSize: parsePositiveInteger(env.AIOCS_EMBEDDING_BATCH_SIZE, "AIOCS_EMBEDDING_BATCH_SIZE", 32),
2953
- embeddingJobsPerCycle: parsePositiveInteger(env.AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE, "AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE", 2),
2954
- lexicalCandidateWindow: parsePositiveInteger(env.AIOCS_LEXICAL_CANDIDATE_WINDOW, "AIOCS_LEXICAL_CANDIDATE_WINDOW", 40),
2955
- vectorCandidateWindow: parsePositiveInteger(env.AIOCS_VECTOR_CANDIDATE_WINDOW, "AIOCS_VECTOR_CANDIDATE_WINDOW", 40),
2956
- rrfK: parsePositiveInteger(env.AIOCS_RRF_K, "AIOCS_RRF_K", 60)
2957
- };
2958
- }
2959
-
2960
- // src/spec/source-spec-files.ts
2961
- import { access, readdir } from "fs/promises";
2962
- import { constants as fsConstants } from "fs";
2963
- import { extname as extname2, join as join5, resolve as resolve4 } from "path";
2964
- var SOURCE_SPEC_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".json"]);
2965
- function uniqueResolvedPaths(paths) {
2966
- const seen = /* @__PURE__ */ new Set();
2967
- const unique = [];
2968
- for (const rawPath of paths) {
2969
- const normalized = resolve4(rawPath);
2970
- if (seen.has(normalized)) {
2971
- continue;
2972
- }
2973
- seen.add(normalized);
2974
- unique.push(normalized);
2975
- }
2976
- return unique;
2977
- }
2978
- async function pathExists(targetPath) {
2979
- try {
2980
- await access(targetPath, fsConstants.F_OK);
2981
- return true;
2982
- } catch {
2983
- return false;
2984
- }
2985
- }
2986
- async function walkSourceSpecFiles(rootDir) {
2987
- const entries = await readdir(rootDir, { withFileTypes: true });
2988
- const discovered = [];
2989
- for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
2990
- const entryPath = join5(rootDir, entry.name);
2991
- if (entry.isDirectory()) {
2992
- discovered.push(...await walkSourceSpecFiles(entryPath));
2993
- continue;
2994
- }
2995
- if (!entry.isFile()) {
2996
- continue;
2997
- }
2998
- if (SOURCE_SPEC_EXTENSIONS.has(extname2(entry.name).toLowerCase())) {
2999
- discovered.push(entryPath);
3000
- }
3001
- }
3002
- return discovered;
3003
- }
3004
-
3005
- // src/daemon.ts
3006
- var DEFAULT_INTERVAL_MINUTES = 60;
3007
- var DEFAULT_CONTAINER_SOURCE_DIR = "/app/sources";
3008
- var BOOLEAN_TRUE_VALUES = /* @__PURE__ */ new Set(["1", "true", "yes", "on"]);
3009
- var BOOLEAN_FALSE_VALUES = /* @__PURE__ */ new Set(["0", "false", "no", "off"]);
3010
- function nowIso3() {
3011
- return (/* @__PURE__ */ new Date()).toISOString();
3012
- }
3013
- function parsePositiveInteger2(raw, variableName) {
3014
- if (!/^\d+$/.test(raw)) {
3015
- throw new Error(`${variableName} must be a positive integer`);
3016
- }
3017
- const parsed = Number(raw);
3018
- if (!Number.isInteger(parsed) || parsed <= 0) {
3019
- throw new Error(`${variableName} must be a positive integer`);
3020
- }
3021
- return parsed;
3022
- }
3023
- function parseBoolean(raw, variableName) {
3024
- const normalized = raw.trim().toLowerCase();
3025
- if (BOOLEAN_TRUE_VALUES.has(normalized)) {
3026
- return true;
3027
- }
3028
- if (BOOLEAN_FALSE_VALUES.has(normalized)) {
3029
- return false;
3030
- }
3031
- throw new Error(`${variableName} must be one of: true, false, 1, 0, yes, no, on, off`);
3032
- }
3033
- function parseDaemonConfig(env, options = {}) {
3034
- const intervalMinutes = env.AIOCS_DAEMON_INTERVAL_MINUTES ? parsePositiveInteger2(env.AIOCS_DAEMON_INTERVAL_MINUTES, "AIOCS_DAEMON_INTERVAL_MINUTES") : DEFAULT_INTERVAL_MINUTES;
3035
- const fetchOnStart = env.AIOCS_DAEMON_FETCH_ON_START ? parseBoolean(env.AIOCS_DAEMON_FETCH_ON_START, "AIOCS_DAEMON_FETCH_ON_START") : true;
3036
- const defaultContainerSourceDir = options.containerSourceDir ?? (existsSync2(DEFAULT_CONTAINER_SOURCE_DIR) ? DEFAULT_CONTAINER_SOURCE_DIR : void 0);
3037
- const defaultSourceDirs = uniqueResolvedPaths([
3038
- options.bundledSourceDir ?? getBundledSourcesDir(),
3039
- options.userSourceDir ?? getAiocsSourcesDir(env),
3040
- ...defaultContainerSourceDir ? [defaultContainerSourceDir] : []
3041
- ]);
3042
- const sourceSpecDirs = env.AIOCS_SOURCE_SPEC_DIRS ? uniqueResolvedPaths(
3043
- env.AIOCS_SOURCE_SPEC_DIRS.split(",").map((entry) => entry.trim()).filter(Boolean)
3044
- ) : defaultSourceDirs;
3045
- if (env.AIOCS_SOURCE_SPEC_DIRS && sourceSpecDirs.length === 0) {
3046
- throw new Error("AIOCS_SOURCE_SPEC_DIRS must include at least one directory");
3047
- }
3048
- return {
3049
- intervalMinutes,
3050
- fetchOnStart,
3051
- strictSourceSpecDirs: Boolean(env.AIOCS_SOURCE_SPEC_DIRS),
3052
- sourceSpecDirs
3053
- };
3054
- }
3055
- async function bootstrapSourceSpecs(input) {
3056
- const normalizedSourceSpecDirs = uniqueResolvedPaths(input.sourceSpecDirs);
3057
- const missingDirs = [];
3058
- const existingDirs = [];
3059
- const sources = [];
3060
- for (const sourceSpecDir of normalizedSourceSpecDirs) {
3061
- if (!await pathExists(sourceSpecDir)) {
3062
- missingDirs.push(sourceSpecDir);
3063
- continue;
3064
- }
3065
- existingDirs.push(sourceSpecDir);
3066
- }
3067
- if (input.strictSourceSpecDirs && missingDirs.length > 0) {
3068
- throw new Error(`Missing source spec directories: ${missingDirs.join(", ")}`);
3069
- }
3070
- for (const sourceSpecDir of existingDirs) {
3071
- const specPaths = await walkSourceSpecFiles(sourceSpecDir);
3072
- for (const specPath of specPaths) {
3073
- const spec = await loadSourceSpec(specPath);
3074
- const upserted = input.catalog.upsertSource(spec, { specPath });
3075
- sources.push({
3076
- sourceId: upserted.sourceId,
3077
- configHash: upserted.configHash,
3078
- configChanged: upserted.configChanged,
3079
- specPath
3080
- });
3081
- }
3082
- }
3083
- if (input.strictSourceSpecDirs && sources.length === 0) {
3084
- throw new Error(`No source spec files found in configured directories: ${normalizedSourceSpecDirs.join(", ")}`);
3085
- }
3086
- const removedSourceIds = input.catalog.removeManagedSources({
3087
- managedRoots: existingDirs.map((sourceSpecDir) => resolve5(sourceSpecDir)),
3088
- activeSources: sources.map((source) => ({
3089
- sourceId: source.sourceId,
3090
- specPath: source.specPath
3091
- }))
3092
- });
3093
- return {
3094
- processedSpecCount: sources.length,
3095
- removedSourceIds,
3096
- sources
3097
- };
3098
- }
3099
- async function runDaemonCycle(input) {
3100
- const startedAt = nowIso3();
3101
- const bootstrapped = await bootstrapSourceSpecs({
3102
- catalog: input.catalog,
3103
- sourceSpecDirs: input.sourceSpecDirs,
3104
- ...input.strictSourceSpecDirs !== void 0 ? { strictSourceSpecDirs: input.strictSourceSpecDirs } : {}
3105
- });
3106
- const dueSourceIds = [
3107
- .../* @__PURE__ */ new Set([
3108
- ...input.catalog.listDueSourceIds(input.referenceTime ?? startedAt),
3109
- ...bootstrapped.sources.filter((source) => source.configChanged).map((source) => source.sourceId)
3110
- ])
3111
- ];
3112
- const canaryDueSourceIds = [
3113
- .../* @__PURE__ */ new Set([
3114
- ...input.catalog.listCanaryDueSourceIds(input.referenceTime ?? startedAt),
3115
- ...bootstrapped.sources.filter((source) => source.configChanged).map((source) => source.sourceId),
3116
- ...input.catalog.listSources().filter((source) => source.lastCanaryCheckedAt === null).map((source) => source.id)
3117
- ])
3118
- ];
3119
- const canaried = [];
3120
- const canaryFailed = [];
3121
- const refreshed = [];
3122
- const failed = [];
3123
- const embedded = [];
3124
- const embeddingFailed = [];
3125
- for (const sourceId of canaryDueSourceIds) {
3126
- try {
3127
- const result = await runSourceCanary({
3128
- catalog: input.catalog,
3129
- sourceId,
3130
- env: process.env
3131
- });
3132
- canaried.push({
3133
- sourceId,
3134
- status: result.status,
3135
- checkedAt: result.checkedAt,
3136
- summary: result.summary
3137
- });
3138
- if (result.status === "fail") {
3139
- canaryFailed.push({
3140
- sourceId,
3141
- errorMessage: `One or more canary checks failed for ${sourceId}`
3142
- });
3143
- }
3144
- } catch (error) {
3145
- canaryFailed.push({
3146
- sourceId,
3147
- errorMessage: error instanceof Error ? error.message : String(error)
3148
- });
3149
- }
3150
- }
3151
- for (const sourceId of dueSourceIds) {
3152
- try {
3153
- const result = await fetchSource({
3154
- catalog: input.catalog,
3155
- dataDir: input.dataDir,
3156
- sourceId
3157
- });
3158
- refreshed.push({
3159
- sourceId,
3160
- snapshotId: result.snapshotId,
3161
- pageCount: result.pageCount,
3162
- reused: result.reused
3163
- });
3164
- } catch (error) {
3165
- const errorMessage = error instanceof Error ? error.message : String(error);
3166
- failed.push({
3167
- sourceId,
3168
- errorMessage
3169
- });
3170
- }
3171
- }
3172
- try {
3173
- const embeddingResult = await processEmbeddingJobs({
3174
- catalog: input.catalog,
3175
- config: getHybridRuntimeConfig(process.env)
3176
- });
3177
- embedded.push(...embeddingResult.succeededJobs);
3178
- embeddingFailed.push(...embeddingResult.failedJobs);
3179
- } catch (error) {
3180
- embeddingFailed.push({
3181
- sourceId: "system",
3182
- snapshotId: "system",
3183
- errorMessage: error instanceof Error ? error.message : String(error)
3184
- });
3185
- }
3186
- return {
3187
- startedAt,
3188
- finishedAt: nowIso3(),
3189
- dueSourceIds,
3190
- canaryDueSourceIds,
3191
- bootstrapped,
3192
- canaried,
3193
- canaryFailed,
3194
- refreshed,
3195
- failed,
3196
- embedded,
3197
- embeddingFailed
3198
- };
3199
- }
3200
- async function startDaemon(input) {
3201
- const intervalMs = input.config.intervalMinutes * 6e4;
3202
- input.catalog.resetRunningEmbeddingJobs();
3203
- input.catalog.markDaemonStarted({
3204
- startedAt: nowIso3(),
3205
- intervalMinutes: input.config.intervalMinutes,
3206
- fetchOnStart: input.config.fetchOnStart
3207
- });
3208
- input.logger.emit({
3209
- type: "daemon.started",
3210
- intervalMinutes: input.config.intervalMinutes,
3211
- fetchOnStart: input.config.fetchOnStart,
3212
- sourceSpecDirs: input.config.sourceSpecDirs
3213
- });
3214
- const runCycle = async (reason) => {
3215
- const startedAt = nowIso3();
3216
- input.catalog.markDaemonCycleStarted(startedAt);
3217
- input.logger.emit({
3218
- type: "daemon.cycle.started",
3219
- reason,
3220
- startedAt
3221
- });
3222
- try {
3223
- const result = await runDaemonCycle({
3224
- catalog: input.catalog,
3225
- dataDir: input.dataDir,
3226
- sourceSpecDirs: input.config.sourceSpecDirs,
3227
- strictSourceSpecDirs: input.config.strictSourceSpecDirs,
3228
- referenceTime: startedAt
3229
- });
3230
- input.catalog.markDaemonCycleCompleted({
3231
- completedAt: result.finishedAt,
3232
- status: result.failed.length > 0 || result.canaryFailed.length > 0 || result.embeddingFailed.length > 0 ? "degraded" : "success"
3233
- });
3234
- input.logger.emit({
3235
- type: "daemon.cycle.completed",
3236
- reason,
3237
- result
3238
- });
3239
- } catch (error) {
3240
- input.catalog.markDaemonCycleCompleted({
3241
- completedAt: nowIso3(),
3242
- status: "failed"
3243
- });
3244
- throw error;
3245
- }
3246
- };
3247
- if (input.config.fetchOnStart && !input.signal?.aborted) {
3248
- await runCycle("startup");
3249
- }
3250
- while (!input.signal?.aborted) {
3251
- try {
3252
- await sleep2(intervalMs, void 0, { signal: input.signal });
3253
- } catch (error) {
3254
- if (input.signal?.aborted) {
3255
- break;
3256
- }
3257
- throw error;
3258
- }
3259
- if (input.signal?.aborted) {
3260
- break;
3261
- }
3262
- await runCycle("interval");
3263
- }
3264
- input.logger.emit({
3265
- type: "daemon.stopped"
3266
- });
3267
- }
3268
-
3269
- // package.json
3270
- var package_default = {
3271
- name: "@bodhi-ventures/aiocs",
3272
- version: "0.1.2",
3273
- license: "MIT",
3274
- type: "module",
3275
- description: "Local-only documentation store, fetcher, and search CLI for AI agents.",
3276
- keywords: [
3277
- "ai",
3278
- "docs",
3279
- "search",
3280
- "mcp",
3281
- "cli"
3282
- ],
3283
- homepage: "https://github.com/Bodhi-Ventures/aiocs",
3284
- bugs: {
3285
- url: "https://github.com/Bodhi-Ventures/aiocs/issues"
3286
- },
3287
- repository: {
3288
- type: "git",
3289
- url: "https://github.com/Bodhi-Ventures/aiocs.git"
3290
- },
3291
- publishConfig: {
3292
- access: "public",
3293
- provenance: true
3294
- },
3295
- packageManager: "pnpm@9.15.9",
3296
- files: [
3297
- "dist",
3298
- "sources",
3299
- "docs",
3300
- "README.md",
3301
- "LICENSE",
3302
- "skills"
3303
- ],
3304
- bin: {
3305
- docs: "./dist/cli.js",
3306
- "aiocs-mcp": "./dist/mcp-server.js"
3307
- },
3308
- engines: {
3309
- node: ">=22"
3310
- },
3311
- scripts: {
3312
- build: "tsup --config tsup.config.ts",
3313
- dev: "tsx src/cli.ts",
3314
- "dev:mcp": "tsx src/mcp-server.ts",
3315
- lint: "tsc --noEmit",
3316
- test: "vitest run",
3317
- "test:watch": "vitest"
3318
- },
3319
- dependencies: {
3320
- "@modelcontextprotocol/sdk": "^1.28.0",
3321
- "@mozilla/readability": "^0.6.0",
3322
- "@qdrant/js-client-rest": "1.17.0",
3323
- "better-sqlite3": "^12.4.1",
3324
- commander: "^14.0.1",
3325
- jsdom: "^27.0.1",
3326
- playwright: "^1.57.0",
3327
- turndown: "^7.2.1",
3328
- "turndown-plugin-gfm": "^1.0.2",
3329
- yaml: "^2.8.1",
3330
- zod: "^4.1.12"
3331
- },
3332
- devDependencies: {
3333
- "@types/better-sqlite3": "^7.6.13",
3334
- "@types/jsdom": "^21.1.7",
3335
- "@types/node": "^24.7.2",
3336
- "@types/turndown": "^5.0.5",
3337
- execa: "^9.6.0",
3338
- tsup: "^8.5.0",
3339
- tsx: "^4.20.6",
3340
- typescript: "^5.9.3",
3341
- vitest: "^3.2.4"
3342
- }
3343
- };
3344
-
3345
- // src/runtime/package-metadata.ts
3346
- var packageName = package_default.name;
3347
- var packageVersion = package_default.version;
3348
- var packageDescription = package_default.description;
3349
-
3350
- // src/services.ts
3351
- import { resolve as resolve8 } from "path";
3352
-
3353
- // src/backup.ts
3354
- import { cp, mkdir, readdir as readdir2, readFile as readFile2, rename, rm, stat, writeFile } from "fs/promises";
3355
- import { basename, dirname as dirname2, join as join6, resolve as resolve6 } from "path";
3356
- import { randomUUID as randomUUID2 } from "crypto";
3357
- import Database2 from "better-sqlite3";
3358
- var CATALOG_DB_FILENAME = "catalog.sqlite";
3359
- var SQLITE_SIDE_CAR_SUFFIXES = ["-wal", "-shm"];
3360
- async function pathExists2(path) {
3361
- try {
3362
- await stat(path);
3363
- return true;
3364
- } catch {
3365
- return false;
3366
- }
3367
- }
3368
- async function assertSourceDirExists(path) {
3369
- if (!await pathExists2(path)) {
3370
- throw new AiocsError(
3371
- AIOCS_ERROR_CODES.backupSourceMissing,
3372
- `Backup source path does not exist: ${path}`
3373
- );
3374
- }
3375
- }
3376
- async function isDirectoryEmpty(path) {
3377
- if (!await pathExists2(path)) {
3378
- return true;
3379
- }
3380
- return (await readdir2(path)).length === 0;
3381
- }
3382
- async function listEntries(root, relativePath = "") {
3383
- const absolutePath = relativePath ? join6(root, relativePath) : root;
3384
- const stats = await stat(absolutePath);
3385
- if (!stats.isDirectory()) {
3386
- return [{
3387
- relativePath,
3388
- type: "file",
3389
- size: stats.size
3390
- }];
3391
- }
3392
- const childNames = await readdir2(absolutePath);
3393
- const entries = relativePath ? [{
3394
- relativePath,
3395
- type: "directory",
3396
- size: 0
3397
- }] : [];
3398
- for (const childName of childNames.sort()) {
3399
- entries.push(...await listEntries(root, relativePath ? join6(relativePath, childName) : childName));
3400
- }
3401
- return entries;
3402
- }
3403
- async function copyIfPresent(from, to, entries, relativePrefix) {
3404
- if (!await pathExists2(from)) {
3405
- return;
3406
- }
3407
- await mkdir(to, { recursive: true });
3408
- await cp(from, to, { recursive: true, force: true });
3409
- const copiedEntries = await listEntries(to);
3410
- entries.push(
3411
- ...copiedEntries.map((entry) => ({
3412
- ...entry,
3413
- relativePath: join6(relativePrefix, entry.relativePath)
3414
- }))
3415
- );
3416
- }
3417
- async function copyDataDirForBackup(from, to) {
3418
- const sourceCatalogPath = join6(from, CATALOG_DB_FILENAME);
3419
- if (!await pathExists2(sourceCatalogPath)) {
3420
- throw new AiocsError(
3421
- AIOCS_ERROR_CODES.backupSourceMissing,
3422
- `Backup source is missing the catalog database: ${sourceCatalogPath}`
3423
- );
3424
- }
3425
- await mkdir(to, { recursive: true });
3426
- await cp(from, to, {
3427
- recursive: true,
3428
- force: true,
3429
- filter: (source) => {
3430
- const name = basename(source);
3431
- if (name === CATALOG_DB_FILENAME) {
3432
- return false;
3433
- }
3434
- return !SQLITE_SIDE_CAR_SUFFIXES.some((suffix) => name === `${CATALOG_DB_FILENAME}${suffix}`);
3435
- }
3436
- });
3437
- const targetCatalogPath = join6(to, CATALOG_DB_FILENAME);
3438
- const sourceCatalog = new Database2(sourceCatalogPath, { readonly: true });
3439
- try {
3440
- await sourceCatalog.backup(targetCatalogPath);
3441
- } finally {
3442
- sourceCatalog.close();
3443
- }
3444
- }
3445
- async function loadValidatedBackupPayload(inputDir) {
3446
- const manifestPath = join6(inputDir, "manifest.json");
3447
- await assertSourceDirExists(inputDir);
3448
- if (!await pathExists2(manifestPath)) {
3449
- throw new AiocsError(
3450
- AIOCS_ERROR_CODES.backupInvalid,
3451
- `Backup manifest not found: ${manifestPath}`
3452
- );
3453
- }
3454
- const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
3455
- if (manifest.formatVersion !== 1 || !Array.isArray(manifest.entries)) {
3456
- throw new AiocsError(
3457
- AIOCS_ERROR_CODES.backupInvalid,
3458
- `Invalid backup manifest: ${manifestPath}`
3459
- );
3460
- }
3461
- const backupDataDir = join6(inputDir, "data");
3462
- if (!await pathExists2(backupDataDir)) {
3463
- throw new AiocsError(
3464
- AIOCS_ERROR_CODES.backupInvalid,
3465
- `Backup payload is missing the data directory: ${backupDataDir}`
3466
- );
3467
- }
3468
- const backupCatalogPath = join6(backupDataDir, CATALOG_DB_FILENAME);
3469
- if (!await pathExists2(backupCatalogPath)) {
3470
- throw new AiocsError(
3471
- AIOCS_ERROR_CODES.backupInvalid,
3472
- `Backup payload is missing the catalog database: ${backupCatalogPath}`
3473
- );
3474
- }
3475
- const backupConfigDir = join6(inputDir, "config");
3476
- return {
3477
- manifest,
3478
- backupDataDir,
3479
- ...await pathExists2(backupConfigDir) ? { backupConfigDir } : {}
3480
- };
3481
- }
3482
- async function prepareReplacementTarget(backupDir, targetDir) {
3483
- const parentDir = dirname2(targetDir);
3484
- const stagingDir = join6(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
3485
- await rm(stagingDir, { recursive: true, force: true });
3486
- await mkdir(parentDir, { recursive: true });
3487
- await cp(backupDir, stagingDir, { recursive: true, force: true });
3488
- return stagingDir;
3489
- }
3490
- async function exportBackup(input) {
3491
- const dataDir = resolve6(input.dataDir);
3492
- const outputDir = resolve6(input.outputDir);
3493
- const configDir = input.configDir ? resolve6(input.configDir) : void 0;
3494
- await assertSourceDirExists(dataDir);
3495
- if (!await isDirectoryEmpty(outputDir)) {
3496
- if (!input.replaceExisting) {
3497
- throw new AiocsError(
3498
- AIOCS_ERROR_CODES.backupConflict,
3499
- `Backup output directory is not empty: ${outputDir}`
3500
- );
3501
- }
3502
- await rm(outputDir, { recursive: true, force: true });
3503
- }
3504
- await mkdir(outputDir, { recursive: true });
3505
- const entries = [];
3506
- await copyDataDirForBackup(dataDir, join6(outputDir, "data"));
3507
- entries.push(...(await listEntries(join6(outputDir, "data"))).map((entry) => ({
3508
- ...entry,
3509
- relativePath: join6("data", entry.relativePath)
3510
- })));
3511
- if (configDir) {
3512
- await copyIfPresent(configDir, join6(outputDir, "config"), entries, "config");
3513
- }
3514
- const manifest = {
3515
- formatVersion: 1,
3516
- createdAt: (/* @__PURE__ */ new Date()).toISOString(),
3517
- packageVersion,
3518
- entries
3519
- };
3520
- const manifestPath = join6(outputDir, "manifest.json");
3521
- await writeFile(manifestPath, JSON.stringify(manifest, null, 2), "utf8");
3522
- return {
3523
- outputDir,
3524
- manifestPath,
3525
- manifest
3526
- };
3527
- }
3528
- async function importBackup(input) {
3529
- const inputDir = resolve6(input.inputDir);
3530
- const dataDir = resolve6(input.dataDir);
3531
- const configDir = input.configDir ? resolve6(input.configDir) : void 0;
3532
- const { manifest, backupDataDir, backupConfigDir } = await loadValidatedBackupPayload(inputDir);
3533
- if (!await isDirectoryEmpty(dataDir)) {
3534
- if (!input.replaceExisting) {
3535
- throw new AiocsError(
3536
- AIOCS_ERROR_CODES.backupConflict,
3537
- `Backup target data directory is not empty: ${dataDir}`
3538
- );
3539
- }
3540
- }
3541
- if (configDir && backupConfigDir && !await isDirectoryEmpty(configDir)) {
3542
- if (!input.replaceExisting) {
3543
- throw new AiocsError(
3544
- AIOCS_ERROR_CODES.backupConflict,
3545
- `Backup target config directory is not empty: ${configDir}`
3546
- );
3547
- }
3548
- }
3549
- const stagedDataDir = await prepareReplacementTarget(backupDataDir, dataDir);
3550
- const stagedConfigDir = configDir && backupConfigDir ? await prepareReplacementTarget(backupConfigDir, configDir) : void 0;
3551
- try {
3552
- await rm(dataDir, { recursive: true, force: true });
3553
- await rename(stagedDataDir, dataDir);
3554
- if (configDir && stagedConfigDir) {
3555
- await rm(configDir, { recursive: true, force: true });
3556
- await rename(stagedConfigDir, configDir);
3557
- }
3558
- } catch (error) {
3559
- await rm(stagedDataDir, { recursive: true, force: true });
3560
- if (stagedConfigDir) {
3561
- await rm(stagedConfigDir, { recursive: true, force: true });
3562
- }
3563
- throw error;
3564
- }
3565
- return {
3566
- inputDir,
3567
- dataDir,
3568
- ...configDir ? { configDir } : {},
3569
- manifest
3570
- };
3571
- }
3572
-
3573
- // src/coverage.ts
3574
- import { readFile as readFile3 } from "fs/promises";
3575
- import { resolve as resolve7 } from "path";
3576
- function normalizeText(value) {
3577
- return value.replace(/[`*_~]+/g, "").replace(/\s+/g, " ").trim().toLowerCase();
3578
- }
3579
- function extractHeadings(markdown) {
3580
- const matches = [...markdown.matchAll(/^#{1,6}\s+(.+)$/gm)];
3581
- return matches.map((match) => match[1]?.trim() ?? "").filter(Boolean);
3582
- }
3583
- function extractComparableLines(markdown) {
3584
- return markdown.split("\n").map((line) => line.replace(/^\s*(#{1,6}|\d+\.\s+|[-*+]\s+)/, "").trim()).map((line) => normalizeText(line)).filter(Boolean);
3585
- }
3586
- function classifyHeading(heading, pageTitles, sectionTitles, comparableMarkdownLines) {
3587
- const normalizedHeading = normalizeText(heading);
3588
- if (!normalizedHeading) {
3589
- return null;
3590
- }
3591
- if (pageTitles.has(normalizedHeading)) {
3592
- return "page_title";
3593
- }
3594
- if (sectionTitles.has(normalizedHeading)) {
3595
- return "section_title";
3596
- }
3597
- if (comparableMarkdownLines.has(normalizedHeading)) {
3598
- return "body";
3599
- }
3600
- return null;
3601
- }
3602
- async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
3603
- if (referenceFiles.length === 0) {
3604
- throw new AiocsError(
3605
- AIOCS_ERROR_CODES.invalidArgument,
3606
- "At least one reference file is required for coverage verification."
3607
- );
3608
- }
3609
- const pageTitles = new Set(corpus.entries.map((entry) => normalizeText(entry.pageTitle)).filter(Boolean));
3610
- const sectionTitles = new Set(corpus.entries.map((entry) => normalizeText(entry.sectionTitle)).filter(Boolean));
3611
- const comparableMarkdownLines = new Set(
3612
- corpus.entries.flatMap((entry) => extractComparableLines(entry.markdown))
3613
- );
3614
- const files = [];
3615
- let headingCount = 0;
3616
- let matchedHeadingCount = 0;
3617
- let missingHeadingCount = 0;
3618
- const matchCounts = {
3619
- pageTitle: 0,
3620
- sectionTitle: 0,
3621
- body: 0
3622
- };
3623
- for (const referenceFile of referenceFiles) {
3624
- const resolvedReferenceFile = resolve7(referenceFile);
3625
- let raw;
3626
- try {
3627
- raw = await readFile3(resolvedReferenceFile, "utf8");
3628
- } catch (error) {
3629
- if (error?.code === "ENOENT") {
3630
- throw new AiocsError(
3631
- AIOCS_ERROR_CODES.referenceFileNotFound,
3632
- `Reference file not found: ${resolvedReferenceFile}`
3633
- );
3634
- }
3635
- throw error;
3636
- }
3637
- const headings = extractHeadings(raw);
3638
- if (headings.length === 0) {
3639
- throw new AiocsError(
3640
- AIOCS_ERROR_CODES.invalidReferenceFile,
3641
- `Reference file does not contain any markdown headings: ${resolvedReferenceFile}`
3642
- );
3643
- }
3644
- const fileMatchCounts = {
3645
- pageTitle: 0,
3646
- sectionTitle: 0,
3647
- body: 0
3648
- };
3649
- const missingHeadings = [];
3650
- for (const heading of headings) {
3651
- const matchType = classifyHeading(heading, pageTitles, sectionTitles, comparableMarkdownLines);
3652
- if (matchType === "page_title") {
3653
- fileMatchCounts.pageTitle += 1;
3654
- matchCounts.pageTitle += 1;
3655
- matchedHeadingCount += 1;
3656
- } else if (matchType === "section_title") {
3657
- fileMatchCounts.sectionTitle += 1;
3658
- matchCounts.sectionTitle += 1;
3659
- matchedHeadingCount += 1;
3660
- } else if (matchType === "body") {
3661
- fileMatchCounts.body += 1;
3662
- matchCounts.body += 1;
3663
- matchedHeadingCount += 1;
3664
- } else {
3665
- missingHeadings.push(heading);
3666
- missingHeadingCount += 1;
3667
- }
3668
- }
3669
- headingCount += headings.length;
3670
- files.push({
3671
- referenceFile: resolvedReferenceFile,
3672
- headingCount: headings.length,
3673
- matchedHeadingCount: headings.length - missingHeadings.length,
3674
- missingHeadingCount: missingHeadings.length,
3675
- missingHeadings,
3676
- matchCounts: fileMatchCounts
3677
- });
3678
- }
3679
- return {
3680
- sourceId: corpus.sourceId,
3681
- snapshotId: corpus.snapshotId,
3682
- complete: missingHeadingCount === 0,
3683
- summary: {
3684
- fileCount: files.length,
3685
- headingCount,
3686
- matchedHeadingCount,
3687
- missingHeadingCount,
3688
- matchCounts
3689
- },
3690
- files
3691
- };
3692
- }
3693
-
3694
- // src/doctor.ts
3695
- import { access as access2 } from "fs/promises";
3696
- import { execFile } from "child_process";
3697
- import { promisify } from "util";
3698
- var execFileAsync = promisify(execFile);
3699
- function summarize(checks) {
3700
- const passCount = checks.filter((check) => check.status === "pass").length;
3701
- const warnCount = checks.filter((check) => check.status === "warn").length;
3702
- const failCount = checks.filter((check) => check.status === "fail").length;
3703
- return {
3704
- status: failCount > 0 ? "unhealthy" : warnCount > 0 ? "degraded" : "healthy",
3705
- checkCount: checks.length,
3706
- passCount,
3707
- warnCount,
3708
- failCount
3709
- };
3710
- }
3711
- function toErrorMessage(error) {
3712
- if (error instanceof Error) {
3713
- return error.message;
3714
- }
3715
- return String(error);
3716
- }
3717
- function parseTimestamp(value) {
3718
- if (!value) {
3719
- return null;
3720
- }
3721
- const parsed = Date.parse(value);
3722
- return Number.isNaN(parsed) ? null : parsed;
3723
- }
3724
- async function checkCatalog(env) {
3725
- const dataDir = getAiocsDataDir(env);
3726
- const configDir = getAiocsConfigDir(env);
3727
- let catalog = null;
3728
- try {
3729
- catalog = openCatalog({ dataDir });
3730
- const sourceCount = catalog.listSources().length;
3731
- const projectLinkCount = catalog.listProjectLinks().length;
3732
- return {
3733
- id: "catalog",
3734
- status: "pass",
3735
- summary: `Catalog opened successfully at ${dataDir}`,
3736
- details: {
3737
- dataDir,
3738
- configDir,
3739
- sourceCount,
3740
- projectLinkCount
3741
- }
3742
- };
3743
- } catch (error) {
3744
- return {
3745
- id: "catalog",
3746
- status: "fail",
3747
- summary: `Catalog unavailable: ${toErrorMessage(error)}`,
3748
- details: {
3749
- dataDir,
3750
- configDir
3751
- }
3752
- };
3753
- } finally {
3754
- catalog?.close();
3755
- }
3756
- }
3757
- async function checkPlaywright() {
3758
- try {
3759
- const { chromium: chromium2 } = await import("playwright");
3760
- const executablePath = chromium2.executablePath();
3761
- if (!executablePath) {
3762
- return {
3763
- id: "playwright",
3764
- status: "fail",
3765
- summary: "Playwright is installed but Chromium has no resolved executable path."
3766
- };
3767
- }
3768
- await access2(executablePath);
3769
- return {
3770
- id: "playwright",
3771
- status: "pass",
3772
- summary: "Playwright Chromium executable is available.",
3773
- details: {
3774
- executablePath
3775
- }
3776
- };
3777
- } catch (error) {
3778
- return {
3779
- id: "playwright",
3780
- status: "fail",
3781
- summary: `Playwright is not ready: ${toErrorMessage(error)}`
3782
- };
3783
- }
3784
- }
3785
- async function checkDaemonConfig(env) {
3786
- try {
3787
- const daemonConfig = parseDaemonConfig(env, {
3788
- bundledSourceDir: getBundledSourcesDir()
3789
- });
3790
- return {
3791
- daemonConfig,
3792
- daemonConfigCheck: {
3793
- id: "daemon-config",
3794
- status: "pass",
3795
- summary: "Daemon configuration parsed successfully.",
3796
- details: daemonConfig
3797
- }
3798
- };
3799
- } catch (error) {
3800
- return {
3801
- daemonConfig: null,
3802
- daemonConfigCheck: {
3803
- id: "daemon-config",
3804
- status: "fail",
3805
- summary: `Daemon configuration is invalid: ${toErrorMessage(error)}`
3806
- }
3807
- };
3808
- }
3809
- }
3810
- async function checkSourceSpecDirs(daemonConfig) {
3811
- if (!daemonConfig) {
3812
- return {
3813
- id: "source-spec-dirs",
3814
- status: "fail",
3815
- summary: "Source spec directories cannot be validated until daemon configuration is valid."
3816
- };
3817
- }
3818
- const directories = await Promise.all(daemonConfig.sourceSpecDirs.map(async (directory) => {
3819
- const exists = await pathExists(directory);
3820
- const specFiles = exists ? await walkSourceSpecFiles(directory) : [];
3821
- return {
3822
- directory,
3823
- exists,
3824
- specCount: specFiles.length
3825
- };
3826
- }));
3827
- const existingCount = directories.filter((directory) => directory.exists).length;
3828
- const totalSpecCount = directories.reduce((sum, directory) => sum + directory.specCount, 0);
3829
- let status = "pass";
3830
- let summary = `Validated ${directories.length} source spec director${directories.length === 1 ? "y" : "ies"}.`;
3831
- if (directories.length === 0) {
3832
- status = "fail";
3833
- summary = "No source spec directories are configured.";
3834
- } else if (daemonConfig.strictSourceSpecDirs && directories.some((directory) => !directory.exists)) {
3835
- status = "fail";
3836
- summary = "One or more explicitly configured source spec directories are missing.";
3837
- } else if (existingCount === 0) {
3838
- status = "warn";
3839
- summary = "No configured source spec directories currently exist.";
3840
- } else if (totalSpecCount === 0) {
3841
- status = "warn";
3842
- summary = "Configured source spec directories exist but contain no source specs.";
3843
- } else if (directories.some((directory) => !directory.exists)) {
3844
- status = "warn";
3845
- summary = "Some optional source spec directories are missing.";
3846
- }
3847
- return {
3848
- id: "source-spec-dirs",
3849
- status,
3850
- summary,
3851
- details: {
3852
- strict: daemonConfig.strictSourceSpecDirs,
3853
- directories
3854
- }
3855
- };
3856
- }
3857
- async function checkFreshness(env) {
3858
- const dataDir = getAiocsDataDir(env);
3859
- let catalog = null;
3860
- try {
3861
- catalog = openCatalog({ dataDir });
3862
- const sources = catalog.listSources();
3863
- const referenceTime = Date.now();
3864
- if (sources.length === 0) {
3865
- return {
3866
- id: "freshness",
3867
- status: "pass",
3868
- summary: "No sources are registered, so no source freshness checks are pending.",
3869
- details: {
3870
- sourceCount: 0
3871
- }
3872
- };
3873
- }
3874
- const staleSources = sources.filter((source) => !source.lastSuccessfulSnapshotId || Date.parse(source.nextDueAt) <= referenceTime).map((source) => ({
3875
- sourceId: source.id,
3876
- nextDueAt: source.nextDueAt,
3877
- lastSuccessfulSnapshotAt: source.lastSuccessfulSnapshotAt,
3878
- lastSuccessfulSnapshotAgeMinutes: source.lastSuccessfulSnapshotAt ? Math.floor((referenceTime - Date.parse(source.lastSuccessfulSnapshotAt)) / 6e4) : null
3879
- }));
3880
- const staleCanaries = sources.filter(
3881
- (source) => source.nextCanaryDueAt && Date.parse(source.nextCanaryDueAt) <= referenceTime || source.lastCanaryStatus === "fail"
3882
- ).map((source) => ({
3883
- sourceId: source.id,
3884
- nextCanaryDueAt: source.nextCanaryDueAt,
3885
- lastCanaryCheckedAt: source.lastCanaryCheckedAt,
3886
- lastCanaryStatus: source.lastCanaryStatus
3887
- }));
3888
- const status = staleSources.length > 0 || staleCanaries.length > 0 ? "warn" : "pass";
3889
- const summary = status === "pass" ? "Source snapshots and canaries are fresh." : `Source freshness issues detected: ${staleSources.length} stale snapshot scope(s), ${staleCanaries.length} stale/failed canary scope(s).`;
3890
- return {
3891
- id: "freshness",
3892
- status,
3893
- summary,
3894
- details: {
3895
- sourceCount: sources.length,
3896
- staleSources,
3897
- staleCanaries,
3898
- checkedAt: new Date(referenceTime).toISOString()
3899
- }
3900
- };
3901
- } catch (error) {
3902
- return {
3903
- id: "freshness",
3904
- status: "fail",
3905
- summary: `Freshness checks failed: ${toErrorMessage(error)}`
3906
- };
3907
- } finally {
3908
- catalog?.close();
3909
- }
3910
- }
3911
- async function checkDaemonHeartbeat(env) {
3912
- const dataDir = getAiocsDataDir(env);
3913
- let catalog = null;
3914
- try {
3915
- catalog = openCatalog({ dataDir });
3916
- const daemonState = catalog.getDaemonState();
3917
- if (!daemonState) {
3918
- return {
3919
- id: "daemon-heartbeat",
3920
- status: "warn",
3921
- summary: "No daemon heartbeat has been recorded yet."
3922
- };
3923
- }
3924
- const intervalMinutes = daemonState.intervalMinutes ?? 60;
3925
- const completedAt = parseTimestamp(daemonState.lastCycleCompletedAt);
3926
- if (!completedAt) {
3927
- return {
3928
- id: "daemon-heartbeat",
3929
- status: "warn",
3930
- summary: "Daemon heartbeat exists but no completed cycle has been recorded yet.",
3931
- details: daemonState
3932
- };
3933
- }
3934
- const ageMinutes = Math.floor((Date.now() - completedAt) / 6e4);
3935
- const stale = ageMinutes > intervalMinutes * 2;
3936
- const unhealthyStatus = daemonState.lastCycleStatus === "failed" || daemonState.lastCycleStatus === "degraded";
3937
- return {
3938
- id: "daemon-heartbeat",
3939
- status: stale || unhealthyStatus ? "warn" : "pass",
3940
- summary: stale || unhealthyStatus ? `Daemon heartbeat is stale or unhealthy (age=${ageMinutes}m, status=${daemonState.lastCycleStatus ?? "unknown"}).` : `Daemon heartbeat is recent (age=${ageMinutes}m).`,
3941
- details: {
3942
- ...daemonState,
3943
- ageMinutes
3944
- }
3945
- };
3946
- } catch (error) {
3947
- return {
3948
- id: "daemon-heartbeat",
3949
- status: "fail",
3950
- summary: `Daemon heartbeat check failed: ${toErrorMessage(error)}`
3951
- };
3952
- } finally {
3953
- catalog?.close();
3954
- }
3955
- }
3956
- async function checkEmbeddingProvider(env) {
3957
- try {
3958
- const config = getHybridRuntimeConfig(env);
3959
- const status = await getEmbeddingProviderStatus(config);
3960
- return {
3961
- id: "embedding-provider",
3962
- status: status.ok ? "pass" : "warn",
3963
- summary: status.ok ? `Embedding provider is ready with model ${status.model}.` : `Embedding provider is reachable but model ${status.model} is not available locally.`,
3964
- details: status
3965
- };
3966
- } catch (error) {
3967
- return {
3968
- id: "embedding-provider",
3969
- status: "fail",
3970
- summary: `Embedding provider check failed: ${toErrorMessage(error)}`
3971
- };
3972
- }
3973
- }
3974
- async function checkVectorStore(env) {
3975
- try {
3976
- const config = getHybridRuntimeConfig(env);
3977
- const status = await new AiocsVectorStore(config).getHealth();
3978
- return {
3979
- id: "vector-store",
3980
- status: status.ok ? "pass" : "warn",
3981
- summary: status.ok ? `Qdrant is reachable at ${config.qdrantUrl}.` : `Qdrant is not ready at ${config.qdrantUrl}: ${status.errorMessage ?? "unknown error"}`,
3982
- details: {
3983
- qdrantUrl: config.qdrantUrl,
3984
- collection: config.qdrantCollection,
3985
- ...status.collections ? { collections: status.collections } : {}
3986
- }
3987
- };
3988
- } catch (error) {
3989
- return {
3990
- id: "vector-store",
3991
- status: "fail",
3992
- summary: `Vector store check failed: ${toErrorMessage(error)}`
3993
- };
3994
- }
3995
- }
3996
- async function checkEmbeddings(env) {
3997
- const dataDir = getAiocsDataDir(env);
3998
- let catalog = null;
3999
- try {
4000
- catalog = openCatalog({ dataDir });
4001
- const overview = catalog.getEmbeddingOverview();
4002
- const underIndexedSources = overview.sources.filter((source) => source.totalChunks > 0 && source.indexedChunks < source.totalChunks).map((source) => ({
4003
- sourceId: source.sourceId,
4004
- snapshotId: source.snapshotId,
4005
- coverageRatio: source.coverageRatio,
4006
- totalChunks: source.totalChunks,
4007
- indexedChunks: source.indexedChunks,
4008
- pendingChunks: source.pendingChunks,
4009
- failedChunks: source.failedChunks,
4010
- staleChunks: source.staleChunks
4011
- }));
4012
- const status = overview.queue.failedJobs > 0 ? "warn" : underIndexedSources.length > 0 || overview.queue.pendingJobs > 0 || overview.queue.runningJobs > 0 ? "warn" : "pass";
4013
- return {
4014
- id: "embeddings",
4015
- status,
4016
- summary: status === "pass" ? "Embedding coverage is complete for latest snapshots." : `Embedding backlog detected: ${overview.queue.pendingJobs} pending, ${overview.queue.runningJobs} running, ${overview.queue.failedJobs} failed job(s).`,
4017
- details: {
4018
- queue: overview.queue,
4019
- underIndexedSources
4020
- }
4021
- };
4022
- } catch (error) {
4023
- return {
4024
- id: "embeddings",
4025
- status: "fail",
4026
- summary: `Embedding status check failed: ${toErrorMessage(error)}`
4027
- };
4028
- } finally {
4029
- catalog?.close();
4030
- }
4031
- }
4032
- async function checkDocker() {
4033
- try {
4034
- const { stdout } = await execFileAsync("docker", ["info", "--format", "{{json .ServerVersion}}"]);
4035
- const version = JSON.parse(stdout.trim());
4036
- return {
4037
- id: "docker",
4038
- status: "pass",
4039
- summary: `Docker is available (server ${version}).`,
4040
- details: {
4041
- serverVersion: version
4042
- }
4043
- };
4044
- } catch (error) {
4045
- const message = toErrorMessage(error);
4046
- if (message.includes("ENOENT")) {
4047
- return {
4048
- id: "docker",
4049
- status: "warn",
4050
- summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable on this machine."
4051
- };
4052
- }
4053
- return {
4054
- id: "docker",
4055
- status: "warn",
4056
- summary: `Docker is not ready: ${message}`
4057
- };
4058
- }
4059
- }
4060
- async function runDoctor(env = process.env) {
4061
- const catalogCheck = await checkCatalog(env);
4062
- const playwrightCheck = await checkPlaywright();
4063
- const { daemonConfigCheck, daemonConfig } = await checkDaemonConfig(env);
4064
- const sourceSpecDirsCheck = await checkSourceSpecDirs(daemonConfig);
4065
- const freshnessCheck = await checkFreshness(env);
4066
- const daemonHeartbeatCheck = await checkDaemonHeartbeat(env);
4067
- const embeddingProviderCheck = await checkEmbeddingProvider(env);
4068
- const vectorStoreCheck = await checkVectorStore(env);
4069
- const embeddingsCheck = await checkEmbeddings(env);
4070
- const dockerCheck = await checkDocker();
4071
- const checks = [
4072
- catalogCheck,
4073
- playwrightCheck,
4074
- daemonConfigCheck,
4075
- sourceSpecDirsCheck,
4076
- freshnessCheck,
4077
- daemonHeartbeatCheck,
4078
- embeddingProviderCheck,
4079
- vectorStoreCheck,
4080
- embeddingsCheck,
4081
- dockerCheck
4082
- ];
4083
- return {
4084
- summary: summarize(checks),
4085
- checks
4086
- };
4087
- }
4088
-
4089
- // src/hybrid/rank.ts
4090
- function reciprocalRankFusion(candidateLists, rrfK) {
4091
- const byChunkId = /* @__PURE__ */ new Map();
4092
- for (const candidates of candidateLists) {
4093
- for (const candidate of candidates) {
4094
- const current = byChunkId.get(candidate.chunkId) ?? {
4095
- fusedScore: 0,
4096
- signals: /* @__PURE__ */ new Set()
4097
- };
4098
- current.fusedScore += 1 / (rrfK + candidate.rank);
4099
- current.signals.add(candidate.signal);
4100
- byChunkId.set(candidate.chunkId, current);
4101
- }
4102
- }
4103
- return [...byChunkId.entries()].map(([chunkId, value]) => ({
4104
- chunkId,
4105
- fusedScore: value.fusedScore,
4106
- signals: [...value.signals]
4107
- })).sort((left, right) => right.fusedScore - left.fusedScore || left.chunkId - right.chunkId);
4108
- }
4109
-
4110
- // src/hybrid/search.ts
4111
- function windowSize(limit, offset, minimum) {
4112
- return Math.max(limit + offset, minimum);
4113
- }
4114
- function withScores(rows, scoreLookup) {
4115
- return rows.map((row) => {
4116
- const score = scoreLookup.get(row.chunkId) ?? {
4117
- score: 0,
4118
- signals: ["lexical"]
4119
- };
4120
- return {
4121
- ...row,
4122
- score: score.score,
4123
- signals: score.signals
4124
- };
4125
- });
4126
- }
4127
- async function searchHybridCatalog(input) {
4128
- const scope = input.catalog.resolveSearchScope({
4129
- query: input.query,
4130
- ...input.searchInput.cwd ? { cwd: input.searchInput.cwd } : {},
4131
- ...input.searchInput.sourceIds ? { sourceIds: input.searchInput.sourceIds } : {},
4132
- ...input.searchInput.snapshotId ? { snapshotId: input.searchInput.snapshotId } : {},
4133
- ...input.searchInput.all ? { all: true } : {},
4134
- ...typeof input.searchInput.limit === "number" ? { limit: input.searchInput.limit } : {},
4135
- ...typeof input.searchInput.offset === "number" ? { offset: input.searchInput.offset } : {}
4136
- });
4137
- const lexicalOnly = () => {
4138
- const lexical = input.catalog.searchLexical({
4139
- query: input.query,
4140
- scope
4141
- });
4142
- return {
4143
- query: input.query,
4144
- total: lexical.total,
4145
- limit: lexical.limit,
4146
- offset: lexical.offset,
4147
- hasMore: lexical.hasMore,
4148
- modeRequested: input.mode,
4149
- modeUsed: "lexical",
4150
- results: lexical.results.map((result, index) => ({
4151
- ...result,
4152
- score: 1 / (index + 1),
4153
- signals: ["lexical"]
4154
- }))
4155
- };
4156
- };
4157
- if (scope.snapshotIds.length === 0) {
4158
- return {
4159
- query: input.query,
4160
- total: 0,
4161
- limit: scope.limit,
4162
- offset: scope.offset,
4163
- hasMore: false,
4164
- modeRequested: input.mode,
4165
- modeUsed: input.mode === "semantic" ? "semantic" : "lexical",
4166
- results: []
4167
- };
4168
- }
4169
- if (input.mode === "lexical") {
4170
- return lexicalOnly();
4171
- }
4172
- const overview = input.catalog.getEmbeddingOverview();
4173
- const snapshotIdSet = new Set(scope.snapshotIds);
4174
- const scopedSources = overview.sources.filter(
4175
- (source) => source.snapshotId ? snapshotIdSet.has(source.snapshotId) : false
4176
- );
4177
- const allSnapshotsIndexed = scopedSources.every(
4178
- (source) => !source.snapshotId || source.totalChunks > 0 && source.indexedChunks === source.totalChunks
4179
- );
4180
- if (input.mode === "auto" && !allSnapshotsIndexed) {
4181
- return lexicalOnly();
4182
- }
4183
- let queryVector;
4184
- let vectorCandidates = [];
4185
- const modelKey = getEmbeddingModelKey(input.config);
4186
- try {
4187
- const embedding = await embedTexts(input.config, [input.query]);
4188
- queryVector = embedding[0];
4189
- if (!queryVector) {
4190
- throw new AiocsError(
4191
- AIOCS_ERROR_CODES.embeddingProviderUnavailable,
4192
- "Embedding provider returned no vector for the search query"
4193
- );
4194
- }
4195
- const vectorStore = new AiocsVectorStore(input.config);
4196
- vectorCandidates = await vectorStore.search({
4197
- vector: queryVector,
4198
- snapshotIds: scope.snapshotIds,
4199
- sourceIds: scope.sourceIds,
4200
- modelKey,
4201
- limit: windowSize(scope.limit, scope.offset, input.config.vectorCandidateWindow)
4202
- });
4203
- } catch (error) {
4204
- if (input.mode === "auto") {
4205
- return lexicalOnly();
4206
- }
4207
- throw error;
4208
- }
4209
- if (input.mode === "auto" && vectorCandidates.length === 0) {
4210
- return lexicalOnly();
4211
- }
4212
- if (input.mode === "semantic") {
4213
- const orderedChunkIds2 = vectorCandidates.map((candidate) => candidate.chunkId);
4214
- const chunkRows2 = input.catalog.getChunksByIds(orderedChunkIds2);
4215
- const chunkMap2 = new Map(chunkRows2.map((row) => [row.chunkId, row]));
4216
- const orderedRows2 = orderedChunkIds2.map((chunkId) => chunkMap2.get(chunkId)).filter((row) => Boolean(row));
4217
- const pagedRows2 = orderedRows2.slice(scope.offset, scope.offset + scope.limit);
4218
- const scoreLookup2 = new Map(vectorCandidates.map((candidate) => [
4219
- candidate.chunkId,
4220
- { score: candidate.score, signals: ["vector"] }
4221
- ]));
4222
- return {
4223
- query: input.query,
4224
- total: orderedRows2.length,
4225
- limit: scope.limit,
4226
- offset: scope.offset,
4227
- hasMore: scope.offset + pagedRows2.length < orderedRows2.length,
4228
- modeRequested: input.mode,
4229
- modeUsed: "semantic",
4230
- results: withScores(pagedRows2, scoreLookup2)
4231
- };
4232
- }
4233
- const lexicalCandidates = input.catalog.searchLexical({
4234
- query: input.query,
4235
- scope,
4236
- limit: windowSize(scope.limit, scope.offset, input.config.lexicalCandidateWindow),
4237
- offset: 0
4238
- });
4239
- const fused = reciprocalRankFusion([
4240
- lexicalCandidates.results.map((result, index) => ({
4241
- chunkId: result.chunkId,
4242
- rank: index + 1,
4243
- signal: "lexical"
4244
- })),
4245
- vectorCandidates.map((result, index) => ({
4246
- chunkId: result.chunkId,
4247
- rank: index + 1,
4248
- signal: "vector",
4249
- score: result.score
4250
- }))
4251
- ], input.config.rrfK);
4252
- const orderedChunkIds = fused.map((result) => result.chunkId);
4253
- const chunkRows = input.catalog.getChunksByIds(orderedChunkIds);
4254
- const chunkMap = new Map(chunkRows.map((row) => [row.chunkId, row]));
4255
- const orderedRows = orderedChunkIds.map((chunkId) => chunkMap.get(chunkId)).filter((row) => Boolean(row));
4256
- const pagedRows = orderedRows.slice(scope.offset, scope.offset + scope.limit);
4257
- const scoreLookup = new Map(fused.map((candidate) => [
4258
- candidate.chunkId,
4259
- {
4260
- score: candidate.fusedScore,
4261
- signals: candidate.signals
4262
- }
4263
- ]));
4264
- return {
4265
- query: input.query,
4266
- total: orderedRows.length,
4267
- limit: scope.limit,
4268
- offset: scope.offset,
4269
- hasMore: scope.offset + pagedRows.length < orderedRows.length,
4270
- modeRequested: input.mode,
4271
- modeUsed: "hybrid",
4272
- results: withScores(pagedRows, scoreLookup)
4273
- };
4274
- }
4275
-
4276
- // src/services.ts
4277
- function createCatalog() {
4278
- const dataDir = getAiocsDataDir();
4279
- getAiocsConfigDir();
4280
- return {
4281
- dataDir,
4282
- catalog: openCatalog({ dataDir })
4283
- };
4284
- }
4285
- function withCatalog(run) {
4286
- const ctx = createCatalog();
4287
- return Promise.resolve(run(ctx)).finally(() => ctx.catalog.close());
4288
- }
4289
- async function upsertSourceFromSpecFile(specFile) {
4290
- const specPath = resolve8(specFile);
4291
- const spec = await loadSourceSpec(specPath);
4292
- const result = await withCatalog(({ catalog }) => catalog.upsertSource(spec, { specPath }));
4293
- return {
4294
- sourceId: result.sourceId,
4295
- configHash: result.configHash,
4296
- specPath
4297
- };
4298
- }
4299
- async function listSources() {
4300
- const sources = await withCatalog(({ catalog }) => catalog.listSources());
4301
- return { sources };
4302
- }
4303
- async function fetchSources(sourceIdOrAll) {
4304
- const results = await withCatalog(async ({ catalog, dataDir }) => {
4305
- const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
4306
- if (sourceIds.length === 0) {
4307
- return [];
4308
- }
4309
- const fetched = [];
4310
- for (const sourceId of sourceIds) {
4311
- const result = await fetchSource({ catalog, sourceId, dataDir });
4312
- fetched.push({
4313
- sourceId,
4314
- snapshotId: result.snapshotId,
4315
- pageCount: result.pageCount,
4316
- reused: result.reused
4317
- });
4318
- }
4319
- await processEmbeddingJobs({
4320
- catalog,
4321
- config: getHybridRuntimeConfig()
4322
- });
4323
- return fetched;
4324
- });
4325
- return { results };
4326
- }
4327
- async function refreshDueSources(sourceIdOrAll = "all") {
4328
- const results = await withCatalog(async ({ catalog, dataDir }) => {
4329
- const dueIds = sourceIdOrAll === "all" ? catalog.listDueSourceIds() : (() => {
4330
- const spec = catalog.getSourceSpec(sourceIdOrAll);
4331
- if (!spec) {
4332
- throw new AiocsError(
4333
- AIOCS_ERROR_CODES.sourceNotFound,
4334
- `Unknown source '${sourceIdOrAll}'`
4335
- );
4336
- }
4337
- return catalog.listDueSourceIds().includes(sourceIdOrAll) ? [sourceIdOrAll] : [];
4338
- })();
4339
- const fetched = [];
4340
- for (const sourceId of dueIds) {
4341
- const result = await fetchSource({ catalog, sourceId, dataDir });
4342
- fetched.push({
4343
- sourceId,
4344
- snapshotId: result.snapshotId,
4345
- pageCount: result.pageCount,
4346
- reused: result.reused
4347
- });
4348
- }
4349
- await processEmbeddingJobs({
4350
- catalog,
4351
- config: getHybridRuntimeConfig()
4352
- });
4353
- return fetched;
4354
- });
4355
- return { results };
4356
- }
4357
- async function runSourceCanaries(sourceIdOrAll) {
4358
- const results = await withCatalog(async ({ catalog }) => {
4359
- const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
4360
- if (sourceIds.length === 0) {
4361
- return [];
4362
- }
4363
- const canaried = [];
4364
- for (const sourceId of sourceIds) {
4365
- canaried.push(await runSourceCanary({
4366
- catalog,
4367
- sourceId,
4368
- env: process.env
4369
- }));
4370
- }
4371
- return canaried;
4372
- });
4373
- return { results };
4374
- }
4375
- async function listSnapshotsForSource(sourceId) {
4376
- const snapshots = await withCatalog(({ catalog }) => catalog.listSnapshots(sourceId));
4377
- return {
4378
- sourceId,
4379
- snapshots
4380
- };
4381
- }
4382
- async function diffSnapshotsForSource(input) {
4383
- return withCatalog(({ catalog }) => catalog.diffSnapshots(input));
4384
- }
4385
- async function linkProjectSources(projectPath, sourceIds) {
4386
- const resolvedProjectPath = resolve8(projectPath);
4387
- await withCatalog(({ catalog }) => {
4388
- catalog.linkProject(resolvedProjectPath, sourceIds);
4389
- });
4390
- return {
4391
- projectPath: resolvedProjectPath,
4392
- sourceIds
4393
- };
4394
- }
4395
- async function unlinkProjectSources(projectPath, sourceIds) {
4396
- const resolvedProjectPath = resolve8(projectPath);
4397
- await withCatalog(({ catalog }) => {
4398
- catalog.unlinkProject(resolvedProjectPath, sourceIds);
4399
- });
4400
- return {
4401
- projectPath: resolvedProjectPath,
4402
- sourceIds
4403
- };
4404
- }
4405
- async function searchCatalog(query, options) {
4406
- const cwd = options.project ? resolve8(options.project) : process.cwd();
4407
- const explicitSources = options.source.length > 0;
4408
- const results = await withCatalog(({ catalog }) => {
4409
- const hybridConfig = getHybridRuntimeConfig();
4410
- const scope = resolveProjectScope(cwd, catalog.listProjectLinks());
4411
- if (!explicitSources && !options.all && !scope) {
4412
- throw new AiocsError(
4413
- AIOCS_ERROR_CODES.noProjectScope,
4414
- "No linked project scope found. Use --source or --all."
4415
- );
4416
- }
4417
- return searchHybridCatalog({
4418
- catalog,
4419
- config: hybridConfig,
4420
- query,
4421
- mode: options.mode ?? hybridConfig.defaultSearchMode,
4422
- searchInput: {
4423
- cwd,
4424
- ...explicitSources ? { sourceIds: options.source } : {},
4425
- ...options.snapshot ? { snapshotId: options.snapshot } : {},
4426
- ...options.all ? { all: true } : {},
4427
- ...typeof options.limit === "number" ? { limit: options.limit } : {},
4428
- ...typeof options.offset === "number" ? { offset: options.offset } : {}
4429
- }
4430
- });
4431
- });
4432
- return {
4433
- query,
4434
- total: results.total,
4435
- limit: results.limit,
4436
- offset: results.offset,
4437
- hasMore: results.hasMore,
4438
- modeRequested: results.modeRequested,
4439
- modeUsed: results.modeUsed,
4440
- results: results.results
4441
- };
4442
- }
4443
- async function showChunk(chunkId) {
4444
- const chunk = await withCatalog(({ catalog }) => catalog.getChunkById(chunkId));
4445
- if (!chunk) {
4446
- throw new AiocsError(
4447
- AIOCS_ERROR_CODES.chunkNotFound,
4448
- `Chunk ${chunkId} not found`
4449
- );
4450
- }
4451
- return { chunk };
4452
- }
4453
- async function verifyCoverage(input) {
4454
- return withCatalog(async ({ catalog }) => {
4455
- const corpus = catalog.getCoverageCorpus({
4456
- sourceId: input.sourceId,
4457
- ...input.snapshotId ? { snapshotId: input.snapshotId } : {}
4458
- });
4459
- return verifyCoverageAgainstReferences(corpus, input.referenceFiles);
4460
- });
4461
- }
4462
- async function initManagedSources(options) {
4463
- const sourceSpecDirs = uniqueResolvedPaths(
4464
- options?.sourceSpecDirs ?? [
4465
- getBundledSourcesDir(),
4466
- getAiocsSourcesDir()
4467
- ]
4468
- );
4469
- const fetched = options?.fetch ?? false;
4470
- const userSourceDir = getAiocsSourcesDir();
4471
- return withCatalog(async ({ catalog, dataDir }) => {
4472
- const bootstrapped = await bootstrapSourceSpecs({
4473
- catalog,
4474
- sourceSpecDirs,
4475
- strictSourceSpecDirs: true
4476
- });
4477
- const fetchResults = [];
4478
- if (fetched) {
4479
- for (const source of bootstrapped.sources) {
4480
- const result = await fetchSource({
4481
- catalog,
4482
- dataDir,
4483
- sourceId: source.sourceId
4484
- });
4485
- fetchResults.push({
4486
- sourceId: source.sourceId,
4487
- snapshotId: result.snapshotId,
4488
- pageCount: result.pageCount,
4489
- reused: result.reused
4490
- });
4491
- }
4492
- await processEmbeddingJobs({
4493
- catalog,
4494
- config: getHybridRuntimeConfig()
4495
- });
4496
- }
4497
- return {
4498
- sourceSpecDirs,
4499
- userSourceDir,
4500
- fetched,
4501
- initializedSources: bootstrapped.sources,
4502
- removedSourceIds: bootstrapped.removedSourceIds,
4503
- fetchResults
4504
- };
4505
- });
4506
- }
4507
- function getManagedSourceSpecDirectories() {
4508
- return {
4509
- bundledSourceDir: getBundledSourcesDir(),
4510
- userSourceDir: getAiocsSourcesDir()
4511
- };
4512
- }
4513
- function getDoctorReport(env = process.env) {
4514
- return runDoctor(env);
4515
- }
4516
- async function exportCatalogBackup(input) {
4517
- return exportBackup({
4518
- dataDir: getAiocsDataDir(),
4519
- configDir: getAiocsConfigDir(),
4520
- outputDir: input.outputDir,
4521
- ...typeof input.replaceExisting === "boolean" ? { replaceExisting: input.replaceExisting } : {}
4522
- });
4523
- }
4524
- async function importCatalogBackup(input) {
4525
- const result = await importBackup({
4526
- inputDir: input.inputDir,
4527
- dataDir: getAiocsDataDir(),
4528
- configDir: getAiocsConfigDir(),
4529
- ...typeof input.replaceExisting === "boolean" ? { replaceExisting: input.replaceExisting } : {}
4530
- });
4531
- try {
4532
- await new AiocsVectorStore(getHybridRuntimeConfig()).clearCollection();
4533
- } catch {
4534
- }
4535
- await withCatalog(({ catalog }) => {
4536
- catalog.resetEmbeddingsAfterImport();
4537
- });
4538
- return result;
4539
- }
4540
- async function getEmbeddingStatus() {
4541
- return withCatalog(({ catalog }) => catalog.getEmbeddingOverview());
4542
- }
4543
- async function backfillEmbeddings(sourceIdOrAll) {
4544
- return withCatalog(({ catalog }) => sourceIdOrAll === "all" ? catalog.requeueLatestEmbeddingJobs() : catalog.requeueLatestEmbeddingJobs([sourceIdOrAll]));
4545
- }
4546
- async function clearEmbeddings(sourceIdOrAll) {
4547
- return withCatalog(async ({ catalog }) => {
4548
- const hybridConfig = getHybridRuntimeConfig();
4549
- const vectorStore = new AiocsVectorStore(hybridConfig);
4550
- if (sourceIdOrAll === "all") {
4551
- await vectorStore.clearCollection();
4552
- return catalog.clearEmbeddings();
4553
- }
4554
- const chunkIds = catalog.listEmbeddingChunkIds([sourceIdOrAll]);
4555
- if (chunkIds.length > 0) {
4556
- await vectorStore.deleteChunkIds(chunkIds);
4557
- }
4558
- return catalog.clearEmbeddings([sourceIdOrAll]);
4559
- });
4560
- }
4561
- async function runEmbeddingWorker() {
4562
- return withCatalog(({ catalog }) => processEmbeddingJobs({
4563
- catalog,
4564
- config: getHybridRuntimeConfig()
4565
- }));
4566
- }
4567
-
4568
- export {
4569
- AIOCS_ERROR_CODES,
4570
- AiocsError,
4571
- toAiocsError,
4572
- getAiocsDataDir,
4573
- getAiocsConfigDir,
4574
- openCatalog,
4575
- parseDaemonConfig,
4576
- startDaemon,
4577
- packageName,
4578
- packageVersion,
4579
- packageDescription,
4580
- upsertSourceFromSpecFile,
4581
- listSources,
4582
- fetchSources,
4583
- refreshDueSources,
4584
- runSourceCanaries,
4585
- listSnapshotsForSource,
4586
- diffSnapshotsForSource,
4587
- linkProjectSources,
4588
- unlinkProjectSources,
4589
- searchCatalog,
4590
- showChunk,
4591
- verifyCoverage,
4592
- initManagedSources,
4593
- getManagedSourceSpecDirectories,
4594
- getDoctorReport,
4595
- exportCatalogBackup,
4596
- importCatalogBackup,
4597
- getEmbeddingStatus,
4598
- backfillEmbeddings,
4599
- clearEmbeddings,
4600
- runEmbeddingWorker
4601
- };