@bodhi-ventures/aiocs 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +488 -0
- package/dist/chunk-ID3PUSMY.js +4535 -0
- package/dist/cli.js +624 -0
- package/dist/mcp-server.js +720 -0
- package/docs/2026-03-26-agent-json-and-daemon-design.md +157 -0
- package/docs/2026-03-28-hybrid-search-design.md +423 -0
- package/docs/README.md +12 -0
- package/docs/codex-integration.md +125 -0
- package/docs/examples/codex-agents/aiocs-docs-specialist.example.toml +21 -0
- package/docs/json-contract.md +524 -0
- package/docs/superpowers/specs/2026-03-29-tag-driven-release-pipeline-design.md +135 -0
- package/package.json +74 -0
- package/skills/aiocs/SKILL.md +174 -0
- package/sources/ethereal.yaml +20 -0
- package/sources/hyperliquid.yaml +20 -0
- package/sources/lighter.yaml +24 -0
- package/sources/nado.yaml +22 -0
- package/sources/synthetix.yaml +24 -0
|
@@ -0,0 +1,4535 @@
|
|
|
1
|
+
// src/errors.ts
|
|
2
|
+
var AIOCS_ERROR_CODES = {
|
|
3
|
+
invalidArgument: "INVALID_ARGUMENT",
|
|
4
|
+
sourceNotFound: "SOURCE_NOT_FOUND",
|
|
5
|
+
snapshotNotFound: "SNAPSHOT_NOT_FOUND",
|
|
6
|
+
snapshotDiffBaseNotFound: "SNAPSHOT_DIFF_BASE_NOT_FOUND",
|
|
7
|
+
noPagesFetched: "NO_PAGES_FETCHED",
|
|
8
|
+
noProjectScope: "NO_PROJECT_SCOPE",
|
|
9
|
+
chunkNotFound: "CHUNK_NOT_FOUND",
|
|
10
|
+
referenceFileNotFound: "REFERENCE_FILE_NOT_FOUND",
|
|
11
|
+
invalidReferenceFile: "INVALID_REFERENCE_FILE",
|
|
12
|
+
authEnvMissing: "AUTH_ENV_MISSING",
|
|
13
|
+
canaryFailed: "CANARY_FAILED",
|
|
14
|
+
backupConflict: "BACKUP_CONFLICT",
|
|
15
|
+
backupInvalid: "BACKUP_INVALID",
|
|
16
|
+
backupSourceMissing: "BACKUP_SOURCE_MISSING",
|
|
17
|
+
embeddingConfigInvalid: "EMBEDDING_CONFIG_INVALID",
|
|
18
|
+
embeddingProviderUnavailable: "EMBEDDING_PROVIDER_UNAVAILABLE",
|
|
19
|
+
vectorStoreUnavailable: "VECTOR_STORE_UNAVAILABLE",
|
|
20
|
+
embeddingJobNotFound: "EMBEDDING_JOB_NOT_FOUND",
|
|
21
|
+
internalError: "INTERNAL_ERROR"
|
|
22
|
+
};
|
|
23
|
+
var AiocsError = class extends Error {
|
|
24
|
+
code;
|
|
25
|
+
details;
|
|
26
|
+
constructor(code, message, details) {
|
|
27
|
+
super(message);
|
|
28
|
+
this.name = "AiocsError";
|
|
29
|
+
this.code = code;
|
|
30
|
+
this.details = details;
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
function isAiocsError(error) {
|
|
34
|
+
return error instanceof AiocsError;
|
|
35
|
+
}
|
|
36
|
+
function toAiocsError(error) {
|
|
37
|
+
if (isAiocsError(error)) {
|
|
38
|
+
return error;
|
|
39
|
+
}
|
|
40
|
+
if (error instanceof Error) {
|
|
41
|
+
return new AiocsError(AIOCS_ERROR_CODES.internalError, error.message);
|
|
42
|
+
}
|
|
43
|
+
return new AiocsError(AIOCS_ERROR_CODES.internalError, String(error));
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// src/catalog/catalog.ts
|
|
47
|
+
import { mkdirSync } from "fs";
|
|
48
|
+
import { join, resolve as resolve2 } from "path";
|
|
49
|
+
import { randomUUID } from "crypto";
|
|
50
|
+
import Database from "better-sqlite3";
|
|
51
|
+
|
|
52
|
+
// src/catalog/chunking.ts
|
|
53
|
+
var MAX_CHUNK_BYTES = 16384;
|
|
54
|
+
var HEADING_PATTERN = /^(#{1,6})\s+(.*)$/;
|
|
55
|
+
function byteLength(value) {
|
|
56
|
+
return Buffer.byteLength(value, "utf8");
|
|
57
|
+
}
|
|
58
|
+
function splitLargeSection(sectionTitle, markdown, startOrder) {
|
|
59
|
+
const lines = markdown.split("\n");
|
|
60
|
+
const chunks = [];
|
|
61
|
+
let current = "";
|
|
62
|
+
let order = startOrder;
|
|
63
|
+
const flush = () => {
|
|
64
|
+
const trimmed = current.trim();
|
|
65
|
+
if (!trimmed) {
|
|
66
|
+
current = "";
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
chunks.push({
|
|
70
|
+
sectionTitle,
|
|
71
|
+
markdown: trimmed,
|
|
72
|
+
chunkOrder: order
|
|
73
|
+
});
|
|
74
|
+
order += 1;
|
|
75
|
+
current = "";
|
|
76
|
+
};
|
|
77
|
+
for (const line of lines) {
|
|
78
|
+
const next = current ? `${current}
|
|
79
|
+
${line}` : line;
|
|
80
|
+
if (current && byteLength(next) > MAX_CHUNK_BYTES) {
|
|
81
|
+
flush();
|
|
82
|
+
}
|
|
83
|
+
current = current ? `${current}
|
|
84
|
+
${line}` : line;
|
|
85
|
+
}
|
|
86
|
+
flush();
|
|
87
|
+
return chunks;
|
|
88
|
+
}
|
|
89
|
+
function chunkMarkdown(pageTitle, markdown) {
|
|
90
|
+
const trimmed = markdown.trim();
|
|
91
|
+
if (!trimmed) {
|
|
92
|
+
return [];
|
|
93
|
+
}
|
|
94
|
+
if (byteLength(trimmed) <= MAX_CHUNK_BYTES) {
|
|
95
|
+
return [{ sectionTitle: pageTitle, markdown: trimmed, chunkOrder: 0 }];
|
|
96
|
+
}
|
|
97
|
+
const lines = trimmed.split("\n");
|
|
98
|
+
const sections = [];
|
|
99
|
+
let currentTitle = pageTitle;
|
|
100
|
+
let currentLines = [];
|
|
101
|
+
const flushSection = () => {
|
|
102
|
+
const content = currentLines.join("\n").trim();
|
|
103
|
+
if (!content) {
|
|
104
|
+
currentLines = [];
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
sections.push({ title: currentTitle, markdown: content });
|
|
108
|
+
currentLines = [];
|
|
109
|
+
};
|
|
110
|
+
for (const line of lines) {
|
|
111
|
+
const match = line.trim().match(HEADING_PATTERN);
|
|
112
|
+
if (match && match[1].length >= 2) {
|
|
113
|
+
flushSection();
|
|
114
|
+
currentTitle = match[2].trim() || pageTitle;
|
|
115
|
+
}
|
|
116
|
+
currentLines.push(line);
|
|
117
|
+
}
|
|
118
|
+
flushSection();
|
|
119
|
+
const chunks = [];
|
|
120
|
+
let order = 0;
|
|
121
|
+
for (const section of sections) {
|
|
122
|
+
if (byteLength(section.markdown) <= MAX_CHUNK_BYTES) {
|
|
123
|
+
chunks.push({
|
|
124
|
+
sectionTitle: section.title,
|
|
125
|
+
markdown: section.markdown,
|
|
126
|
+
chunkOrder: order
|
|
127
|
+
});
|
|
128
|
+
order += 1;
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
const split = splitLargeSection(section.title, section.markdown, order);
|
|
132
|
+
chunks.push(...split);
|
|
133
|
+
order = chunks.length;
|
|
134
|
+
}
|
|
135
|
+
return chunks;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// src/catalog/fingerprint.ts
|
|
139
|
+
import { createHash } from "crypto";
|
|
140
|
+
function sha256(value) {
|
|
141
|
+
return createHash("sha256").update(value).digest("hex");
|
|
142
|
+
}
|
|
143
|
+
function buildSnapshotFingerprint(input) {
|
|
144
|
+
const normalizedPages = [...input.pages].sort((left, right) => left.url.localeCompare(right.url));
|
|
145
|
+
const payload = JSON.stringify({
|
|
146
|
+
sourceId: input.sourceId,
|
|
147
|
+
configHash: input.configHash,
|
|
148
|
+
pages: normalizedPages
|
|
149
|
+
});
|
|
150
|
+
return sha256(payload);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// src/catalog/project-scope.ts
|
|
154
|
+
import { realpathSync } from "fs";
|
|
155
|
+
import { resolve } from "path";
|
|
156
|
+
function isWithin(candidate, root) {
|
|
157
|
+
return candidate === root || candidate.startsWith(`${root}/`);
|
|
158
|
+
}
|
|
159
|
+
function canonicalizeProjectPath(path) {
|
|
160
|
+
const resolved = resolve(path);
|
|
161
|
+
try {
|
|
162
|
+
return realpathSync.native(resolved);
|
|
163
|
+
} catch {
|
|
164
|
+
return resolved;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
function resolveProjectScope(cwd, scopes) {
|
|
168
|
+
const normalizedCwd = canonicalizeProjectPath(cwd);
|
|
169
|
+
const normalizedScopes = scopes.map((scope) => ({
|
|
170
|
+
projectPath: canonicalizeProjectPath(scope.projectPath),
|
|
171
|
+
sourceIds: [...scope.sourceIds]
|
|
172
|
+
})).filter((scope) => isWithin(normalizedCwd, scope.projectPath)).sort((left, right) => right.projectPath.length - left.projectPath.length);
|
|
173
|
+
return normalizedScopes[0] ?? null;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// src/spec/source-spec.ts
|
|
177
|
+
import { readFile } from "fs/promises";
|
|
178
|
+
import { extname } from "path";
|
|
179
|
+
import YAML from "yaml";
|
|
180
|
+
import { z } from "zod";
|
|
181
|
+
var patternSchema = z.string().min(1);
|
|
182
|
+
var interactionSchema = z.discriminatedUnion("action", [
|
|
183
|
+
z.object({
|
|
184
|
+
action: z.literal("hover"),
|
|
185
|
+
selector: z.string().min(1),
|
|
186
|
+
timeoutMs: z.number().int().positive().optional()
|
|
187
|
+
}),
|
|
188
|
+
z.object({
|
|
189
|
+
action: z.literal("click"),
|
|
190
|
+
selector: z.string().min(1),
|
|
191
|
+
timeoutMs: z.number().int().positive().optional()
|
|
192
|
+
}),
|
|
193
|
+
z.object({
|
|
194
|
+
action: z.literal("press"),
|
|
195
|
+
key: z.string().min(1)
|
|
196
|
+
}),
|
|
197
|
+
z.object({
|
|
198
|
+
action: z.literal("wait"),
|
|
199
|
+
timeoutMs: z.number().int().positive()
|
|
200
|
+
})
|
|
201
|
+
]);
|
|
202
|
+
var clipboardExtractSchema = z.object({
|
|
203
|
+
strategy: z.literal("clipboardButton"),
|
|
204
|
+
interactions: z.array(interactionSchema).min(1),
|
|
205
|
+
clipboardTimeoutMs: z.number().int().positive().default(1e4)
|
|
206
|
+
});
|
|
207
|
+
var selectorExtractSchema = z.object({
|
|
208
|
+
strategy: z.literal("selector"),
|
|
209
|
+
selector: z.string().min(1)
|
|
210
|
+
});
|
|
211
|
+
var readabilityExtractSchema = z.object({
|
|
212
|
+
strategy: z.literal("readability")
|
|
213
|
+
});
|
|
214
|
+
var authHeaderSchema = z.object({
|
|
215
|
+
name: z.string().min(1),
|
|
216
|
+
valueFromEnv: z.string().min(1),
|
|
217
|
+
hosts: z.array(z.string().min(1)).min(1).optional(),
|
|
218
|
+
include: z.array(patternSchema).min(1).optional()
|
|
219
|
+
});
|
|
220
|
+
var authCookieSchema = z.object({
|
|
221
|
+
name: z.string().min(1),
|
|
222
|
+
valueFromEnv: z.string().min(1),
|
|
223
|
+
domain: z.string().min(1),
|
|
224
|
+
path: z.string().min(1).default("/"),
|
|
225
|
+
secure: z.boolean().optional(),
|
|
226
|
+
httpOnly: z.boolean().optional(),
|
|
227
|
+
sameSite: z.enum(["Strict", "Lax", "None"]).optional()
|
|
228
|
+
});
|
|
229
|
+
var canaryCheckSchema = z.object({
|
|
230
|
+
url: z.string().url(),
|
|
231
|
+
expectedTitle: z.string().min(1).optional(),
|
|
232
|
+
expectedText: z.string().min(1).optional(),
|
|
233
|
+
minMarkdownLength: z.number().int().positive().default(40)
|
|
234
|
+
});
|
|
235
|
+
var sourceSpecSchema = z.object({
|
|
236
|
+
id: z.string().min(1).regex(/^[a-z0-9-]+$/),
|
|
237
|
+
label: z.string().min(1),
|
|
238
|
+
startUrls: z.array(z.string().url()).min(1),
|
|
239
|
+
allowedHosts: z.array(z.string().min(1)).min(1),
|
|
240
|
+
discovery: z.object({
|
|
241
|
+
include: z.array(patternSchema).min(1),
|
|
242
|
+
exclude: z.array(patternSchema),
|
|
243
|
+
maxPages: z.number().int().positive()
|
|
244
|
+
}),
|
|
245
|
+
extract: z.discriminatedUnion("strategy", [
|
|
246
|
+
clipboardExtractSchema,
|
|
247
|
+
selectorExtractSchema,
|
|
248
|
+
readabilityExtractSchema
|
|
249
|
+
]),
|
|
250
|
+
normalize: z.object({
|
|
251
|
+
prependSourceComment: z.boolean().default(true)
|
|
252
|
+
}),
|
|
253
|
+
schedule: z.object({
|
|
254
|
+
everyHours: z.number().int().positive()
|
|
255
|
+
}),
|
|
256
|
+
auth: z.object({
|
|
257
|
+
headers: z.array(authHeaderSchema).default([]),
|
|
258
|
+
cookies: z.array(authCookieSchema).default([])
|
|
259
|
+
}).optional(),
|
|
260
|
+
canary: z.object({
|
|
261
|
+
everyHours: z.number().int().positive().optional(),
|
|
262
|
+
checks: z.array(canaryCheckSchema).min(1)
|
|
263
|
+
}).optional()
|
|
264
|
+
}).superRefine((spec, context) => {
|
|
265
|
+
for (const [index, header] of (spec.auth?.headers ?? []).entries()) {
|
|
266
|
+
if (!header.hosts) {
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
for (const host of header.hosts) {
|
|
270
|
+
if (!spec.allowedHosts.includes(host)) {
|
|
271
|
+
context.addIssue({
|
|
272
|
+
code: z.ZodIssueCode.custom,
|
|
273
|
+
path: ["auth", "headers", index, "hosts"],
|
|
274
|
+
message: `Authenticated header host '${host}' must be included in allowedHosts`
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
});
|
|
280
|
+
function parseSourceSpec(raw, ext) {
|
|
281
|
+
if (ext === ".json") {
|
|
282
|
+
return JSON.parse(raw);
|
|
283
|
+
}
|
|
284
|
+
return YAML.parse(raw);
|
|
285
|
+
}
|
|
286
|
+
async function loadSourceSpec(path) {
|
|
287
|
+
const raw = await readFile(path, "utf8");
|
|
288
|
+
const parsed = parseSourceSpec(raw, extname(path).toLowerCase());
|
|
289
|
+
return sourceSpecSchema.parse(parsed);
|
|
290
|
+
}
|
|
291
|
+
function resolveSourceCanary(spec) {
|
|
292
|
+
return {
|
|
293
|
+
everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
|
|
294
|
+
checks: spec.canary?.checks ?? [
|
|
295
|
+
{
|
|
296
|
+
url: spec.startUrls[0],
|
|
297
|
+
minMarkdownLength: 40
|
|
298
|
+
}
|
|
299
|
+
]
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// src/catalog/catalog.ts
|
|
304
|
+
function initSchema(db) {
|
|
305
|
+
db.exec(`
|
|
306
|
+
PRAGMA foreign_keys = ON;
|
|
307
|
+
|
|
308
|
+
CREATE TABLE IF NOT EXISTS sources (
|
|
309
|
+
id TEXT PRIMARY KEY,
|
|
310
|
+
label TEXT NOT NULL,
|
|
311
|
+
spec_json TEXT NOT NULL,
|
|
312
|
+
spec_path TEXT,
|
|
313
|
+
config_hash TEXT NOT NULL,
|
|
314
|
+
created_at TEXT NOT NULL,
|
|
315
|
+
updated_at TEXT NOT NULL,
|
|
316
|
+
last_checked_at TEXT,
|
|
317
|
+
last_successful_snapshot_at TEXT,
|
|
318
|
+
last_successful_snapshot_id TEXT,
|
|
319
|
+
last_canary_checked_at TEXT,
|
|
320
|
+
last_successful_canary_at TEXT,
|
|
321
|
+
last_canary_status TEXT,
|
|
322
|
+
next_canary_due_at TEXT,
|
|
323
|
+
next_due_at TEXT NOT NULL
|
|
324
|
+
);
|
|
325
|
+
|
|
326
|
+
CREATE TABLE IF NOT EXISTS snapshots (
|
|
327
|
+
id TEXT PRIMARY KEY,
|
|
328
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
329
|
+
fingerprint TEXT NOT NULL,
|
|
330
|
+
config_hash TEXT NOT NULL,
|
|
331
|
+
detected_version TEXT,
|
|
332
|
+
page_count INTEGER NOT NULL,
|
|
333
|
+
created_at TEXT NOT NULL,
|
|
334
|
+
UNIQUE(source_id, fingerprint)
|
|
335
|
+
);
|
|
336
|
+
|
|
337
|
+
CREATE TABLE IF NOT EXISTS pages (
|
|
338
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
339
|
+
snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
340
|
+
url TEXT NOT NULL,
|
|
341
|
+
title TEXT NOT NULL,
|
|
342
|
+
markdown TEXT NOT NULL,
|
|
343
|
+
content_hash TEXT NOT NULL,
|
|
344
|
+
UNIQUE(snapshot_id, url)
|
|
345
|
+
);
|
|
346
|
+
|
|
347
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
348
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
349
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
350
|
+
snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
351
|
+
page_id INTEGER NOT NULL REFERENCES pages(id) ON DELETE CASCADE,
|
|
352
|
+
page_url TEXT NOT NULL,
|
|
353
|
+
page_title TEXT NOT NULL,
|
|
354
|
+
section_title TEXT NOT NULL,
|
|
355
|
+
chunk_order INTEGER NOT NULL,
|
|
356
|
+
markdown TEXT NOT NULL
|
|
357
|
+
);
|
|
358
|
+
|
|
359
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
360
|
+
page_title,
|
|
361
|
+
section_title,
|
|
362
|
+
markdown,
|
|
363
|
+
content=chunks,
|
|
364
|
+
content_rowid=id,
|
|
365
|
+
tokenize='porter unicode61'
|
|
366
|
+
);
|
|
367
|
+
|
|
368
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
|
|
369
|
+
INSERT INTO chunks_fts(rowid, page_title, section_title, markdown)
|
|
370
|
+
VALUES (new.id, new.page_title, new.section_title, new.markdown);
|
|
371
|
+
END;
|
|
372
|
+
|
|
373
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
|
|
374
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, page_title, section_title, markdown)
|
|
375
|
+
VALUES ('delete', old.id, old.page_title, old.section_title, old.markdown);
|
|
376
|
+
END;
|
|
377
|
+
|
|
378
|
+
CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
|
|
379
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, page_title, section_title, markdown)
|
|
380
|
+
VALUES ('delete', old.id, old.page_title, old.section_title, old.markdown);
|
|
381
|
+
INSERT INTO chunks_fts(rowid, page_title, section_title, markdown)
|
|
382
|
+
VALUES (new.id, new.page_title, new.section_title, new.markdown);
|
|
383
|
+
END;
|
|
384
|
+
|
|
385
|
+
CREATE TABLE IF NOT EXISTS fetch_runs (
|
|
386
|
+
id TEXT PRIMARY KEY,
|
|
387
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
388
|
+
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
|
|
389
|
+
error_message TEXT,
|
|
390
|
+
snapshot_id TEXT REFERENCES snapshots(id) ON DELETE SET NULL,
|
|
391
|
+
started_at TEXT NOT NULL,
|
|
392
|
+
finished_at TEXT NOT NULL
|
|
393
|
+
);
|
|
394
|
+
|
|
395
|
+
CREATE TABLE IF NOT EXISTS canary_runs (
|
|
396
|
+
id TEXT PRIMARY KEY,
|
|
397
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
398
|
+
status TEXT NOT NULL CHECK(status IN ('pass', 'fail')),
|
|
399
|
+
checked_at TEXT NOT NULL,
|
|
400
|
+
details_json TEXT NOT NULL
|
|
401
|
+
);
|
|
402
|
+
|
|
403
|
+
CREATE TABLE IF NOT EXISTS project_links (
|
|
404
|
+
project_path TEXT NOT NULL,
|
|
405
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
406
|
+
created_at TEXT NOT NULL,
|
|
407
|
+
PRIMARY KEY(project_path, source_id)
|
|
408
|
+
);
|
|
409
|
+
|
|
410
|
+
CREATE TABLE IF NOT EXISTS daemon_state (
|
|
411
|
+
singleton_id INTEGER PRIMARY KEY CHECK(singleton_id = 1),
|
|
412
|
+
last_started_at TEXT,
|
|
413
|
+
last_cycle_started_at TEXT,
|
|
414
|
+
last_cycle_completed_at TEXT,
|
|
415
|
+
last_cycle_status TEXT,
|
|
416
|
+
interval_minutes INTEGER,
|
|
417
|
+
fetch_on_start INTEGER
|
|
418
|
+
);
|
|
419
|
+
|
|
420
|
+
CREATE TABLE IF NOT EXISTS embedding_state (
|
|
421
|
+
chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
|
|
422
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
423
|
+
snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
424
|
+
content_hash TEXT NOT NULL,
|
|
425
|
+
model_key TEXT,
|
|
426
|
+
status TEXT NOT NULL CHECK(status IN ('pending', 'indexed', 'failed', 'stale')),
|
|
427
|
+
vector_point_id TEXT,
|
|
428
|
+
last_attempted_at TEXT,
|
|
429
|
+
indexed_at TEXT,
|
|
430
|
+
error_message TEXT
|
|
431
|
+
);
|
|
432
|
+
|
|
433
|
+
CREATE TABLE IF NOT EXISTS embedding_jobs (
|
|
434
|
+
source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
435
|
+
snapshot_id TEXT NOT NULL REFERENCES snapshots(id) ON DELETE CASCADE,
|
|
436
|
+
status TEXT NOT NULL CHECK(status IN ('pending', 'running', 'succeeded', 'failed')),
|
|
437
|
+
attempt_count INTEGER NOT NULL DEFAULT 0,
|
|
438
|
+
chunk_count INTEGER NOT NULL,
|
|
439
|
+
created_at TEXT NOT NULL,
|
|
440
|
+
updated_at TEXT NOT NULL,
|
|
441
|
+
claimed_at TEXT,
|
|
442
|
+
completed_at TEXT,
|
|
443
|
+
error_message TEXT,
|
|
444
|
+
PRIMARY KEY(source_id, snapshot_id)
|
|
445
|
+
);
|
|
446
|
+
|
|
447
|
+
CREATE INDEX IF NOT EXISTS idx_embedding_jobs_status_updated
|
|
448
|
+
ON embedding_jobs(status, updated_at, source_id, snapshot_id);
|
|
449
|
+
|
|
450
|
+
CREATE INDEX IF NOT EXISTS idx_embedding_state_source_snapshot
|
|
451
|
+
ON embedding_state(source_id, snapshot_id, status);
|
|
452
|
+
`);
|
|
453
|
+
const sourceColumns = db.prepare("PRAGMA table_info(sources)").all();
|
|
454
|
+
if (!sourceColumns.some((column) => column.name === "spec_path")) {
|
|
455
|
+
db.exec("ALTER TABLE sources ADD COLUMN spec_path TEXT");
|
|
456
|
+
}
|
|
457
|
+
if (!sourceColumns.some((column) => column.name === "last_successful_snapshot_at")) {
|
|
458
|
+
db.exec("ALTER TABLE sources ADD COLUMN last_successful_snapshot_at TEXT");
|
|
459
|
+
}
|
|
460
|
+
if (!sourceColumns.some((column) => column.name === "last_canary_checked_at")) {
|
|
461
|
+
db.exec("ALTER TABLE sources ADD COLUMN last_canary_checked_at TEXT");
|
|
462
|
+
}
|
|
463
|
+
if (!sourceColumns.some((column) => column.name === "last_successful_canary_at")) {
|
|
464
|
+
db.exec("ALTER TABLE sources ADD COLUMN last_successful_canary_at TEXT");
|
|
465
|
+
}
|
|
466
|
+
if (!sourceColumns.some((column) => column.name === "last_canary_status")) {
|
|
467
|
+
db.exec("ALTER TABLE sources ADD COLUMN last_canary_status TEXT");
|
|
468
|
+
}
|
|
469
|
+
if (!sourceColumns.some((column) => column.name === "next_canary_due_at")) {
|
|
470
|
+
db.exec("ALTER TABLE sources ADD COLUMN next_canary_due_at TEXT");
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
function nowIso() {
|
|
474
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
475
|
+
}
|
|
476
|
+
function addHoursIso(hours) {
|
|
477
|
+
return new Date(Date.now() + hours * 60 * 60 * 1e3).toISOString();
|
|
478
|
+
}
|
|
479
|
+
function stableStringify(value) {
|
|
480
|
+
if (Array.isArray(value)) {
|
|
481
|
+
return `[${value.map((entry) => stableStringify(entry)).join(",")}]`;
|
|
482
|
+
}
|
|
483
|
+
if (value && typeof value === "object") {
|
|
484
|
+
const entries = Object.entries(value).sort(
|
|
485
|
+
([left], [right]) => left.localeCompare(right)
|
|
486
|
+
);
|
|
487
|
+
return `{${entries.map(([key, entry]) => `${JSON.stringify(key)}:${stableStringify(entry)}`).join(",")}}`;
|
|
488
|
+
}
|
|
489
|
+
return JSON.stringify(value);
|
|
490
|
+
}
|
|
491
|
+
function normalizeQuery(query) {
|
|
492
|
+
const words = query.replace(/[^\p{L}\p{N}]+/gu, " ").split(/\s+/).map((part) => part.trim()).filter(Boolean);
|
|
493
|
+
return words.join(" ");
|
|
494
|
+
}
|
|
495
|
+
function assertPaginationValue(value, field, fallback) {
|
|
496
|
+
if (typeof value === "undefined") {
|
|
497
|
+
return fallback;
|
|
498
|
+
}
|
|
499
|
+
if (!Number.isInteger(value) || value < 0) {
|
|
500
|
+
throw new AiocsError(
|
|
501
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
502
|
+
`${field} must be a non-negative integer`
|
|
503
|
+
);
|
|
504
|
+
}
|
|
505
|
+
if (field === "limit" && value === 0) {
|
|
506
|
+
throw new AiocsError(
|
|
507
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
508
|
+
"limit must be greater than zero"
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
return value;
|
|
512
|
+
}
|
|
513
|
+
function openCatalog(options) {
|
|
514
|
+
const dataDir = resolve2(options.dataDir);
|
|
515
|
+
mkdirSync(dataDir, { recursive: true });
|
|
516
|
+
const db = new Database(join(dataDir, "catalog.sqlite"));
|
|
517
|
+
initSchema(db);
|
|
518
|
+
const listProjectLinks = () => {
|
|
519
|
+
const rows = db.prepare("SELECT project_path, source_id FROM project_links ORDER BY project_path, source_id").all();
|
|
520
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
521
|
+
for (const row of rows) {
|
|
522
|
+
const current = grouped.get(row.project_path) ?? [];
|
|
523
|
+
current.push(row.source_id);
|
|
524
|
+
grouped.set(row.project_path, current);
|
|
525
|
+
}
|
|
526
|
+
return [...grouped.entries()].map(([projectPath, sourceIds]) => ({ projectPath, sourceIds }));
|
|
527
|
+
};
|
|
528
|
+
const resolveSearchScope = (input) => {
|
|
529
|
+
const limit = assertPaginationValue(input.limit, "limit", 20);
|
|
530
|
+
const offset = assertPaginationValue(input.offset, "offset", 0);
|
|
531
|
+
let sourceIds = input.sourceIds ? [...input.sourceIds] : void 0;
|
|
532
|
+
if (!sourceIds || sourceIds.length === 0) {
|
|
533
|
+
if (input.cwd) {
|
|
534
|
+
const scope = resolveProjectScope(
|
|
535
|
+
input.cwd,
|
|
536
|
+
listProjectLinks().map((link) => ({
|
|
537
|
+
projectPath: link.projectPath,
|
|
538
|
+
sourceIds: link.sourceIds
|
|
539
|
+
}))
|
|
540
|
+
);
|
|
541
|
+
if (scope) {
|
|
542
|
+
sourceIds = scope.sourceIds;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
if ((!sourceIds || sourceIds.length === 0) && !input.all) {
|
|
547
|
+
return {
|
|
548
|
+
limit,
|
|
549
|
+
offset,
|
|
550
|
+
sourceIds: null,
|
|
551
|
+
snapshotIds: []
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
|
|
555
|
+
const latestSnapshotIds = input.snapshotId ? [input.snapshotId] : db.prepare(`
|
|
556
|
+
SELECT last_successful_snapshot_id AS snapshot_id
|
|
557
|
+
FROM sources
|
|
558
|
+
WHERE last_successful_snapshot_id IS NOT NULL
|
|
559
|
+
${filterSourceIds ? `AND id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
|
|
560
|
+
`).all(...filterSourceIds ?? []).map((row) => row.snapshot_id);
|
|
561
|
+
return {
|
|
562
|
+
limit,
|
|
563
|
+
offset,
|
|
564
|
+
sourceIds: filterSourceIds,
|
|
565
|
+
snapshotIds: latestSnapshotIds
|
|
566
|
+
};
|
|
567
|
+
};
|
|
568
|
+
const searchLexicalByScope = (input) => {
|
|
569
|
+
const normalized = normalizeQuery(input.query);
|
|
570
|
+
const limit = assertPaginationValue(input.limit, "limit", input.scope.limit);
|
|
571
|
+
const offset = assertPaginationValue(input.offset, "offset", input.scope.offset);
|
|
572
|
+
if (!normalized || input.scope.snapshotIds.length === 0) {
|
|
573
|
+
return {
|
|
574
|
+
total: 0,
|
|
575
|
+
limit,
|
|
576
|
+
offset,
|
|
577
|
+
hasMore: false,
|
|
578
|
+
results: []
|
|
579
|
+
};
|
|
580
|
+
}
|
|
581
|
+
const whereSnapshotPlaceholders = input.scope.snapshotIds.map(() => "?").join(",");
|
|
582
|
+
const sourceSql = input.scope.sourceIds ? `AND c.source_id IN (${input.scope.sourceIds.map(() => "?").join(",")})` : "";
|
|
583
|
+
const queryArgs = [
|
|
584
|
+
normalized,
|
|
585
|
+
...input.scope.snapshotIds,
|
|
586
|
+
...input.scope.sourceIds ?? []
|
|
587
|
+
];
|
|
588
|
+
const totalRow = db.prepare(`
|
|
589
|
+
SELECT COUNT(*) AS total
|
|
590
|
+
FROM chunks_fts
|
|
591
|
+
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
592
|
+
WHERE chunks_fts MATCH ?
|
|
593
|
+
AND c.snapshot_id IN (${whereSnapshotPlaceholders})
|
|
594
|
+
${sourceSql}
|
|
595
|
+
`).get(...queryArgs);
|
|
596
|
+
const rows = db.prepare(`
|
|
597
|
+
SELECT
|
|
598
|
+
c.id AS chunk_id,
|
|
599
|
+
c.source_id,
|
|
600
|
+
c.snapshot_id,
|
|
601
|
+
c.page_url,
|
|
602
|
+
c.page_title,
|
|
603
|
+
c.section_title,
|
|
604
|
+
c.markdown
|
|
605
|
+
FROM chunks_fts
|
|
606
|
+
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
607
|
+
WHERE chunks_fts MATCH ?
|
|
608
|
+
AND c.snapshot_id IN (${whereSnapshotPlaceholders})
|
|
609
|
+
${sourceSql}
|
|
610
|
+
ORDER BY bm25(chunks_fts), c.id
|
|
611
|
+
LIMIT ?
|
|
612
|
+
OFFSET ?
|
|
613
|
+
`).all(...queryArgs, limit, offset);
|
|
614
|
+
const results = rows.map((row) => ({
|
|
615
|
+
chunkId: row.chunk_id,
|
|
616
|
+
sourceId: row.source_id,
|
|
617
|
+
snapshotId: row.snapshot_id,
|
|
618
|
+
pageUrl: row.page_url,
|
|
619
|
+
pageTitle: row.page_title,
|
|
620
|
+
sectionTitle: row.section_title,
|
|
621
|
+
markdown: row.markdown
|
|
622
|
+
}));
|
|
623
|
+
return {
|
|
624
|
+
total: totalRow.total,
|
|
625
|
+
limit,
|
|
626
|
+
offset,
|
|
627
|
+
hasMore: offset + results.length < totalRow.total,
|
|
628
|
+
results
|
|
629
|
+
};
|
|
630
|
+
};
|
|
631
|
+
const listLatestSnapshots = (sourceIds) => {
|
|
632
|
+
const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
|
|
633
|
+
const rows = db.prepare(`
|
|
634
|
+
SELECT id AS source_id, last_successful_snapshot_id AS snapshot_id
|
|
635
|
+
FROM sources
|
|
636
|
+
WHERE last_successful_snapshot_id IS NOT NULL
|
|
637
|
+
${filterSourceIds ? `AND id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
|
|
638
|
+
ORDER BY id
|
|
639
|
+
`).all(...filterSourceIds ?? []);
|
|
640
|
+
return rows.map((row) => ({
|
|
641
|
+
sourceId: row.source_id,
|
|
642
|
+
snapshotId: row.snapshot_id
|
|
643
|
+
}));
|
|
644
|
+
};
|
|
645
|
+
const queueEmbeddingJobForSnapshot = (sourceId, snapshotId, previousLatestSnapshotId) => {
|
|
646
|
+
const timestamp = nowIso();
|
|
647
|
+
if (previousLatestSnapshotId && previousLatestSnapshotId !== snapshotId) {
|
|
648
|
+
db.prepare(`
|
|
649
|
+
UPDATE embedding_state
|
|
650
|
+
SET
|
|
651
|
+
status = 'stale',
|
|
652
|
+
vector_point_id = NULL,
|
|
653
|
+
indexed_at = NULL,
|
|
654
|
+
error_message = NULL
|
|
655
|
+
WHERE source_id = ?
|
|
656
|
+
AND snapshot_id = ?
|
|
657
|
+
`).run(sourceId, previousLatestSnapshotId);
|
|
658
|
+
db.prepare(`
|
|
659
|
+
DELETE FROM embedding_jobs
|
|
660
|
+
WHERE source_id = ?
|
|
661
|
+
AND snapshot_id = ?
|
|
662
|
+
`).run(sourceId, previousLatestSnapshotId);
|
|
663
|
+
}
|
|
664
|
+
const chunkRows = db.prepare(`
|
|
665
|
+
SELECT id, markdown
|
|
666
|
+
FROM chunks
|
|
667
|
+
WHERE source_id = ?
|
|
668
|
+
AND snapshot_id = ?
|
|
669
|
+
ORDER BY id
|
|
670
|
+
`).all(sourceId, snapshotId);
|
|
671
|
+
const upsertState = db.prepare(`
|
|
672
|
+
INSERT INTO embedding_state (
|
|
673
|
+
chunk_id,
|
|
674
|
+
source_id,
|
|
675
|
+
snapshot_id,
|
|
676
|
+
content_hash,
|
|
677
|
+
model_key,
|
|
678
|
+
status,
|
|
679
|
+
vector_point_id,
|
|
680
|
+
last_attempted_at,
|
|
681
|
+
indexed_at,
|
|
682
|
+
error_message
|
|
683
|
+
) VALUES (?, ?, ?, ?, NULL, 'pending', NULL, NULL, NULL, NULL)
|
|
684
|
+
ON CONFLICT(chunk_id) DO UPDATE SET
|
|
685
|
+
source_id = excluded.source_id,
|
|
686
|
+
snapshot_id = excluded.snapshot_id,
|
|
687
|
+
content_hash = excluded.content_hash,
|
|
688
|
+
model_key = CASE
|
|
689
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
690
|
+
THEN embedding_state.model_key
|
|
691
|
+
ELSE NULL
|
|
692
|
+
END,
|
|
693
|
+
status = CASE
|
|
694
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
695
|
+
THEN 'indexed'
|
|
696
|
+
ELSE 'pending'
|
|
697
|
+
END,
|
|
698
|
+
vector_point_id = CASE
|
|
699
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
700
|
+
THEN embedding_state.vector_point_id
|
|
701
|
+
ELSE NULL
|
|
702
|
+
END,
|
|
703
|
+
last_attempted_at = CASE
|
|
704
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
705
|
+
THEN embedding_state.last_attempted_at
|
|
706
|
+
ELSE NULL
|
|
707
|
+
END,
|
|
708
|
+
indexed_at = CASE
|
|
709
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
710
|
+
THEN embedding_state.indexed_at
|
|
711
|
+
ELSE NULL
|
|
712
|
+
END,
|
|
713
|
+
error_message = CASE
|
|
714
|
+
WHEN embedding_state.status = 'indexed' AND embedding_state.content_hash = excluded.content_hash
|
|
715
|
+
THEN embedding_state.error_message
|
|
716
|
+
ELSE NULL
|
|
717
|
+
END
|
|
718
|
+
`);
|
|
719
|
+
const transaction = db.transaction(() => {
|
|
720
|
+
for (const chunk of chunkRows) {
|
|
721
|
+
upsertState.run(
|
|
722
|
+
chunk.id,
|
|
723
|
+
sourceId,
|
|
724
|
+
snapshotId,
|
|
725
|
+
sha256(chunk.markdown)
|
|
726
|
+
);
|
|
727
|
+
}
|
|
728
|
+
});
|
|
729
|
+
transaction();
|
|
730
|
+
const pendingRow = db.prepare(`
|
|
731
|
+
SELECT COUNT(*) AS pending_count
|
|
732
|
+
FROM embedding_state
|
|
733
|
+
WHERE source_id = ?
|
|
734
|
+
AND snapshot_id = ?
|
|
735
|
+
AND status != 'indexed'
|
|
736
|
+
`).get(sourceId, snapshotId);
|
|
737
|
+
if (pendingRow.pending_count === 0) {
|
|
738
|
+
db.prepare(`
|
|
739
|
+
INSERT INTO embedding_jobs (
|
|
740
|
+
source_id,
|
|
741
|
+
snapshot_id,
|
|
742
|
+
status,
|
|
743
|
+
attempt_count,
|
|
744
|
+
chunk_count,
|
|
745
|
+
created_at,
|
|
746
|
+
updated_at,
|
|
747
|
+
claimed_at,
|
|
748
|
+
completed_at,
|
|
749
|
+
error_message
|
|
750
|
+
) VALUES (?, ?, 'succeeded', 0, ?, ?, ?, NULL, ?, NULL)
|
|
751
|
+
ON CONFLICT(source_id, snapshot_id) DO UPDATE SET
|
|
752
|
+
status = 'succeeded',
|
|
753
|
+
chunk_count = excluded.chunk_count,
|
|
754
|
+
updated_at = excluded.updated_at,
|
|
755
|
+
claimed_at = NULL,
|
|
756
|
+
completed_at = excluded.completed_at,
|
|
757
|
+
error_message = NULL
|
|
758
|
+
`).run(sourceId, snapshotId, chunkRows.length, timestamp, timestamp, timestamp);
|
|
759
|
+
return;
|
|
760
|
+
}
|
|
761
|
+
db.prepare(`
|
|
762
|
+
INSERT INTO embedding_jobs (
|
|
763
|
+
source_id,
|
|
764
|
+
snapshot_id,
|
|
765
|
+
status,
|
|
766
|
+
attempt_count,
|
|
767
|
+
chunk_count,
|
|
768
|
+
created_at,
|
|
769
|
+
updated_at,
|
|
770
|
+
claimed_at,
|
|
771
|
+
completed_at,
|
|
772
|
+
error_message
|
|
773
|
+
) VALUES (?, ?, 'pending', 0, ?, ?, ?, NULL, NULL, NULL)
|
|
774
|
+
ON CONFLICT(source_id, snapshot_id) DO UPDATE SET
|
|
775
|
+
status = 'pending',
|
|
776
|
+
chunk_count = excluded.chunk_count,
|
|
777
|
+
updated_at = excluded.updated_at,
|
|
778
|
+
claimed_at = NULL,
|
|
779
|
+
completed_at = NULL,
|
|
780
|
+
error_message = NULL
|
|
781
|
+
`).run(sourceId, snapshotId, chunkRows.length, timestamp, timestamp);
|
|
782
|
+
};
|
|
783
|
+
return {
|
|
784
|
+
close() {
|
|
785
|
+
db.close();
|
|
786
|
+
},
|
|
787
|
+
upsertSource(spec, options2) {
|
|
788
|
+
const timestamp = nowIso();
|
|
789
|
+
const configHash = sha256(stableStringify(spec));
|
|
790
|
+
const existing = db.prepare("SELECT id, created_at, next_due_at, next_canary_due_at, config_hash FROM sources WHERE id = ?").get(spec.id);
|
|
791
|
+
const resolvedSpecPath = options2?.specPath ? resolve2(options2.specPath) : null;
|
|
792
|
+
const nextDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_due_at : timestamp;
|
|
793
|
+
const canaryConfig = resolveSourceCanary(spec);
|
|
794
|
+
const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryConfig.everyHours) : timestamp;
|
|
795
|
+
const configChanged = Boolean(existing && existing.config_hash !== configHash);
|
|
796
|
+
db.prepare(`
|
|
797
|
+
INSERT INTO sources (
|
|
798
|
+
id, label, spec_json, spec_path, config_hash, created_at, updated_at, next_due_at, next_canary_due_at
|
|
799
|
+
) VALUES (
|
|
800
|
+
@id, @label, @specJson, @specPath, @configHash, @createdAt, @updatedAt, @nextDueAt, @nextCanaryDueAt
|
|
801
|
+
)
|
|
802
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
803
|
+
label = excluded.label,
|
|
804
|
+
spec_json = excluded.spec_json,
|
|
805
|
+
spec_path = excluded.spec_path,
|
|
806
|
+
config_hash = excluded.config_hash,
|
|
807
|
+
updated_at = excluded.updated_at,
|
|
808
|
+
next_due_at = excluded.next_due_at,
|
|
809
|
+
next_canary_due_at = excluded.next_canary_due_at
|
|
810
|
+
`).run({
|
|
811
|
+
id: spec.id,
|
|
812
|
+
label: spec.label,
|
|
813
|
+
specJson: JSON.stringify(spec),
|
|
814
|
+
specPath: resolvedSpecPath,
|
|
815
|
+
configHash,
|
|
816
|
+
createdAt: existing?.created_at ?? timestamp,
|
|
817
|
+
updatedAt: timestamp,
|
|
818
|
+
nextDueAt,
|
|
819
|
+
nextCanaryDueAt
|
|
820
|
+
});
|
|
821
|
+
return {
|
|
822
|
+
sourceId: spec.id,
|
|
823
|
+
configHash,
|
|
824
|
+
configChanged
|
|
825
|
+
};
|
|
826
|
+
},
|
|
827
|
+
getSourceSpec(sourceId) {
|
|
828
|
+
const row = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(sourceId);
|
|
829
|
+
if (!row) {
|
|
830
|
+
return null;
|
|
831
|
+
}
|
|
832
|
+
return JSON.parse(row.spec_json);
|
|
833
|
+
},
|
|
834
|
+
listSources() {
|
|
835
|
+
const rows = db.prepare(`
|
|
836
|
+
SELECT
|
|
837
|
+
id,
|
|
838
|
+
label,
|
|
839
|
+
spec_path,
|
|
840
|
+
next_due_at,
|
|
841
|
+
next_canary_due_at,
|
|
842
|
+
last_checked_at,
|
|
843
|
+
last_successful_snapshot_at,
|
|
844
|
+
last_successful_snapshot_id,
|
|
845
|
+
last_canary_checked_at,
|
|
846
|
+
last_successful_canary_at,
|
|
847
|
+
last_canary_status
|
|
848
|
+
FROM sources
|
|
849
|
+
ORDER BY id
|
|
850
|
+
`).all();
|
|
851
|
+
return rows.map((row) => ({
|
|
852
|
+
id: row.id,
|
|
853
|
+
label: row.label,
|
|
854
|
+
specPath: row.spec_path,
|
|
855
|
+
nextDueAt: row.next_due_at,
|
|
856
|
+
isDue: Date.parse(row.next_due_at) <= Date.now(),
|
|
857
|
+
nextCanaryDueAt: row.next_canary_due_at,
|
|
858
|
+
isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
|
|
859
|
+
lastCheckedAt: row.last_checked_at,
|
|
860
|
+
lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
|
|
861
|
+
lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
|
|
862
|
+
lastCanaryCheckedAt: row.last_canary_checked_at,
|
|
863
|
+
lastSuccessfulCanaryAt: row.last_successful_canary_at,
|
|
864
|
+
lastCanaryStatus: row.last_canary_status
|
|
865
|
+
}));
|
|
866
|
+
},
|
|
867
|
+
listDueSourceIds(referenceTime = nowIso()) {
|
|
868
|
+
const rows = db.prepare(`
|
|
869
|
+
SELECT id
|
|
870
|
+
FROM sources
|
|
871
|
+
WHERE next_due_at <= ?
|
|
872
|
+
ORDER BY next_due_at, id
|
|
873
|
+
`).all(referenceTime);
|
|
874
|
+
return rows.map((row) => row.id);
|
|
875
|
+
},
|
|
876
|
+
listCanaryDueSourceIds(referenceTime = nowIso()) {
|
|
877
|
+
const rows = db.prepare(`
|
|
878
|
+
SELECT id
|
|
879
|
+
FROM sources
|
|
880
|
+
WHERE next_canary_due_at IS NOT NULL
|
|
881
|
+
AND next_canary_due_at <= ?
|
|
882
|
+
ORDER BY next_canary_due_at, id
|
|
883
|
+
`).all(referenceTime);
|
|
884
|
+
return rows.map((row) => row.id);
|
|
885
|
+
},
|
|
886
|
+
linkProject(projectPath, sourceIds) {
|
|
887
|
+
const normalizedPath = canonicalizeProjectPath(projectPath);
|
|
888
|
+
const timestamp = nowIso();
|
|
889
|
+
const insert = db.prepare(`
|
|
890
|
+
INSERT INTO project_links (project_path, source_id, created_at)
|
|
891
|
+
VALUES (?, ?, ?)
|
|
892
|
+
ON CONFLICT(project_path, source_id) DO NOTHING
|
|
893
|
+
`);
|
|
894
|
+
const transaction = db.transaction((ids) => {
|
|
895
|
+
for (const sourceId of ids) {
|
|
896
|
+
insert.run(normalizedPath, sourceId, timestamp);
|
|
897
|
+
}
|
|
898
|
+
});
|
|
899
|
+
transaction(sourceIds);
|
|
900
|
+
},
|
|
901
|
+
unlinkProject(projectPath, sourceIds) {
|
|
902
|
+
const normalizedPath = canonicalizeProjectPath(projectPath);
|
|
903
|
+
if (!sourceIds || sourceIds.length === 0) {
|
|
904
|
+
db.prepare("DELETE FROM project_links WHERE project_path = ?").run(normalizedPath);
|
|
905
|
+
return;
|
|
906
|
+
}
|
|
907
|
+
const statement = db.prepare("DELETE FROM project_links WHERE project_path = ? AND source_id = ?");
|
|
908
|
+
const transaction = db.transaction((ids) => {
|
|
909
|
+
for (const sourceId of ids) {
|
|
910
|
+
statement.run(normalizedPath, sourceId);
|
|
911
|
+
}
|
|
912
|
+
});
|
|
913
|
+
transaction(sourceIds);
|
|
914
|
+
},
|
|
915
|
+
recordSuccessfulSnapshot(input) {
|
|
916
|
+
const sourceRow = db.prepare("SELECT config_hash, spec_json, last_successful_snapshot_id FROM sources WHERE id = ?").get(input.sourceId);
|
|
917
|
+
if (!sourceRow) {
|
|
918
|
+
throw new AiocsError(
|
|
919
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
920
|
+
`Unknown source '${input.sourceId}'`
|
|
921
|
+
);
|
|
922
|
+
}
|
|
923
|
+
const pagesWithHashes = input.pages.map((page) => ({
|
|
924
|
+
...page,
|
|
925
|
+
markdown: page.markdown.trim(),
|
|
926
|
+
contentHash: sha256(page.markdown.trim())
|
|
927
|
+
}));
|
|
928
|
+
const fingerprint = buildSnapshotFingerprint({
|
|
929
|
+
sourceId: input.sourceId,
|
|
930
|
+
configHash: sourceRow.config_hash,
|
|
931
|
+
pages: pagesWithHashes.map((page) => ({
|
|
932
|
+
url: page.url,
|
|
933
|
+
contentHash: page.contentHash
|
|
934
|
+
}))
|
|
935
|
+
});
|
|
936
|
+
const existing = db.prepare("SELECT id FROM snapshots WHERE source_id = ? AND fingerprint = ?").get(input.sourceId, fingerprint);
|
|
937
|
+
const spec = JSON.parse(sourceRow.spec_json);
|
|
938
|
+
const checkedAt = nowIso();
|
|
939
|
+
const nextDueAt = addHoursIso(spec.schedule.everyHours);
|
|
940
|
+
if (existing) {
|
|
941
|
+
db.prepare(`
|
|
942
|
+
UPDATE sources
|
|
943
|
+
SET last_checked_at = ?, last_successful_snapshot_at = ?, last_successful_snapshot_id = ?, next_due_at = ?, updated_at = ?
|
|
944
|
+
WHERE id = ?
|
|
945
|
+
`).run(checkedAt, checkedAt, existing.id, nextDueAt, checkedAt, input.sourceId);
|
|
946
|
+
queueEmbeddingJobForSnapshot(
|
|
947
|
+
input.sourceId,
|
|
948
|
+
existing.id,
|
|
949
|
+
sourceRow.last_successful_snapshot_id
|
|
950
|
+
);
|
|
951
|
+
db.prepare(`
|
|
952
|
+
INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
|
|
953
|
+
VALUES (?, ?, 'success', ?, ?, ?)
|
|
954
|
+
`).run(randomUUID(), input.sourceId, existing.id, checkedAt, checkedAt);
|
|
955
|
+
return {
|
|
956
|
+
snapshotId: existing.id,
|
|
957
|
+
reused: true
|
|
958
|
+
};
|
|
959
|
+
}
|
|
960
|
+
const snapshotId = `snp_${checkedAt.replace(/[-:.TZ]/g, "")}_${fingerprint.slice(0, 12)}`;
|
|
961
|
+
const insertSnapshot = db.prepare(`
|
|
962
|
+
INSERT INTO snapshots (
|
|
963
|
+
id, source_id, fingerprint, config_hash, detected_version, page_count, created_at
|
|
964
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
965
|
+
`);
|
|
966
|
+
const insertPage = db.prepare(`
|
|
967
|
+
INSERT INTO pages (snapshot_id, url, title, markdown, content_hash)
|
|
968
|
+
VALUES (?, ?, ?, ?, ?)
|
|
969
|
+
`);
|
|
970
|
+
const insertChunk = db.prepare(`
|
|
971
|
+
INSERT INTO chunks (
|
|
972
|
+
source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown
|
|
973
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
974
|
+
`);
|
|
975
|
+
const insertRun = db.prepare(`
|
|
976
|
+
INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
|
|
977
|
+
VALUES (?, ?, 'success', ?, ?, ?)
|
|
978
|
+
`);
|
|
979
|
+
const transaction = db.transaction(() => {
|
|
980
|
+
insertSnapshot.run(
|
|
981
|
+
snapshotId,
|
|
982
|
+
input.sourceId,
|
|
983
|
+
fingerprint,
|
|
984
|
+
sourceRow.config_hash,
|
|
985
|
+
input.detectedVersion ?? null,
|
|
986
|
+
pagesWithHashes.length,
|
|
987
|
+
checkedAt
|
|
988
|
+
);
|
|
989
|
+
for (const page of pagesWithHashes) {
|
|
990
|
+
const pageInsert = insertPage.run(snapshotId, page.url, page.title, page.markdown, page.contentHash);
|
|
991
|
+
const pageId = Number(pageInsert.lastInsertRowid);
|
|
992
|
+
const chunks = chunkMarkdown(page.title, page.markdown);
|
|
993
|
+
for (const chunk of chunks) {
|
|
994
|
+
insertChunk.run(
|
|
995
|
+
input.sourceId,
|
|
996
|
+
snapshotId,
|
|
997
|
+
pageId,
|
|
998
|
+
page.url,
|
|
999
|
+
page.title,
|
|
1000
|
+
chunk.sectionTitle,
|
|
1001
|
+
chunk.chunkOrder,
|
|
1002
|
+
chunk.markdown
|
|
1003
|
+
);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
db.prepare(`
|
|
1007
|
+
UPDATE sources
|
|
1008
|
+
SET last_checked_at = ?, last_successful_snapshot_at = ?, last_successful_snapshot_id = ?, next_due_at = ?, updated_at = ?
|
|
1009
|
+
WHERE id = ?
|
|
1010
|
+
`).run(checkedAt, checkedAt, snapshotId, nextDueAt, checkedAt, input.sourceId);
|
|
1011
|
+
queueEmbeddingJobForSnapshot(
|
|
1012
|
+
input.sourceId,
|
|
1013
|
+
snapshotId,
|
|
1014
|
+
sourceRow.last_successful_snapshot_id
|
|
1015
|
+
);
|
|
1016
|
+
insertRun.run(randomUUID(), input.sourceId, snapshotId, checkedAt, checkedAt);
|
|
1017
|
+
});
|
|
1018
|
+
transaction();
|
|
1019
|
+
return {
|
|
1020
|
+
snapshotId,
|
|
1021
|
+
reused: false
|
|
1022
|
+
};
|
|
1023
|
+
},
|
|
1024
|
+
recordFailedFetchRun(input) {
|
|
1025
|
+
const sourceRow = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(input.sourceId);
|
|
1026
|
+
if (!sourceRow) {
|
|
1027
|
+
throw new AiocsError(
|
|
1028
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
1029
|
+
`Unknown source '${input.sourceId}'`
|
|
1030
|
+
);
|
|
1031
|
+
}
|
|
1032
|
+
const spec = JSON.parse(sourceRow.spec_json);
|
|
1033
|
+
const timestamp = nowIso();
|
|
1034
|
+
db.prepare(`
|
|
1035
|
+
INSERT INTO fetch_runs (id, source_id, status, error_message, started_at, finished_at)
|
|
1036
|
+
VALUES (?, ?, 'failed', ?, ?, ?)
|
|
1037
|
+
`).run(randomUUID(), input.sourceId, input.errorMessage, timestamp, timestamp);
|
|
1038
|
+
db.prepare(`
|
|
1039
|
+
UPDATE sources
|
|
1040
|
+
SET last_checked_at = ?, next_due_at = ?, updated_at = ?
|
|
1041
|
+
WHERE id = ?
|
|
1042
|
+
`).run(timestamp, addHoursIso(spec.schedule.everyHours), timestamp, input.sourceId);
|
|
1043
|
+
},
|
|
1044
|
+
recordCanaryRun(input) {
|
|
1045
|
+
const sourceRow = db.prepare("SELECT spec_json FROM sources WHERE id = ?").get(input.sourceId);
|
|
1046
|
+
if (!sourceRow) {
|
|
1047
|
+
throw new AiocsError(
|
|
1048
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
1049
|
+
`Unknown source '${input.sourceId}'`
|
|
1050
|
+
);
|
|
1051
|
+
}
|
|
1052
|
+
const spec = JSON.parse(sourceRow.spec_json);
|
|
1053
|
+
const canary = resolveSourceCanary(spec);
|
|
1054
|
+
db.prepare(`
|
|
1055
|
+
INSERT INTO canary_runs (id, source_id, status, checked_at, details_json)
|
|
1056
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1057
|
+
`).run(
|
|
1058
|
+
randomUUID(),
|
|
1059
|
+
input.sourceId,
|
|
1060
|
+
input.status,
|
|
1061
|
+
input.checkedAt,
|
|
1062
|
+
JSON.stringify(input.details)
|
|
1063
|
+
);
|
|
1064
|
+
db.prepare(`
|
|
1065
|
+
UPDATE sources
|
|
1066
|
+
SET
|
|
1067
|
+
last_canary_checked_at = ?,
|
|
1068
|
+
last_successful_canary_at = CASE WHEN ? = 'pass' THEN ? ELSE last_successful_canary_at END,
|
|
1069
|
+
last_canary_status = ?,
|
|
1070
|
+
next_canary_due_at = ?,
|
|
1071
|
+
updated_at = ?
|
|
1072
|
+
WHERE id = ?
|
|
1073
|
+
`).run(
|
|
1074
|
+
input.checkedAt,
|
|
1075
|
+
input.status,
|
|
1076
|
+
input.checkedAt,
|
|
1077
|
+
input.status,
|
|
1078
|
+
addHoursIso(canary.everyHours),
|
|
1079
|
+
input.checkedAt,
|
|
1080
|
+
input.sourceId
|
|
1081
|
+
);
|
|
1082
|
+
},
|
|
1083
|
+
listProjectLinks,
|
|
1084
|
+
removeManagedSources(input) {
|
|
1085
|
+
if (input.managedRoots.length === 0) {
|
|
1086
|
+
return [];
|
|
1087
|
+
}
|
|
1088
|
+
const activeSourceKeys = new Set(
|
|
1089
|
+
input.activeSources.map((source) => `${source.sourceId}::${resolve2(source.specPath)}`)
|
|
1090
|
+
);
|
|
1091
|
+
const rows = db.prepare(`
|
|
1092
|
+
SELECT id, spec_path
|
|
1093
|
+
FROM sources
|
|
1094
|
+
WHERE spec_path IS NOT NULL
|
|
1095
|
+
ORDER BY id
|
|
1096
|
+
`).all();
|
|
1097
|
+
const toDelete = rows.filter((row) => {
|
|
1098
|
+
if (!row.spec_path) {
|
|
1099
|
+
return false;
|
|
1100
|
+
}
|
|
1101
|
+
const normalizedSpecPath = resolve2(row.spec_path);
|
|
1102
|
+
return input.managedRoots.some(
|
|
1103
|
+
(managedRoot) => normalizedSpecPath === managedRoot || normalizedSpecPath.startsWith(`${managedRoot}/`)
|
|
1104
|
+
) && !activeSourceKeys.has(`${row.id}::${normalizedSpecPath}`);
|
|
1105
|
+
}).map((row) => row.id);
|
|
1106
|
+
if (toDelete.length === 0) {
|
|
1107
|
+
return [];
|
|
1108
|
+
}
|
|
1109
|
+
const deleteStatement = db.prepare("DELETE FROM sources WHERE id = ?");
|
|
1110
|
+
const transaction = db.transaction((sourceIds) => {
|
|
1111
|
+
for (const sourceId of sourceIds) {
|
|
1112
|
+
deleteStatement.run(sourceId);
|
|
1113
|
+
}
|
|
1114
|
+
});
|
|
1115
|
+
transaction(toDelete);
|
|
1116
|
+
return toDelete;
|
|
1117
|
+
},
|
|
1118
|
+
listSnapshots(sourceId) {
|
|
1119
|
+
const rows = db.prepare(`
|
|
1120
|
+
SELECT id, source_id, detected_version, created_at, page_count
|
|
1121
|
+
FROM snapshots
|
|
1122
|
+
WHERE source_id = ?
|
|
1123
|
+
ORDER BY rowid DESC
|
|
1124
|
+
`).all(sourceId);
|
|
1125
|
+
return rows.map((row) => ({
|
|
1126
|
+
snapshotId: row.id,
|
|
1127
|
+
sourceId: row.source_id,
|
|
1128
|
+
detectedVersion: row.detected_version,
|
|
1129
|
+
createdAt: row.created_at,
|
|
1130
|
+
pageCount: row.page_count
|
|
1131
|
+
}));
|
|
1132
|
+
},
|
|
1133
|
+
diffSnapshots(input) {
|
|
1134
|
+
const snapshots = this.listSnapshots(input.sourceId);
|
|
1135
|
+
if (snapshots.length === 0) {
|
|
1136
|
+
throw new AiocsError(
|
|
1137
|
+
AIOCS_ERROR_CODES.snapshotNotFound,
|
|
1138
|
+
`No successful snapshot found for source '${input.sourceId}'`
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
1141
|
+
const toSnapshot = input.toSnapshotId ? snapshots.find((snapshot) => snapshot.snapshotId === input.toSnapshotId) : snapshots[0];
|
|
1142
|
+
if (!toSnapshot) {
|
|
1143
|
+
throw new AiocsError(
|
|
1144
|
+
AIOCS_ERROR_CODES.snapshotNotFound,
|
|
1145
|
+
`Snapshot '${input.toSnapshotId}' not found for source '${input.sourceId}'`
|
|
1146
|
+
);
|
|
1147
|
+
}
|
|
1148
|
+
const toSnapshotIndex = snapshots.findIndex((snapshot) => snapshot.snapshotId === toSnapshot.snapshotId);
|
|
1149
|
+
const fromSnapshot = input.fromSnapshotId ? snapshots.find((snapshot) => snapshot.snapshotId === input.fromSnapshotId) : snapshots[toSnapshotIndex + 1];
|
|
1150
|
+
if (!fromSnapshot) {
|
|
1151
|
+
throw new AiocsError(
|
|
1152
|
+
AIOCS_ERROR_CODES.snapshotDiffBaseNotFound,
|
|
1153
|
+
`No base snapshot available to diff source '${input.sourceId}'`
|
|
1154
|
+
);
|
|
1155
|
+
}
|
|
1156
|
+
const loadSnapshotPages = (snapshotId) => db.prepare(`
|
|
1157
|
+
SELECT url, title, markdown, content_hash
|
|
1158
|
+
FROM pages
|
|
1159
|
+
WHERE snapshot_id = ?
|
|
1160
|
+
ORDER BY url
|
|
1161
|
+
`).all(snapshotId);
|
|
1162
|
+
const beforePages = loadSnapshotPages(fromSnapshot.snapshotId);
|
|
1163
|
+
const afterPages = loadSnapshotPages(toSnapshot.snapshotId);
|
|
1164
|
+
const beforeMap = new Map(beforePages.map((page) => [page.url, page]));
|
|
1165
|
+
const afterMap = new Map(afterPages.map((page) => [page.url, page]));
|
|
1166
|
+
const addedPages = afterPages.filter((page) => !beforeMap.has(page.url)).map((page) => ({
|
|
1167
|
+
url: page.url,
|
|
1168
|
+
title: page.title
|
|
1169
|
+
}));
|
|
1170
|
+
const removedPages = beforePages.filter((page) => !afterMap.has(page.url)).map((page) => ({
|
|
1171
|
+
url: page.url,
|
|
1172
|
+
title: page.title
|
|
1173
|
+
}));
|
|
1174
|
+
const summarizeLineDiff = (beforeMarkdown, afterMarkdown) => {
|
|
1175
|
+
const beforeLines = beforeMarkdown.split("\n");
|
|
1176
|
+
const afterLines = afterMarkdown.split("\n");
|
|
1177
|
+
let prefix = 0;
|
|
1178
|
+
while (prefix < beforeLines.length && prefix < afterLines.length && beforeLines[prefix] === afterLines[prefix]) {
|
|
1179
|
+
prefix += 1;
|
|
1180
|
+
}
|
|
1181
|
+
let suffix = 0;
|
|
1182
|
+
while (suffix < beforeLines.length - prefix && suffix < afterLines.length - prefix && beforeLines[beforeLines.length - 1 - suffix] === afterLines[afterLines.length - 1 - suffix]) {
|
|
1183
|
+
suffix += 1;
|
|
1184
|
+
}
|
|
1185
|
+
return {
|
|
1186
|
+
addedLineCount: Math.max(0, afterLines.length - prefix - suffix),
|
|
1187
|
+
removedLineCount: Math.max(0, beforeLines.length - prefix - suffix)
|
|
1188
|
+
};
|
|
1189
|
+
};
|
|
1190
|
+
const changedPages = beforePages.filter((page) => afterMap.has(page.url)).map((page) => ({
|
|
1191
|
+
before: page,
|
|
1192
|
+
after: afterMap.get(page.url)
|
|
1193
|
+
})).filter(({ before, after }) => before.content_hash !== after.content_hash || before.title !== after.title).map(({ before, after }) => ({
|
|
1194
|
+
url: before.url,
|
|
1195
|
+
beforeTitle: before.title,
|
|
1196
|
+
afterTitle: after.title,
|
|
1197
|
+
lineSummary: summarizeLineDiff(before.markdown, after.markdown)
|
|
1198
|
+
}));
|
|
1199
|
+
const unchangedPageCount = beforePages.filter((page) => {
|
|
1200
|
+
const next = afterMap.get(page.url);
|
|
1201
|
+
return next && next.content_hash === page.content_hash && next.title === page.title;
|
|
1202
|
+
}).length;
|
|
1203
|
+
return {
|
|
1204
|
+
sourceId: input.sourceId,
|
|
1205
|
+
fromSnapshotId: fromSnapshot.snapshotId,
|
|
1206
|
+
toSnapshotId: toSnapshot.snapshotId,
|
|
1207
|
+
summary: {
|
|
1208
|
+
addedPageCount: addedPages.length,
|
|
1209
|
+
removedPageCount: removedPages.length,
|
|
1210
|
+
changedPageCount: changedPages.length,
|
|
1211
|
+
unchangedPageCount
|
|
1212
|
+
},
|
|
1213
|
+
addedPages,
|
|
1214
|
+
removedPages,
|
|
1215
|
+
changedPages
|
|
1216
|
+
};
|
|
1217
|
+
},
|
|
1218
|
+
resolveSearchScope(input) {
|
|
1219
|
+
return resolveSearchScope(input);
|
|
1220
|
+
},
|
|
1221
|
+
searchLexical(input) {
|
|
1222
|
+
return searchLexicalByScope(input);
|
|
1223
|
+
},
|
|
1224
|
+
search(input) {
|
|
1225
|
+
return searchLexicalByScope({
|
|
1226
|
+
query: input.query,
|
|
1227
|
+
scope: resolveSearchScope(input)
|
|
1228
|
+
});
|
|
1229
|
+
},
|
|
1230
|
+
listLatestSnapshots(sourceIds) {
|
|
1231
|
+
return listLatestSnapshots(sourceIds);
|
|
1232
|
+
},
|
|
1233
|
+
listSnapshotChunks(input) {
|
|
1234
|
+
const rows = db.prepare(`
|
|
1235
|
+
SELECT
|
|
1236
|
+
c.id AS chunk_id,
|
|
1237
|
+
c.source_id,
|
|
1238
|
+
c.snapshot_id,
|
|
1239
|
+
c.page_url,
|
|
1240
|
+
c.page_title,
|
|
1241
|
+
c.section_title,
|
|
1242
|
+
c.markdown
|
|
1243
|
+
FROM chunks c
|
|
1244
|
+
WHERE c.source_id = ?
|
|
1245
|
+
AND c.snapshot_id = ?
|
|
1246
|
+
ORDER BY c.id
|
|
1247
|
+
`).all(input.sourceId, input.snapshotId);
|
|
1248
|
+
return rows.map((row) => ({
|
|
1249
|
+
chunkId: row.chunk_id,
|
|
1250
|
+
sourceId: row.source_id,
|
|
1251
|
+
snapshotId: row.snapshot_id,
|
|
1252
|
+
pageUrl: row.page_url,
|
|
1253
|
+
pageTitle: row.page_title,
|
|
1254
|
+
sectionTitle: row.section_title,
|
|
1255
|
+
markdown: row.markdown,
|
|
1256
|
+
contentHash: sha256(row.markdown)
|
|
1257
|
+
}));
|
|
1258
|
+
},
|
|
1259
|
+
getSnapshotEmbeddingState(input) {
|
|
1260
|
+
const rows = db.prepare(`
|
|
1261
|
+
SELECT chunk_id, status, model_key, content_hash
|
|
1262
|
+
FROM embedding_state
|
|
1263
|
+
WHERE source_id = ?
|
|
1264
|
+
AND snapshot_id = ?
|
|
1265
|
+
ORDER BY chunk_id
|
|
1266
|
+
`).all(input.sourceId, input.snapshotId);
|
|
1267
|
+
return rows.map((row) => ({
|
|
1268
|
+
chunkId: row.chunk_id,
|
|
1269
|
+
status: row.status,
|
|
1270
|
+
modelKey: row.model_key,
|
|
1271
|
+
contentHash: row.content_hash
|
|
1272
|
+
}));
|
|
1273
|
+
},
|
|
1274
|
+
listStaleEmbeddingChunkIds(sourceId) {
|
|
1275
|
+
const rows = db.prepare(`
|
|
1276
|
+
SELECT chunk_id
|
|
1277
|
+
FROM embedding_state
|
|
1278
|
+
WHERE source_id = ?
|
|
1279
|
+
AND status = 'stale'
|
|
1280
|
+
ORDER BY chunk_id
|
|
1281
|
+
`).all(sourceId);
|
|
1282
|
+
return rows.map((row) => row.chunk_id);
|
|
1283
|
+
},
|
|
1284
|
+
listEmbeddingChunkIds(sourceIds) {
|
|
1285
|
+
const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
|
|
1286
|
+
const rows = db.prepare(`
|
|
1287
|
+
SELECT chunk_id
|
|
1288
|
+
FROM embedding_state
|
|
1289
|
+
${filterSourceIds ? `WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})` : ""}
|
|
1290
|
+
ORDER BY chunk_id
|
|
1291
|
+
`).all(...filterSourceIds ?? []);
|
|
1292
|
+
return rows.map((row) => row.chunk_id);
|
|
1293
|
+
},
|
|
1294
|
+
getChunksByIds(chunkIds) {
|
|
1295
|
+
if (chunkIds.length === 0) {
|
|
1296
|
+
return [];
|
|
1297
|
+
}
|
|
1298
|
+
const rows = db.prepare(`
|
|
1299
|
+
SELECT
|
|
1300
|
+
c.id AS chunk_id,
|
|
1301
|
+
c.source_id,
|
|
1302
|
+
c.snapshot_id,
|
|
1303
|
+
c.page_url,
|
|
1304
|
+
c.page_title,
|
|
1305
|
+
c.section_title,
|
|
1306
|
+
c.markdown
|
|
1307
|
+
FROM chunks c
|
|
1308
|
+
WHERE c.id IN (${chunkIds.map(() => "?").join(",")})
|
|
1309
|
+
`).all(...chunkIds);
|
|
1310
|
+
return rows.map((row) => ({
|
|
1311
|
+
chunkId: row.chunk_id,
|
|
1312
|
+
sourceId: row.source_id,
|
|
1313
|
+
snapshotId: row.snapshot_id,
|
|
1314
|
+
pageUrl: row.page_url,
|
|
1315
|
+
pageTitle: row.page_title,
|
|
1316
|
+
sectionTitle: row.section_title,
|
|
1317
|
+
markdown: row.markdown
|
|
1318
|
+
}));
|
|
1319
|
+
},
|
|
1320
|
+
queueLatestEmbeddingJobs(sourceIds) {
|
|
1321
|
+
const latestSnapshots = listLatestSnapshots(sourceIds);
|
|
1322
|
+
const transaction = db.transaction((snapshots) => {
|
|
1323
|
+
for (const snapshot of snapshots) {
|
|
1324
|
+
queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
|
|
1325
|
+
}
|
|
1326
|
+
});
|
|
1327
|
+
transaction(latestSnapshots);
|
|
1328
|
+
return {
|
|
1329
|
+
queuedJobs: latestSnapshots.length
|
|
1330
|
+
};
|
|
1331
|
+
},
|
|
1332
|
+
requeueLatestEmbeddingJobs(sourceIds) {
|
|
1333
|
+
const latestSnapshots = listLatestSnapshots(sourceIds);
|
|
1334
|
+
const transaction = db.transaction((snapshots) => {
|
|
1335
|
+
for (const snapshot of snapshots) {
|
|
1336
|
+
db.prepare(`
|
|
1337
|
+
UPDATE embedding_state
|
|
1338
|
+
SET
|
|
1339
|
+
status = 'pending',
|
|
1340
|
+
model_key = NULL,
|
|
1341
|
+
vector_point_id = NULL,
|
|
1342
|
+
last_attempted_at = NULL,
|
|
1343
|
+
indexed_at = NULL,
|
|
1344
|
+
error_message = NULL
|
|
1345
|
+
WHERE source_id = ?
|
|
1346
|
+
AND snapshot_id = ?
|
|
1347
|
+
`).run(snapshot.sourceId, snapshot.snapshotId);
|
|
1348
|
+
queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
|
|
1349
|
+
}
|
|
1350
|
+
});
|
|
1351
|
+
transaction(latestSnapshots);
|
|
1352
|
+
return {
|
|
1353
|
+
queuedJobs: latestSnapshots.length
|
|
1354
|
+
};
|
|
1355
|
+
},
|
|
1356
|
+
resetEmbeddingsAfterImport() {
|
|
1357
|
+
const transaction = db.transaction(() => {
|
|
1358
|
+
db.prepare("DELETE FROM embedding_jobs").run();
|
|
1359
|
+
db.prepare("DELETE FROM embedding_state").run();
|
|
1360
|
+
});
|
|
1361
|
+
transaction();
|
|
1362
|
+
const latestSnapshots = listLatestSnapshots();
|
|
1363
|
+
const queueTransaction = db.transaction((snapshots) => {
|
|
1364
|
+
for (const snapshot of snapshots) {
|
|
1365
|
+
queueEmbeddingJobForSnapshot(snapshot.sourceId, snapshot.snapshotId);
|
|
1366
|
+
}
|
|
1367
|
+
});
|
|
1368
|
+
queueTransaction(latestSnapshots);
|
|
1369
|
+
return {
|
|
1370
|
+
queuedJobs: latestSnapshots.length
|
|
1371
|
+
};
|
|
1372
|
+
},
|
|
1373
|
+
resetRunningEmbeddingJobs() {
|
|
1374
|
+
const result = db.prepare(`
|
|
1375
|
+
UPDATE embedding_jobs
|
|
1376
|
+
SET
|
|
1377
|
+
status = 'pending',
|
|
1378
|
+
updated_at = ?,
|
|
1379
|
+
claimed_at = NULL,
|
|
1380
|
+
error_message = NULL
|
|
1381
|
+
WHERE status = 'running'
|
|
1382
|
+
`).run(nowIso());
|
|
1383
|
+
return result.changes;
|
|
1384
|
+
},
|
|
1385
|
+
claimEmbeddingJobs(limit) {
|
|
1386
|
+
const normalizedLimit = assertPaginationValue(limit, "limit", limit);
|
|
1387
|
+
if (normalizedLimit === 0) {
|
|
1388
|
+
return [];
|
|
1389
|
+
}
|
|
1390
|
+
const claimedAt = nowIso();
|
|
1391
|
+
const transaction = db.transaction(() => {
|
|
1392
|
+
const pending = db.prepare(`
|
|
1393
|
+
SELECT
|
|
1394
|
+
source_id,
|
|
1395
|
+
snapshot_id,
|
|
1396
|
+
status,
|
|
1397
|
+
attempt_count,
|
|
1398
|
+
chunk_count,
|
|
1399
|
+
created_at,
|
|
1400
|
+
updated_at,
|
|
1401
|
+
claimed_at,
|
|
1402
|
+
completed_at,
|
|
1403
|
+
error_message
|
|
1404
|
+
FROM embedding_jobs
|
|
1405
|
+
WHERE status = 'pending'
|
|
1406
|
+
ORDER BY updated_at, source_id, snapshot_id
|
|
1407
|
+
LIMIT ?
|
|
1408
|
+
`).all(normalizedLimit);
|
|
1409
|
+
const claim = db.prepare(`
|
|
1410
|
+
UPDATE embedding_jobs
|
|
1411
|
+
SET
|
|
1412
|
+
status = 'running',
|
|
1413
|
+
attempt_count = attempt_count + 1,
|
|
1414
|
+
updated_at = ?,
|
|
1415
|
+
claimed_at = ?,
|
|
1416
|
+
error_message = NULL
|
|
1417
|
+
WHERE source_id = ?
|
|
1418
|
+
AND snapshot_id = ?
|
|
1419
|
+
`);
|
|
1420
|
+
for (const job of pending) {
|
|
1421
|
+
claim.run(claimedAt, claimedAt, job.source_id, job.snapshot_id);
|
|
1422
|
+
}
|
|
1423
|
+
return pending.map((job) => ({
|
|
1424
|
+
sourceId: job.source_id,
|
|
1425
|
+
snapshotId: job.snapshot_id,
|
|
1426
|
+
status: "running",
|
|
1427
|
+
attemptCount: job.attempt_count + 1,
|
|
1428
|
+
chunkCount: job.chunk_count,
|
|
1429
|
+
createdAt: job.created_at,
|
|
1430
|
+
updatedAt: claimedAt,
|
|
1431
|
+
claimedAt,
|
|
1432
|
+
completedAt: job.completed_at,
|
|
1433
|
+
errorMessage: null
|
|
1434
|
+
}));
|
|
1435
|
+
});
|
|
1436
|
+
return transaction();
|
|
1437
|
+
},
|
|
1438
|
+
markEmbeddingJobSucceeded(input) {
|
|
1439
|
+
const timestamp = nowIso();
|
|
1440
|
+
const staleChunkIds = [...new Set(input.staleChunkIds ?? [])];
|
|
1441
|
+
const indexedChunkIds = [...new Set(input.indexedChunkIds)];
|
|
1442
|
+
const indexedPlaceholders = indexedChunkIds.length > 0 ? indexedChunkIds.map(() => "?").join(",") : null;
|
|
1443
|
+
const stalePlaceholders = staleChunkIds.length > 0 ? staleChunkIds.map(() => "?").join(",") : null;
|
|
1444
|
+
const transaction = db.transaction(() => {
|
|
1445
|
+
if (indexedPlaceholders) {
|
|
1446
|
+
db.prepare(`
|
|
1447
|
+
UPDATE embedding_state
|
|
1448
|
+
SET
|
|
1449
|
+
status = 'indexed',
|
|
1450
|
+
model_key = ?,
|
|
1451
|
+
vector_point_id = CAST(chunk_id AS TEXT),
|
|
1452
|
+
last_attempted_at = ?,
|
|
1453
|
+
indexed_at = ?,
|
|
1454
|
+
error_message = NULL
|
|
1455
|
+
WHERE chunk_id IN (${indexedPlaceholders})
|
|
1456
|
+
`).run(input.modelKey, timestamp, timestamp, ...indexedChunkIds);
|
|
1457
|
+
}
|
|
1458
|
+
db.prepare(`
|
|
1459
|
+
UPDATE embedding_state
|
|
1460
|
+
SET
|
|
1461
|
+
status = 'failed',
|
|
1462
|
+
model_key = NULL,
|
|
1463
|
+
vector_point_id = NULL,
|
|
1464
|
+
last_attempted_at = ?,
|
|
1465
|
+
indexed_at = NULL,
|
|
1466
|
+
error_message = 'Chunk was not indexed during the latest embedding run'
|
|
1467
|
+
WHERE source_id = ?
|
|
1468
|
+
AND snapshot_id = ?
|
|
1469
|
+
AND status != 'indexed'
|
|
1470
|
+
`).run(timestamp, input.sourceId, input.snapshotId);
|
|
1471
|
+
if (stalePlaceholders) {
|
|
1472
|
+
db.prepare(`
|
|
1473
|
+
DELETE FROM embedding_state
|
|
1474
|
+
WHERE chunk_id IN (${stalePlaceholders})
|
|
1475
|
+
`).run(...staleChunkIds);
|
|
1476
|
+
}
|
|
1477
|
+
db.prepare(`
|
|
1478
|
+
UPDATE embedding_jobs
|
|
1479
|
+
SET
|
|
1480
|
+
status = 'succeeded',
|
|
1481
|
+
updated_at = ?,
|
|
1482
|
+
completed_at = ?,
|
|
1483
|
+
claimed_at = NULL,
|
|
1484
|
+
error_message = NULL
|
|
1485
|
+
WHERE source_id = ?
|
|
1486
|
+
AND snapshot_id = ?
|
|
1487
|
+
`).run(timestamp, timestamp, input.sourceId, input.snapshotId);
|
|
1488
|
+
});
|
|
1489
|
+
transaction();
|
|
1490
|
+
},
|
|
1491
|
+
markEmbeddingJobFailed(input) {
|
|
1492
|
+
const timestamp = nowIso();
|
|
1493
|
+
const transaction = db.transaction(() => {
|
|
1494
|
+
db.prepare(`
|
|
1495
|
+
UPDATE embedding_state
|
|
1496
|
+
SET
|
|
1497
|
+
status = 'failed',
|
|
1498
|
+
model_key = NULL,
|
|
1499
|
+
vector_point_id = NULL,
|
|
1500
|
+
last_attempted_at = ?,
|
|
1501
|
+
indexed_at = NULL,
|
|
1502
|
+
error_message = ?
|
|
1503
|
+
WHERE source_id = ?
|
|
1504
|
+
AND snapshot_id = ?
|
|
1505
|
+
AND status != 'indexed'
|
|
1506
|
+
`).run(timestamp, input.errorMessage, input.sourceId, input.snapshotId);
|
|
1507
|
+
db.prepare(`
|
|
1508
|
+
UPDATE embedding_jobs
|
|
1509
|
+
SET
|
|
1510
|
+
status = 'failed',
|
|
1511
|
+
updated_at = ?,
|
|
1512
|
+
completed_at = ?,
|
|
1513
|
+
claimed_at = NULL,
|
|
1514
|
+
error_message = ?
|
|
1515
|
+
WHERE source_id = ?
|
|
1516
|
+
AND snapshot_id = ?
|
|
1517
|
+
`).run(timestamp, timestamp, input.errorMessage, input.sourceId, input.snapshotId);
|
|
1518
|
+
});
|
|
1519
|
+
transaction();
|
|
1520
|
+
},
|
|
1521
|
+
clearEmbeddings(sourceIds) {
|
|
1522
|
+
const latestSnapshots = listLatestSnapshots(sourceIds);
|
|
1523
|
+
const clearedSources = latestSnapshots.map((snapshot) => snapshot.sourceId);
|
|
1524
|
+
const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
|
|
1525
|
+
const transaction = db.transaction(() => {
|
|
1526
|
+
if (filterSourceIds && filterSourceIds.length > 0) {
|
|
1527
|
+
db.prepare(`
|
|
1528
|
+
DELETE FROM embedding_jobs
|
|
1529
|
+
WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})
|
|
1530
|
+
`).run(...filterSourceIds);
|
|
1531
|
+
db.prepare(`
|
|
1532
|
+
DELETE FROM embedding_state
|
|
1533
|
+
WHERE source_id IN (${filterSourceIds.map(() => "?").join(",")})
|
|
1534
|
+
`).run(...filterSourceIds);
|
|
1535
|
+
} else {
|
|
1536
|
+
db.prepare("DELETE FROM embedding_jobs").run();
|
|
1537
|
+
db.prepare("DELETE FROM embedding_state").run();
|
|
1538
|
+
}
|
|
1539
|
+
});
|
|
1540
|
+
transaction();
|
|
1541
|
+
return {
|
|
1542
|
+
clearedSources
|
|
1543
|
+
};
|
|
1544
|
+
},
|
|
1545
|
+
getEmbeddingOverview() {
|
|
1546
|
+
const queueCounts = db.prepare(`
|
|
1547
|
+
SELECT
|
|
1548
|
+
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) AS pending_jobs,
|
|
1549
|
+
SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) AS running_jobs,
|
|
1550
|
+
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed_jobs
|
|
1551
|
+
FROM embedding_jobs
|
|
1552
|
+
`).get();
|
|
1553
|
+
const rows = db.prepare(`
|
|
1554
|
+
SELECT
|
|
1555
|
+
s.id AS source_id,
|
|
1556
|
+
s.last_successful_snapshot_id AS snapshot_id,
|
|
1557
|
+
COUNT(c.id) AS total_chunks,
|
|
1558
|
+
SUM(CASE WHEN es.status = 'indexed' THEN 1 ELSE 0 END) AS indexed_chunks,
|
|
1559
|
+
SUM(CASE WHEN es.status = 'pending' THEN 1 ELSE 0 END) AS pending_chunks,
|
|
1560
|
+
SUM(CASE WHEN es.status = 'failed' THEN 1 ELSE 0 END) AS failed_chunks,
|
|
1561
|
+
SUM(CASE WHEN es.status = 'stale' THEN 1 ELSE 0 END) AS stale_chunks
|
|
1562
|
+
FROM sources s
|
|
1563
|
+
LEFT JOIN chunks c
|
|
1564
|
+
ON c.snapshot_id = s.last_successful_snapshot_id
|
|
1565
|
+
LEFT JOIN embedding_state es
|
|
1566
|
+
ON es.chunk_id = c.id
|
|
1567
|
+
GROUP BY s.id, s.last_successful_snapshot_id
|
|
1568
|
+
ORDER BY s.id
|
|
1569
|
+
`).all();
|
|
1570
|
+
return {
|
|
1571
|
+
queue: {
|
|
1572
|
+
pendingJobs: queueCounts.pending_jobs ?? 0,
|
|
1573
|
+
runningJobs: queueCounts.running_jobs ?? 0,
|
|
1574
|
+
failedJobs: queueCounts.failed_jobs ?? 0
|
|
1575
|
+
},
|
|
1576
|
+
sources: rows.map((row) => ({
|
|
1577
|
+
sourceId: row.source_id,
|
|
1578
|
+
snapshotId: row.snapshot_id,
|
|
1579
|
+
totalChunks: row.total_chunks,
|
|
1580
|
+
indexedChunks: row.indexed_chunks,
|
|
1581
|
+
pendingChunks: row.pending_chunks,
|
|
1582
|
+
failedChunks: row.failed_chunks,
|
|
1583
|
+
staleChunks: row.stale_chunks,
|
|
1584
|
+
coverageRatio: row.total_chunks === 0 ? 0 : row.indexed_chunks / row.total_chunks
|
|
1585
|
+
}))
|
|
1586
|
+
};
|
|
1587
|
+
},
|
|
1588
|
+
markDaemonStarted(input) {
|
|
1589
|
+
db.prepare(`
|
|
1590
|
+
INSERT INTO daemon_state (
|
|
1591
|
+
singleton_id,
|
|
1592
|
+
last_started_at,
|
|
1593
|
+
interval_minutes,
|
|
1594
|
+
fetch_on_start
|
|
1595
|
+
) VALUES (1, ?, ?, ?)
|
|
1596
|
+
ON CONFLICT(singleton_id) DO UPDATE SET
|
|
1597
|
+
last_started_at = excluded.last_started_at,
|
|
1598
|
+
interval_minutes = excluded.interval_minutes,
|
|
1599
|
+
fetch_on_start = excluded.fetch_on_start
|
|
1600
|
+
`).run(
|
|
1601
|
+
input.startedAt,
|
|
1602
|
+
input.intervalMinutes,
|
|
1603
|
+
input.fetchOnStart ? 1 : 0
|
|
1604
|
+
);
|
|
1605
|
+
},
|
|
1606
|
+
markDaemonCycleStarted(startedAt) {
|
|
1607
|
+
db.prepare(`
|
|
1608
|
+
INSERT INTO daemon_state (singleton_id, last_cycle_started_at)
|
|
1609
|
+
VALUES (1, ?)
|
|
1610
|
+
ON CONFLICT(singleton_id) DO UPDATE SET
|
|
1611
|
+
last_cycle_started_at = excluded.last_cycle_started_at
|
|
1612
|
+
`).run(startedAt);
|
|
1613
|
+
},
|
|
1614
|
+
markDaemonCycleCompleted(input) {
|
|
1615
|
+
db.prepare(`
|
|
1616
|
+
INSERT INTO daemon_state (
|
|
1617
|
+
singleton_id,
|
|
1618
|
+
last_cycle_completed_at,
|
|
1619
|
+
last_cycle_status
|
|
1620
|
+
) VALUES (1, ?, ?)
|
|
1621
|
+
ON CONFLICT(singleton_id) DO UPDATE SET
|
|
1622
|
+
last_cycle_completed_at = excluded.last_cycle_completed_at,
|
|
1623
|
+
last_cycle_status = excluded.last_cycle_status
|
|
1624
|
+
`).run(
|
|
1625
|
+
input.completedAt,
|
|
1626
|
+
input.status
|
|
1627
|
+
);
|
|
1628
|
+
},
|
|
1629
|
+
getDaemonState() {
|
|
1630
|
+
const row = db.prepare(`
|
|
1631
|
+
SELECT
|
|
1632
|
+
last_started_at,
|
|
1633
|
+
last_cycle_started_at,
|
|
1634
|
+
last_cycle_completed_at,
|
|
1635
|
+
last_cycle_status,
|
|
1636
|
+
interval_minutes,
|
|
1637
|
+
fetch_on_start
|
|
1638
|
+
FROM daemon_state
|
|
1639
|
+
WHERE singleton_id = 1
|
|
1640
|
+
`).get();
|
|
1641
|
+
if (!row) {
|
|
1642
|
+
return null;
|
|
1643
|
+
}
|
|
1644
|
+
return {
|
|
1645
|
+
lastStartedAt: row.last_started_at,
|
|
1646
|
+
lastCycleStartedAt: row.last_cycle_started_at,
|
|
1647
|
+
lastCycleCompletedAt: row.last_cycle_completed_at,
|
|
1648
|
+
lastCycleStatus: row.last_cycle_status,
|
|
1649
|
+
intervalMinutes: row.interval_minutes,
|
|
1650
|
+
fetchOnStart: row.fetch_on_start === null ? null : row.fetch_on_start === 1
|
|
1651
|
+
};
|
|
1652
|
+
},
|
|
1653
|
+
getCoverageCorpus(input) {
|
|
1654
|
+
const sourceRow = db.prepare("SELECT last_successful_snapshot_id FROM sources WHERE id = ?").get(input.sourceId);
|
|
1655
|
+
if (!sourceRow) {
|
|
1656
|
+
throw new AiocsError(
|
|
1657
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
1658
|
+
`Unknown source '${input.sourceId}'`
|
|
1659
|
+
);
|
|
1660
|
+
}
|
|
1661
|
+
const snapshotId = input.snapshotId ?? sourceRow.last_successful_snapshot_id;
|
|
1662
|
+
if (!snapshotId) {
|
|
1663
|
+
throw new AiocsError(
|
|
1664
|
+
AIOCS_ERROR_CODES.snapshotNotFound,
|
|
1665
|
+
`No successful snapshot found for source '${input.sourceId}'`
|
|
1666
|
+
);
|
|
1667
|
+
}
|
|
1668
|
+
const snapshotRow = db.prepare("SELECT id FROM snapshots WHERE id = ? AND source_id = ?").get(snapshotId, input.sourceId);
|
|
1669
|
+
if (!snapshotRow) {
|
|
1670
|
+
throw new AiocsError(
|
|
1671
|
+
AIOCS_ERROR_CODES.snapshotNotFound,
|
|
1672
|
+
`Snapshot '${snapshotId}' not found for source '${input.sourceId}'`
|
|
1673
|
+
);
|
|
1674
|
+
}
|
|
1675
|
+
const rows = db.prepare(`
|
|
1676
|
+
SELECT page_title, section_title, markdown
|
|
1677
|
+
FROM chunks
|
|
1678
|
+
WHERE source_id = ?
|
|
1679
|
+
AND snapshot_id = ?
|
|
1680
|
+
ORDER BY page_id, chunk_order
|
|
1681
|
+
`).all(input.sourceId, snapshotId);
|
|
1682
|
+
return {
|
|
1683
|
+
sourceId: input.sourceId,
|
|
1684
|
+
snapshotId,
|
|
1685
|
+
entries: rows.map((row) => ({
|
|
1686
|
+
pageTitle: row.page_title,
|
|
1687
|
+
sectionTitle: row.section_title,
|
|
1688
|
+
markdown: row.markdown
|
|
1689
|
+
}))
|
|
1690
|
+
};
|
|
1691
|
+
},
|
|
1692
|
+
getChunkById(chunkId) {
|
|
1693
|
+
const row = db.prepare(`
|
|
1694
|
+
SELECT
|
|
1695
|
+
c.id AS chunk_id,
|
|
1696
|
+
c.source_id,
|
|
1697
|
+
c.snapshot_id,
|
|
1698
|
+
c.page_url,
|
|
1699
|
+
c.page_title,
|
|
1700
|
+
c.section_title,
|
|
1701
|
+
c.markdown
|
|
1702
|
+
FROM chunks c
|
|
1703
|
+
WHERE c.id = ?
|
|
1704
|
+
`).get(chunkId);
|
|
1705
|
+
if (!row) {
|
|
1706
|
+
return null;
|
|
1707
|
+
}
|
|
1708
|
+
return {
|
|
1709
|
+
chunkId: row.chunk_id,
|
|
1710
|
+
sourceId: row.source_id,
|
|
1711
|
+
snapshotId: row.snapshot_id,
|
|
1712
|
+
pageUrl: row.page_url,
|
|
1713
|
+
pageTitle: row.page_title,
|
|
1714
|
+
sectionTitle: row.section_title,
|
|
1715
|
+
markdown: row.markdown
|
|
1716
|
+
};
|
|
1717
|
+
}
|
|
1718
|
+
};
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
// src/runtime/paths.ts
|
|
1722
|
+
import { homedir } from "os";
|
|
1723
|
+
import { join as join2 } from "path";
|
|
1724
|
+
import { mkdirSync as mkdirSync2 } from "fs";
|
|
1725
|
+
function expandTilde(path) {
|
|
1726
|
+
if (path === "~") {
|
|
1727
|
+
return homedir();
|
|
1728
|
+
}
|
|
1729
|
+
if (path.startsWith("~/")) {
|
|
1730
|
+
return join2(homedir(), path.slice(2));
|
|
1731
|
+
}
|
|
1732
|
+
return path;
|
|
1733
|
+
}
|
|
1734
|
+
function getAiocsDataDir(env = process.env) {
|
|
1735
|
+
const override = env.AIOCS_DATA_DIR;
|
|
1736
|
+
if (override) {
|
|
1737
|
+
mkdirSync2(expandTilde(override), { recursive: true });
|
|
1738
|
+
return expandTilde(override);
|
|
1739
|
+
}
|
|
1740
|
+
const target = join2(homedir(), ".aiocs", "data");
|
|
1741
|
+
mkdirSync2(target, { recursive: true });
|
|
1742
|
+
return target;
|
|
1743
|
+
}
|
|
1744
|
+
function getAiocsConfigDir(env = process.env) {
|
|
1745
|
+
const override = env.AIOCS_CONFIG_DIR;
|
|
1746
|
+
if (override) {
|
|
1747
|
+
mkdirSync2(expandTilde(override), { recursive: true });
|
|
1748
|
+
return expandTilde(override);
|
|
1749
|
+
}
|
|
1750
|
+
const target = join2(homedir(), ".aiocs", "config");
|
|
1751
|
+
mkdirSync2(target, { recursive: true });
|
|
1752
|
+
return target;
|
|
1753
|
+
}
|
|
1754
|
+
function getAiocsSourcesDir(env = process.env) {
|
|
1755
|
+
const override = env.AIOCS_SOURCES_DIR;
|
|
1756
|
+
if (override) {
|
|
1757
|
+
mkdirSync2(expandTilde(override), { recursive: true });
|
|
1758
|
+
return expandTilde(override);
|
|
1759
|
+
}
|
|
1760
|
+
const target = join2(homedir(), ".aiocs", "sources");
|
|
1761
|
+
mkdirSync2(target, { recursive: true });
|
|
1762
|
+
return target;
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
// src/daemon.ts
|
|
1766
|
+
import { resolve as resolve4 } from "path";
|
|
1767
|
+
import { setTimeout as sleep2 } from "timers/promises";
|
|
1768
|
+
|
|
1769
|
+
// src/fetch/fetch-source.ts
|
|
1770
|
+
import { mkdirSync as mkdirSync3, writeFileSync } from "fs";
|
|
1771
|
+
import { join as join3 } from "path";
|
|
1772
|
+
import { setTimeout as sleep } from "timers/promises";
|
|
1773
|
+
import { chromium } from "playwright";
|
|
1774
|
+
|
|
1775
|
+
// src/fetch/extract.ts
|
|
1776
|
+
import { JSDOM } from "jsdom";
|
|
1777
|
+
import { Readability } from "@mozilla/readability";
|
|
1778
|
+
|
|
1779
|
+
// src/fetch/normalize.ts
|
|
1780
|
+
import TurndownService from "turndown";
|
|
1781
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
1782
|
+
var turndown = new TurndownService({
|
|
1783
|
+
headingStyle: "atx",
|
|
1784
|
+
codeBlockStyle: "fenced"
|
|
1785
|
+
});
|
|
1786
|
+
turndown.use(gfm);
|
|
1787
|
+
function htmlToMarkdown(html) {
|
|
1788
|
+
return turndown.turndown(html).trim();
|
|
1789
|
+
}
|
|
1790
|
+
function ensureTitle(markdown, title) {
|
|
1791
|
+
const trimmed = markdown.trim();
|
|
1792
|
+
if (!trimmed) {
|
|
1793
|
+
return `# ${title}`;
|
|
1794
|
+
}
|
|
1795
|
+
if (trimmed.startsWith("# ")) {
|
|
1796
|
+
return trimmed;
|
|
1797
|
+
}
|
|
1798
|
+
return `# ${title}
|
|
1799
|
+
|
|
1800
|
+
${trimmed}`;
|
|
1801
|
+
}
|
|
1802
|
+
function normalizeMarkdown(spec, page) {
|
|
1803
|
+
const titled = ensureTitle(page.markdown, page.title);
|
|
1804
|
+
if (!spec.normalize.prependSourceComment) {
|
|
1805
|
+
return titled;
|
|
1806
|
+
}
|
|
1807
|
+
return `<!-- source: ${page.url} -->
|
|
1808
|
+
|
|
1809
|
+
${titled}`;
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1812
|
+
// src/fetch/extract.ts
|
|
1813
|
+
var CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS = 1e3;
|
|
1814
|
+
async function readClipboard(page) {
|
|
1815
|
+
return page.evaluate(() => navigator.clipboard.readText());
|
|
1816
|
+
}
|
|
1817
|
+
async function writeClipboard(page, value) {
|
|
1818
|
+
return page.evaluate(async (nextValue) => {
|
|
1819
|
+
try {
|
|
1820
|
+
await navigator.clipboard.writeText(nextValue);
|
|
1821
|
+
return true;
|
|
1822
|
+
} catch {
|
|
1823
|
+
return false;
|
|
1824
|
+
}
|
|
1825
|
+
}, value);
|
|
1826
|
+
}
|
|
1827
|
+
async function waitForClipboardChange(page, previousText, timeoutMs) {
|
|
1828
|
+
const startedAt = Date.now();
|
|
1829
|
+
while (Date.now() - startedAt < timeoutMs) {
|
|
1830
|
+
const current = (await readClipboard(page)).trim();
|
|
1831
|
+
if (current && current !== previousText.trim()) {
|
|
1832
|
+
return current;
|
|
1833
|
+
}
|
|
1834
|
+
await page.waitForTimeout(100);
|
|
1835
|
+
}
|
|
1836
|
+
throw new Error("Timed out waiting for clipboard content to change");
|
|
1837
|
+
}
|
|
1838
|
+
async function performClipboardInteractions(page, strategy, deadlineAt) {
|
|
1839
|
+
for (const interaction of strategy.interactions) {
|
|
1840
|
+
const remainingMs = deadlineAt - Date.now();
|
|
1841
|
+
if (remainingMs <= 0) {
|
|
1842
|
+
throw new Error("Timed out before clipboard copy controls became ready");
|
|
1843
|
+
}
|
|
1844
|
+
if (interaction.action === "hover") {
|
|
1845
|
+
const locator = page.locator(interaction.selector).first();
|
|
1846
|
+
const interactionTimeout = Math.min(
|
|
1847
|
+
interaction.timeoutMs ?? CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS,
|
|
1848
|
+
remainingMs
|
|
1849
|
+
);
|
|
1850
|
+
await locator.waitFor({
|
|
1851
|
+
state: "visible",
|
|
1852
|
+
timeout: interactionTimeout
|
|
1853
|
+
});
|
|
1854
|
+
await locator.hover({
|
|
1855
|
+
timeout: interactionTimeout
|
|
1856
|
+
});
|
|
1857
|
+
continue;
|
|
1858
|
+
}
|
|
1859
|
+
if (interaction.action === "click") {
|
|
1860
|
+
const locator = page.locator(interaction.selector).first();
|
|
1861
|
+
const interactionTimeout = Math.min(
|
|
1862
|
+
interaction.timeoutMs ?? CLIPBOARD_INTERACTION_DEFAULT_TIMEOUT_MS,
|
|
1863
|
+
remainingMs
|
|
1864
|
+
);
|
|
1865
|
+
await locator.waitFor({
|
|
1866
|
+
state: "visible",
|
|
1867
|
+
timeout: interactionTimeout
|
|
1868
|
+
});
|
|
1869
|
+
await locator.click({
|
|
1870
|
+
timeout: interactionTimeout
|
|
1871
|
+
});
|
|
1872
|
+
continue;
|
|
1873
|
+
}
|
|
1874
|
+
if (interaction.action === "press") {
|
|
1875
|
+
await page.keyboard.press(interaction.key);
|
|
1876
|
+
continue;
|
|
1877
|
+
}
|
|
1878
|
+
await page.waitForTimeout(Math.min(interaction.timeoutMs, remainingMs));
|
|
1879
|
+
}
|
|
1880
|
+
}
|
|
1881
|
+
async function runClipboardStrategy(page, strategy) {
|
|
1882
|
+
const sentinel = `__aiocs_clipboard_marker__${Date.now()}__${Math.random().toString(36).slice(2)}__`;
|
|
1883
|
+
const before = await writeClipboard(page, sentinel).catch(() => false) ? sentinel : await readClipboard(page).catch(() => "");
|
|
1884
|
+
const deadlineAt = Date.now() + strategy.clipboardTimeoutMs;
|
|
1885
|
+
let lastError = null;
|
|
1886
|
+
let markdown = null;
|
|
1887
|
+
while (Date.now() < deadlineAt && !markdown) {
|
|
1888
|
+
try {
|
|
1889
|
+
await performClipboardInteractions(page, strategy, deadlineAt);
|
|
1890
|
+
} catch (error) {
|
|
1891
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
1892
|
+
}
|
|
1893
|
+
const remainingMs = deadlineAt - Date.now();
|
|
1894
|
+
if (remainingMs <= 0) {
|
|
1895
|
+
break;
|
|
1896
|
+
}
|
|
1897
|
+
try {
|
|
1898
|
+
markdown = await waitForClipboardChange(page, before, Math.min(400, remainingMs));
|
|
1899
|
+
} catch (error) {
|
|
1900
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
1901
|
+
}
|
|
1902
|
+
}
|
|
1903
|
+
if (!markdown) {
|
|
1904
|
+
throw lastError ?? new Error("Timed out waiting for clipboard content to change");
|
|
1905
|
+
}
|
|
1906
|
+
const title = extractTitleFromMarkdown(markdown) ?? await page.title();
|
|
1907
|
+
return {
|
|
1908
|
+
title,
|
|
1909
|
+
markdown: markdown.trim()
|
|
1910
|
+
};
|
|
1911
|
+
}
|
|
1912
|
+
async function runSelectorStrategy(page, selector) {
|
|
1913
|
+
const locator = page.locator(selector).first();
|
|
1914
|
+
await locator.waitFor({ state: "visible", timeout: 1e4 });
|
|
1915
|
+
const html = await locator.innerHTML();
|
|
1916
|
+
const heading = await locator.locator("h1").first().textContent().catch(() => null);
|
|
1917
|
+
const title = (heading ?? await page.title()).trim();
|
|
1918
|
+
return {
|
|
1919
|
+
title,
|
|
1920
|
+
markdown: htmlToMarkdown(html)
|
|
1921
|
+
};
|
|
1922
|
+
}
|
|
1923
|
+
async function runReadabilityStrategy(page) {
|
|
1924
|
+
const html = await page.content();
|
|
1925
|
+
const dom = new JSDOM(html, { url: page.url() });
|
|
1926
|
+
const reader = new Readability(dom.window.document);
|
|
1927
|
+
const article = reader.parse();
|
|
1928
|
+
if (!article?.content) {
|
|
1929
|
+
throw new Error(`Readability could not extract content for ${page.url()}`);
|
|
1930
|
+
}
|
|
1931
|
+
return {
|
|
1932
|
+
title: article.title?.trim() || await page.title(),
|
|
1933
|
+
markdown: htmlToMarkdown(article.content)
|
|
1934
|
+
};
|
|
1935
|
+
}
|
|
1936
|
+
function extractTitleFromMarkdown(markdown) {
|
|
1937
|
+
for (const line of markdown.split("\n")) {
|
|
1938
|
+
const trimmed = line.trim();
|
|
1939
|
+
if (trimmed.startsWith("# ")) {
|
|
1940
|
+
return trimmed.slice(2).trim();
|
|
1941
|
+
}
|
|
1942
|
+
}
|
|
1943
|
+
return null;
|
|
1944
|
+
}
|
|
1945
|
+
async function extractPage(page, strategy) {
|
|
1946
|
+
if (strategy.strategy === "clipboardButton") {
|
|
1947
|
+
return runClipboardStrategy(page, strategy);
|
|
1948
|
+
}
|
|
1949
|
+
if (strategy.strategy === "selector") {
|
|
1950
|
+
return runSelectorStrategy(page, strategy.selector);
|
|
1951
|
+
}
|
|
1952
|
+
return runReadabilityStrategy(page);
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1955
|
+
// src/fetch/url-patterns.ts
|
|
1956
|
+
function escapeRegex(value) {
|
|
1957
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
1958
|
+
}
|
|
1959
|
+
function patternToRegex(pattern) {
|
|
1960
|
+
let regex = "^";
|
|
1961
|
+
for (let index = 0; index < pattern.length; index += 1) {
|
|
1962
|
+
const current = pattern[index];
|
|
1963
|
+
const next = pattern[index + 1];
|
|
1964
|
+
if (current === "*" && next === "*") {
|
|
1965
|
+
regex += ".*";
|
|
1966
|
+
index += 1;
|
|
1967
|
+
continue;
|
|
1968
|
+
}
|
|
1969
|
+
if (current === "*") {
|
|
1970
|
+
regex += "[^?#]*";
|
|
1971
|
+
continue;
|
|
1972
|
+
}
|
|
1973
|
+
regex += escapeRegex(current ?? "");
|
|
1974
|
+
}
|
|
1975
|
+
return new RegExp(`${regex}$`);
|
|
1976
|
+
}
|
|
1977
|
+
function matchesPatterns(url, patterns) {
|
|
1978
|
+
return patterns.some((pattern) => patternToRegex(pattern).test(url));
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
// src/fetch/fetch-source.ts
|
|
1982
|
+
var MAX_FETCH_ATTEMPTS = 3;
|
|
1983
|
+
var RETRY_DELAY_MS = 250;
|
|
1984
|
+
function nowIso2() {
|
|
1985
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
1986
|
+
}
|
|
1987
|
+
function canonicalizeUrl(raw) {
|
|
1988
|
+
const url = new URL(raw);
|
|
1989
|
+
url.hash = "";
|
|
1990
|
+
if (url.pathname !== "/" && url.pathname.endsWith("/")) {
|
|
1991
|
+
url.pathname = url.pathname.slice(0, -1);
|
|
1992
|
+
}
|
|
1993
|
+
return url.toString();
|
|
1994
|
+
}
|
|
1995
|
+
function getCrawlKey(raw) {
|
|
1996
|
+
const url = new URL(canonicalizeUrl(raw));
|
|
1997
|
+
if (/\.(md|markdown)$/i.test(url.pathname)) {
|
|
1998
|
+
url.pathname = url.pathname.replace(/\.(md|markdown)$/i, "");
|
|
1999
|
+
}
|
|
2000
|
+
return url.toString();
|
|
2001
|
+
}
|
|
2002
|
+
function isAllowed(url, allowedHosts, include, exclude) {
|
|
2003
|
+
const parsed = new URL(url);
|
|
2004
|
+
if (!allowedHosts.includes(parsed.hostname)) {
|
|
2005
|
+
return false;
|
|
2006
|
+
}
|
|
2007
|
+
if (parsed.pathname.startsWith("/~gitbook/")) {
|
|
2008
|
+
return false;
|
|
2009
|
+
}
|
|
2010
|
+
if (!matchesPatterns(url, include)) {
|
|
2011
|
+
return false;
|
|
2012
|
+
}
|
|
2013
|
+
if (exclude.length > 0 && matchesPatterns(url, exclude)) {
|
|
2014
|
+
return false;
|
|
2015
|
+
}
|
|
2016
|
+
return true;
|
|
2017
|
+
}
|
|
2018
|
+
function slugify(value) {
|
|
2019
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80) || "page";
|
|
2020
|
+
}
|
|
2021
|
+
function extractTitleFromMarkdown2(markdown) {
|
|
2022
|
+
for (const line of markdown.split("\n")) {
|
|
2023
|
+
const trimmed = line.trim();
|
|
2024
|
+
if (trimmed.startsWith("# ")) {
|
|
2025
|
+
return trimmed.slice(2).trim();
|
|
2026
|
+
}
|
|
2027
|
+
}
|
|
2028
|
+
return null;
|
|
2029
|
+
}
|
|
2030
|
+
function deriveTitleFromUrl(url) {
|
|
2031
|
+
const pathname = new URL(url).pathname;
|
|
2032
|
+
const lastSegment = pathname.split("/").filter(Boolean).pop() ?? "page";
|
|
2033
|
+
return lastSegment.replace(/\.(md|markdown)$/i, "").replace(/[-_]+/g, " ").trim() || "page";
|
|
2034
|
+
}
|
|
2035
|
+
function isRawMarkdownResponse(url, response) {
|
|
2036
|
+
if (!response) {
|
|
2037
|
+
return false;
|
|
2038
|
+
}
|
|
2039
|
+
const contentType = response.headers()["content-type"]?.toLowerCase() ?? "";
|
|
2040
|
+
if (contentType.includes("text/markdown") || contentType.includes("text/x-markdown")) {
|
|
2041
|
+
return true;
|
|
2042
|
+
}
|
|
2043
|
+
return contentType.includes("text/plain") && /\.(md|markdown)$/i.test(new URL(url).pathname);
|
|
2044
|
+
}
|
|
2045
|
+
async function extractRawMarkdownPage(url, response) {
|
|
2046
|
+
const markdown = (await response.text()).trim();
|
|
2047
|
+
return {
|
|
2048
|
+
url,
|
|
2049
|
+
title: extractTitleFromMarkdown2(markdown) ?? deriveTitleFromUrl(url),
|
|
2050
|
+
markdown
|
|
2051
|
+
};
|
|
2052
|
+
}
|
|
2053
|
+
function persistSnapshotPages(input, snapshotId, pages) {
|
|
2054
|
+
const snapshotDir = join3(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
|
|
2055
|
+
mkdirSync3(snapshotDir, { recursive: true });
|
|
2056
|
+
pages.forEach((page, index) => {
|
|
2057
|
+
const filename = `${String(index + 1).padStart(3, "0")}-${slugify(page.title)}.md`;
|
|
2058
|
+
writeFileSync(join3(snapshotDir, filename), page.markdown, "utf8");
|
|
2059
|
+
});
|
|
2060
|
+
}
|
|
2061
|
+
function resolveEnvValue(name, env) {
|
|
2062
|
+
const value = env[name];
|
|
2063
|
+
if (!value) {
|
|
2064
|
+
throw new AiocsError(
|
|
2065
|
+
AIOCS_ERROR_CODES.authEnvMissing,
|
|
2066
|
+
`Missing required environment variable '${name}' for authenticated source access`,
|
|
2067
|
+
{
|
|
2068
|
+
envVar: name
|
|
2069
|
+
}
|
|
2070
|
+
);
|
|
2071
|
+
}
|
|
2072
|
+
return value;
|
|
2073
|
+
}
|
|
2074
|
+
function resolveSourceAuth(spec, env) {
|
|
2075
|
+
const scopedHeaders = (spec.auth?.headers ?? []).map((header) => ({
|
|
2076
|
+
name: header.name,
|
|
2077
|
+
value: resolveEnvValue(header.valueFromEnv, env),
|
|
2078
|
+
hosts: header.hosts ?? spec.allowedHosts,
|
|
2079
|
+
...header.include ? { include: header.include } : {}
|
|
2080
|
+
}));
|
|
2081
|
+
const cookies = (spec.auth?.cookies ?? []).map((cookie) => ({
|
|
2082
|
+
name: cookie.name,
|
|
2083
|
+
value: resolveEnvValue(cookie.valueFromEnv, env),
|
|
2084
|
+
domain: cookie.domain,
|
|
2085
|
+
path: cookie.path,
|
|
2086
|
+
...typeof cookie.secure === "boolean" ? { secure: cookie.secure } : {},
|
|
2087
|
+
...typeof cookie.httpOnly === "boolean" ? { httpOnly: cookie.httpOnly } : {},
|
|
2088
|
+
...cookie.sameSite ? { sameSite: cookie.sameSite } : {}
|
|
2089
|
+
}));
|
|
2090
|
+
return {
|
|
2091
|
+
scopedHeaders,
|
|
2092
|
+
cookies
|
|
2093
|
+
};
|
|
2094
|
+
}
|
|
2095
|
+
function applyScopedAuthHeaders(requestUrl, headers, scopedHeaders) {
|
|
2096
|
+
if (scopedHeaders.length === 0) {
|
|
2097
|
+
return headers;
|
|
2098
|
+
}
|
|
2099
|
+
const hostname = new URL(requestUrl).hostname;
|
|
2100
|
+
const nextHeaders = { ...headers };
|
|
2101
|
+
for (const header of scopedHeaders) {
|
|
2102
|
+
if (!header.hosts.includes(hostname)) {
|
|
2103
|
+
continue;
|
|
2104
|
+
}
|
|
2105
|
+
if (header.include && !matchesPatterns(requestUrl, header.include)) {
|
|
2106
|
+
continue;
|
|
2107
|
+
}
|
|
2108
|
+
nextHeaders[header.name] = header.value;
|
|
2109
|
+
}
|
|
2110
|
+
return nextHeaders;
|
|
2111
|
+
}
|
|
2112
|
+
async function createSourceContext(spec, env) {
|
|
2113
|
+
const { scopedHeaders, cookies } = resolveSourceAuth(spec, env);
|
|
2114
|
+
const browser = await chromium.launch({ headless: true });
|
|
2115
|
+
const context = await browser.newContext({
|
|
2116
|
+
viewport: {
|
|
2117
|
+
width: 1440,
|
|
2118
|
+
height: 1200
|
|
2119
|
+
}
|
|
2120
|
+
});
|
|
2121
|
+
if (scopedHeaders.length > 0) {
|
|
2122
|
+
await context.route("**/*", async (route) => {
|
|
2123
|
+
await route.continue({
|
|
2124
|
+
headers: applyScopedAuthHeaders(route.request().url(), route.request().headers(), scopedHeaders)
|
|
2125
|
+
});
|
|
2126
|
+
});
|
|
2127
|
+
}
|
|
2128
|
+
if (cookies.length > 0) {
|
|
2129
|
+
await context.addCookies(cookies);
|
|
2130
|
+
}
|
|
2131
|
+
const uniqueOrigins = [...new Set(spec.startUrls.map((url) => new URL(url).origin))];
|
|
2132
|
+
for (const origin of uniqueOrigins) {
|
|
2133
|
+
await context.grantPermissions(["clipboard-read", "clipboard-write"], { origin });
|
|
2134
|
+
}
|
|
2135
|
+
const page = await context.newPage();
|
|
2136
|
+
page.setDefaultTimeout(15e3);
|
|
2137
|
+
return {
|
|
2138
|
+
page,
|
|
2139
|
+
async close() {
|
|
2140
|
+
await context.close();
|
|
2141
|
+
await browser.close();
|
|
2142
|
+
}
|
|
2143
|
+
};
|
|
2144
|
+
}
|
|
2145
|
+
async function discoverLinks(page) {
|
|
2146
|
+
return page.locator("a[href]").evaluateAll(
|
|
2147
|
+
(anchors) => anchors.map((anchor) => anchor.href).filter((href) => typeof href === "string" && href.length > 0)
|
|
2148
|
+
);
|
|
2149
|
+
}
|
|
2150
|
+
async function extractFetchedPage(spec, page, url, response) {
|
|
2151
|
+
if (response && isRawMarkdownResponse(url, response)) {
|
|
2152
|
+
const extracted2 = await extractRawMarkdownPage(url, response);
|
|
2153
|
+
const markdown2 = normalizeMarkdown(spec, extracted2);
|
|
2154
|
+
return {
|
|
2155
|
+
...extracted2,
|
|
2156
|
+
markdown: markdown2,
|
|
2157
|
+
markdownLength: markdown2.trim().length
|
|
2158
|
+
};
|
|
2159
|
+
}
|
|
2160
|
+
await page.waitForTimeout(150);
|
|
2161
|
+
const extracted = await extractPage(page, spec.extract);
|
|
2162
|
+
const markdown = normalizeMarkdown(spec, {
|
|
2163
|
+
title: extracted.title,
|
|
2164
|
+
url,
|
|
2165
|
+
markdown: extracted.markdown
|
|
2166
|
+
});
|
|
2167
|
+
return {
|
|
2168
|
+
url,
|
|
2169
|
+
title: extracted.title,
|
|
2170
|
+
markdown,
|
|
2171
|
+
markdownLength: markdown.trim().length
|
|
2172
|
+
};
|
|
2173
|
+
}
|
|
2174
|
+
async function fetchSourceOnce(input) {
|
|
2175
|
+
const spec = input.catalog.getSourceSpec(input.sourceId);
|
|
2176
|
+
if (!spec) {
|
|
2177
|
+
throw new AiocsError(
|
|
2178
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
2179
|
+
`Unknown source '${input.sourceId}'`
|
|
2180
|
+
);
|
|
2181
|
+
}
|
|
2182
|
+
const session = await createSourceContext(spec, input.env ?? process.env);
|
|
2183
|
+
const { page } = session;
|
|
2184
|
+
const queue = spec.startUrls.map((url) => canonicalizeUrl(url));
|
|
2185
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2186
|
+
const pageOrder = [];
|
|
2187
|
+
const pagesByCrawlKey = /* @__PURE__ */ new Map();
|
|
2188
|
+
const pendingRawFallbacks = /* @__PURE__ */ new Map();
|
|
2189
|
+
try {
|
|
2190
|
+
while (queue.length > 0 && pagesByCrawlKey.size < spec.discovery.maxPages) {
|
|
2191
|
+
const next = queue.shift();
|
|
2192
|
+
if (!next) {
|
|
2193
|
+
break;
|
|
2194
|
+
}
|
|
2195
|
+
const url = canonicalizeUrl(next);
|
|
2196
|
+
const crawlKey = getCrawlKey(url);
|
|
2197
|
+
const isRawMarkdownUrl = crawlKey !== url;
|
|
2198
|
+
const existing = pagesByCrawlKey.get(crawlKey);
|
|
2199
|
+
if (isRawMarkdownUrl) {
|
|
2200
|
+
if (existing && !existing.isRawMarkdown) {
|
|
2201
|
+
continue;
|
|
2202
|
+
}
|
|
2203
|
+
if (!seen.has(crawlKey) && !existing) {
|
|
2204
|
+
pendingRawFallbacks.set(crawlKey, url);
|
|
2205
|
+
const canonicalQueued = queue.some((queuedUrl) => canonicalizeUrl(queuedUrl) === crawlKey);
|
|
2206
|
+
if (!canonicalQueued) {
|
|
2207
|
+
queue.unshift(crawlKey);
|
|
2208
|
+
}
|
|
2209
|
+
continue;
|
|
2210
|
+
}
|
|
2211
|
+
}
|
|
2212
|
+
if (seen.has(url)) {
|
|
2213
|
+
continue;
|
|
2214
|
+
}
|
|
2215
|
+
seen.add(url);
|
|
2216
|
+
if (!isAllowed(url, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
|
|
2217
|
+
continue;
|
|
2218
|
+
}
|
|
2219
|
+
const response = await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
2220
|
+
if (response && response.status() >= 400) {
|
|
2221
|
+
const pendingRawFallback = pendingRawFallbacks.get(crawlKey);
|
|
2222
|
+
if (!isRawMarkdownUrl && pendingRawFallback && !seen.has(pendingRawFallback)) {
|
|
2223
|
+
queue.unshift(pendingRawFallback);
|
|
2224
|
+
}
|
|
2225
|
+
continue;
|
|
2226
|
+
}
|
|
2227
|
+
let fetchedPage;
|
|
2228
|
+
try {
|
|
2229
|
+
fetchedPage = await extractFetchedPage(spec, page, url, response);
|
|
2230
|
+
} catch (error) {
|
|
2231
|
+
const pendingRawFallback = pendingRawFallbacks.get(crawlKey);
|
|
2232
|
+
if (!isRawMarkdownUrl && pendingRawFallback && !seen.has(pendingRawFallback)) {
|
|
2233
|
+
queue.unshift(pendingRawFallback);
|
|
2234
|
+
continue;
|
|
2235
|
+
}
|
|
2236
|
+
throw error;
|
|
2237
|
+
}
|
|
2238
|
+
const isRawMarkdown = response !== null && isRawMarkdownResponse(url, response);
|
|
2239
|
+
if (!existing) {
|
|
2240
|
+
pageOrder.push(crawlKey);
|
|
2241
|
+
pagesByCrawlKey.set(crawlKey, { page: fetchedPage, isRawMarkdown });
|
|
2242
|
+
} else if (existing.isRawMarkdown && !isRawMarkdown) {
|
|
2243
|
+
pagesByCrawlKey.set(crawlKey, { page: fetchedPage, isRawMarkdown });
|
|
2244
|
+
}
|
|
2245
|
+
if (!isRawMarkdown) {
|
|
2246
|
+
pendingRawFallbacks.delete(crawlKey);
|
|
2247
|
+
}
|
|
2248
|
+
if (!isRawMarkdown) {
|
|
2249
|
+
const links = await discoverLinks(page);
|
|
2250
|
+
for (const link of links) {
|
|
2251
|
+
const canonical = canonicalizeUrl(link);
|
|
2252
|
+
if (!seen.has(canonical) && isAllowed(canonical, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
|
|
2253
|
+
queue.push(canonical);
|
|
2254
|
+
}
|
|
2255
|
+
}
|
|
2256
|
+
}
|
|
2257
|
+
}
|
|
2258
|
+
const pages = pageOrder.map((crawlKey) => pagesByCrawlKey.get(crawlKey)?.page).filter((pageEntry) => pageEntry !== void 0);
|
|
2259
|
+
if (pages.length === 0) {
|
|
2260
|
+
throw new AiocsError(
|
|
2261
|
+
AIOCS_ERROR_CODES.noPagesFetched,
|
|
2262
|
+
`No pages fetched for source '${input.sourceId}'`
|
|
2263
|
+
);
|
|
2264
|
+
}
|
|
2265
|
+
const result = input.catalog.recordSuccessfulSnapshot({
|
|
2266
|
+
sourceId: input.sourceId,
|
|
2267
|
+
pages
|
|
2268
|
+
});
|
|
2269
|
+
if (!result.reused) {
|
|
2270
|
+
persistSnapshotPages(input, result.snapshotId, pages);
|
|
2271
|
+
}
|
|
2272
|
+
return {
|
|
2273
|
+
snapshotId: result.snapshotId,
|
|
2274
|
+
pageCount: pages.length,
|
|
2275
|
+
reused: result.reused
|
|
2276
|
+
};
|
|
2277
|
+
} finally {
|
|
2278
|
+
await session.close();
|
|
2279
|
+
}
|
|
2280
|
+
}
|
|
2281
|
+
async function fetchSource(input) {
|
|
2282
|
+
let lastError;
|
|
2283
|
+
for (let attempt = 1; attempt <= MAX_FETCH_ATTEMPTS; attempt += 1) {
|
|
2284
|
+
try {
|
|
2285
|
+
return await fetchSourceOnce(input);
|
|
2286
|
+
} catch (error) {
|
|
2287
|
+
lastError = error;
|
|
2288
|
+
if (attempt >= MAX_FETCH_ATTEMPTS) {
|
|
2289
|
+
input.catalog.recordFailedFetchRun({
|
|
2290
|
+
sourceId: input.sourceId,
|
|
2291
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2292
|
+
});
|
|
2293
|
+
throw error;
|
|
2294
|
+
}
|
|
2295
|
+
await sleep(RETRY_DELAY_MS * attempt);
|
|
2296
|
+
}
|
|
2297
|
+
}
|
|
2298
|
+
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
|
2299
|
+
}
|
|
2300
|
+
async function runSourceCanaryOnce(input) {
|
|
2301
|
+
const spec = input.catalog.getSourceSpec(input.sourceId);
|
|
2302
|
+
if (!spec) {
|
|
2303
|
+
throw new AiocsError(
|
|
2304
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
2305
|
+
`Unknown source '${input.sourceId}'`
|
|
2306
|
+
);
|
|
2307
|
+
}
|
|
2308
|
+
const canary = resolveSourceCanary(spec);
|
|
2309
|
+
const session = await createSourceContext(spec, input.env ?? process.env);
|
|
2310
|
+
const { page } = session;
|
|
2311
|
+
const checks = [];
|
|
2312
|
+
try {
|
|
2313
|
+
for (const check of canary.checks) {
|
|
2314
|
+
const url = canonicalizeUrl(check.url);
|
|
2315
|
+
try {
|
|
2316
|
+
if (!isAllowed(url, spec.allowedHosts, spec.discovery.include, spec.discovery.exclude)) {
|
|
2317
|
+
throw new AiocsError(
|
|
2318
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
2319
|
+
`Canary URL '${url}' is outside the allowed source scope`
|
|
2320
|
+
);
|
|
2321
|
+
}
|
|
2322
|
+
const response = await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
2323
|
+
if (response && response.status() >= 400) {
|
|
2324
|
+
throw new Error(`Canary request failed with HTTP ${response.status()}`);
|
|
2325
|
+
}
|
|
2326
|
+
const extracted = await extractFetchedPage(spec, page, url, response);
|
|
2327
|
+
if (check.expectedTitle && !extracted.title.includes(check.expectedTitle)) {
|
|
2328
|
+
throw new Error(`Expected title to include '${check.expectedTitle}'`);
|
|
2329
|
+
}
|
|
2330
|
+
if (check.expectedText && !extracted.markdown.includes(check.expectedText)) {
|
|
2331
|
+
throw new Error(`Expected markdown to include '${check.expectedText}'`);
|
|
2332
|
+
}
|
|
2333
|
+
if (extracted.markdownLength < check.minMarkdownLength) {
|
|
2334
|
+
throw new Error(
|
|
2335
|
+
`Expected markdown length to be at least ${check.minMarkdownLength}, received ${extracted.markdownLength}`
|
|
2336
|
+
);
|
|
2337
|
+
}
|
|
2338
|
+
checks.push({
|
|
2339
|
+
url,
|
|
2340
|
+
status: "pass",
|
|
2341
|
+
title: extracted.title,
|
|
2342
|
+
markdownLength: extracted.markdownLength
|
|
2343
|
+
});
|
|
2344
|
+
} catch (error) {
|
|
2345
|
+
checks.push({
|
|
2346
|
+
url,
|
|
2347
|
+
status: "fail",
|
|
2348
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2349
|
+
});
|
|
2350
|
+
}
|
|
2351
|
+
}
|
|
2352
|
+
} finally {
|
|
2353
|
+
await session.close();
|
|
2354
|
+
}
|
|
2355
|
+
const result = {
|
|
2356
|
+
sourceId: input.sourceId,
|
|
2357
|
+
status: checks.every((check) => check.status === "pass") ? "pass" : "fail",
|
|
2358
|
+
checkedAt: nowIso2(),
|
|
2359
|
+
summary: {
|
|
2360
|
+
checkCount: checks.length,
|
|
2361
|
+
passCount: checks.filter((check) => check.status === "pass").length,
|
|
2362
|
+
failCount: checks.filter((check) => check.status === "fail").length
|
|
2363
|
+
},
|
|
2364
|
+
checks
|
|
2365
|
+
};
|
|
2366
|
+
input.catalog.recordCanaryRun({
|
|
2367
|
+
sourceId: input.sourceId,
|
|
2368
|
+
status: result.status,
|
|
2369
|
+
checkedAt: result.checkedAt,
|
|
2370
|
+
details: result
|
|
2371
|
+
});
|
|
2372
|
+
if (result.status === "fail") {
|
|
2373
|
+
throw new AiocsError(
|
|
2374
|
+
AIOCS_ERROR_CODES.canaryFailed,
|
|
2375
|
+
`Canary failed for source '${input.sourceId}'`,
|
|
2376
|
+
result
|
|
2377
|
+
);
|
|
2378
|
+
}
|
|
2379
|
+
return result;
|
|
2380
|
+
}
|
|
2381
|
+
async function runSourceCanary(input) {
|
|
2382
|
+
let lastError;
|
|
2383
|
+
for (let attempt = 1; attempt <= MAX_FETCH_ATTEMPTS; attempt += 1) {
|
|
2384
|
+
try {
|
|
2385
|
+
return await runSourceCanaryOnce(input);
|
|
2386
|
+
} catch (error) {
|
|
2387
|
+
lastError = error;
|
|
2388
|
+
if (attempt >= MAX_FETCH_ATTEMPTS) {
|
|
2389
|
+
if (error instanceof AiocsError && error.code === AIOCS_ERROR_CODES.canaryFailed) {
|
|
2390
|
+
return error.details;
|
|
2391
|
+
}
|
|
2392
|
+
throw error;
|
|
2393
|
+
}
|
|
2394
|
+
await sleep(RETRY_DELAY_MS * attempt);
|
|
2395
|
+
}
|
|
2396
|
+
}
|
|
2397
|
+
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
|
2398
|
+
}
|
|
2399
|
+
|
|
2400
|
+
// src/hybrid/ollama.ts
|
|
2401
|
+
function getEmbeddingModelKey(config) {
|
|
2402
|
+
return `${config.embeddingProvider}:${config.ollamaEmbeddingModel}`;
|
|
2403
|
+
}
|
|
2404
|
+
function normalizeBaseUrl(baseUrl) {
|
|
2405
|
+
return baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
|
|
2406
|
+
}
|
|
2407
|
+
async function parseJsonResponse(response) {
|
|
2408
|
+
const text = await response.text();
|
|
2409
|
+
if (!text) {
|
|
2410
|
+
return {};
|
|
2411
|
+
}
|
|
2412
|
+
try {
|
|
2413
|
+
return JSON.parse(text);
|
|
2414
|
+
} catch {
|
|
2415
|
+
throw new AiocsError(
|
|
2416
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2417
|
+
`Ollama returned a non-JSON response with status ${response.status}`
|
|
2418
|
+
);
|
|
2419
|
+
}
|
|
2420
|
+
}
|
|
2421
|
+
async function embedTexts(config, texts) {
|
|
2422
|
+
if (texts.length === 0) {
|
|
2423
|
+
return [];
|
|
2424
|
+
}
|
|
2425
|
+
const response = await fetch(`${normalizeBaseUrl(config.ollamaBaseUrl)}/api/embed`, {
|
|
2426
|
+
method: "POST",
|
|
2427
|
+
headers: {
|
|
2428
|
+
"content-type": "application/json"
|
|
2429
|
+
},
|
|
2430
|
+
signal: AbortSignal.timeout(config.ollamaTimeoutMs),
|
|
2431
|
+
body: JSON.stringify({
|
|
2432
|
+
model: config.ollamaEmbeddingModel,
|
|
2433
|
+
input: texts
|
|
2434
|
+
})
|
|
2435
|
+
}).catch((error) => {
|
|
2436
|
+
throw new AiocsError(
|
|
2437
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2438
|
+
`Unable to reach Ollama at ${config.ollamaBaseUrl}: ${error instanceof Error ? error.message : String(error)}`
|
|
2439
|
+
);
|
|
2440
|
+
});
|
|
2441
|
+
if (!response.ok) {
|
|
2442
|
+
const body = await response.text();
|
|
2443
|
+
throw new AiocsError(
|
|
2444
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2445
|
+
`Ollama embed request failed with status ${response.status}`,
|
|
2446
|
+
body ? { body } : void 0
|
|
2447
|
+
);
|
|
2448
|
+
}
|
|
2449
|
+
const payload = await parseJsonResponse(response);
|
|
2450
|
+
if (!Array.isArray(payload.embeddings)) {
|
|
2451
|
+
throw new AiocsError(
|
|
2452
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2453
|
+
"Ollama embed response did not include an embeddings array"
|
|
2454
|
+
);
|
|
2455
|
+
}
|
|
2456
|
+
const embeddings = payload.embeddings.map((entry) => {
|
|
2457
|
+
if (!Array.isArray(entry) || !entry.every((value) => typeof value === "number")) {
|
|
2458
|
+
throw new AiocsError(
|
|
2459
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2460
|
+
"Ollama embed response contained an invalid embedding vector"
|
|
2461
|
+
);
|
|
2462
|
+
}
|
|
2463
|
+
return entry;
|
|
2464
|
+
});
|
|
2465
|
+
if (embeddings.length !== texts.length) {
|
|
2466
|
+
throw new AiocsError(
|
|
2467
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2468
|
+
`Ollama returned ${embeddings.length} embeddings for ${texts.length} inputs`
|
|
2469
|
+
);
|
|
2470
|
+
}
|
|
2471
|
+
return embeddings;
|
|
2472
|
+
}
|
|
2473
|
+
async function getEmbeddingProviderStatus(config) {
|
|
2474
|
+
const response = await fetch(`${normalizeBaseUrl(config.ollamaBaseUrl)}/api/tags`, {
|
|
2475
|
+
signal: AbortSignal.timeout(config.ollamaTimeoutMs)
|
|
2476
|
+
}).catch((error) => {
|
|
2477
|
+
throw new AiocsError(
|
|
2478
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2479
|
+
`Unable to reach Ollama at ${config.ollamaBaseUrl}: ${error instanceof Error ? error.message : String(error)}`
|
|
2480
|
+
);
|
|
2481
|
+
});
|
|
2482
|
+
if (!response.ok) {
|
|
2483
|
+
throw new AiocsError(
|
|
2484
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2485
|
+
`Ollama tags request failed with status ${response.status}`
|
|
2486
|
+
);
|
|
2487
|
+
}
|
|
2488
|
+
const payload = await parseJsonResponse(response);
|
|
2489
|
+
const availableModels = (payload.models ?? []).map((entry) => entry.name ?? entry.model).filter((entry) => typeof entry === "string" && entry.length > 0);
|
|
2490
|
+
const modelPresent = availableModels.some(
|
|
2491
|
+
(name) => name === config.ollamaEmbeddingModel || name.startsWith(`${config.ollamaEmbeddingModel}:`)
|
|
2492
|
+
);
|
|
2493
|
+
return {
|
|
2494
|
+
ok: modelPresent,
|
|
2495
|
+
modelPresent,
|
|
2496
|
+
baseUrl: config.ollamaBaseUrl,
|
|
2497
|
+
model: config.ollamaEmbeddingModel,
|
|
2498
|
+
availableModels
|
|
2499
|
+
};
|
|
2500
|
+
}
|
|
2501
|
+
|
|
2502
|
+
// src/hybrid/qdrant.ts
|
|
2503
|
+
import { QdrantClient } from "@qdrant/js-client-rest";
|
|
2504
|
+
var AiocsVectorStore = class {
|
|
2505
|
+
client;
|
|
2506
|
+
collectionName;
|
|
2507
|
+
constructor(config) {
|
|
2508
|
+
this.client = new QdrantClient({
|
|
2509
|
+
url: config.qdrantUrl,
|
|
2510
|
+
timeout: config.qdrantTimeoutMs,
|
|
2511
|
+
checkCompatibility: false
|
|
2512
|
+
});
|
|
2513
|
+
this.collectionName = config.qdrantCollection;
|
|
2514
|
+
}
|
|
2515
|
+
pointIdForChunk(chunkId) {
|
|
2516
|
+
return chunkId;
|
|
2517
|
+
}
|
|
2518
|
+
async ensureCollection(dimension) {
|
|
2519
|
+
const existsResponse = await this.client.collectionExists(this.collectionName).catch((error) => {
|
|
2520
|
+
throw new AiocsError(
|
|
2521
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2522
|
+
`Unable to reach Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2523
|
+
);
|
|
2524
|
+
});
|
|
2525
|
+
const exists = typeof existsResponse === "boolean" ? existsResponse : Boolean(existsResponse.exists);
|
|
2526
|
+
if (!exists) {
|
|
2527
|
+
await this.client.createCollection(this.collectionName, {
|
|
2528
|
+
vectors: {
|
|
2529
|
+
size: dimension,
|
|
2530
|
+
distance: "Cosine"
|
|
2531
|
+
}
|
|
2532
|
+
}).catch((error) => {
|
|
2533
|
+
throw new AiocsError(
|
|
2534
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2535
|
+
`Unable to create Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2536
|
+
);
|
|
2537
|
+
});
|
|
2538
|
+
return;
|
|
2539
|
+
}
|
|
2540
|
+
const collection = await this.client.getCollection(this.collectionName).catch((error) => {
|
|
2541
|
+
throw new AiocsError(
|
|
2542
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2543
|
+
`Unable to inspect Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2544
|
+
);
|
|
2545
|
+
});
|
|
2546
|
+
const params = collection.config?.params?.vectors;
|
|
2547
|
+
const currentSize = typeof params === "object" && params && "size" in params ? Number(params.size) : null;
|
|
2548
|
+
if (!currentSize || currentSize !== dimension) {
|
|
2549
|
+
await this.client.recreateCollection(this.collectionName, {
|
|
2550
|
+
vectors: {
|
|
2551
|
+
size: dimension,
|
|
2552
|
+
distance: "Cosine"
|
|
2553
|
+
}
|
|
2554
|
+
}).catch((error) => {
|
|
2555
|
+
throw new AiocsError(
|
|
2556
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2557
|
+
`Unable to recreate Qdrant collection '${this.collectionName}' for dimension ${dimension}: ${error instanceof Error ? error.message : String(error)}`
|
|
2558
|
+
);
|
|
2559
|
+
});
|
|
2560
|
+
}
|
|
2561
|
+
}
|
|
2562
|
+
async upsertChunks(input) {
|
|
2563
|
+
if (input.points.length === 0) {
|
|
2564
|
+
return;
|
|
2565
|
+
}
|
|
2566
|
+
const points = input.points.map((point) => ({
|
|
2567
|
+
id: this.pointIdForChunk(point.chunkId),
|
|
2568
|
+
vector: point.vector,
|
|
2569
|
+
payload: {
|
|
2570
|
+
chunkId: point.chunkId,
|
|
2571
|
+
sourceId: point.sourceId,
|
|
2572
|
+
snapshotId: point.snapshotId,
|
|
2573
|
+
pageUrl: point.pageUrl,
|
|
2574
|
+
pageTitle: point.pageTitle,
|
|
2575
|
+
sectionTitle: point.sectionTitle,
|
|
2576
|
+
modelKey: input.modelKey
|
|
2577
|
+
}
|
|
2578
|
+
}));
|
|
2579
|
+
await this.client.upsert(this.collectionName, {
|
|
2580
|
+
wait: true,
|
|
2581
|
+
points
|
|
2582
|
+
}).catch((error) => {
|
|
2583
|
+
throw new AiocsError(
|
|
2584
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2585
|
+
`Unable to upsert vectors into Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2586
|
+
);
|
|
2587
|
+
});
|
|
2588
|
+
}
|
|
2589
|
+
async deleteChunkIds(chunkIds) {
|
|
2590
|
+
if (chunkIds.length === 0) {
|
|
2591
|
+
return;
|
|
2592
|
+
}
|
|
2593
|
+
await this.client.delete(this.collectionName, {
|
|
2594
|
+
wait: true,
|
|
2595
|
+
points: chunkIds.map((chunkId) => this.pointIdForChunk(chunkId))
|
|
2596
|
+
}).catch((error) => {
|
|
2597
|
+
throw new AiocsError(
|
|
2598
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2599
|
+
`Unable to delete vectors from Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2600
|
+
);
|
|
2601
|
+
});
|
|
2602
|
+
}
|
|
2603
|
+
async clearCollection() {
|
|
2604
|
+
const existsResponse = await this.client.collectionExists(this.collectionName).catch((error) => {
|
|
2605
|
+
throw new AiocsError(
|
|
2606
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2607
|
+
`Unable to reach Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2608
|
+
);
|
|
2609
|
+
});
|
|
2610
|
+
const exists = typeof existsResponse === "boolean" ? existsResponse : Boolean(existsResponse.exists);
|
|
2611
|
+
if (!exists) {
|
|
2612
|
+
return;
|
|
2613
|
+
}
|
|
2614
|
+
await this.client.deleteCollection(this.collectionName).catch((error) => {
|
|
2615
|
+
throw new AiocsError(
|
|
2616
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2617
|
+
`Unable to delete Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2618
|
+
);
|
|
2619
|
+
});
|
|
2620
|
+
}
|
|
2621
|
+
async search(input) {
|
|
2622
|
+
if (input.snapshotIds.length === 0) {
|
|
2623
|
+
return [];
|
|
2624
|
+
}
|
|
2625
|
+
const results = await this.client.search(this.collectionName, {
|
|
2626
|
+
vector: input.vector,
|
|
2627
|
+
limit: input.limit,
|
|
2628
|
+
...typeof input.offset === "number" ? { offset: input.offset } : {},
|
|
2629
|
+
with_payload: ["chunkId", "snapshotId", "sourceId", "modelKey"],
|
|
2630
|
+
filter: {
|
|
2631
|
+
must: [
|
|
2632
|
+
{
|
|
2633
|
+
key: "snapshotId",
|
|
2634
|
+
match: {
|
|
2635
|
+
any: input.snapshotIds
|
|
2636
|
+
}
|
|
2637
|
+
},
|
|
2638
|
+
{
|
|
2639
|
+
key: "modelKey",
|
|
2640
|
+
match: {
|
|
2641
|
+
value: input.modelKey
|
|
2642
|
+
}
|
|
2643
|
+
},
|
|
2644
|
+
...input.sourceIds && input.sourceIds.length > 0 ? [{
|
|
2645
|
+
key: "sourceId",
|
|
2646
|
+
match: {
|
|
2647
|
+
any: input.sourceIds
|
|
2648
|
+
}
|
|
2649
|
+
}] : []
|
|
2650
|
+
]
|
|
2651
|
+
}
|
|
2652
|
+
}).catch((error) => {
|
|
2653
|
+
throw new AiocsError(
|
|
2654
|
+
AIOCS_ERROR_CODES.vectorStoreUnavailable,
|
|
2655
|
+
`Unable to search Qdrant collection '${this.collectionName}': ${error instanceof Error ? error.message : String(error)}`
|
|
2656
|
+
);
|
|
2657
|
+
});
|
|
2658
|
+
return results.map((result) => {
|
|
2659
|
+
const payload = result.payload ?? {};
|
|
2660
|
+
const chunkId = typeof payload.chunkId === "number" ? payload.chunkId : typeof result.id === "number" ? result.id : Number(result.id);
|
|
2661
|
+
if (!Number.isInteger(chunkId)) {
|
|
2662
|
+
return null;
|
|
2663
|
+
}
|
|
2664
|
+
return {
|
|
2665
|
+
chunkId,
|
|
2666
|
+
score: result.score
|
|
2667
|
+
};
|
|
2668
|
+
}).filter((result) => result !== null);
|
|
2669
|
+
}
|
|
2670
|
+
async getHealth() {
|
|
2671
|
+
try {
|
|
2672
|
+
const response = await this.client.getCollections();
|
|
2673
|
+
return {
|
|
2674
|
+
ok: true,
|
|
2675
|
+
collections: response.collections?.map((entry) => entry.name) ?? []
|
|
2676
|
+
};
|
|
2677
|
+
} catch (error) {
|
|
2678
|
+
return {
|
|
2679
|
+
ok: false,
|
|
2680
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2681
|
+
};
|
|
2682
|
+
}
|
|
2683
|
+
}
|
|
2684
|
+
};
|
|
2685
|
+
|
|
2686
|
+
// src/hybrid/worker.ts
|
|
2687
|
+
function chunkArray(values, size) {
|
|
2688
|
+
const chunks = [];
|
|
2689
|
+
for (let index = 0; index < values.length; index += size) {
|
|
2690
|
+
chunks.push(values.slice(index, index + size));
|
|
2691
|
+
}
|
|
2692
|
+
return chunks;
|
|
2693
|
+
}
|
|
2694
|
+
async function processEmbeddingJobs(input) {
|
|
2695
|
+
const claimedJobs = input.catalog.claimEmbeddingJobs(input.config.embeddingJobsPerCycle);
|
|
2696
|
+
if (claimedJobs.length === 0) {
|
|
2697
|
+
return {
|
|
2698
|
+
processedJobs: 0,
|
|
2699
|
+
succeededJobs: [],
|
|
2700
|
+
failedJobs: []
|
|
2701
|
+
};
|
|
2702
|
+
}
|
|
2703
|
+
const vectorStore = new AiocsVectorStore(input.config);
|
|
2704
|
+
const modelKey = getEmbeddingModelKey(input.config);
|
|
2705
|
+
const succeededJobs = [];
|
|
2706
|
+
const failedJobs = [];
|
|
2707
|
+
for (const job of claimedJobs) {
|
|
2708
|
+
try {
|
|
2709
|
+
const chunks = input.catalog.listSnapshotChunks({
|
|
2710
|
+
sourceId: job.sourceId,
|
|
2711
|
+
snapshotId: job.snapshotId
|
|
2712
|
+
});
|
|
2713
|
+
if (chunks.length === 0) {
|
|
2714
|
+
input.catalog.markEmbeddingJobFailed({
|
|
2715
|
+
sourceId: job.sourceId,
|
|
2716
|
+
snapshotId: job.snapshotId,
|
|
2717
|
+
errorMessage: "No chunks found for embedding job snapshot"
|
|
2718
|
+
});
|
|
2719
|
+
failedJobs.push({
|
|
2720
|
+
sourceId: job.sourceId,
|
|
2721
|
+
snapshotId: job.snapshotId,
|
|
2722
|
+
errorMessage: "No chunks found for embedding job snapshot"
|
|
2723
|
+
});
|
|
2724
|
+
continue;
|
|
2725
|
+
}
|
|
2726
|
+
const existingState = input.catalog.getSnapshotEmbeddingState({
|
|
2727
|
+
sourceId: job.sourceId,
|
|
2728
|
+
snapshotId: job.snapshotId
|
|
2729
|
+
});
|
|
2730
|
+
const staleChunkIds = [
|
|
2731
|
+
.../* @__PURE__ */ new Set([
|
|
2732
|
+
...input.catalog.listStaleEmbeddingChunkIds(job.sourceId),
|
|
2733
|
+
...existingState.filter((entry) => entry.modelKey && entry.modelKey !== modelKey).map((entry) => entry.chunkId)
|
|
2734
|
+
])
|
|
2735
|
+
];
|
|
2736
|
+
const needsReindex = existingState.some((entry) => entry.status !== "indexed" || entry.modelKey !== modelKey);
|
|
2737
|
+
if (!needsReindex) {
|
|
2738
|
+
input.catalog.markEmbeddingJobSucceeded({
|
|
2739
|
+
sourceId: job.sourceId,
|
|
2740
|
+
snapshotId: job.snapshotId,
|
|
2741
|
+
modelKey,
|
|
2742
|
+
indexedChunkIds: chunks.map((chunk) => chunk.chunkId),
|
|
2743
|
+
staleChunkIds
|
|
2744
|
+
});
|
|
2745
|
+
succeededJobs.push({
|
|
2746
|
+
sourceId: job.sourceId,
|
|
2747
|
+
snapshotId: job.snapshotId,
|
|
2748
|
+
chunkCount: chunks.length
|
|
2749
|
+
});
|
|
2750
|
+
continue;
|
|
2751
|
+
}
|
|
2752
|
+
const dimensionProbe = await embedTexts(input.config, [chunks[0].markdown]);
|
|
2753
|
+
const vectorDimension = dimensionProbe[0]?.length;
|
|
2754
|
+
if (!vectorDimension) {
|
|
2755
|
+
throw new AiocsError(
|
|
2756
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2757
|
+
"Embedding provider returned an empty vector for the first chunk"
|
|
2758
|
+
);
|
|
2759
|
+
}
|
|
2760
|
+
await vectorStore.ensureCollection(vectorDimension);
|
|
2761
|
+
if (staleChunkIds.length > 0) {
|
|
2762
|
+
await vectorStore.deleteChunkIds(staleChunkIds);
|
|
2763
|
+
}
|
|
2764
|
+
const indexedChunkIds = [];
|
|
2765
|
+
const batchedChunks = chunkArray(chunks, input.config.embeddingBatchSize);
|
|
2766
|
+
let dimensionProbeConsumed = false;
|
|
2767
|
+
for (const batch of batchedChunks) {
|
|
2768
|
+
const embeddings = dimensionProbeConsumed ? await embedTexts(input.config, batch.map((chunk) => chunk.markdown)) : [
|
|
2769
|
+
dimensionProbe[0],
|
|
2770
|
+
...batch.length > 1 ? await embedTexts(input.config, batch.slice(1).map((chunk) => chunk.markdown)) : []
|
|
2771
|
+
];
|
|
2772
|
+
dimensionProbeConsumed = true;
|
|
2773
|
+
if (embeddings.length !== batch.length) {
|
|
2774
|
+
throw new AiocsError(
|
|
2775
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
2776
|
+
`Embedding provider returned ${embeddings.length} embeddings for a batch of ${batch.length}`
|
|
2777
|
+
);
|
|
2778
|
+
}
|
|
2779
|
+
await vectorStore.upsertChunks({
|
|
2780
|
+
modelKey,
|
|
2781
|
+
points: batch.map((chunk, index) => ({
|
|
2782
|
+
chunkId: chunk.chunkId,
|
|
2783
|
+
vector: embeddings[index],
|
|
2784
|
+
sourceId: chunk.sourceId,
|
|
2785
|
+
snapshotId: chunk.snapshotId,
|
|
2786
|
+
pageUrl: chunk.pageUrl,
|
|
2787
|
+
pageTitle: chunk.pageTitle,
|
|
2788
|
+
sectionTitle: chunk.sectionTitle
|
|
2789
|
+
}))
|
|
2790
|
+
});
|
|
2791
|
+
indexedChunkIds.push(...batch.map((chunk) => chunk.chunkId));
|
|
2792
|
+
}
|
|
2793
|
+
input.catalog.markEmbeddingJobSucceeded({
|
|
2794
|
+
sourceId: job.sourceId,
|
|
2795
|
+
snapshotId: job.snapshotId,
|
|
2796
|
+
modelKey,
|
|
2797
|
+
indexedChunkIds,
|
|
2798
|
+
staleChunkIds
|
|
2799
|
+
});
|
|
2800
|
+
succeededJobs.push({
|
|
2801
|
+
sourceId: job.sourceId,
|
|
2802
|
+
snapshotId: job.snapshotId,
|
|
2803
|
+
chunkCount: indexedChunkIds.length
|
|
2804
|
+
});
|
|
2805
|
+
} catch (error) {
|
|
2806
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
2807
|
+
input.catalog.markEmbeddingJobFailed({
|
|
2808
|
+
sourceId: job.sourceId,
|
|
2809
|
+
snapshotId: job.snapshotId,
|
|
2810
|
+
errorMessage
|
|
2811
|
+
});
|
|
2812
|
+
failedJobs.push({
|
|
2813
|
+
sourceId: job.sourceId,
|
|
2814
|
+
snapshotId: job.snapshotId,
|
|
2815
|
+
errorMessage
|
|
2816
|
+
});
|
|
2817
|
+
}
|
|
2818
|
+
}
|
|
2819
|
+
return {
|
|
2820
|
+
processedJobs: claimedJobs.length,
|
|
2821
|
+
succeededJobs,
|
|
2822
|
+
failedJobs
|
|
2823
|
+
};
|
|
2824
|
+
}
|
|
2825
|
+
|
|
2826
|
+
// src/runtime/bundled-sources.ts
|
|
2827
|
+
import { existsSync } from "fs";
|
|
2828
|
+
import { dirname, join as join4 } from "path";
|
|
2829
|
+
import { fileURLToPath } from "url";
|
|
2830
|
+
function findPackageRoot(startDir) {
|
|
2831
|
+
let currentDir = startDir;
|
|
2832
|
+
while (true) {
|
|
2833
|
+
if (existsSync(join4(currentDir, "package.json")) && existsSync(join4(currentDir, "sources"))) {
|
|
2834
|
+
return currentDir;
|
|
2835
|
+
}
|
|
2836
|
+
const parentDir = dirname(currentDir);
|
|
2837
|
+
if (parentDir === currentDir) {
|
|
2838
|
+
throw new Error(`Could not locate aiocs package root from ${startDir}`);
|
|
2839
|
+
}
|
|
2840
|
+
currentDir = parentDir;
|
|
2841
|
+
}
|
|
2842
|
+
}
|
|
2843
|
+
function getBundledSourcesDir() {
|
|
2844
|
+
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2845
|
+
const packageRoot = findPackageRoot(dirname(currentFilePath));
|
|
2846
|
+
return join4(packageRoot, "sources");
|
|
2847
|
+
}
|
|
2848
|
+
|
|
2849
|
+
// src/runtime/hybrid-config.ts
|
|
2850
|
+
function parsePositiveInteger(value, field, fallback) {
|
|
2851
|
+
if (typeof value === "undefined" || value.trim() === "") {
|
|
2852
|
+
return fallback;
|
|
2853
|
+
}
|
|
2854
|
+
const parsed = Number(value);
|
|
2855
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
2856
|
+
throw new AiocsError(
|
|
2857
|
+
AIOCS_ERROR_CODES.embeddingConfigInvalid,
|
|
2858
|
+
`${field} must be a positive integer`
|
|
2859
|
+
);
|
|
2860
|
+
}
|
|
2861
|
+
return parsed;
|
|
2862
|
+
}
|
|
2863
|
+
function parseSearchMode(value) {
|
|
2864
|
+
if (!value) {
|
|
2865
|
+
return "auto";
|
|
2866
|
+
}
|
|
2867
|
+
if (value === "auto" || value === "lexical" || value === "hybrid" || value === "semantic") {
|
|
2868
|
+
return value;
|
|
2869
|
+
}
|
|
2870
|
+
throw new AiocsError(
|
|
2871
|
+
AIOCS_ERROR_CODES.embeddingConfigInvalid,
|
|
2872
|
+
"AIOCS_SEARCH_MODE_DEFAULT must be one of: auto, lexical, hybrid, semantic"
|
|
2873
|
+
);
|
|
2874
|
+
}
|
|
2875
|
+
function getHybridRuntimeConfig(env = process.env) {
|
|
2876
|
+
const embeddingProvider = env.AIOCS_EMBEDDING_PROVIDER ?? "ollama";
|
|
2877
|
+
if (embeddingProvider !== "ollama") {
|
|
2878
|
+
throw new AiocsError(
|
|
2879
|
+
AIOCS_ERROR_CODES.embeddingConfigInvalid,
|
|
2880
|
+
"AIOCS_EMBEDDING_PROVIDER currently supports only ollama"
|
|
2881
|
+
);
|
|
2882
|
+
}
|
|
2883
|
+
return {
|
|
2884
|
+
defaultSearchMode: parseSearchMode(env.AIOCS_SEARCH_MODE_DEFAULT),
|
|
2885
|
+
qdrantUrl: env.AIOCS_QDRANT_URL ?? "http://127.0.0.1:6333",
|
|
2886
|
+
qdrantCollection: env.AIOCS_QDRANT_COLLECTION ?? "aiocs_docs_chunks",
|
|
2887
|
+
qdrantTimeoutMs: parsePositiveInteger(env.AIOCS_QDRANT_TIMEOUT_MS, "AIOCS_QDRANT_TIMEOUT_MS", 1e3),
|
|
2888
|
+
embeddingProvider: "ollama",
|
|
2889
|
+
ollamaBaseUrl: env.AIOCS_OLLAMA_BASE_URL ?? "http://127.0.0.1:11434",
|
|
2890
|
+
ollamaEmbeddingModel: env.AIOCS_OLLAMA_EMBEDDING_MODEL ?? "nomic-embed-text",
|
|
2891
|
+
ollamaTimeoutMs: parsePositiveInteger(env.AIOCS_OLLAMA_TIMEOUT_MS, "AIOCS_OLLAMA_TIMEOUT_MS", 1e3),
|
|
2892
|
+
embeddingBatchSize: parsePositiveInteger(env.AIOCS_EMBEDDING_BATCH_SIZE, "AIOCS_EMBEDDING_BATCH_SIZE", 32),
|
|
2893
|
+
embeddingJobsPerCycle: parsePositiveInteger(env.AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE, "AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE", 2),
|
|
2894
|
+
lexicalCandidateWindow: parsePositiveInteger(env.AIOCS_LEXICAL_CANDIDATE_WINDOW, "AIOCS_LEXICAL_CANDIDATE_WINDOW", 40),
|
|
2895
|
+
vectorCandidateWindow: parsePositiveInteger(env.AIOCS_VECTOR_CANDIDATE_WINDOW, "AIOCS_VECTOR_CANDIDATE_WINDOW", 40),
|
|
2896
|
+
rrfK: parsePositiveInteger(env.AIOCS_RRF_K, "AIOCS_RRF_K", 60)
|
|
2897
|
+
};
|
|
2898
|
+
}
|
|
2899
|
+
|
|
2900
|
+
// src/spec/source-spec-files.ts
|
|
2901
|
+
import { access, readdir } from "fs/promises";
|
|
2902
|
+
import { constants as fsConstants } from "fs";
|
|
2903
|
+
import { extname as extname2, join as join5, resolve as resolve3 } from "path";
|
|
2904
|
+
var SOURCE_SPEC_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".json"]);
|
|
2905
|
+
function uniqueResolvedPaths(paths) {
|
|
2906
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2907
|
+
const unique = [];
|
|
2908
|
+
for (const rawPath of paths) {
|
|
2909
|
+
const normalized = resolve3(rawPath);
|
|
2910
|
+
if (seen.has(normalized)) {
|
|
2911
|
+
continue;
|
|
2912
|
+
}
|
|
2913
|
+
seen.add(normalized);
|
|
2914
|
+
unique.push(normalized);
|
|
2915
|
+
}
|
|
2916
|
+
return unique;
|
|
2917
|
+
}
|
|
2918
|
+
async function pathExists(targetPath) {
|
|
2919
|
+
try {
|
|
2920
|
+
await access(targetPath, fsConstants.F_OK);
|
|
2921
|
+
return true;
|
|
2922
|
+
} catch {
|
|
2923
|
+
return false;
|
|
2924
|
+
}
|
|
2925
|
+
}
|
|
2926
|
+
async function walkSourceSpecFiles(rootDir) {
|
|
2927
|
+
const entries = await readdir(rootDir, { withFileTypes: true });
|
|
2928
|
+
const discovered = [];
|
|
2929
|
+
for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
|
|
2930
|
+
const entryPath = join5(rootDir, entry.name);
|
|
2931
|
+
if (entry.isDirectory()) {
|
|
2932
|
+
discovered.push(...await walkSourceSpecFiles(entryPath));
|
|
2933
|
+
continue;
|
|
2934
|
+
}
|
|
2935
|
+
if (!entry.isFile()) {
|
|
2936
|
+
continue;
|
|
2937
|
+
}
|
|
2938
|
+
if (SOURCE_SPEC_EXTENSIONS.has(extname2(entry.name).toLowerCase())) {
|
|
2939
|
+
discovered.push(entryPath);
|
|
2940
|
+
}
|
|
2941
|
+
}
|
|
2942
|
+
return discovered;
|
|
2943
|
+
}
|
|
2944
|
+
|
|
2945
|
+
// src/daemon.ts
|
|
2946
|
+
var DEFAULT_INTERVAL_MINUTES = 60;
|
|
2947
|
+
var DEFAULT_CONTAINER_SOURCE_DIR = "/app/sources";
|
|
2948
|
+
var BOOLEAN_TRUE_VALUES = /* @__PURE__ */ new Set(["1", "true", "yes", "on"]);
|
|
2949
|
+
var BOOLEAN_FALSE_VALUES = /* @__PURE__ */ new Set(["0", "false", "no", "off"]);
|
|
2950
|
+
function nowIso3() {
|
|
2951
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
2952
|
+
}
|
|
2953
|
+
function parsePositiveInteger2(raw, variableName) {
|
|
2954
|
+
if (!/^\d+$/.test(raw)) {
|
|
2955
|
+
throw new Error(`${variableName} must be a positive integer`);
|
|
2956
|
+
}
|
|
2957
|
+
const parsed = Number(raw);
|
|
2958
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
2959
|
+
throw new Error(`${variableName} must be a positive integer`);
|
|
2960
|
+
}
|
|
2961
|
+
return parsed;
|
|
2962
|
+
}
|
|
2963
|
+
function parseBoolean(raw, variableName) {
|
|
2964
|
+
const normalized = raw.trim().toLowerCase();
|
|
2965
|
+
if (BOOLEAN_TRUE_VALUES.has(normalized)) {
|
|
2966
|
+
return true;
|
|
2967
|
+
}
|
|
2968
|
+
if (BOOLEAN_FALSE_VALUES.has(normalized)) {
|
|
2969
|
+
return false;
|
|
2970
|
+
}
|
|
2971
|
+
throw new Error(`${variableName} must be one of: true, false, 1, 0, yes, no, on, off`);
|
|
2972
|
+
}
|
|
2973
|
+
function parseDaemonConfig(env, options = {}) {
|
|
2974
|
+
const intervalMinutes = env.AIOCS_DAEMON_INTERVAL_MINUTES ? parsePositiveInteger2(env.AIOCS_DAEMON_INTERVAL_MINUTES, "AIOCS_DAEMON_INTERVAL_MINUTES") : DEFAULT_INTERVAL_MINUTES;
|
|
2975
|
+
const fetchOnStart = env.AIOCS_DAEMON_FETCH_ON_START ? parseBoolean(env.AIOCS_DAEMON_FETCH_ON_START, "AIOCS_DAEMON_FETCH_ON_START") : true;
|
|
2976
|
+
const defaultSourceDirs = uniqueResolvedPaths([
|
|
2977
|
+
options.bundledSourceDir ?? getBundledSourcesDir(),
|
|
2978
|
+
options.userSourceDir ?? getAiocsSourcesDir(env),
|
|
2979
|
+
options.containerSourceDir ?? DEFAULT_CONTAINER_SOURCE_DIR
|
|
2980
|
+
]);
|
|
2981
|
+
const sourceSpecDirs = env.AIOCS_SOURCE_SPEC_DIRS ? uniqueResolvedPaths(
|
|
2982
|
+
env.AIOCS_SOURCE_SPEC_DIRS.split(",").map((entry) => entry.trim()).filter(Boolean)
|
|
2983
|
+
) : defaultSourceDirs;
|
|
2984
|
+
if (env.AIOCS_SOURCE_SPEC_DIRS && sourceSpecDirs.length === 0) {
|
|
2985
|
+
throw new Error("AIOCS_SOURCE_SPEC_DIRS must include at least one directory");
|
|
2986
|
+
}
|
|
2987
|
+
return {
|
|
2988
|
+
intervalMinutes,
|
|
2989
|
+
fetchOnStart,
|
|
2990
|
+
strictSourceSpecDirs: Boolean(env.AIOCS_SOURCE_SPEC_DIRS),
|
|
2991
|
+
sourceSpecDirs
|
|
2992
|
+
};
|
|
2993
|
+
}
|
|
2994
|
+
async function bootstrapSourceSpecs(input) {
|
|
2995
|
+
const normalizedSourceSpecDirs = uniqueResolvedPaths(input.sourceSpecDirs);
|
|
2996
|
+
const missingDirs = [];
|
|
2997
|
+
const existingDirs = [];
|
|
2998
|
+
const sources = [];
|
|
2999
|
+
for (const sourceSpecDir of normalizedSourceSpecDirs) {
|
|
3000
|
+
if (!await pathExists(sourceSpecDir)) {
|
|
3001
|
+
missingDirs.push(sourceSpecDir);
|
|
3002
|
+
continue;
|
|
3003
|
+
}
|
|
3004
|
+
existingDirs.push(sourceSpecDir);
|
|
3005
|
+
}
|
|
3006
|
+
if (input.strictSourceSpecDirs && missingDirs.length > 0) {
|
|
3007
|
+
throw new Error(`Missing source spec directories: ${missingDirs.join(", ")}`);
|
|
3008
|
+
}
|
|
3009
|
+
for (const sourceSpecDir of existingDirs) {
|
|
3010
|
+
const specPaths = await walkSourceSpecFiles(sourceSpecDir);
|
|
3011
|
+
for (const specPath of specPaths) {
|
|
3012
|
+
const spec = await loadSourceSpec(specPath);
|
|
3013
|
+
const upserted = input.catalog.upsertSource(spec, { specPath });
|
|
3014
|
+
sources.push({
|
|
3015
|
+
sourceId: upserted.sourceId,
|
|
3016
|
+
configHash: upserted.configHash,
|
|
3017
|
+
configChanged: upserted.configChanged,
|
|
3018
|
+
specPath
|
|
3019
|
+
});
|
|
3020
|
+
}
|
|
3021
|
+
}
|
|
3022
|
+
if (input.strictSourceSpecDirs && sources.length === 0) {
|
|
3023
|
+
throw new Error(`No source spec files found in configured directories: ${normalizedSourceSpecDirs.join(", ")}`);
|
|
3024
|
+
}
|
|
3025
|
+
const removedSourceIds = input.catalog.removeManagedSources({
|
|
3026
|
+
managedRoots: existingDirs.map((sourceSpecDir) => resolve4(sourceSpecDir)),
|
|
3027
|
+
activeSources: sources.map((source) => ({
|
|
3028
|
+
sourceId: source.sourceId,
|
|
3029
|
+
specPath: source.specPath
|
|
3030
|
+
}))
|
|
3031
|
+
});
|
|
3032
|
+
return {
|
|
3033
|
+
processedSpecCount: sources.length,
|
|
3034
|
+
removedSourceIds,
|
|
3035
|
+
sources
|
|
3036
|
+
};
|
|
3037
|
+
}
|
|
3038
|
+
async function runDaemonCycle(input) {
|
|
3039
|
+
const startedAt = nowIso3();
|
|
3040
|
+
const bootstrapped = await bootstrapSourceSpecs({
|
|
3041
|
+
catalog: input.catalog,
|
|
3042
|
+
sourceSpecDirs: input.sourceSpecDirs,
|
|
3043
|
+
...input.strictSourceSpecDirs !== void 0 ? { strictSourceSpecDirs: input.strictSourceSpecDirs } : {}
|
|
3044
|
+
});
|
|
3045
|
+
const dueSourceIds = [
|
|
3046
|
+
.../* @__PURE__ */ new Set([
|
|
3047
|
+
...input.catalog.listDueSourceIds(input.referenceTime ?? startedAt),
|
|
3048
|
+
...bootstrapped.sources.filter((source) => source.configChanged).map((source) => source.sourceId)
|
|
3049
|
+
])
|
|
3050
|
+
];
|
|
3051
|
+
const canaryDueSourceIds = [
|
|
3052
|
+
.../* @__PURE__ */ new Set([
|
|
3053
|
+
...input.catalog.listCanaryDueSourceIds(input.referenceTime ?? startedAt),
|
|
3054
|
+
...bootstrapped.sources.filter((source) => source.configChanged).map((source) => source.sourceId),
|
|
3055
|
+
...input.catalog.listSources().filter((source) => source.lastCanaryCheckedAt === null).map((source) => source.id)
|
|
3056
|
+
])
|
|
3057
|
+
];
|
|
3058
|
+
const canaried = [];
|
|
3059
|
+
const canaryFailed = [];
|
|
3060
|
+
const refreshed = [];
|
|
3061
|
+
const failed = [];
|
|
3062
|
+
const embedded = [];
|
|
3063
|
+
const embeddingFailed = [];
|
|
3064
|
+
for (const sourceId of canaryDueSourceIds) {
|
|
3065
|
+
try {
|
|
3066
|
+
const result = await runSourceCanary({
|
|
3067
|
+
catalog: input.catalog,
|
|
3068
|
+
sourceId,
|
|
3069
|
+
env: process.env
|
|
3070
|
+
});
|
|
3071
|
+
canaried.push({
|
|
3072
|
+
sourceId,
|
|
3073
|
+
status: result.status,
|
|
3074
|
+
checkedAt: result.checkedAt,
|
|
3075
|
+
summary: result.summary
|
|
3076
|
+
});
|
|
3077
|
+
if (result.status === "fail") {
|
|
3078
|
+
canaryFailed.push({
|
|
3079
|
+
sourceId,
|
|
3080
|
+
errorMessage: `One or more canary checks failed for ${sourceId}`
|
|
3081
|
+
});
|
|
3082
|
+
}
|
|
3083
|
+
} catch (error) {
|
|
3084
|
+
canaryFailed.push({
|
|
3085
|
+
sourceId,
|
|
3086
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
3087
|
+
});
|
|
3088
|
+
}
|
|
3089
|
+
}
|
|
3090
|
+
for (const sourceId of dueSourceIds) {
|
|
3091
|
+
try {
|
|
3092
|
+
const result = await fetchSource({
|
|
3093
|
+
catalog: input.catalog,
|
|
3094
|
+
dataDir: input.dataDir,
|
|
3095
|
+
sourceId
|
|
3096
|
+
});
|
|
3097
|
+
refreshed.push({
|
|
3098
|
+
sourceId,
|
|
3099
|
+
snapshotId: result.snapshotId,
|
|
3100
|
+
pageCount: result.pageCount,
|
|
3101
|
+
reused: result.reused
|
|
3102
|
+
});
|
|
3103
|
+
} catch (error) {
|
|
3104
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
3105
|
+
failed.push({
|
|
3106
|
+
sourceId,
|
|
3107
|
+
errorMessage
|
|
3108
|
+
});
|
|
3109
|
+
}
|
|
3110
|
+
}
|
|
3111
|
+
try {
|
|
3112
|
+
const embeddingResult = await processEmbeddingJobs({
|
|
3113
|
+
catalog: input.catalog,
|
|
3114
|
+
config: getHybridRuntimeConfig(process.env)
|
|
3115
|
+
});
|
|
3116
|
+
embedded.push(...embeddingResult.succeededJobs);
|
|
3117
|
+
embeddingFailed.push(...embeddingResult.failedJobs);
|
|
3118
|
+
} catch (error) {
|
|
3119
|
+
embeddingFailed.push({
|
|
3120
|
+
sourceId: "system",
|
|
3121
|
+
snapshotId: "system",
|
|
3122
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
3123
|
+
});
|
|
3124
|
+
}
|
|
3125
|
+
return {
|
|
3126
|
+
startedAt,
|
|
3127
|
+
finishedAt: nowIso3(),
|
|
3128
|
+
dueSourceIds,
|
|
3129
|
+
canaryDueSourceIds,
|
|
3130
|
+
bootstrapped,
|
|
3131
|
+
canaried,
|
|
3132
|
+
canaryFailed,
|
|
3133
|
+
refreshed,
|
|
3134
|
+
failed,
|
|
3135
|
+
embedded,
|
|
3136
|
+
embeddingFailed
|
|
3137
|
+
};
|
|
3138
|
+
}
|
|
3139
|
+
async function startDaemon(input) {
|
|
3140
|
+
const intervalMs = input.config.intervalMinutes * 6e4;
|
|
3141
|
+
input.catalog.resetRunningEmbeddingJobs();
|
|
3142
|
+
input.catalog.markDaemonStarted({
|
|
3143
|
+
startedAt: nowIso3(),
|
|
3144
|
+
intervalMinutes: input.config.intervalMinutes,
|
|
3145
|
+
fetchOnStart: input.config.fetchOnStart
|
|
3146
|
+
});
|
|
3147
|
+
input.logger.emit({
|
|
3148
|
+
type: "daemon.started",
|
|
3149
|
+
intervalMinutes: input.config.intervalMinutes,
|
|
3150
|
+
fetchOnStart: input.config.fetchOnStart,
|
|
3151
|
+
sourceSpecDirs: input.config.sourceSpecDirs
|
|
3152
|
+
});
|
|
3153
|
+
const runCycle = async (reason) => {
|
|
3154
|
+
const startedAt = nowIso3();
|
|
3155
|
+
input.catalog.markDaemonCycleStarted(startedAt);
|
|
3156
|
+
input.logger.emit({
|
|
3157
|
+
type: "daemon.cycle.started",
|
|
3158
|
+
reason,
|
|
3159
|
+
startedAt
|
|
3160
|
+
});
|
|
3161
|
+
try {
|
|
3162
|
+
const result = await runDaemonCycle({
|
|
3163
|
+
catalog: input.catalog,
|
|
3164
|
+
dataDir: input.dataDir,
|
|
3165
|
+
sourceSpecDirs: input.config.sourceSpecDirs,
|
|
3166
|
+
strictSourceSpecDirs: input.config.strictSourceSpecDirs,
|
|
3167
|
+
referenceTime: startedAt
|
|
3168
|
+
});
|
|
3169
|
+
input.catalog.markDaemonCycleCompleted({
|
|
3170
|
+
completedAt: result.finishedAt,
|
|
3171
|
+
status: result.failed.length > 0 || result.canaryFailed.length > 0 || result.embeddingFailed.length > 0 ? "degraded" : "success"
|
|
3172
|
+
});
|
|
3173
|
+
input.logger.emit({
|
|
3174
|
+
type: "daemon.cycle.completed",
|
|
3175
|
+
reason,
|
|
3176
|
+
result
|
|
3177
|
+
});
|
|
3178
|
+
} catch (error) {
|
|
3179
|
+
input.catalog.markDaemonCycleCompleted({
|
|
3180
|
+
completedAt: nowIso3(),
|
|
3181
|
+
status: "failed"
|
|
3182
|
+
});
|
|
3183
|
+
throw error;
|
|
3184
|
+
}
|
|
3185
|
+
};
|
|
3186
|
+
if (input.config.fetchOnStart && !input.signal?.aborted) {
|
|
3187
|
+
await runCycle("startup");
|
|
3188
|
+
}
|
|
3189
|
+
while (!input.signal?.aborted) {
|
|
3190
|
+
try {
|
|
3191
|
+
await sleep2(intervalMs, void 0, { signal: input.signal });
|
|
3192
|
+
} catch (error) {
|
|
3193
|
+
if (input.signal?.aborted) {
|
|
3194
|
+
break;
|
|
3195
|
+
}
|
|
3196
|
+
throw error;
|
|
3197
|
+
}
|
|
3198
|
+
if (input.signal?.aborted) {
|
|
3199
|
+
break;
|
|
3200
|
+
}
|
|
3201
|
+
await runCycle("interval");
|
|
3202
|
+
}
|
|
3203
|
+
input.logger.emit({
|
|
3204
|
+
type: "daemon.stopped"
|
|
3205
|
+
});
|
|
3206
|
+
}
|
|
3207
|
+
|
|
3208
|
+
// package.json
|
|
3209
|
+
var package_default = {
|
|
3210
|
+
name: "@bodhi-ventures/aiocs",
|
|
3211
|
+
version: "0.1.0",
|
|
3212
|
+
license: "MIT",
|
|
3213
|
+
type: "module",
|
|
3214
|
+
description: "Local-only documentation store, fetcher, and search CLI for AI agents.",
|
|
3215
|
+
keywords: [
|
|
3216
|
+
"ai",
|
|
3217
|
+
"docs",
|
|
3218
|
+
"search",
|
|
3219
|
+
"mcp",
|
|
3220
|
+
"cli"
|
|
3221
|
+
],
|
|
3222
|
+
homepage: "https://github.com/Bodhi-Ventures/aiocs",
|
|
3223
|
+
bugs: {
|
|
3224
|
+
url: "https://github.com/Bodhi-Ventures/aiocs/issues"
|
|
3225
|
+
},
|
|
3226
|
+
repository: {
|
|
3227
|
+
type: "git",
|
|
3228
|
+
url: "https://github.com/Bodhi-Ventures/aiocs.git"
|
|
3229
|
+
},
|
|
3230
|
+
publishConfig: {
|
|
3231
|
+
access: "public",
|
|
3232
|
+
provenance: true
|
|
3233
|
+
},
|
|
3234
|
+
packageManager: "pnpm@9.15.9",
|
|
3235
|
+
files: [
|
|
3236
|
+
"dist",
|
|
3237
|
+
"sources",
|
|
3238
|
+
"docs",
|
|
3239
|
+
"README.md",
|
|
3240
|
+
"LICENSE",
|
|
3241
|
+
"skills"
|
|
3242
|
+
],
|
|
3243
|
+
bin: {
|
|
3244
|
+
docs: "./dist/cli.js",
|
|
3245
|
+
"aiocs-mcp": "./dist/mcp-server.js"
|
|
3246
|
+
},
|
|
3247
|
+
engines: {
|
|
3248
|
+
node: ">=22"
|
|
3249
|
+
},
|
|
3250
|
+
scripts: {
|
|
3251
|
+
build: "tsup --config tsup.config.ts",
|
|
3252
|
+
dev: "tsx src/cli.ts",
|
|
3253
|
+
"dev:mcp": "tsx src/mcp-server.ts",
|
|
3254
|
+
lint: "tsc --noEmit",
|
|
3255
|
+
test: "vitest run",
|
|
3256
|
+
"test:watch": "vitest"
|
|
3257
|
+
},
|
|
3258
|
+
dependencies: {
|
|
3259
|
+
"@modelcontextprotocol/sdk": "^1.28.0",
|
|
3260
|
+
"@mozilla/readability": "^0.6.0",
|
|
3261
|
+
"@qdrant/js-client-rest": "1.17.0",
|
|
3262
|
+
"better-sqlite3": "^12.4.1",
|
|
3263
|
+
commander: "^14.0.1",
|
|
3264
|
+
jsdom: "^27.0.1",
|
|
3265
|
+
playwright: "^1.57.0",
|
|
3266
|
+
turndown: "^7.2.1",
|
|
3267
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
3268
|
+
yaml: "^2.8.1",
|
|
3269
|
+
zod: "^4.1.12"
|
|
3270
|
+
},
|
|
3271
|
+
devDependencies: {
|
|
3272
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
3273
|
+
"@types/jsdom": "^21.1.7",
|
|
3274
|
+
"@types/node": "^24.7.2",
|
|
3275
|
+
"@types/turndown": "^5.0.5",
|
|
3276
|
+
execa: "^9.6.0",
|
|
3277
|
+
tsup: "^8.5.0",
|
|
3278
|
+
tsx: "^4.20.6",
|
|
3279
|
+
typescript: "^5.9.3",
|
|
3280
|
+
vitest: "^3.2.4"
|
|
3281
|
+
}
|
|
3282
|
+
};
|
|
3283
|
+
|
|
3284
|
+
// src/runtime/package-metadata.ts
|
|
3285
|
+
var packageName = package_default.name;
|
|
3286
|
+
var packageVersion = package_default.version;
|
|
3287
|
+
var packageDescription = package_default.description;
|
|
3288
|
+
|
|
3289
|
+
// src/services.ts
|
|
3290
|
+
import { resolve as resolve7 } from "path";
|
|
3291
|
+
|
|
3292
|
+
// src/backup.ts
|
|
3293
|
+
import { cp, mkdir, readdir as readdir2, readFile as readFile2, rename, rm, stat, writeFile } from "fs/promises";
|
|
3294
|
+
import { basename, dirname as dirname2, join as join6, resolve as resolve5 } from "path";
|
|
3295
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
3296
|
+
import Database2 from "better-sqlite3";
|
|
3297
|
+
var CATALOG_DB_FILENAME = "catalog.sqlite";
|
|
3298
|
+
var SQLITE_SIDE_CAR_SUFFIXES = ["-wal", "-shm"];
|
|
3299
|
+
async function pathExists2(path) {
|
|
3300
|
+
try {
|
|
3301
|
+
await stat(path);
|
|
3302
|
+
return true;
|
|
3303
|
+
} catch {
|
|
3304
|
+
return false;
|
|
3305
|
+
}
|
|
3306
|
+
}
|
|
3307
|
+
async function assertSourceDirExists(path) {
|
|
3308
|
+
if (!await pathExists2(path)) {
|
|
3309
|
+
throw new AiocsError(
|
|
3310
|
+
AIOCS_ERROR_CODES.backupSourceMissing,
|
|
3311
|
+
`Backup source path does not exist: ${path}`
|
|
3312
|
+
);
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
async function isDirectoryEmpty(path) {
|
|
3316
|
+
if (!await pathExists2(path)) {
|
|
3317
|
+
return true;
|
|
3318
|
+
}
|
|
3319
|
+
return (await readdir2(path)).length === 0;
|
|
3320
|
+
}
|
|
3321
|
+
async function listEntries(root, relativePath = "") {
|
|
3322
|
+
const absolutePath = relativePath ? join6(root, relativePath) : root;
|
|
3323
|
+
const stats = await stat(absolutePath);
|
|
3324
|
+
if (!stats.isDirectory()) {
|
|
3325
|
+
return [{
|
|
3326
|
+
relativePath,
|
|
3327
|
+
type: "file",
|
|
3328
|
+
size: stats.size
|
|
3329
|
+
}];
|
|
3330
|
+
}
|
|
3331
|
+
const childNames = await readdir2(absolutePath);
|
|
3332
|
+
const entries = relativePath ? [{
|
|
3333
|
+
relativePath,
|
|
3334
|
+
type: "directory",
|
|
3335
|
+
size: 0
|
|
3336
|
+
}] : [];
|
|
3337
|
+
for (const childName of childNames.sort()) {
|
|
3338
|
+
entries.push(...await listEntries(root, relativePath ? join6(relativePath, childName) : childName));
|
|
3339
|
+
}
|
|
3340
|
+
return entries;
|
|
3341
|
+
}
|
|
3342
|
+
async function copyIfPresent(from, to, entries, relativePrefix) {
|
|
3343
|
+
if (!await pathExists2(from)) {
|
|
3344
|
+
return;
|
|
3345
|
+
}
|
|
3346
|
+
await mkdir(to, { recursive: true });
|
|
3347
|
+
await cp(from, to, { recursive: true, force: true });
|
|
3348
|
+
const copiedEntries = await listEntries(to);
|
|
3349
|
+
entries.push(
|
|
3350
|
+
...copiedEntries.map((entry) => ({
|
|
3351
|
+
...entry,
|
|
3352
|
+
relativePath: join6(relativePrefix, entry.relativePath)
|
|
3353
|
+
}))
|
|
3354
|
+
);
|
|
3355
|
+
}
|
|
3356
|
+
async function copyDataDirForBackup(from, to) {
|
|
3357
|
+
const sourceCatalogPath = join6(from, CATALOG_DB_FILENAME);
|
|
3358
|
+
if (!await pathExists2(sourceCatalogPath)) {
|
|
3359
|
+
throw new AiocsError(
|
|
3360
|
+
AIOCS_ERROR_CODES.backupSourceMissing,
|
|
3361
|
+
`Backup source is missing the catalog database: ${sourceCatalogPath}`
|
|
3362
|
+
);
|
|
3363
|
+
}
|
|
3364
|
+
await mkdir(to, { recursive: true });
|
|
3365
|
+
await cp(from, to, {
|
|
3366
|
+
recursive: true,
|
|
3367
|
+
force: true,
|
|
3368
|
+
filter: (source) => {
|
|
3369
|
+
const name = basename(source);
|
|
3370
|
+
if (name === CATALOG_DB_FILENAME) {
|
|
3371
|
+
return false;
|
|
3372
|
+
}
|
|
3373
|
+
return !SQLITE_SIDE_CAR_SUFFIXES.some((suffix) => name === `${CATALOG_DB_FILENAME}${suffix}`);
|
|
3374
|
+
}
|
|
3375
|
+
});
|
|
3376
|
+
const targetCatalogPath = join6(to, CATALOG_DB_FILENAME);
|
|
3377
|
+
const sourceCatalog = new Database2(sourceCatalogPath, { readonly: true });
|
|
3378
|
+
try {
|
|
3379
|
+
await sourceCatalog.backup(targetCatalogPath);
|
|
3380
|
+
} finally {
|
|
3381
|
+
sourceCatalog.close();
|
|
3382
|
+
}
|
|
3383
|
+
}
|
|
3384
|
+
async function loadValidatedBackupPayload(inputDir) {
|
|
3385
|
+
const manifestPath = join6(inputDir, "manifest.json");
|
|
3386
|
+
await assertSourceDirExists(inputDir);
|
|
3387
|
+
if (!await pathExists2(manifestPath)) {
|
|
3388
|
+
throw new AiocsError(
|
|
3389
|
+
AIOCS_ERROR_CODES.backupInvalid,
|
|
3390
|
+
`Backup manifest not found: ${manifestPath}`
|
|
3391
|
+
);
|
|
3392
|
+
}
|
|
3393
|
+
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
3394
|
+
if (manifest.formatVersion !== 1 || !Array.isArray(manifest.entries)) {
|
|
3395
|
+
throw new AiocsError(
|
|
3396
|
+
AIOCS_ERROR_CODES.backupInvalid,
|
|
3397
|
+
`Invalid backup manifest: ${manifestPath}`
|
|
3398
|
+
);
|
|
3399
|
+
}
|
|
3400
|
+
const backupDataDir = join6(inputDir, "data");
|
|
3401
|
+
if (!await pathExists2(backupDataDir)) {
|
|
3402
|
+
throw new AiocsError(
|
|
3403
|
+
AIOCS_ERROR_CODES.backupInvalid,
|
|
3404
|
+
`Backup payload is missing the data directory: ${backupDataDir}`
|
|
3405
|
+
);
|
|
3406
|
+
}
|
|
3407
|
+
const backupCatalogPath = join6(backupDataDir, CATALOG_DB_FILENAME);
|
|
3408
|
+
if (!await pathExists2(backupCatalogPath)) {
|
|
3409
|
+
throw new AiocsError(
|
|
3410
|
+
AIOCS_ERROR_CODES.backupInvalid,
|
|
3411
|
+
`Backup payload is missing the catalog database: ${backupCatalogPath}`
|
|
3412
|
+
);
|
|
3413
|
+
}
|
|
3414
|
+
const backupConfigDir = join6(inputDir, "config");
|
|
3415
|
+
return {
|
|
3416
|
+
manifest,
|
|
3417
|
+
backupDataDir,
|
|
3418
|
+
...await pathExists2(backupConfigDir) ? { backupConfigDir } : {}
|
|
3419
|
+
};
|
|
3420
|
+
}
|
|
3421
|
+
async function prepareReplacementTarget(backupDir, targetDir) {
|
|
3422
|
+
const parentDir = dirname2(targetDir);
|
|
3423
|
+
const stagingDir = join6(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
|
|
3424
|
+
await rm(stagingDir, { recursive: true, force: true });
|
|
3425
|
+
await mkdir(parentDir, { recursive: true });
|
|
3426
|
+
await cp(backupDir, stagingDir, { recursive: true, force: true });
|
|
3427
|
+
return stagingDir;
|
|
3428
|
+
}
|
|
3429
|
+
async function exportBackup(input) {
|
|
3430
|
+
const dataDir = resolve5(input.dataDir);
|
|
3431
|
+
const outputDir = resolve5(input.outputDir);
|
|
3432
|
+
const configDir = input.configDir ? resolve5(input.configDir) : void 0;
|
|
3433
|
+
await assertSourceDirExists(dataDir);
|
|
3434
|
+
if (!await isDirectoryEmpty(outputDir)) {
|
|
3435
|
+
if (!input.replaceExisting) {
|
|
3436
|
+
throw new AiocsError(
|
|
3437
|
+
AIOCS_ERROR_CODES.backupConflict,
|
|
3438
|
+
`Backup output directory is not empty: ${outputDir}`
|
|
3439
|
+
);
|
|
3440
|
+
}
|
|
3441
|
+
await rm(outputDir, { recursive: true, force: true });
|
|
3442
|
+
}
|
|
3443
|
+
await mkdir(outputDir, { recursive: true });
|
|
3444
|
+
const entries = [];
|
|
3445
|
+
await copyDataDirForBackup(dataDir, join6(outputDir, "data"));
|
|
3446
|
+
entries.push(...(await listEntries(join6(outputDir, "data"))).map((entry) => ({
|
|
3447
|
+
...entry,
|
|
3448
|
+
relativePath: join6("data", entry.relativePath)
|
|
3449
|
+
})));
|
|
3450
|
+
if (configDir) {
|
|
3451
|
+
await copyIfPresent(configDir, join6(outputDir, "config"), entries, "config");
|
|
3452
|
+
}
|
|
3453
|
+
const manifest = {
|
|
3454
|
+
formatVersion: 1,
|
|
3455
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3456
|
+
packageVersion,
|
|
3457
|
+
entries
|
|
3458
|
+
};
|
|
3459
|
+
const manifestPath = join6(outputDir, "manifest.json");
|
|
3460
|
+
await writeFile(manifestPath, JSON.stringify(manifest, null, 2), "utf8");
|
|
3461
|
+
return {
|
|
3462
|
+
outputDir,
|
|
3463
|
+
manifestPath,
|
|
3464
|
+
manifest
|
|
3465
|
+
};
|
|
3466
|
+
}
|
|
3467
|
+
async function importBackup(input) {
|
|
3468
|
+
const inputDir = resolve5(input.inputDir);
|
|
3469
|
+
const dataDir = resolve5(input.dataDir);
|
|
3470
|
+
const configDir = input.configDir ? resolve5(input.configDir) : void 0;
|
|
3471
|
+
const { manifest, backupDataDir, backupConfigDir } = await loadValidatedBackupPayload(inputDir);
|
|
3472
|
+
if (!await isDirectoryEmpty(dataDir)) {
|
|
3473
|
+
if (!input.replaceExisting) {
|
|
3474
|
+
throw new AiocsError(
|
|
3475
|
+
AIOCS_ERROR_CODES.backupConflict,
|
|
3476
|
+
`Backup target data directory is not empty: ${dataDir}`
|
|
3477
|
+
);
|
|
3478
|
+
}
|
|
3479
|
+
}
|
|
3480
|
+
if (configDir && backupConfigDir && !await isDirectoryEmpty(configDir)) {
|
|
3481
|
+
if (!input.replaceExisting) {
|
|
3482
|
+
throw new AiocsError(
|
|
3483
|
+
AIOCS_ERROR_CODES.backupConflict,
|
|
3484
|
+
`Backup target config directory is not empty: ${configDir}`
|
|
3485
|
+
);
|
|
3486
|
+
}
|
|
3487
|
+
}
|
|
3488
|
+
const stagedDataDir = await prepareReplacementTarget(backupDataDir, dataDir);
|
|
3489
|
+
const stagedConfigDir = configDir && backupConfigDir ? await prepareReplacementTarget(backupConfigDir, configDir) : void 0;
|
|
3490
|
+
try {
|
|
3491
|
+
await rm(dataDir, { recursive: true, force: true });
|
|
3492
|
+
await rename(stagedDataDir, dataDir);
|
|
3493
|
+
if (configDir && stagedConfigDir) {
|
|
3494
|
+
await rm(configDir, { recursive: true, force: true });
|
|
3495
|
+
await rename(stagedConfigDir, configDir);
|
|
3496
|
+
}
|
|
3497
|
+
} catch (error) {
|
|
3498
|
+
await rm(stagedDataDir, { recursive: true, force: true });
|
|
3499
|
+
if (stagedConfigDir) {
|
|
3500
|
+
await rm(stagedConfigDir, { recursive: true, force: true });
|
|
3501
|
+
}
|
|
3502
|
+
throw error;
|
|
3503
|
+
}
|
|
3504
|
+
return {
|
|
3505
|
+
inputDir,
|
|
3506
|
+
dataDir,
|
|
3507
|
+
...configDir ? { configDir } : {},
|
|
3508
|
+
manifest
|
|
3509
|
+
};
|
|
3510
|
+
}
|
|
3511
|
+
|
|
3512
|
+
// src/coverage.ts
|
|
3513
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
3514
|
+
import { resolve as resolve6 } from "path";
|
|
3515
|
+
function normalizeText(value) {
|
|
3516
|
+
return value.replace(/[`*_~]+/g, "").replace(/\s+/g, " ").trim().toLowerCase();
|
|
3517
|
+
}
|
|
3518
|
+
function extractHeadings(markdown) {
|
|
3519
|
+
const matches = [...markdown.matchAll(/^#{1,6}\s+(.+)$/gm)];
|
|
3520
|
+
return matches.map((match) => match[1]?.trim() ?? "").filter(Boolean);
|
|
3521
|
+
}
|
|
3522
|
+
function extractComparableLines(markdown) {
|
|
3523
|
+
return markdown.split("\n").map((line) => line.replace(/^\s*(#{1,6}|\d+\.\s+|[-*+]\s+)/, "").trim()).map((line) => normalizeText(line)).filter(Boolean);
|
|
3524
|
+
}
|
|
3525
|
+
function classifyHeading(heading, pageTitles, sectionTitles, comparableMarkdownLines) {
|
|
3526
|
+
const normalizedHeading = normalizeText(heading);
|
|
3527
|
+
if (!normalizedHeading) {
|
|
3528
|
+
return null;
|
|
3529
|
+
}
|
|
3530
|
+
if (pageTitles.has(normalizedHeading)) {
|
|
3531
|
+
return "page_title";
|
|
3532
|
+
}
|
|
3533
|
+
if (sectionTitles.has(normalizedHeading)) {
|
|
3534
|
+
return "section_title";
|
|
3535
|
+
}
|
|
3536
|
+
if (comparableMarkdownLines.has(normalizedHeading)) {
|
|
3537
|
+
return "body";
|
|
3538
|
+
}
|
|
3539
|
+
return null;
|
|
3540
|
+
}
|
|
3541
|
+
async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
|
|
3542
|
+
if (referenceFiles.length === 0) {
|
|
3543
|
+
throw new AiocsError(
|
|
3544
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
3545
|
+
"At least one reference file is required for coverage verification."
|
|
3546
|
+
);
|
|
3547
|
+
}
|
|
3548
|
+
const pageTitles = new Set(corpus.entries.map((entry) => normalizeText(entry.pageTitle)).filter(Boolean));
|
|
3549
|
+
const sectionTitles = new Set(corpus.entries.map((entry) => normalizeText(entry.sectionTitle)).filter(Boolean));
|
|
3550
|
+
const comparableMarkdownLines = new Set(
|
|
3551
|
+
corpus.entries.flatMap((entry) => extractComparableLines(entry.markdown))
|
|
3552
|
+
);
|
|
3553
|
+
const files = [];
|
|
3554
|
+
let headingCount = 0;
|
|
3555
|
+
let matchedHeadingCount = 0;
|
|
3556
|
+
let missingHeadingCount = 0;
|
|
3557
|
+
const matchCounts = {
|
|
3558
|
+
pageTitle: 0,
|
|
3559
|
+
sectionTitle: 0,
|
|
3560
|
+
body: 0
|
|
3561
|
+
};
|
|
3562
|
+
for (const referenceFile of referenceFiles) {
|
|
3563
|
+
const resolvedReferenceFile = resolve6(referenceFile);
|
|
3564
|
+
let raw;
|
|
3565
|
+
try {
|
|
3566
|
+
raw = await readFile3(resolvedReferenceFile, "utf8");
|
|
3567
|
+
} catch (error) {
|
|
3568
|
+
if (error?.code === "ENOENT") {
|
|
3569
|
+
throw new AiocsError(
|
|
3570
|
+
AIOCS_ERROR_CODES.referenceFileNotFound,
|
|
3571
|
+
`Reference file not found: ${resolvedReferenceFile}`
|
|
3572
|
+
);
|
|
3573
|
+
}
|
|
3574
|
+
throw error;
|
|
3575
|
+
}
|
|
3576
|
+
const headings = extractHeadings(raw);
|
|
3577
|
+
if (headings.length === 0) {
|
|
3578
|
+
throw new AiocsError(
|
|
3579
|
+
AIOCS_ERROR_CODES.invalidReferenceFile,
|
|
3580
|
+
`Reference file does not contain any markdown headings: ${resolvedReferenceFile}`
|
|
3581
|
+
);
|
|
3582
|
+
}
|
|
3583
|
+
const fileMatchCounts = {
|
|
3584
|
+
pageTitle: 0,
|
|
3585
|
+
sectionTitle: 0,
|
|
3586
|
+
body: 0
|
|
3587
|
+
};
|
|
3588
|
+
const missingHeadings = [];
|
|
3589
|
+
for (const heading of headings) {
|
|
3590
|
+
const matchType = classifyHeading(heading, pageTitles, sectionTitles, comparableMarkdownLines);
|
|
3591
|
+
if (matchType === "page_title") {
|
|
3592
|
+
fileMatchCounts.pageTitle += 1;
|
|
3593
|
+
matchCounts.pageTitle += 1;
|
|
3594
|
+
matchedHeadingCount += 1;
|
|
3595
|
+
} else if (matchType === "section_title") {
|
|
3596
|
+
fileMatchCounts.sectionTitle += 1;
|
|
3597
|
+
matchCounts.sectionTitle += 1;
|
|
3598
|
+
matchedHeadingCount += 1;
|
|
3599
|
+
} else if (matchType === "body") {
|
|
3600
|
+
fileMatchCounts.body += 1;
|
|
3601
|
+
matchCounts.body += 1;
|
|
3602
|
+
matchedHeadingCount += 1;
|
|
3603
|
+
} else {
|
|
3604
|
+
missingHeadings.push(heading);
|
|
3605
|
+
missingHeadingCount += 1;
|
|
3606
|
+
}
|
|
3607
|
+
}
|
|
3608
|
+
headingCount += headings.length;
|
|
3609
|
+
files.push({
|
|
3610
|
+
referenceFile: resolvedReferenceFile,
|
|
3611
|
+
headingCount: headings.length,
|
|
3612
|
+
matchedHeadingCount: headings.length - missingHeadings.length,
|
|
3613
|
+
missingHeadingCount: missingHeadings.length,
|
|
3614
|
+
missingHeadings,
|
|
3615
|
+
matchCounts: fileMatchCounts
|
|
3616
|
+
});
|
|
3617
|
+
}
|
|
3618
|
+
return {
|
|
3619
|
+
sourceId: corpus.sourceId,
|
|
3620
|
+
snapshotId: corpus.snapshotId,
|
|
3621
|
+
complete: missingHeadingCount === 0,
|
|
3622
|
+
summary: {
|
|
3623
|
+
fileCount: files.length,
|
|
3624
|
+
headingCount,
|
|
3625
|
+
matchedHeadingCount,
|
|
3626
|
+
missingHeadingCount,
|
|
3627
|
+
matchCounts
|
|
3628
|
+
},
|
|
3629
|
+
files
|
|
3630
|
+
};
|
|
3631
|
+
}
|
|
3632
|
+
|
|
3633
|
+
// src/doctor.ts
|
|
3634
|
+
import { access as access2 } from "fs/promises";
|
|
3635
|
+
import { execFile } from "child_process";
|
|
3636
|
+
import { promisify } from "util";
|
|
3637
|
+
var execFileAsync = promisify(execFile);
|
|
3638
|
+
function summarize(checks) {
|
|
3639
|
+
const passCount = checks.filter((check) => check.status === "pass").length;
|
|
3640
|
+
const warnCount = checks.filter((check) => check.status === "warn").length;
|
|
3641
|
+
const failCount = checks.filter((check) => check.status === "fail").length;
|
|
3642
|
+
return {
|
|
3643
|
+
status: failCount > 0 ? "unhealthy" : warnCount > 0 ? "degraded" : "healthy",
|
|
3644
|
+
checkCount: checks.length,
|
|
3645
|
+
passCount,
|
|
3646
|
+
warnCount,
|
|
3647
|
+
failCount
|
|
3648
|
+
};
|
|
3649
|
+
}
|
|
3650
|
+
function toErrorMessage(error) {
|
|
3651
|
+
if (error instanceof Error) {
|
|
3652
|
+
return error.message;
|
|
3653
|
+
}
|
|
3654
|
+
return String(error);
|
|
3655
|
+
}
|
|
3656
|
+
function parseTimestamp(value) {
|
|
3657
|
+
if (!value) {
|
|
3658
|
+
return null;
|
|
3659
|
+
}
|
|
3660
|
+
const parsed = Date.parse(value);
|
|
3661
|
+
return Number.isNaN(parsed) ? null : parsed;
|
|
3662
|
+
}
|
|
3663
|
+
async function checkCatalog(env) {
|
|
3664
|
+
const dataDir = getAiocsDataDir(env);
|
|
3665
|
+
const configDir = getAiocsConfigDir(env);
|
|
3666
|
+
let catalog = null;
|
|
3667
|
+
try {
|
|
3668
|
+
catalog = openCatalog({ dataDir });
|
|
3669
|
+
const sourceCount = catalog.listSources().length;
|
|
3670
|
+
const projectLinkCount = catalog.listProjectLinks().length;
|
|
3671
|
+
return {
|
|
3672
|
+
id: "catalog",
|
|
3673
|
+
status: "pass",
|
|
3674
|
+
summary: `Catalog opened successfully at ${dataDir}`,
|
|
3675
|
+
details: {
|
|
3676
|
+
dataDir,
|
|
3677
|
+
configDir,
|
|
3678
|
+
sourceCount,
|
|
3679
|
+
projectLinkCount
|
|
3680
|
+
}
|
|
3681
|
+
};
|
|
3682
|
+
} catch (error) {
|
|
3683
|
+
return {
|
|
3684
|
+
id: "catalog",
|
|
3685
|
+
status: "fail",
|
|
3686
|
+
summary: `Catalog unavailable: ${toErrorMessage(error)}`,
|
|
3687
|
+
details: {
|
|
3688
|
+
dataDir,
|
|
3689
|
+
configDir
|
|
3690
|
+
}
|
|
3691
|
+
};
|
|
3692
|
+
} finally {
|
|
3693
|
+
catalog?.close();
|
|
3694
|
+
}
|
|
3695
|
+
}
|
|
3696
|
+
async function checkPlaywright() {
|
|
3697
|
+
try {
|
|
3698
|
+
const { chromium: chromium2 } = await import("playwright");
|
|
3699
|
+
const executablePath = chromium2.executablePath();
|
|
3700
|
+
if (!executablePath) {
|
|
3701
|
+
return {
|
|
3702
|
+
id: "playwright",
|
|
3703
|
+
status: "fail",
|
|
3704
|
+
summary: "Playwright is installed but Chromium has no resolved executable path."
|
|
3705
|
+
};
|
|
3706
|
+
}
|
|
3707
|
+
await access2(executablePath);
|
|
3708
|
+
return {
|
|
3709
|
+
id: "playwright",
|
|
3710
|
+
status: "pass",
|
|
3711
|
+
summary: "Playwright Chromium executable is available.",
|
|
3712
|
+
details: {
|
|
3713
|
+
executablePath
|
|
3714
|
+
}
|
|
3715
|
+
};
|
|
3716
|
+
} catch (error) {
|
|
3717
|
+
return {
|
|
3718
|
+
id: "playwright",
|
|
3719
|
+
status: "fail",
|
|
3720
|
+
summary: `Playwright is not ready: ${toErrorMessage(error)}`
|
|
3721
|
+
};
|
|
3722
|
+
}
|
|
3723
|
+
}
|
|
3724
|
+
async function checkDaemonConfig(env) {
|
|
3725
|
+
try {
|
|
3726
|
+
const daemonConfig = parseDaemonConfig(env, {
|
|
3727
|
+
bundledSourceDir: getBundledSourcesDir()
|
|
3728
|
+
});
|
|
3729
|
+
return {
|
|
3730
|
+
daemonConfig,
|
|
3731
|
+
daemonConfigCheck: {
|
|
3732
|
+
id: "daemon-config",
|
|
3733
|
+
status: "pass",
|
|
3734
|
+
summary: "Daemon configuration parsed successfully.",
|
|
3735
|
+
details: daemonConfig
|
|
3736
|
+
}
|
|
3737
|
+
};
|
|
3738
|
+
} catch (error) {
|
|
3739
|
+
return {
|
|
3740
|
+
daemonConfig: null,
|
|
3741
|
+
daemonConfigCheck: {
|
|
3742
|
+
id: "daemon-config",
|
|
3743
|
+
status: "fail",
|
|
3744
|
+
summary: `Daemon configuration is invalid: ${toErrorMessage(error)}`
|
|
3745
|
+
}
|
|
3746
|
+
};
|
|
3747
|
+
}
|
|
3748
|
+
}
|
|
3749
|
+
async function checkSourceSpecDirs(daemonConfig) {
|
|
3750
|
+
if (!daemonConfig) {
|
|
3751
|
+
return {
|
|
3752
|
+
id: "source-spec-dirs",
|
|
3753
|
+
status: "fail",
|
|
3754
|
+
summary: "Source spec directories cannot be validated until daemon configuration is valid."
|
|
3755
|
+
};
|
|
3756
|
+
}
|
|
3757
|
+
const directories = await Promise.all(daemonConfig.sourceSpecDirs.map(async (directory) => {
|
|
3758
|
+
const exists = await pathExists(directory);
|
|
3759
|
+
const specFiles = exists ? await walkSourceSpecFiles(directory) : [];
|
|
3760
|
+
return {
|
|
3761
|
+
directory,
|
|
3762
|
+
exists,
|
|
3763
|
+
specCount: specFiles.length
|
|
3764
|
+
};
|
|
3765
|
+
}));
|
|
3766
|
+
const existingCount = directories.filter((directory) => directory.exists).length;
|
|
3767
|
+
const totalSpecCount = directories.reduce((sum, directory) => sum + directory.specCount, 0);
|
|
3768
|
+
let status = "pass";
|
|
3769
|
+
let summary = `Validated ${directories.length} source spec director${directories.length === 1 ? "y" : "ies"}.`;
|
|
3770
|
+
if (directories.length === 0) {
|
|
3771
|
+
status = "fail";
|
|
3772
|
+
summary = "No source spec directories are configured.";
|
|
3773
|
+
} else if (daemonConfig.strictSourceSpecDirs && directories.some((directory) => !directory.exists)) {
|
|
3774
|
+
status = "fail";
|
|
3775
|
+
summary = "One or more explicitly configured source spec directories are missing.";
|
|
3776
|
+
} else if (existingCount === 0) {
|
|
3777
|
+
status = "warn";
|
|
3778
|
+
summary = "No configured source spec directories currently exist.";
|
|
3779
|
+
} else if (totalSpecCount === 0) {
|
|
3780
|
+
status = "warn";
|
|
3781
|
+
summary = "Configured source spec directories exist but contain no source specs.";
|
|
3782
|
+
} else if (directories.some((directory) => !directory.exists)) {
|
|
3783
|
+
status = "warn";
|
|
3784
|
+
summary = "Some optional source spec directories are missing.";
|
|
3785
|
+
}
|
|
3786
|
+
return {
|
|
3787
|
+
id: "source-spec-dirs",
|
|
3788
|
+
status,
|
|
3789
|
+
summary,
|
|
3790
|
+
details: {
|
|
3791
|
+
strict: daemonConfig.strictSourceSpecDirs,
|
|
3792
|
+
directories
|
|
3793
|
+
}
|
|
3794
|
+
};
|
|
3795
|
+
}
|
|
3796
|
+
async function checkFreshness(env) {
|
|
3797
|
+
const dataDir = getAiocsDataDir(env);
|
|
3798
|
+
let catalog = null;
|
|
3799
|
+
try {
|
|
3800
|
+
catalog = openCatalog({ dataDir });
|
|
3801
|
+
const sources = catalog.listSources();
|
|
3802
|
+
const referenceTime = Date.now();
|
|
3803
|
+
if (sources.length === 0) {
|
|
3804
|
+
return {
|
|
3805
|
+
id: "freshness",
|
|
3806
|
+
status: "pass",
|
|
3807
|
+
summary: "No sources are registered, so no source freshness checks are pending.",
|
|
3808
|
+
details: {
|
|
3809
|
+
sourceCount: 0
|
|
3810
|
+
}
|
|
3811
|
+
};
|
|
3812
|
+
}
|
|
3813
|
+
const staleSources = sources.filter((source) => !source.lastSuccessfulSnapshotId || Date.parse(source.nextDueAt) <= referenceTime).map((source) => ({
|
|
3814
|
+
sourceId: source.id,
|
|
3815
|
+
nextDueAt: source.nextDueAt,
|
|
3816
|
+
lastSuccessfulSnapshotAt: source.lastSuccessfulSnapshotAt,
|
|
3817
|
+
lastSuccessfulSnapshotAgeMinutes: source.lastSuccessfulSnapshotAt ? Math.floor((referenceTime - Date.parse(source.lastSuccessfulSnapshotAt)) / 6e4) : null
|
|
3818
|
+
}));
|
|
3819
|
+
const staleCanaries = sources.filter(
|
|
3820
|
+
(source) => source.nextCanaryDueAt && Date.parse(source.nextCanaryDueAt) <= referenceTime || source.lastCanaryStatus === "fail"
|
|
3821
|
+
).map((source) => ({
|
|
3822
|
+
sourceId: source.id,
|
|
3823
|
+
nextCanaryDueAt: source.nextCanaryDueAt,
|
|
3824
|
+
lastCanaryCheckedAt: source.lastCanaryCheckedAt,
|
|
3825
|
+
lastCanaryStatus: source.lastCanaryStatus
|
|
3826
|
+
}));
|
|
3827
|
+
const status = staleSources.length > 0 || staleCanaries.length > 0 ? "warn" : "pass";
|
|
3828
|
+
const summary = status === "pass" ? "Source snapshots and canaries are fresh." : `Source freshness issues detected: ${staleSources.length} stale snapshot scope(s), ${staleCanaries.length} stale/failed canary scope(s).`;
|
|
3829
|
+
return {
|
|
3830
|
+
id: "freshness",
|
|
3831
|
+
status,
|
|
3832
|
+
summary,
|
|
3833
|
+
details: {
|
|
3834
|
+
sourceCount: sources.length,
|
|
3835
|
+
staleSources,
|
|
3836
|
+
staleCanaries,
|
|
3837
|
+
checkedAt: new Date(referenceTime).toISOString()
|
|
3838
|
+
}
|
|
3839
|
+
};
|
|
3840
|
+
} catch (error) {
|
|
3841
|
+
return {
|
|
3842
|
+
id: "freshness",
|
|
3843
|
+
status: "fail",
|
|
3844
|
+
summary: `Freshness checks failed: ${toErrorMessage(error)}`
|
|
3845
|
+
};
|
|
3846
|
+
} finally {
|
|
3847
|
+
catalog?.close();
|
|
3848
|
+
}
|
|
3849
|
+
}
|
|
3850
|
+
async function checkDaemonHeartbeat(env) {
|
|
3851
|
+
const dataDir = getAiocsDataDir(env);
|
|
3852
|
+
let catalog = null;
|
|
3853
|
+
try {
|
|
3854
|
+
catalog = openCatalog({ dataDir });
|
|
3855
|
+
const daemonState = catalog.getDaemonState();
|
|
3856
|
+
if (!daemonState) {
|
|
3857
|
+
return {
|
|
3858
|
+
id: "daemon-heartbeat",
|
|
3859
|
+
status: "warn",
|
|
3860
|
+
summary: "No daemon heartbeat has been recorded yet."
|
|
3861
|
+
};
|
|
3862
|
+
}
|
|
3863
|
+
const intervalMinutes = daemonState.intervalMinutes ?? 60;
|
|
3864
|
+
const completedAt = parseTimestamp(daemonState.lastCycleCompletedAt);
|
|
3865
|
+
if (!completedAt) {
|
|
3866
|
+
return {
|
|
3867
|
+
id: "daemon-heartbeat",
|
|
3868
|
+
status: "warn",
|
|
3869
|
+
summary: "Daemon heartbeat exists but no completed cycle has been recorded yet.",
|
|
3870
|
+
details: daemonState
|
|
3871
|
+
};
|
|
3872
|
+
}
|
|
3873
|
+
const ageMinutes = Math.floor((Date.now() - completedAt) / 6e4);
|
|
3874
|
+
const stale = ageMinutes > intervalMinutes * 2;
|
|
3875
|
+
const unhealthyStatus = daemonState.lastCycleStatus === "failed" || daemonState.lastCycleStatus === "degraded";
|
|
3876
|
+
return {
|
|
3877
|
+
id: "daemon-heartbeat",
|
|
3878
|
+
status: stale || unhealthyStatus ? "warn" : "pass",
|
|
3879
|
+
summary: stale || unhealthyStatus ? `Daemon heartbeat is stale or unhealthy (age=${ageMinutes}m, status=${daemonState.lastCycleStatus ?? "unknown"}).` : `Daemon heartbeat is recent (age=${ageMinutes}m).`,
|
|
3880
|
+
details: {
|
|
3881
|
+
...daemonState,
|
|
3882
|
+
ageMinutes
|
|
3883
|
+
}
|
|
3884
|
+
};
|
|
3885
|
+
} catch (error) {
|
|
3886
|
+
return {
|
|
3887
|
+
id: "daemon-heartbeat",
|
|
3888
|
+
status: "fail",
|
|
3889
|
+
summary: `Daemon heartbeat check failed: ${toErrorMessage(error)}`
|
|
3890
|
+
};
|
|
3891
|
+
} finally {
|
|
3892
|
+
catalog?.close();
|
|
3893
|
+
}
|
|
3894
|
+
}
|
|
3895
|
+
async function checkEmbeddingProvider(env) {
|
|
3896
|
+
try {
|
|
3897
|
+
const config = getHybridRuntimeConfig(env);
|
|
3898
|
+
const status = await getEmbeddingProviderStatus(config);
|
|
3899
|
+
return {
|
|
3900
|
+
id: "embedding-provider",
|
|
3901
|
+
status: status.ok ? "pass" : "warn",
|
|
3902
|
+
summary: status.ok ? `Embedding provider is ready with model ${status.model}.` : `Embedding provider is reachable but model ${status.model} is not available locally.`,
|
|
3903
|
+
details: status
|
|
3904
|
+
};
|
|
3905
|
+
} catch (error) {
|
|
3906
|
+
return {
|
|
3907
|
+
id: "embedding-provider",
|
|
3908
|
+
status: "fail",
|
|
3909
|
+
summary: `Embedding provider check failed: ${toErrorMessage(error)}`
|
|
3910
|
+
};
|
|
3911
|
+
}
|
|
3912
|
+
}
|
|
3913
|
+
async function checkVectorStore(env) {
|
|
3914
|
+
try {
|
|
3915
|
+
const config = getHybridRuntimeConfig(env);
|
|
3916
|
+
const status = await new AiocsVectorStore(config).getHealth();
|
|
3917
|
+
return {
|
|
3918
|
+
id: "vector-store",
|
|
3919
|
+
status: status.ok ? "pass" : "warn",
|
|
3920
|
+
summary: status.ok ? `Qdrant is reachable at ${config.qdrantUrl}.` : `Qdrant is not ready at ${config.qdrantUrl}: ${status.errorMessage ?? "unknown error"}`,
|
|
3921
|
+
details: {
|
|
3922
|
+
qdrantUrl: config.qdrantUrl,
|
|
3923
|
+
collection: config.qdrantCollection,
|
|
3924
|
+
...status.collections ? { collections: status.collections } : {}
|
|
3925
|
+
}
|
|
3926
|
+
};
|
|
3927
|
+
} catch (error) {
|
|
3928
|
+
return {
|
|
3929
|
+
id: "vector-store",
|
|
3930
|
+
status: "fail",
|
|
3931
|
+
summary: `Vector store check failed: ${toErrorMessage(error)}`
|
|
3932
|
+
};
|
|
3933
|
+
}
|
|
3934
|
+
}
|
|
3935
|
+
async function checkEmbeddings(env) {
|
|
3936
|
+
const dataDir = getAiocsDataDir(env);
|
|
3937
|
+
let catalog = null;
|
|
3938
|
+
try {
|
|
3939
|
+
catalog = openCatalog({ dataDir });
|
|
3940
|
+
const overview = catalog.getEmbeddingOverview();
|
|
3941
|
+
const underIndexedSources = overview.sources.filter((source) => source.totalChunks > 0 && source.indexedChunks < source.totalChunks).map((source) => ({
|
|
3942
|
+
sourceId: source.sourceId,
|
|
3943
|
+
snapshotId: source.snapshotId,
|
|
3944
|
+
coverageRatio: source.coverageRatio,
|
|
3945
|
+
totalChunks: source.totalChunks,
|
|
3946
|
+
indexedChunks: source.indexedChunks,
|
|
3947
|
+
pendingChunks: source.pendingChunks,
|
|
3948
|
+
failedChunks: source.failedChunks,
|
|
3949
|
+
staleChunks: source.staleChunks
|
|
3950
|
+
}));
|
|
3951
|
+
const status = overview.queue.failedJobs > 0 ? "warn" : underIndexedSources.length > 0 || overview.queue.pendingJobs > 0 || overview.queue.runningJobs > 0 ? "warn" : "pass";
|
|
3952
|
+
return {
|
|
3953
|
+
id: "embeddings",
|
|
3954
|
+
status,
|
|
3955
|
+
summary: status === "pass" ? "Embedding coverage is complete for latest snapshots." : `Embedding backlog detected: ${overview.queue.pendingJobs} pending, ${overview.queue.runningJobs} running, ${overview.queue.failedJobs} failed job(s).`,
|
|
3956
|
+
details: {
|
|
3957
|
+
queue: overview.queue,
|
|
3958
|
+
underIndexedSources
|
|
3959
|
+
}
|
|
3960
|
+
};
|
|
3961
|
+
} catch (error) {
|
|
3962
|
+
return {
|
|
3963
|
+
id: "embeddings",
|
|
3964
|
+
status: "fail",
|
|
3965
|
+
summary: `Embedding status check failed: ${toErrorMessage(error)}`
|
|
3966
|
+
};
|
|
3967
|
+
} finally {
|
|
3968
|
+
catalog?.close();
|
|
3969
|
+
}
|
|
3970
|
+
}
|
|
3971
|
+
async function checkDocker() {
|
|
3972
|
+
try {
|
|
3973
|
+
const { stdout } = await execFileAsync("docker", ["info", "--format", "{{json .ServerVersion}}"]);
|
|
3974
|
+
const version = JSON.parse(stdout.trim());
|
|
3975
|
+
return {
|
|
3976
|
+
id: "docker",
|
|
3977
|
+
status: "pass",
|
|
3978
|
+
summary: `Docker is available (server ${version}).`,
|
|
3979
|
+
details: {
|
|
3980
|
+
serverVersion: version
|
|
3981
|
+
}
|
|
3982
|
+
};
|
|
3983
|
+
} catch (error) {
|
|
3984
|
+
const message = toErrorMessage(error);
|
|
3985
|
+
if (message.includes("ENOENT")) {
|
|
3986
|
+
return {
|
|
3987
|
+
id: "docker",
|
|
3988
|
+
status: "warn",
|
|
3989
|
+
summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable on this machine."
|
|
3990
|
+
};
|
|
3991
|
+
}
|
|
3992
|
+
return {
|
|
3993
|
+
id: "docker",
|
|
3994
|
+
status: "warn",
|
|
3995
|
+
summary: `Docker is not ready: ${message}`
|
|
3996
|
+
};
|
|
3997
|
+
}
|
|
3998
|
+
}
|
|
3999
|
+
async function runDoctor(env = process.env) {
|
|
4000
|
+
const catalogCheck = await checkCatalog(env);
|
|
4001
|
+
const playwrightCheck = await checkPlaywright();
|
|
4002
|
+
const { daemonConfigCheck, daemonConfig } = await checkDaemonConfig(env);
|
|
4003
|
+
const sourceSpecDirsCheck = await checkSourceSpecDirs(daemonConfig);
|
|
4004
|
+
const freshnessCheck = await checkFreshness(env);
|
|
4005
|
+
const daemonHeartbeatCheck = await checkDaemonHeartbeat(env);
|
|
4006
|
+
const embeddingProviderCheck = await checkEmbeddingProvider(env);
|
|
4007
|
+
const vectorStoreCheck = await checkVectorStore(env);
|
|
4008
|
+
const embeddingsCheck = await checkEmbeddings(env);
|
|
4009
|
+
const dockerCheck = await checkDocker();
|
|
4010
|
+
const checks = [
|
|
4011
|
+
catalogCheck,
|
|
4012
|
+
playwrightCheck,
|
|
4013
|
+
daemonConfigCheck,
|
|
4014
|
+
sourceSpecDirsCheck,
|
|
4015
|
+
freshnessCheck,
|
|
4016
|
+
daemonHeartbeatCheck,
|
|
4017
|
+
embeddingProviderCheck,
|
|
4018
|
+
vectorStoreCheck,
|
|
4019
|
+
embeddingsCheck,
|
|
4020
|
+
dockerCheck
|
|
4021
|
+
];
|
|
4022
|
+
return {
|
|
4023
|
+
summary: summarize(checks),
|
|
4024
|
+
checks
|
|
4025
|
+
};
|
|
4026
|
+
}
|
|
4027
|
+
|
|
4028
|
+
// src/hybrid/rank.ts
|
|
4029
|
+
function reciprocalRankFusion(candidateLists, rrfK) {
|
|
4030
|
+
const byChunkId = /* @__PURE__ */ new Map();
|
|
4031
|
+
for (const candidates of candidateLists) {
|
|
4032
|
+
for (const candidate of candidates) {
|
|
4033
|
+
const current = byChunkId.get(candidate.chunkId) ?? {
|
|
4034
|
+
fusedScore: 0,
|
|
4035
|
+
signals: /* @__PURE__ */ new Set()
|
|
4036
|
+
};
|
|
4037
|
+
current.fusedScore += 1 / (rrfK + candidate.rank);
|
|
4038
|
+
current.signals.add(candidate.signal);
|
|
4039
|
+
byChunkId.set(candidate.chunkId, current);
|
|
4040
|
+
}
|
|
4041
|
+
}
|
|
4042
|
+
return [...byChunkId.entries()].map(([chunkId, value]) => ({
|
|
4043
|
+
chunkId,
|
|
4044
|
+
fusedScore: value.fusedScore,
|
|
4045
|
+
signals: [...value.signals]
|
|
4046
|
+
})).sort((left, right) => right.fusedScore - left.fusedScore || left.chunkId - right.chunkId);
|
|
4047
|
+
}
|
|
4048
|
+
|
|
4049
|
+
// src/hybrid/search.ts
|
|
4050
|
+
function windowSize(limit, offset, minimum) {
|
|
4051
|
+
return Math.max(limit + offset, minimum);
|
|
4052
|
+
}
|
|
4053
|
+
function withScores(rows, scoreLookup) {
|
|
4054
|
+
return rows.map((row) => {
|
|
4055
|
+
const score = scoreLookup.get(row.chunkId) ?? {
|
|
4056
|
+
score: 0,
|
|
4057
|
+
signals: ["lexical"]
|
|
4058
|
+
};
|
|
4059
|
+
return {
|
|
4060
|
+
...row,
|
|
4061
|
+
score: score.score,
|
|
4062
|
+
signals: score.signals
|
|
4063
|
+
};
|
|
4064
|
+
});
|
|
4065
|
+
}
|
|
4066
|
+
async function searchHybridCatalog(input) {
|
|
4067
|
+
const scope = input.catalog.resolveSearchScope({
|
|
4068
|
+
query: input.query,
|
|
4069
|
+
...input.searchInput.cwd ? { cwd: input.searchInput.cwd } : {},
|
|
4070
|
+
...input.searchInput.sourceIds ? { sourceIds: input.searchInput.sourceIds } : {},
|
|
4071
|
+
...input.searchInput.snapshotId ? { snapshotId: input.searchInput.snapshotId } : {},
|
|
4072
|
+
...input.searchInput.all ? { all: true } : {},
|
|
4073
|
+
...typeof input.searchInput.limit === "number" ? { limit: input.searchInput.limit } : {},
|
|
4074
|
+
...typeof input.searchInput.offset === "number" ? { offset: input.searchInput.offset } : {}
|
|
4075
|
+
});
|
|
4076
|
+
const lexicalOnly = () => {
|
|
4077
|
+
const lexical = input.catalog.searchLexical({
|
|
4078
|
+
query: input.query,
|
|
4079
|
+
scope
|
|
4080
|
+
});
|
|
4081
|
+
return {
|
|
4082
|
+
query: input.query,
|
|
4083
|
+
total: lexical.total,
|
|
4084
|
+
limit: lexical.limit,
|
|
4085
|
+
offset: lexical.offset,
|
|
4086
|
+
hasMore: lexical.hasMore,
|
|
4087
|
+
modeRequested: input.mode,
|
|
4088
|
+
modeUsed: "lexical",
|
|
4089
|
+
results: lexical.results.map((result, index) => ({
|
|
4090
|
+
...result,
|
|
4091
|
+
score: 1 / (index + 1),
|
|
4092
|
+
signals: ["lexical"]
|
|
4093
|
+
}))
|
|
4094
|
+
};
|
|
4095
|
+
};
|
|
4096
|
+
if (scope.snapshotIds.length === 0) {
|
|
4097
|
+
return {
|
|
4098
|
+
query: input.query,
|
|
4099
|
+
total: 0,
|
|
4100
|
+
limit: scope.limit,
|
|
4101
|
+
offset: scope.offset,
|
|
4102
|
+
hasMore: false,
|
|
4103
|
+
modeRequested: input.mode,
|
|
4104
|
+
modeUsed: input.mode === "semantic" ? "semantic" : "lexical",
|
|
4105
|
+
results: []
|
|
4106
|
+
};
|
|
4107
|
+
}
|
|
4108
|
+
if (input.mode === "lexical") {
|
|
4109
|
+
return lexicalOnly();
|
|
4110
|
+
}
|
|
4111
|
+
const overview = input.catalog.getEmbeddingOverview();
|
|
4112
|
+
const snapshotIdSet = new Set(scope.snapshotIds);
|
|
4113
|
+
const scopedSources = overview.sources.filter(
|
|
4114
|
+
(source) => source.snapshotId ? snapshotIdSet.has(source.snapshotId) : false
|
|
4115
|
+
);
|
|
4116
|
+
const allSnapshotsIndexed = scopedSources.every(
|
|
4117
|
+
(source) => !source.snapshotId || source.totalChunks > 0 && source.indexedChunks === source.totalChunks
|
|
4118
|
+
);
|
|
4119
|
+
if (input.mode === "auto" && !allSnapshotsIndexed) {
|
|
4120
|
+
return lexicalOnly();
|
|
4121
|
+
}
|
|
4122
|
+
let queryVector;
|
|
4123
|
+
let vectorCandidates = [];
|
|
4124
|
+
const modelKey = getEmbeddingModelKey(input.config);
|
|
4125
|
+
try {
|
|
4126
|
+
const embedding = await embedTexts(input.config, [input.query]);
|
|
4127
|
+
queryVector = embedding[0];
|
|
4128
|
+
if (!queryVector) {
|
|
4129
|
+
throw new AiocsError(
|
|
4130
|
+
AIOCS_ERROR_CODES.embeddingProviderUnavailable,
|
|
4131
|
+
"Embedding provider returned no vector for the search query"
|
|
4132
|
+
);
|
|
4133
|
+
}
|
|
4134
|
+
const vectorStore = new AiocsVectorStore(input.config);
|
|
4135
|
+
vectorCandidates = await vectorStore.search({
|
|
4136
|
+
vector: queryVector,
|
|
4137
|
+
snapshotIds: scope.snapshotIds,
|
|
4138
|
+
sourceIds: scope.sourceIds,
|
|
4139
|
+
modelKey,
|
|
4140
|
+
limit: windowSize(scope.limit, scope.offset, input.config.vectorCandidateWindow)
|
|
4141
|
+
});
|
|
4142
|
+
} catch (error) {
|
|
4143
|
+
if (input.mode === "auto") {
|
|
4144
|
+
return lexicalOnly();
|
|
4145
|
+
}
|
|
4146
|
+
throw error;
|
|
4147
|
+
}
|
|
4148
|
+
if (input.mode === "auto" && vectorCandidates.length === 0) {
|
|
4149
|
+
return lexicalOnly();
|
|
4150
|
+
}
|
|
4151
|
+
if (input.mode === "semantic") {
|
|
4152
|
+
const orderedChunkIds2 = vectorCandidates.map((candidate) => candidate.chunkId);
|
|
4153
|
+
const chunkRows2 = input.catalog.getChunksByIds(orderedChunkIds2);
|
|
4154
|
+
const chunkMap2 = new Map(chunkRows2.map((row) => [row.chunkId, row]));
|
|
4155
|
+
const orderedRows2 = orderedChunkIds2.map((chunkId) => chunkMap2.get(chunkId)).filter((row) => Boolean(row));
|
|
4156
|
+
const pagedRows2 = orderedRows2.slice(scope.offset, scope.offset + scope.limit);
|
|
4157
|
+
const scoreLookup2 = new Map(vectorCandidates.map((candidate) => [
|
|
4158
|
+
candidate.chunkId,
|
|
4159
|
+
{ score: candidate.score, signals: ["vector"] }
|
|
4160
|
+
]));
|
|
4161
|
+
return {
|
|
4162
|
+
query: input.query,
|
|
4163
|
+
total: orderedRows2.length,
|
|
4164
|
+
limit: scope.limit,
|
|
4165
|
+
offset: scope.offset,
|
|
4166
|
+
hasMore: scope.offset + pagedRows2.length < orderedRows2.length,
|
|
4167
|
+
modeRequested: input.mode,
|
|
4168
|
+
modeUsed: "semantic",
|
|
4169
|
+
results: withScores(pagedRows2, scoreLookup2)
|
|
4170
|
+
};
|
|
4171
|
+
}
|
|
4172
|
+
const lexicalCandidates = input.catalog.searchLexical({
|
|
4173
|
+
query: input.query,
|
|
4174
|
+
scope,
|
|
4175
|
+
limit: windowSize(scope.limit, scope.offset, input.config.lexicalCandidateWindow),
|
|
4176
|
+
offset: 0
|
|
4177
|
+
});
|
|
4178
|
+
const fused = reciprocalRankFusion([
|
|
4179
|
+
lexicalCandidates.results.map((result, index) => ({
|
|
4180
|
+
chunkId: result.chunkId,
|
|
4181
|
+
rank: index + 1,
|
|
4182
|
+
signal: "lexical"
|
|
4183
|
+
})),
|
|
4184
|
+
vectorCandidates.map((result, index) => ({
|
|
4185
|
+
chunkId: result.chunkId,
|
|
4186
|
+
rank: index + 1,
|
|
4187
|
+
signal: "vector",
|
|
4188
|
+
score: result.score
|
|
4189
|
+
}))
|
|
4190
|
+
], input.config.rrfK);
|
|
4191
|
+
const orderedChunkIds = fused.map((result) => result.chunkId);
|
|
4192
|
+
const chunkRows = input.catalog.getChunksByIds(orderedChunkIds);
|
|
4193
|
+
const chunkMap = new Map(chunkRows.map((row) => [row.chunkId, row]));
|
|
4194
|
+
const orderedRows = orderedChunkIds.map((chunkId) => chunkMap.get(chunkId)).filter((row) => Boolean(row));
|
|
4195
|
+
const pagedRows = orderedRows.slice(scope.offset, scope.offset + scope.limit);
|
|
4196
|
+
const scoreLookup = new Map(fused.map((candidate) => [
|
|
4197
|
+
candidate.chunkId,
|
|
4198
|
+
{
|
|
4199
|
+
score: candidate.fusedScore,
|
|
4200
|
+
signals: candidate.signals
|
|
4201
|
+
}
|
|
4202
|
+
]));
|
|
4203
|
+
return {
|
|
4204
|
+
query: input.query,
|
|
4205
|
+
total: orderedRows.length,
|
|
4206
|
+
limit: scope.limit,
|
|
4207
|
+
offset: scope.offset,
|
|
4208
|
+
hasMore: scope.offset + pagedRows.length < orderedRows.length,
|
|
4209
|
+
modeRequested: input.mode,
|
|
4210
|
+
modeUsed: "hybrid",
|
|
4211
|
+
results: withScores(pagedRows, scoreLookup)
|
|
4212
|
+
};
|
|
4213
|
+
}
|
|
4214
|
+
|
|
4215
|
+
// src/services.ts
|
|
4216
|
+
function createCatalog() {
|
|
4217
|
+
const dataDir = getAiocsDataDir();
|
|
4218
|
+
getAiocsConfigDir();
|
|
4219
|
+
return {
|
|
4220
|
+
dataDir,
|
|
4221
|
+
catalog: openCatalog({ dataDir })
|
|
4222
|
+
};
|
|
4223
|
+
}
|
|
4224
|
+
function withCatalog(run) {
|
|
4225
|
+
const ctx = createCatalog();
|
|
4226
|
+
return Promise.resolve(run(ctx)).finally(() => ctx.catalog.close());
|
|
4227
|
+
}
|
|
4228
|
+
async function upsertSourceFromSpecFile(specFile) {
|
|
4229
|
+
const specPath = resolve7(specFile);
|
|
4230
|
+
const spec = await loadSourceSpec(specPath);
|
|
4231
|
+
const result = await withCatalog(({ catalog }) => catalog.upsertSource(spec, { specPath }));
|
|
4232
|
+
return {
|
|
4233
|
+
sourceId: result.sourceId,
|
|
4234
|
+
configHash: result.configHash,
|
|
4235
|
+
specPath
|
|
4236
|
+
};
|
|
4237
|
+
}
|
|
4238
|
+
async function listSources() {
|
|
4239
|
+
const sources = await withCatalog(({ catalog }) => catalog.listSources());
|
|
4240
|
+
return { sources };
|
|
4241
|
+
}
|
|
4242
|
+
async function fetchSources(sourceIdOrAll) {
|
|
4243
|
+
const results = await withCatalog(async ({ catalog, dataDir }) => {
|
|
4244
|
+
const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
|
|
4245
|
+
if (sourceIds.length === 0) {
|
|
4246
|
+
return [];
|
|
4247
|
+
}
|
|
4248
|
+
const fetched = [];
|
|
4249
|
+
for (const sourceId of sourceIds) {
|
|
4250
|
+
const result = await fetchSource({ catalog, sourceId, dataDir });
|
|
4251
|
+
fetched.push({
|
|
4252
|
+
sourceId,
|
|
4253
|
+
snapshotId: result.snapshotId,
|
|
4254
|
+
pageCount: result.pageCount,
|
|
4255
|
+
reused: result.reused
|
|
4256
|
+
});
|
|
4257
|
+
}
|
|
4258
|
+
await processEmbeddingJobs({
|
|
4259
|
+
catalog,
|
|
4260
|
+
config: getHybridRuntimeConfig()
|
|
4261
|
+
});
|
|
4262
|
+
return fetched;
|
|
4263
|
+
});
|
|
4264
|
+
return { results };
|
|
4265
|
+
}
|
|
4266
|
+
async function refreshDueSources(sourceIdOrAll = "all") {
|
|
4267
|
+
const results = await withCatalog(async ({ catalog, dataDir }) => {
|
|
4268
|
+
const dueIds = sourceIdOrAll === "all" ? catalog.listDueSourceIds() : (() => {
|
|
4269
|
+
const spec = catalog.getSourceSpec(sourceIdOrAll);
|
|
4270
|
+
if (!spec) {
|
|
4271
|
+
throw new AiocsError(
|
|
4272
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
4273
|
+
`Unknown source '${sourceIdOrAll}'`
|
|
4274
|
+
);
|
|
4275
|
+
}
|
|
4276
|
+
return catalog.listDueSourceIds().includes(sourceIdOrAll) ? [sourceIdOrAll] : [];
|
|
4277
|
+
})();
|
|
4278
|
+
const fetched = [];
|
|
4279
|
+
for (const sourceId of dueIds) {
|
|
4280
|
+
const result = await fetchSource({ catalog, sourceId, dataDir });
|
|
4281
|
+
fetched.push({
|
|
4282
|
+
sourceId,
|
|
4283
|
+
snapshotId: result.snapshotId,
|
|
4284
|
+
pageCount: result.pageCount,
|
|
4285
|
+
reused: result.reused
|
|
4286
|
+
});
|
|
4287
|
+
}
|
|
4288
|
+
await processEmbeddingJobs({
|
|
4289
|
+
catalog,
|
|
4290
|
+
config: getHybridRuntimeConfig()
|
|
4291
|
+
});
|
|
4292
|
+
return fetched;
|
|
4293
|
+
});
|
|
4294
|
+
return { results };
|
|
4295
|
+
}
|
|
4296
|
+
async function runSourceCanaries(sourceIdOrAll) {
|
|
4297
|
+
const results = await withCatalog(async ({ catalog }) => {
|
|
4298
|
+
const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
|
|
4299
|
+
if (sourceIds.length === 0) {
|
|
4300
|
+
return [];
|
|
4301
|
+
}
|
|
4302
|
+
const canaried = [];
|
|
4303
|
+
for (const sourceId of sourceIds) {
|
|
4304
|
+
canaried.push(await runSourceCanary({
|
|
4305
|
+
catalog,
|
|
4306
|
+
sourceId,
|
|
4307
|
+
env: process.env
|
|
4308
|
+
}));
|
|
4309
|
+
}
|
|
4310
|
+
return canaried;
|
|
4311
|
+
});
|
|
4312
|
+
return { results };
|
|
4313
|
+
}
|
|
4314
|
+
async function listSnapshotsForSource(sourceId) {
|
|
4315
|
+
const snapshots = await withCatalog(({ catalog }) => catalog.listSnapshots(sourceId));
|
|
4316
|
+
return {
|
|
4317
|
+
sourceId,
|
|
4318
|
+
snapshots
|
|
4319
|
+
};
|
|
4320
|
+
}
|
|
4321
|
+
async function diffSnapshotsForSource(input) {
|
|
4322
|
+
return withCatalog(({ catalog }) => catalog.diffSnapshots(input));
|
|
4323
|
+
}
|
|
4324
|
+
async function linkProjectSources(projectPath, sourceIds) {
|
|
4325
|
+
const resolvedProjectPath = resolve7(projectPath);
|
|
4326
|
+
await withCatalog(({ catalog }) => {
|
|
4327
|
+
catalog.linkProject(resolvedProjectPath, sourceIds);
|
|
4328
|
+
});
|
|
4329
|
+
return {
|
|
4330
|
+
projectPath: resolvedProjectPath,
|
|
4331
|
+
sourceIds
|
|
4332
|
+
};
|
|
4333
|
+
}
|
|
4334
|
+
async function unlinkProjectSources(projectPath, sourceIds) {
|
|
4335
|
+
const resolvedProjectPath = resolve7(projectPath);
|
|
4336
|
+
await withCatalog(({ catalog }) => {
|
|
4337
|
+
catalog.unlinkProject(resolvedProjectPath, sourceIds);
|
|
4338
|
+
});
|
|
4339
|
+
return {
|
|
4340
|
+
projectPath: resolvedProjectPath,
|
|
4341
|
+
sourceIds
|
|
4342
|
+
};
|
|
4343
|
+
}
|
|
4344
|
+
async function searchCatalog(query, options) {
|
|
4345
|
+
const cwd = options.project ? resolve7(options.project) : process.cwd();
|
|
4346
|
+
const explicitSources = options.source.length > 0;
|
|
4347
|
+
const results = await withCatalog(({ catalog }) => {
|
|
4348
|
+
const hybridConfig = getHybridRuntimeConfig();
|
|
4349
|
+
const scope = resolveProjectScope(cwd, catalog.listProjectLinks());
|
|
4350
|
+
if (!explicitSources && !options.all && !scope) {
|
|
4351
|
+
throw new AiocsError(
|
|
4352
|
+
AIOCS_ERROR_CODES.noProjectScope,
|
|
4353
|
+
"No linked project scope found. Use --source or --all."
|
|
4354
|
+
);
|
|
4355
|
+
}
|
|
4356
|
+
return searchHybridCatalog({
|
|
4357
|
+
catalog,
|
|
4358
|
+
config: hybridConfig,
|
|
4359
|
+
query,
|
|
4360
|
+
mode: options.mode ?? hybridConfig.defaultSearchMode,
|
|
4361
|
+
searchInput: {
|
|
4362
|
+
cwd,
|
|
4363
|
+
...explicitSources ? { sourceIds: options.source } : {},
|
|
4364
|
+
...options.snapshot ? { snapshotId: options.snapshot } : {},
|
|
4365
|
+
...options.all ? { all: true } : {},
|
|
4366
|
+
...typeof options.limit === "number" ? { limit: options.limit } : {},
|
|
4367
|
+
...typeof options.offset === "number" ? { offset: options.offset } : {}
|
|
4368
|
+
}
|
|
4369
|
+
});
|
|
4370
|
+
});
|
|
4371
|
+
return {
|
|
4372
|
+
query,
|
|
4373
|
+
total: results.total,
|
|
4374
|
+
limit: results.limit,
|
|
4375
|
+
offset: results.offset,
|
|
4376
|
+
hasMore: results.hasMore,
|
|
4377
|
+
modeRequested: results.modeRequested,
|
|
4378
|
+
modeUsed: results.modeUsed,
|
|
4379
|
+
results: results.results
|
|
4380
|
+
};
|
|
4381
|
+
}
|
|
4382
|
+
async function showChunk(chunkId) {
|
|
4383
|
+
const chunk = await withCatalog(({ catalog }) => catalog.getChunkById(chunkId));
|
|
4384
|
+
if (!chunk) {
|
|
4385
|
+
throw new AiocsError(
|
|
4386
|
+
AIOCS_ERROR_CODES.chunkNotFound,
|
|
4387
|
+
`Chunk ${chunkId} not found`
|
|
4388
|
+
);
|
|
4389
|
+
}
|
|
4390
|
+
return { chunk };
|
|
4391
|
+
}
|
|
4392
|
+
async function verifyCoverage(input) {
|
|
4393
|
+
return withCatalog(async ({ catalog }) => {
|
|
4394
|
+
const corpus = catalog.getCoverageCorpus({
|
|
4395
|
+
sourceId: input.sourceId,
|
|
4396
|
+
...input.snapshotId ? { snapshotId: input.snapshotId } : {}
|
|
4397
|
+
});
|
|
4398
|
+
return verifyCoverageAgainstReferences(corpus, input.referenceFiles);
|
|
4399
|
+
});
|
|
4400
|
+
}
|
|
4401
|
+
async function initBuiltInSources(options) {
|
|
4402
|
+
const sourceSpecDir = options?.sourceSpecDir ?? getBundledSourcesDir();
|
|
4403
|
+
const fetched = options?.fetch ?? false;
|
|
4404
|
+
const userSourceDir = getAiocsSourcesDir();
|
|
4405
|
+
return withCatalog(async ({ catalog, dataDir }) => {
|
|
4406
|
+
const bootstrapped = await bootstrapSourceSpecs({
|
|
4407
|
+
catalog,
|
|
4408
|
+
sourceSpecDirs: [sourceSpecDir],
|
|
4409
|
+
strictSourceSpecDirs: true
|
|
4410
|
+
});
|
|
4411
|
+
const fetchResults = [];
|
|
4412
|
+
if (fetched) {
|
|
4413
|
+
for (const source of bootstrapped.sources) {
|
|
4414
|
+
const result = await fetchSource({
|
|
4415
|
+
catalog,
|
|
4416
|
+
dataDir,
|
|
4417
|
+
sourceId: source.sourceId
|
|
4418
|
+
});
|
|
4419
|
+
fetchResults.push({
|
|
4420
|
+
sourceId: source.sourceId,
|
|
4421
|
+
snapshotId: result.snapshotId,
|
|
4422
|
+
pageCount: result.pageCount,
|
|
4423
|
+
reused: result.reused
|
|
4424
|
+
});
|
|
4425
|
+
}
|
|
4426
|
+
await processEmbeddingJobs({
|
|
4427
|
+
catalog,
|
|
4428
|
+
config: getHybridRuntimeConfig()
|
|
4429
|
+
});
|
|
4430
|
+
}
|
|
4431
|
+
return {
|
|
4432
|
+
sourceSpecDir,
|
|
4433
|
+
userSourceDir,
|
|
4434
|
+
fetched,
|
|
4435
|
+
initializedSources: bootstrapped.sources,
|
|
4436
|
+
removedSourceIds: bootstrapped.removedSourceIds,
|
|
4437
|
+
fetchResults
|
|
4438
|
+
};
|
|
4439
|
+
});
|
|
4440
|
+
}
|
|
4441
|
+
function getManagedSourceSpecDirectories() {
|
|
4442
|
+
return {
|
|
4443
|
+
bundledSourceDir: getBundledSourcesDir(),
|
|
4444
|
+
userSourceDir: getAiocsSourcesDir()
|
|
4445
|
+
};
|
|
4446
|
+
}
|
|
4447
|
+
function getDoctorReport(env = process.env) {
|
|
4448
|
+
return runDoctor(env);
|
|
4449
|
+
}
|
|
4450
|
+
async function exportCatalogBackup(input) {
|
|
4451
|
+
return exportBackup({
|
|
4452
|
+
dataDir: getAiocsDataDir(),
|
|
4453
|
+
configDir: getAiocsConfigDir(),
|
|
4454
|
+
outputDir: input.outputDir,
|
|
4455
|
+
...typeof input.replaceExisting === "boolean" ? { replaceExisting: input.replaceExisting } : {}
|
|
4456
|
+
});
|
|
4457
|
+
}
|
|
4458
|
+
async function importCatalogBackup(input) {
|
|
4459
|
+
const result = await importBackup({
|
|
4460
|
+
inputDir: input.inputDir,
|
|
4461
|
+
dataDir: getAiocsDataDir(),
|
|
4462
|
+
configDir: getAiocsConfigDir(),
|
|
4463
|
+
...typeof input.replaceExisting === "boolean" ? { replaceExisting: input.replaceExisting } : {}
|
|
4464
|
+
});
|
|
4465
|
+
try {
|
|
4466
|
+
await new AiocsVectorStore(getHybridRuntimeConfig()).clearCollection();
|
|
4467
|
+
} catch {
|
|
4468
|
+
}
|
|
4469
|
+
await withCatalog(({ catalog }) => {
|
|
4470
|
+
catalog.resetEmbeddingsAfterImport();
|
|
4471
|
+
});
|
|
4472
|
+
return result;
|
|
4473
|
+
}
|
|
4474
|
+
async function getEmbeddingStatus() {
|
|
4475
|
+
return withCatalog(({ catalog }) => catalog.getEmbeddingOverview());
|
|
4476
|
+
}
|
|
4477
|
+
async function backfillEmbeddings(sourceIdOrAll) {
|
|
4478
|
+
return withCatalog(({ catalog }) => sourceIdOrAll === "all" ? catalog.requeueLatestEmbeddingJobs() : catalog.requeueLatestEmbeddingJobs([sourceIdOrAll]));
|
|
4479
|
+
}
|
|
4480
|
+
async function clearEmbeddings(sourceIdOrAll) {
|
|
4481
|
+
return withCatalog(async ({ catalog }) => {
|
|
4482
|
+
const hybridConfig = getHybridRuntimeConfig();
|
|
4483
|
+
const vectorStore = new AiocsVectorStore(hybridConfig);
|
|
4484
|
+
if (sourceIdOrAll === "all") {
|
|
4485
|
+
await vectorStore.clearCollection();
|
|
4486
|
+
return catalog.clearEmbeddings();
|
|
4487
|
+
}
|
|
4488
|
+
const chunkIds = catalog.listEmbeddingChunkIds([sourceIdOrAll]);
|
|
4489
|
+
if (chunkIds.length > 0) {
|
|
4490
|
+
await vectorStore.deleteChunkIds(chunkIds);
|
|
4491
|
+
}
|
|
4492
|
+
return catalog.clearEmbeddings([sourceIdOrAll]);
|
|
4493
|
+
});
|
|
4494
|
+
}
|
|
4495
|
+
async function runEmbeddingWorker() {
|
|
4496
|
+
return withCatalog(({ catalog }) => processEmbeddingJobs({
|
|
4497
|
+
catalog,
|
|
4498
|
+
config: getHybridRuntimeConfig()
|
|
4499
|
+
}));
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
export {
|
|
4503
|
+
AIOCS_ERROR_CODES,
|
|
4504
|
+
AiocsError,
|
|
4505
|
+
toAiocsError,
|
|
4506
|
+
openCatalog,
|
|
4507
|
+
getAiocsDataDir,
|
|
4508
|
+
getAiocsConfigDir,
|
|
4509
|
+
parseDaemonConfig,
|
|
4510
|
+
startDaemon,
|
|
4511
|
+
packageName,
|
|
4512
|
+
packageVersion,
|
|
4513
|
+
packageDescription,
|
|
4514
|
+
upsertSourceFromSpecFile,
|
|
4515
|
+
listSources,
|
|
4516
|
+
fetchSources,
|
|
4517
|
+
refreshDueSources,
|
|
4518
|
+
runSourceCanaries,
|
|
4519
|
+
listSnapshotsForSource,
|
|
4520
|
+
diffSnapshotsForSource,
|
|
4521
|
+
linkProjectSources,
|
|
4522
|
+
unlinkProjectSources,
|
|
4523
|
+
searchCatalog,
|
|
4524
|
+
showChunk,
|
|
4525
|
+
verifyCoverage,
|
|
4526
|
+
initBuiltInSources,
|
|
4527
|
+
getManagedSourceSpecDirectories,
|
|
4528
|
+
getDoctorReport,
|
|
4529
|
+
exportCatalogBackup,
|
|
4530
|
+
importCatalogBackup,
|
|
4531
|
+
getEmbeddingStatus,
|
|
4532
|
+
backfillEmbeddings,
|
|
4533
|
+
clearEmbeddings,
|
|
4534
|
+
runEmbeddingWorker
|
|
4535
|
+
};
|