@bodhi-ventures/aiocs 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -3
- package/dist/{chunk-CZ6C4YUX.js → chunk-M7YEYMJL.js} +940 -168
- package/dist/cli.js +14 -2
- package/dist/mcp-server.js +22 -4
- package/docs/README.md +1 -1
- package/docs/codex-integration.md +25 -18
- package/docs/json-contract.md +21 -3
- package/package.json +20 -20
- package/skills/aiocs/SKILL.md +23 -38
- package/skills/aiocs-curation/SKILL.md +110 -0
- package/sources/nktkas-hyperliquid.yaml +30 -0
- package/docs/2026-03-26-agent-json-and-daemon-design.md +0 -157
- package/docs/2026-03-28-hybrid-search-design.md +0 -423
- package/docs/superpowers/specs/2026-03-29-tag-driven-release-pipeline-design.md +0 -135
|
@@ -150,42 +150,91 @@ import Database from "better-sqlite3";
|
|
|
150
150
|
|
|
151
151
|
// src/catalog/chunking.ts
|
|
152
152
|
var MAX_CHUNK_BYTES = 16384;
|
|
153
|
+
var CHUNK_OVERLAP_LINES = 6;
|
|
153
154
|
var HEADING_PATTERN = /^(#{1,6})\s+(.*)$/;
|
|
154
155
|
function byteLength(value) {
|
|
155
156
|
return Buffer.byteLength(value, "utf8");
|
|
156
157
|
}
|
|
158
|
+
function normalizeLanguage(filePath, language) {
|
|
159
|
+
if (language) {
|
|
160
|
+
return language.toLowerCase();
|
|
161
|
+
}
|
|
162
|
+
if (!filePath) {
|
|
163
|
+
return null;
|
|
164
|
+
}
|
|
165
|
+
const lower = filePath.toLowerCase();
|
|
166
|
+
if (lower.endsWith(".md") || lower.endsWith(".mdx")) {
|
|
167
|
+
return "markdown";
|
|
168
|
+
}
|
|
169
|
+
if (lower.endsWith(".ts")) {
|
|
170
|
+
return "typescript";
|
|
171
|
+
}
|
|
172
|
+
if (lower.endsWith(".tsx")) {
|
|
173
|
+
return "tsx";
|
|
174
|
+
}
|
|
175
|
+
if (lower.endsWith(".js")) {
|
|
176
|
+
return "javascript";
|
|
177
|
+
}
|
|
178
|
+
if (lower.endsWith(".jsx")) {
|
|
179
|
+
return "jsx";
|
|
180
|
+
}
|
|
181
|
+
if (lower.endsWith(".json")) {
|
|
182
|
+
return "json";
|
|
183
|
+
}
|
|
184
|
+
if (lower.endsWith(".yaml") || lower.endsWith(".yml")) {
|
|
185
|
+
return "yaml";
|
|
186
|
+
}
|
|
187
|
+
if (lower.endsWith(".toml")) {
|
|
188
|
+
return "toml";
|
|
189
|
+
}
|
|
190
|
+
if (lower.endsWith(".py")) {
|
|
191
|
+
return "python";
|
|
192
|
+
}
|
|
193
|
+
if (lower.endsWith(".rs")) {
|
|
194
|
+
return "rust";
|
|
195
|
+
}
|
|
196
|
+
if (lower.endsWith(".go")) {
|
|
197
|
+
return "go";
|
|
198
|
+
}
|
|
199
|
+
if (lower.endsWith(".sql")) {
|
|
200
|
+
return "sql";
|
|
201
|
+
}
|
|
202
|
+
if (lower.endsWith(".sh")) {
|
|
203
|
+
return "shell";
|
|
204
|
+
}
|
|
205
|
+
return null;
|
|
206
|
+
}
|
|
207
|
+
function flushChunk(chunks, sectionTitle, current, chunkOrder) {
|
|
208
|
+
const trimmed = current.trim();
|
|
209
|
+
if (!trimmed) {
|
|
210
|
+
return chunkOrder;
|
|
211
|
+
}
|
|
212
|
+
chunks.push({
|
|
213
|
+
sectionTitle,
|
|
214
|
+
markdown: trimmed,
|
|
215
|
+
chunkOrder
|
|
216
|
+
});
|
|
217
|
+
return chunkOrder + 1;
|
|
218
|
+
}
|
|
157
219
|
function splitLargeSection(sectionTitle, markdown, startOrder) {
|
|
158
220
|
const lines = markdown.split("\n");
|
|
159
221
|
const chunks = [];
|
|
160
222
|
let current = "";
|
|
161
223
|
let order = startOrder;
|
|
162
|
-
const flush = () => {
|
|
163
|
-
const trimmed = current.trim();
|
|
164
|
-
if (!trimmed) {
|
|
165
|
-
current = "";
|
|
166
|
-
return;
|
|
167
|
-
}
|
|
168
|
-
chunks.push({
|
|
169
|
-
sectionTitle,
|
|
170
|
-
markdown: trimmed,
|
|
171
|
-
chunkOrder: order
|
|
172
|
-
});
|
|
173
|
-
order += 1;
|
|
174
|
-
current = "";
|
|
175
|
-
};
|
|
176
224
|
for (const line of lines) {
|
|
177
225
|
const next = current ? `${current}
|
|
178
226
|
${line}` : line;
|
|
179
227
|
if (current && byteLength(next) > MAX_CHUNK_BYTES) {
|
|
180
|
-
|
|
228
|
+
order = flushChunk(chunks, sectionTitle, current, order);
|
|
229
|
+
current = "";
|
|
181
230
|
}
|
|
182
231
|
current = current ? `${current}
|
|
183
232
|
${line}` : line;
|
|
184
233
|
}
|
|
185
|
-
|
|
234
|
+
flushChunk(chunks, sectionTitle, current, order);
|
|
186
235
|
return chunks;
|
|
187
236
|
}
|
|
188
|
-
function
|
|
237
|
+
function chunkMarkdownSectioned(pageTitle, markdown) {
|
|
189
238
|
const trimmed = markdown.trim();
|
|
190
239
|
if (!trimmed) {
|
|
191
240
|
return [];
|
|
@@ -233,6 +282,154 @@ function chunkMarkdown(pageTitle, markdown) {
|
|
|
233
282
|
}
|
|
234
283
|
return chunks;
|
|
235
284
|
}
|
|
285
|
+
function symbolBoundary(line, language) {
|
|
286
|
+
const trimmed = line.trim();
|
|
287
|
+
if (!trimmed) {
|
|
288
|
+
return null;
|
|
289
|
+
}
|
|
290
|
+
const patterns = [];
|
|
291
|
+
switch (language) {
|
|
292
|
+
case "typescript":
|
|
293
|
+
case "tsx":
|
|
294
|
+
case "javascript":
|
|
295
|
+
case "jsx":
|
|
296
|
+
patterns.push(
|
|
297
|
+
/^(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/,
|
|
298
|
+
/^(?:export\s+)?(?:default\s+)?class\s+([A-Za-z0-9_$]+)/,
|
|
299
|
+
/^(?:export\s+)?(?:interface|type|enum)\s+([A-Za-z0-9_$]+)/,
|
|
300
|
+
/^(?:export\s+)?const\s+([A-Za-z0-9_$]+)\s*=/
|
|
301
|
+
);
|
|
302
|
+
break;
|
|
303
|
+
case "python":
|
|
304
|
+
patterns.push(/^(?:async\s+def|def|class)\s+([A-Za-z0-9_]+)/);
|
|
305
|
+
break;
|
|
306
|
+
case "rust":
|
|
307
|
+
patterns.push(/^(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z0-9_]+)/, /^(?:pub\s+)?(?:struct|enum|trait)\s+([A-Za-z0-9_]+)/);
|
|
308
|
+
break;
|
|
309
|
+
case "go":
|
|
310
|
+
patterns.push(/^func\s+([A-Za-z0-9_]+)/, /^type\s+([A-Za-z0-9_]+)/);
|
|
311
|
+
break;
|
|
312
|
+
case "json":
|
|
313
|
+
case "yaml":
|
|
314
|
+
case "toml":
|
|
315
|
+
patterns.push(/^["']?([A-Za-z0-9_.-]+)["']?\s*[:=]/);
|
|
316
|
+
break;
|
|
317
|
+
default:
|
|
318
|
+
patterns.push(/^(?:export\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/, /^(?:class|interface|type|enum)\s+([A-Za-z0-9_$]+)/);
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
for (const pattern of patterns) {
|
|
322
|
+
const match = trimmed.match(pattern);
|
|
323
|
+
if (match?.[1]) {
|
|
324
|
+
return match[1];
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return null;
|
|
328
|
+
}
|
|
329
|
+
function discoverBoundaries(lines, title, language) {
|
|
330
|
+
const boundaries = [];
|
|
331
|
+
lines.forEach((line, index) => {
|
|
332
|
+
const symbol = symbolBoundary(line, language);
|
|
333
|
+
if (symbol) {
|
|
334
|
+
boundaries.push({
|
|
335
|
+
index,
|
|
336
|
+
title: symbol
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
});
|
|
340
|
+
if (boundaries.length === 0 || boundaries[0].index !== 0) {
|
|
341
|
+
boundaries.unshift({
|
|
342
|
+
index: 0,
|
|
343
|
+
title
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
return boundaries;
|
|
347
|
+
}
|
|
348
|
+
function buildWindowTitle(title, startLine, endLine) {
|
|
349
|
+
return `${title} (${startLine}-${endLine})`;
|
|
350
|
+
}
|
|
351
|
+
function chunkLineWindows(title, content, startOrder) {
|
|
352
|
+
const lines = content.split("\n");
|
|
353
|
+
const chunks = [];
|
|
354
|
+
let start = 0;
|
|
355
|
+
let order = startOrder;
|
|
356
|
+
while (start < lines.length) {
|
|
357
|
+
let end = start;
|
|
358
|
+
let current = "";
|
|
359
|
+
while (end < lines.length) {
|
|
360
|
+
const candidate = current ? `${current}
|
|
361
|
+
${lines[end]}` : lines[end];
|
|
362
|
+
if (current && byteLength(candidate) > MAX_CHUNK_BYTES) {
|
|
363
|
+
break;
|
|
364
|
+
}
|
|
365
|
+
current = candidate;
|
|
366
|
+
end += 1;
|
|
367
|
+
}
|
|
368
|
+
const trimmed = current.trim();
|
|
369
|
+
if (!trimmed) {
|
|
370
|
+
break;
|
|
371
|
+
}
|
|
372
|
+
chunks.push({
|
|
373
|
+
sectionTitle: buildWindowTitle(title, start + 1, end),
|
|
374
|
+
markdown: trimmed,
|
|
375
|
+
chunkOrder: order
|
|
376
|
+
});
|
|
377
|
+
order += 1;
|
|
378
|
+
if (end >= lines.length) {
|
|
379
|
+
break;
|
|
380
|
+
}
|
|
381
|
+
start = Math.max(start + 1, end - CHUNK_OVERLAP_LINES);
|
|
382
|
+
}
|
|
383
|
+
return chunks;
|
|
384
|
+
}
|
|
385
|
+
function chunkByBoundaries(input, language) {
|
|
386
|
+
const trimmed = input.content.trim();
|
|
387
|
+
if (!trimmed) {
|
|
388
|
+
return [];
|
|
389
|
+
}
|
|
390
|
+
if (byteLength(trimmed) <= MAX_CHUNK_BYTES) {
|
|
391
|
+
return [{ sectionTitle: input.title, markdown: trimmed, chunkOrder: 0 }];
|
|
392
|
+
}
|
|
393
|
+
const lines = trimmed.split("\n");
|
|
394
|
+
const boundaries = discoverBoundaries(lines, input.title, language);
|
|
395
|
+
const chunks = [];
|
|
396
|
+
let order = 0;
|
|
397
|
+
for (let index = 0; index < boundaries.length; index += 1) {
|
|
398
|
+
const boundary = boundaries[index];
|
|
399
|
+
const nextIndex = boundaries[index + 1]?.index ?? lines.length;
|
|
400
|
+
const sectionLines = lines.slice(boundary.index, nextIndex);
|
|
401
|
+
const sectionContent = sectionLines.join("\n").trim();
|
|
402
|
+
if (!sectionContent) {
|
|
403
|
+
continue;
|
|
404
|
+
}
|
|
405
|
+
if (byteLength(sectionContent) <= MAX_CHUNK_BYTES) {
|
|
406
|
+
chunks.push({
|
|
407
|
+
sectionTitle: boundary.title,
|
|
408
|
+
markdown: sectionContent,
|
|
409
|
+
chunkOrder: order
|
|
410
|
+
});
|
|
411
|
+
order += 1;
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
const splitChunks = chunkLineWindows(boundary.title, sectionContent, order);
|
|
415
|
+
chunks.push(...splitChunks);
|
|
416
|
+
order = chunks.length;
|
|
417
|
+
}
|
|
418
|
+
return chunks.length > 0 ? chunks : chunkLineWindows(input.title, trimmed, 0);
|
|
419
|
+
}
|
|
420
|
+
function chunkContent(input) {
|
|
421
|
+
const language = normalizeLanguage(input.filePath, input.language);
|
|
422
|
+
if (language === "markdown") {
|
|
423
|
+
return chunkMarkdownSectioned(input.title, input.content);
|
|
424
|
+
}
|
|
425
|
+
if (!language) {
|
|
426
|
+
return chunkLineWindows(input.title, input.content.trim(), 0);
|
|
427
|
+
}
|
|
428
|
+
return chunkByBoundaries(input, language);
|
|
429
|
+
}
|
|
430
|
+
function detectLanguage(filePath, language) {
|
|
431
|
+
return normalizeLanguage(filePath, language);
|
|
432
|
+
}
|
|
236
433
|
|
|
237
434
|
// src/catalog/fingerprint.ts
|
|
238
435
|
import { createHash } from "crypto";
|
|
@@ -244,6 +441,7 @@ function buildSnapshotFingerprint(input) {
|
|
|
244
441
|
const payload = JSON.stringify({
|
|
245
442
|
sourceId: input.sourceId,
|
|
246
443
|
configHash: input.configHash,
|
|
444
|
+
revisionKey: input.revisionKey ?? null,
|
|
247
445
|
pages: normalizedPages
|
|
248
446
|
});
|
|
249
447
|
return sha256(payload);
|
|
@@ -272,22 +470,55 @@ function resolveProjectScope(cwd, scopes) {
|
|
|
272
470
|
return normalizedScopes[0] ?? null;
|
|
273
471
|
}
|
|
274
472
|
|
|
473
|
+
// src/patterns.ts
|
|
474
|
+
function escapeRegex(value) {
|
|
475
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
476
|
+
}
|
|
477
|
+
function patternToRegex(pattern) {
|
|
478
|
+
let regex = "^";
|
|
479
|
+
for (let index = 0; index < pattern.length; index += 1) {
|
|
480
|
+
const current = pattern[index];
|
|
481
|
+
const next = pattern[index + 1];
|
|
482
|
+
if (current === "*" && next === "*") {
|
|
483
|
+
regex += ".*";
|
|
484
|
+
index += 1;
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
if (current === "*") {
|
|
488
|
+
regex += "[^?#]*";
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
regex += escapeRegex(current ?? "");
|
|
492
|
+
}
|
|
493
|
+
return new RegExp(`${regex}$`);
|
|
494
|
+
}
|
|
495
|
+
function matchesPatterns(value, patterns) {
|
|
496
|
+
return patterns.some((pattern) => patternToRegex(pattern).test(value));
|
|
497
|
+
}
|
|
498
|
+
function toSqliteGlob(pattern) {
|
|
499
|
+
return pattern.replace(/\*\*/g, "*");
|
|
500
|
+
}
|
|
501
|
+
|
|
275
502
|
// src/spec/source-spec.ts
|
|
276
503
|
import { readFile } from "fs/promises";
|
|
277
504
|
import { extname } from "path";
|
|
278
505
|
import YAML from "yaml";
|
|
279
506
|
import { z } from "zod";
|
|
280
507
|
var patternSchema = z.string().min(1);
|
|
508
|
+
var positiveIntSchema = z.number().int().positive();
|
|
509
|
+
var scheduleSchema = z.object({
|
|
510
|
+
everyHours: positiveIntSchema
|
|
511
|
+
});
|
|
281
512
|
var interactionSchema = z.discriminatedUnion("action", [
|
|
282
513
|
z.object({
|
|
283
514
|
action: z.literal("hover"),
|
|
284
515
|
selector: z.string().min(1),
|
|
285
|
-
timeoutMs:
|
|
516
|
+
timeoutMs: positiveIntSchema.optional()
|
|
286
517
|
}),
|
|
287
518
|
z.object({
|
|
288
519
|
action: z.literal("click"),
|
|
289
520
|
selector: z.string().min(1),
|
|
290
|
-
timeoutMs:
|
|
521
|
+
timeoutMs: positiveIntSchema.optional()
|
|
291
522
|
}),
|
|
292
523
|
z.object({
|
|
293
524
|
action: z.literal("press"),
|
|
@@ -295,13 +526,13 @@ var interactionSchema = z.discriminatedUnion("action", [
|
|
|
295
526
|
}),
|
|
296
527
|
z.object({
|
|
297
528
|
action: z.literal("wait"),
|
|
298
|
-
timeoutMs:
|
|
529
|
+
timeoutMs: positiveIntSchema
|
|
299
530
|
})
|
|
300
531
|
]);
|
|
301
532
|
var clipboardExtractSchema = z.object({
|
|
302
533
|
strategy: z.literal("clipboardButton"),
|
|
303
534
|
interactions: z.array(interactionSchema).min(1),
|
|
304
|
-
clipboardTimeoutMs:
|
|
535
|
+
clipboardTimeoutMs: positiveIntSchema.default(1e4)
|
|
305
536
|
});
|
|
306
537
|
var selectorExtractSchema = z.object({
|
|
307
538
|
strategy: z.literal("selector"),
|
|
@@ -310,13 +541,13 @@ var selectorExtractSchema = z.object({
|
|
|
310
541
|
var readabilityExtractSchema = z.object({
|
|
311
542
|
strategy: z.literal("readability")
|
|
312
543
|
});
|
|
313
|
-
var
|
|
544
|
+
var webAuthHeaderSchema = z.object({
|
|
314
545
|
name: z.string().min(1),
|
|
315
546
|
valueFromEnv: z.string().min(1),
|
|
316
547
|
hosts: z.array(z.string().min(1)).min(1).optional(),
|
|
317
548
|
include: z.array(patternSchema).min(1).optional()
|
|
318
549
|
});
|
|
319
|
-
var
|
|
550
|
+
var webAuthCookieSchema = z.object({
|
|
320
551
|
name: z.string().min(1),
|
|
321
552
|
valueFromEnv: z.string().min(1),
|
|
322
553
|
domain: z.string().min(1),
|
|
@@ -325,21 +556,36 @@ var authCookieSchema = z.object({
|
|
|
325
556
|
httpOnly: z.boolean().optional(),
|
|
326
557
|
sameSite: z.enum(["Strict", "Lax", "None"]).optional()
|
|
327
558
|
});
|
|
328
|
-
var
|
|
559
|
+
var webCanaryCheckSchema = z.object({
|
|
329
560
|
url: z.string().url(),
|
|
330
561
|
expectedTitle: z.string().min(1).optional(),
|
|
331
562
|
expectedText: z.string().min(1).optional(),
|
|
332
|
-
minMarkdownLength:
|
|
563
|
+
minMarkdownLength: positiveIntSchema.default(40)
|
|
564
|
+
});
|
|
565
|
+
var gitAuthSchema = z.object({
|
|
566
|
+
tokenFromEnv: z.string().min(1),
|
|
567
|
+
username: z.string().min(1).default("x-access-token"),
|
|
568
|
+
scheme: z.enum(["basic", "bearer"]).default("basic")
|
|
569
|
+
});
|
|
570
|
+
var gitCanaryCheckSchema = z.object({
|
|
571
|
+
path: z.string().min(1),
|
|
572
|
+
expectedTitle: z.string().min(1).optional(),
|
|
573
|
+
expectedText: z.string().min(1).optional(),
|
|
574
|
+
minContentLength: positiveIntSchema.default(40)
|
|
333
575
|
});
|
|
334
|
-
var
|
|
576
|
+
var baseSourceSpecSchema = z.object({
|
|
335
577
|
id: z.string().min(1).regex(/^[a-z0-9-]+$/),
|
|
336
578
|
label: z.string().min(1),
|
|
579
|
+
schedule: scheduleSchema
|
|
580
|
+
});
|
|
581
|
+
var webSourceSpecSchema = baseSourceSpecSchema.extend({
|
|
582
|
+
kind: z.literal("web").default("web"),
|
|
337
583
|
startUrls: z.array(z.string().url()).min(1),
|
|
338
584
|
allowedHosts: z.array(z.string().min(1)).min(1),
|
|
339
585
|
discovery: z.object({
|
|
340
586
|
include: z.array(patternSchema).min(1),
|
|
341
|
-
exclude: z.array(patternSchema),
|
|
342
|
-
maxPages:
|
|
587
|
+
exclude: z.array(patternSchema).default([]),
|
|
588
|
+
maxPages: positiveIntSchema
|
|
343
589
|
}),
|
|
344
590
|
extract: z.discriminatedUnion("strategy", [
|
|
345
591
|
clipboardExtractSchema,
|
|
@@ -349,16 +595,13 @@ var sourceSpecSchema = z.object({
|
|
|
349
595
|
normalize: z.object({
|
|
350
596
|
prependSourceComment: z.boolean().default(true)
|
|
351
597
|
}),
|
|
352
|
-
schedule: z.object({
|
|
353
|
-
everyHours: z.number().int().positive()
|
|
354
|
-
}),
|
|
355
598
|
auth: z.object({
|
|
356
|
-
headers: z.array(
|
|
357
|
-
cookies: z.array(
|
|
599
|
+
headers: z.array(webAuthHeaderSchema).default([]),
|
|
600
|
+
cookies: z.array(webAuthCookieSchema).default([])
|
|
358
601
|
}).optional(),
|
|
359
602
|
canary: z.object({
|
|
360
|
-
everyHours:
|
|
361
|
-
checks: z.array(
|
|
603
|
+
everyHours: positiveIntSchema.optional(),
|
|
604
|
+
checks: z.array(webCanaryCheckSchema).min(1)
|
|
362
605
|
}).optional()
|
|
363
606
|
}).superRefine((spec, context) => {
|
|
364
607
|
for (const [index, header] of (spec.auth?.headers ?? []).entries()) {
|
|
@@ -376,6 +619,47 @@ var sourceSpecSchema = z.object({
|
|
|
376
619
|
}
|
|
377
620
|
}
|
|
378
621
|
});
|
|
622
|
+
var gitSourceSpecSchema = baseSourceSpecSchema.extend({
|
|
623
|
+
kind: z.literal("git"),
|
|
624
|
+
repo: z.object({
|
|
625
|
+
url: z.string().url(),
|
|
626
|
+
ref: z.string().min(1).default("HEAD"),
|
|
627
|
+
include: z.array(patternSchema).min(1),
|
|
628
|
+
exclude: z.array(patternSchema).default([]),
|
|
629
|
+
maxFiles: positiveIntSchema.default(2e3),
|
|
630
|
+
textFileMaxBytes: positiveIntSchema.default(262144),
|
|
631
|
+
auth: gitAuthSchema.optional()
|
|
632
|
+
}),
|
|
633
|
+
canary: z.object({
|
|
634
|
+
everyHours: positiveIntSchema.optional(),
|
|
635
|
+
checks: z.array(gitCanaryCheckSchema).min(1)
|
|
636
|
+
}).optional()
|
|
637
|
+
}).superRefine((spec, context) => {
|
|
638
|
+
const protocol = new URL(spec.repo.url).protocol;
|
|
639
|
+
if (!["https:", "http:", "file:"].includes(protocol)) {
|
|
640
|
+
context.addIssue({
|
|
641
|
+
code: z.ZodIssueCode.custom,
|
|
642
|
+
path: ["repo", "url"],
|
|
643
|
+
message: `Unsupported git source protocol '${protocol}'. Use https:// or file://.`
|
|
644
|
+
});
|
|
645
|
+
}
|
|
646
|
+
});
|
|
647
|
+
var sourceSpecSchema = z.preprocess((value) => {
|
|
648
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
649
|
+
return value;
|
|
650
|
+
}
|
|
651
|
+
const candidate = value;
|
|
652
|
+
if (!("kind" in candidate)) {
|
|
653
|
+
return {
|
|
654
|
+
...candidate,
|
|
655
|
+
kind: "web"
|
|
656
|
+
};
|
|
657
|
+
}
|
|
658
|
+
return candidate;
|
|
659
|
+
}, z.discriminatedUnion("kind", [
|
|
660
|
+
webSourceSpecSchema,
|
|
661
|
+
gitSourceSpecSchema
|
|
662
|
+
]));
|
|
379
663
|
function parseSourceSpec(raw, ext) {
|
|
380
664
|
if (ext === ".json") {
|
|
381
665
|
return JSON.parse(raw);
|
|
@@ -387,8 +671,27 @@ async function loadSourceSpec(path) {
|
|
|
387
671
|
const parsed = parseSourceSpec(raw, extname(path).toLowerCase());
|
|
388
672
|
return sourceSpecSchema.parse(parsed);
|
|
389
673
|
}
|
|
674
|
+
function parseSourceSpecObject(value) {
|
|
675
|
+
return sourceSpecSchema.parse(value);
|
|
676
|
+
}
|
|
677
|
+
function isGitSourceSpec(spec) {
|
|
678
|
+
return spec.kind === "git";
|
|
679
|
+
}
|
|
390
680
|
function resolveSourceCanary(spec) {
|
|
681
|
+
if (spec.kind === "git") {
|
|
682
|
+
return {
|
|
683
|
+
kind: "git",
|
|
684
|
+
everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
|
|
685
|
+
checks: spec.canary?.checks ?? [
|
|
686
|
+
{
|
|
687
|
+
path: "README.md",
|
|
688
|
+
minContentLength: 40
|
|
689
|
+
}
|
|
690
|
+
]
|
|
691
|
+
};
|
|
692
|
+
}
|
|
391
693
|
return {
|
|
694
|
+
kind: "web",
|
|
392
695
|
everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
|
|
393
696
|
checks: spec.canary?.checks ?? [
|
|
394
697
|
{
|
|
@@ -440,6 +743,9 @@ function initSchema(db) {
|
|
|
440
743
|
title TEXT NOT NULL,
|
|
441
744
|
markdown TEXT NOT NULL,
|
|
442
745
|
content_hash TEXT NOT NULL,
|
|
746
|
+
page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
|
|
747
|
+
file_path TEXT,
|
|
748
|
+
language TEXT,
|
|
443
749
|
UNIQUE(snapshot_id, url)
|
|
444
750
|
);
|
|
445
751
|
|
|
@@ -452,7 +758,10 @@ function initSchema(db) {
|
|
|
452
758
|
page_title TEXT NOT NULL,
|
|
453
759
|
section_title TEXT NOT NULL,
|
|
454
760
|
chunk_order INTEGER NOT NULL,
|
|
455
|
-
markdown TEXT NOT NULL
|
|
761
|
+
markdown TEXT NOT NULL,
|
|
762
|
+
page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
|
|
763
|
+
file_path TEXT,
|
|
764
|
+
language TEXT
|
|
456
765
|
);
|
|
457
766
|
|
|
458
767
|
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
@@ -568,6 +877,26 @@ function initSchema(db) {
|
|
|
568
877
|
if (!sourceColumns.some((column) => column.name === "next_canary_due_at")) {
|
|
569
878
|
db.exec("ALTER TABLE sources ADD COLUMN next_canary_due_at TEXT");
|
|
570
879
|
}
|
|
880
|
+
const pageColumns = db.prepare("PRAGMA table_info(pages)").all();
|
|
881
|
+
if (!pageColumns.some((column) => column.name === "page_kind")) {
|
|
882
|
+
db.exec(`ALTER TABLE pages ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
|
|
883
|
+
}
|
|
884
|
+
if (!pageColumns.some((column) => column.name === "file_path")) {
|
|
885
|
+
db.exec("ALTER TABLE pages ADD COLUMN file_path TEXT");
|
|
886
|
+
}
|
|
887
|
+
if (!pageColumns.some((column) => column.name === "language")) {
|
|
888
|
+
db.exec("ALTER TABLE pages ADD COLUMN language TEXT");
|
|
889
|
+
}
|
|
890
|
+
const chunkColumns = db.prepare("PRAGMA table_info(chunks)").all();
|
|
891
|
+
if (!chunkColumns.some((column) => column.name === "page_kind")) {
|
|
892
|
+
db.exec(`ALTER TABLE chunks ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
|
|
893
|
+
}
|
|
894
|
+
if (!chunkColumns.some((column) => column.name === "file_path")) {
|
|
895
|
+
db.exec("ALTER TABLE chunks ADD COLUMN file_path TEXT");
|
|
896
|
+
}
|
|
897
|
+
if (!chunkColumns.some((column) => column.name === "language")) {
|
|
898
|
+
db.exec("ALTER TABLE chunks ADD COLUMN language TEXT");
|
|
899
|
+
}
|
|
571
900
|
}
|
|
572
901
|
function nowIso() {
|
|
573
902
|
return (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -575,6 +904,9 @@ function nowIso() {
|
|
|
575
904
|
function addHoursIso(hours) {
|
|
576
905
|
return new Date(Date.now() + hours * 60 * 60 * 1e3).toISOString();
|
|
577
906
|
}
|
|
907
|
+
function resolveCanaryEveryHours(spec) {
|
|
908
|
+
return spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6));
|
|
909
|
+
}
|
|
578
910
|
function stableStringify(value) {
|
|
579
911
|
if (Array.isArray(value)) {
|
|
580
912
|
return `[${value.map((entry) => stableStringify(entry)).join(",")}]`;
|
|
@@ -591,6 +923,20 @@ function normalizeQuery(query) {
|
|
|
591
923
|
const words = query.replace(/[^\p{L}\p{N}]+/gu, " ").split(/\s+/).map((part) => part.trim()).filter(Boolean);
|
|
592
924
|
return words.join(" ");
|
|
593
925
|
}
|
|
926
|
+
function normalizePatternFilters(patterns) {
|
|
927
|
+
if (!patterns || patterns.length === 0) {
|
|
928
|
+
return null;
|
|
929
|
+
}
|
|
930
|
+
const normalized = [...new Set(patterns.map((pattern) => pattern.trim()).filter(Boolean))];
|
|
931
|
+
return normalized.length > 0 ? normalized : null;
|
|
932
|
+
}
|
|
933
|
+
function normalizeLanguageFilters(languages) {
|
|
934
|
+
if (!languages || languages.length === 0) {
|
|
935
|
+
return null;
|
|
936
|
+
}
|
|
937
|
+
const normalized = [...new Set(languages.map((language) => language.trim().toLowerCase()).filter(Boolean))];
|
|
938
|
+
return normalized.length > 0 ? normalized : null;
|
|
939
|
+
}
|
|
594
940
|
function assertPaginationValue(value, field, fallback) {
|
|
595
941
|
if (typeof value === "undefined") {
|
|
596
942
|
return fallback;
|
|
@@ -647,7 +993,9 @@ function openCatalog(options) {
|
|
|
647
993
|
limit,
|
|
648
994
|
offset,
|
|
649
995
|
sourceIds: null,
|
|
650
|
-
snapshotIds: []
|
|
996
|
+
snapshotIds: [],
|
|
997
|
+
pathPatterns: normalizePatternFilters(input.pathPatterns),
|
|
998
|
+
languages: normalizeLanguageFilters(input.languages)
|
|
651
999
|
};
|
|
652
1000
|
}
|
|
653
1001
|
const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
|
|
@@ -661,7 +1009,9 @@ function openCatalog(options) {
|
|
|
661
1009
|
limit,
|
|
662
1010
|
offset,
|
|
663
1011
|
sourceIds: filterSourceIds,
|
|
664
|
-
snapshotIds: latestSnapshotIds
|
|
1012
|
+
snapshotIds: latestSnapshotIds,
|
|
1013
|
+
pathPatterns: normalizePatternFilters(input.pathPatterns),
|
|
1014
|
+
languages: normalizeLanguageFilters(input.languages)
|
|
665
1015
|
};
|
|
666
1016
|
};
|
|
667
1017
|
const searchLexicalByScope = (input) => {
|
|
@@ -679,10 +1029,14 @@ function openCatalog(options) {
|
|
|
679
1029
|
}
|
|
680
1030
|
const whereSnapshotPlaceholders = input.scope.snapshotIds.map(() => "?").join(",");
|
|
681
1031
|
const sourceSql = input.scope.sourceIds ? `AND c.source_id IN (${input.scope.sourceIds.map(() => "?").join(",")})` : "";
|
|
1032
|
+
const pathSql = input.scope.pathPatterns ? `AND c.file_path IS NOT NULL AND (${input.scope.pathPatterns.map(() => "c.file_path GLOB ?").join(" OR ")})` : "";
|
|
1033
|
+
const languageSql = input.scope.languages ? `AND c.language IN (${input.scope.languages.map(() => "?").join(",")})` : "";
|
|
682
1034
|
const queryArgs = [
|
|
683
1035
|
normalized,
|
|
684
1036
|
...input.scope.snapshotIds,
|
|
685
|
-
...input.scope.sourceIds ?? []
|
|
1037
|
+
...input.scope.sourceIds ?? [],
|
|
1038
|
+
...(input.scope.pathPatterns ?? []).map((pattern) => toSqliteGlob(pattern)),
|
|
1039
|
+
...input.scope.languages ?? []
|
|
686
1040
|
];
|
|
687
1041
|
const totalRow = db.prepare(`
|
|
688
1042
|
SELECT COUNT(*) AS total
|
|
@@ -691,6 +1045,8 @@ function openCatalog(options) {
|
|
|
691
1045
|
WHERE chunks_fts MATCH ?
|
|
692
1046
|
AND c.snapshot_id IN (${whereSnapshotPlaceholders})
|
|
693
1047
|
${sourceSql}
|
|
1048
|
+
${pathSql}
|
|
1049
|
+
${languageSql}
|
|
694
1050
|
`).get(...queryArgs);
|
|
695
1051
|
const rows = db.prepare(`
|
|
696
1052
|
SELECT
|
|
@@ -700,12 +1056,17 @@ function openCatalog(options) {
|
|
|
700
1056
|
c.page_url,
|
|
701
1057
|
c.page_title,
|
|
702
1058
|
c.section_title,
|
|
703
|
-
c.markdown
|
|
1059
|
+
c.markdown,
|
|
1060
|
+
c.page_kind,
|
|
1061
|
+
c.file_path,
|
|
1062
|
+
c.language
|
|
704
1063
|
FROM chunks_fts
|
|
705
1064
|
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
706
1065
|
WHERE chunks_fts MATCH ?
|
|
707
1066
|
AND c.snapshot_id IN (${whereSnapshotPlaceholders})
|
|
708
1067
|
${sourceSql}
|
|
1068
|
+
${pathSql}
|
|
1069
|
+
${languageSql}
|
|
709
1070
|
ORDER BY bm25(chunks_fts), c.id
|
|
710
1071
|
LIMIT ?
|
|
711
1072
|
OFFSET ?
|
|
@@ -717,7 +1078,10 @@ function openCatalog(options) {
|
|
|
717
1078
|
pageUrl: row.page_url,
|
|
718
1079
|
pageTitle: row.page_title,
|
|
719
1080
|
sectionTitle: row.section_title,
|
|
720
|
-
markdown: row.markdown
|
|
1081
|
+
markdown: row.markdown,
|
|
1082
|
+
pageKind: row.page_kind,
|
|
1083
|
+
filePath: row.file_path,
|
|
1084
|
+
language: row.language
|
|
721
1085
|
}));
|
|
722
1086
|
return {
|
|
723
1087
|
total: totalRow.total,
|
|
@@ -889,8 +1253,8 @@ function openCatalog(options) {
|
|
|
889
1253
|
const existing = db.prepare("SELECT id, created_at, next_due_at, next_canary_due_at, config_hash FROM sources WHERE id = ?").get(spec.id);
|
|
890
1254
|
const resolvedSpecPath = options2?.specPath ? canonicalizeManagedSpecPath(options2.specPath) : null;
|
|
891
1255
|
const nextDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_due_at : timestamp;
|
|
892
|
-
const
|
|
893
|
-
const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(
|
|
1256
|
+
const canaryEveryHours = resolveCanaryEveryHours(spec);
|
|
1257
|
+
const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryEveryHours) : timestamp;
|
|
894
1258
|
const configChanged = Boolean(existing && existing.config_hash !== configHash);
|
|
895
1259
|
db.prepare(`
|
|
896
1260
|
INSERT INTO sources (
|
|
@@ -935,6 +1299,7 @@ function openCatalog(options) {
|
|
|
935
1299
|
SELECT
|
|
936
1300
|
id,
|
|
937
1301
|
label,
|
|
1302
|
+
spec_json,
|
|
938
1303
|
spec_path,
|
|
939
1304
|
next_due_at,
|
|
940
1305
|
next_canary_due_at,
|
|
@@ -947,21 +1312,25 @@ function openCatalog(options) {
|
|
|
947
1312
|
FROM sources
|
|
948
1313
|
ORDER BY id
|
|
949
1314
|
`).all();
|
|
950
|
-
return rows.map((row) =>
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1315
|
+
return rows.map((row) => {
|
|
1316
|
+
const storedSpec = parseSourceSpecObject(JSON.parse(row.spec_json));
|
|
1317
|
+
return {
|
|
1318
|
+
id: row.id,
|
|
1319
|
+
kind: storedSpec.kind,
|
|
1320
|
+
label: row.label,
|
|
1321
|
+
specPath: row.spec_path ? canonicalizeManagedSpecPath(row.spec_path) : null,
|
|
1322
|
+
nextDueAt: row.next_due_at,
|
|
1323
|
+
isDue: Date.parse(row.next_due_at) <= Date.now(),
|
|
1324
|
+
nextCanaryDueAt: row.next_canary_due_at,
|
|
1325
|
+
isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
|
|
1326
|
+
lastCheckedAt: row.last_checked_at,
|
|
1327
|
+
lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
|
|
1328
|
+
lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
|
|
1329
|
+
lastCanaryCheckedAt: row.last_canary_checked_at,
|
|
1330
|
+
lastSuccessfulCanaryAt: row.last_successful_canary_at,
|
|
1331
|
+
lastCanaryStatus: row.last_canary_status
|
|
1332
|
+
};
|
|
1333
|
+
});
|
|
965
1334
|
},
|
|
966
1335
|
listDueSourceIds(referenceTime = nowIso()) {
|
|
967
1336
|
const rows = db.prepare(`
|
|
@@ -1022,11 +1391,15 @@ function openCatalog(options) {
|
|
|
1022
1391
|
const pagesWithHashes = input.pages.map((page) => ({
|
|
1023
1392
|
...page,
|
|
1024
1393
|
markdown: page.markdown.trim(),
|
|
1025
|
-
contentHash: sha256(page.markdown.trim())
|
|
1394
|
+
contentHash: sha256(page.markdown.trim()),
|
|
1395
|
+
pageKind: page.pageKind ?? "document",
|
|
1396
|
+
filePath: page.filePath ?? null,
|
|
1397
|
+
language: detectLanguage(page.filePath, page.language)
|
|
1026
1398
|
}));
|
|
1027
1399
|
const fingerprint = buildSnapshotFingerprint({
|
|
1028
1400
|
sourceId: input.sourceId,
|
|
1029
1401
|
configHash: sourceRow.config_hash,
|
|
1402
|
+
...input.revisionKey ? { revisionKey: input.revisionKey } : {},
|
|
1030
1403
|
pages: pagesWithHashes.map((page) => ({
|
|
1031
1404
|
url: page.url,
|
|
1032
1405
|
contentHash: page.contentHash
|
|
@@ -1063,13 +1436,13 @@ function openCatalog(options) {
|
|
|
1063
1436
|
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
1064
1437
|
`);
|
|
1065
1438
|
const insertPage = db.prepare(`
|
|
1066
|
-
INSERT INTO pages (snapshot_id, url, title, markdown, content_hash)
|
|
1067
|
-
VALUES (?, ?, ?, ?, ?)
|
|
1439
|
+
INSERT INTO pages (snapshot_id, url, title, markdown, content_hash, page_kind, file_path, language)
|
|
1440
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
1068
1441
|
`);
|
|
1069
1442
|
const insertChunk = db.prepare(`
|
|
1070
1443
|
INSERT INTO chunks (
|
|
1071
|
-
source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown
|
|
1072
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
1444
|
+
source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown, page_kind, file_path, language
|
|
1445
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1073
1446
|
`);
|
|
1074
1447
|
const insertRun = db.prepare(`
|
|
1075
1448
|
INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
|
|
@@ -1086,9 +1459,23 @@ function openCatalog(options) {
|
|
|
1086
1459
|
checkedAt
|
|
1087
1460
|
);
|
|
1088
1461
|
for (const page of pagesWithHashes) {
|
|
1089
|
-
const pageInsert = insertPage.run(
|
|
1462
|
+
const pageInsert = insertPage.run(
|
|
1463
|
+
snapshotId,
|
|
1464
|
+
page.url,
|
|
1465
|
+
page.title,
|
|
1466
|
+
page.markdown,
|
|
1467
|
+
page.contentHash,
|
|
1468
|
+
page.pageKind,
|
|
1469
|
+
page.filePath,
|
|
1470
|
+
page.language
|
|
1471
|
+
);
|
|
1090
1472
|
const pageId = Number(pageInsert.lastInsertRowid);
|
|
1091
|
-
const chunks =
|
|
1473
|
+
const chunks = chunkContent({
|
|
1474
|
+
title: page.title,
|
|
1475
|
+
content: page.markdown,
|
|
1476
|
+
filePath: page.filePath,
|
|
1477
|
+
language: page.language
|
|
1478
|
+
});
|
|
1092
1479
|
for (const chunk of chunks) {
|
|
1093
1480
|
insertChunk.run(
|
|
1094
1481
|
input.sourceId,
|
|
@@ -1098,7 +1485,10 @@ function openCatalog(options) {
|
|
|
1098
1485
|
page.title,
|
|
1099
1486
|
chunk.sectionTitle,
|
|
1100
1487
|
chunk.chunkOrder,
|
|
1101
|
-
chunk.markdown
|
|
1488
|
+
chunk.markdown,
|
|
1489
|
+
page.pageKind,
|
|
1490
|
+
page.filePath,
|
|
1491
|
+
page.language
|
|
1102
1492
|
);
|
|
1103
1493
|
}
|
|
1104
1494
|
}
|
|
@@ -1149,7 +1539,6 @@ function openCatalog(options) {
|
|
|
1149
1539
|
);
|
|
1150
1540
|
}
|
|
1151
1541
|
const spec = JSON.parse(sourceRow.spec_json);
|
|
1152
|
-
const canary = resolveSourceCanary(spec);
|
|
1153
1542
|
db.prepare(`
|
|
1154
1543
|
INSERT INTO canary_runs (id, source_id, status, checked_at, details_json)
|
|
1155
1544
|
VALUES (?, ?, ?, ?, ?)
|
|
@@ -1174,7 +1563,7 @@ function openCatalog(options) {
|
|
|
1174
1563
|
input.status,
|
|
1175
1564
|
input.checkedAt,
|
|
1176
1565
|
input.status,
|
|
1177
|
-
addHoursIso(
|
|
1566
|
+
addHoursIso(resolveCanaryEveryHours(spec)),
|
|
1178
1567
|
input.checkedAt,
|
|
1179
1568
|
input.sourceId
|
|
1180
1569
|
);
|
|
@@ -1254,7 +1643,7 @@ function openCatalog(options) {
|
|
|
1254
1643
|
);
|
|
1255
1644
|
}
|
|
1256
1645
|
const loadSnapshotPages = (snapshotId) => db.prepare(`
|
|
1257
|
-
SELECT url, title, markdown, content_hash
|
|
1646
|
+
SELECT url, title, markdown, content_hash, page_kind, file_path, language
|
|
1258
1647
|
FROM pages
|
|
1259
1648
|
WHERE snapshot_id = ?
|
|
1260
1649
|
ORDER BY url
|
|
@@ -1265,11 +1654,17 @@ function openCatalog(options) {
|
|
|
1265
1654
|
const afterMap = new Map(afterPages.map((page) => [page.url, page]));
|
|
1266
1655
|
const addedPages = afterPages.filter((page) => !beforeMap.has(page.url)).map((page) => ({
|
|
1267
1656
|
url: page.url,
|
|
1268
|
-
title: page.title
|
|
1657
|
+
title: page.title,
|
|
1658
|
+
pageKind: page.page_kind,
|
|
1659
|
+
filePath: page.file_path,
|
|
1660
|
+
language: page.language
|
|
1269
1661
|
}));
|
|
1270
1662
|
const removedPages = beforePages.filter((page) => !afterMap.has(page.url)).map((page) => ({
|
|
1271
1663
|
url: page.url,
|
|
1272
|
-
title: page.title
|
|
1664
|
+
title: page.title,
|
|
1665
|
+
pageKind: page.page_kind,
|
|
1666
|
+
filePath: page.file_path,
|
|
1667
|
+
language: page.language
|
|
1273
1668
|
}));
|
|
1274
1669
|
const summarizeLineDiff = (beforeMarkdown, afterMarkdown) => {
|
|
1275
1670
|
const beforeLines = beforeMarkdown.split("\n");
|
|
@@ -1294,6 +1689,9 @@ function openCatalog(options) {
|
|
|
1294
1689
|
url: before.url,
|
|
1295
1690
|
beforeTitle: before.title,
|
|
1296
1691
|
afterTitle: after.title,
|
|
1692
|
+
pageKind: after.page_kind,
|
|
1693
|
+
filePath: after.file_path,
|
|
1694
|
+
language: after.language,
|
|
1297
1695
|
lineSummary: summarizeLineDiff(before.markdown, after.markdown)
|
|
1298
1696
|
}));
|
|
1299
1697
|
const unchangedPageCount = beforePages.filter((page) => {
|
|
@@ -1339,11 +1737,14 @@ function openCatalog(options) {
|
|
|
1339
1737
|
c.page_url,
|
|
1340
1738
|
c.page_title,
|
|
1341
1739
|
c.section_title,
|
|
1342
|
-
c.markdown
|
|
1740
|
+
c.markdown,
|
|
1741
|
+
c.page_kind,
|
|
1742
|
+
c.file_path,
|
|
1743
|
+
c.language
|
|
1343
1744
|
FROM chunks c
|
|
1344
1745
|
WHERE c.source_id = ?
|
|
1345
1746
|
AND c.snapshot_id = ?
|
|
1346
|
-
|
|
1747
|
+
ORDER BY c.id
|
|
1347
1748
|
`).all(input.sourceId, input.snapshotId);
|
|
1348
1749
|
return rows.map((row) => ({
|
|
1349
1750
|
chunkId: row.chunk_id,
|
|
@@ -1353,6 +1754,9 @@ function openCatalog(options) {
|
|
|
1353
1754
|
pageTitle: row.page_title,
|
|
1354
1755
|
sectionTitle: row.section_title,
|
|
1355
1756
|
markdown: row.markdown,
|
|
1757
|
+
pageKind: row.page_kind,
|
|
1758
|
+
filePath: row.file_path,
|
|
1759
|
+
language: row.language,
|
|
1356
1760
|
contentHash: sha256(row.markdown)
|
|
1357
1761
|
}));
|
|
1358
1762
|
},
|
|
@@ -1403,7 +1807,10 @@ function openCatalog(options) {
|
|
|
1403
1807
|
c.page_url,
|
|
1404
1808
|
c.page_title,
|
|
1405
1809
|
c.section_title,
|
|
1406
|
-
c.markdown
|
|
1810
|
+
c.markdown,
|
|
1811
|
+
c.page_kind,
|
|
1812
|
+
c.file_path,
|
|
1813
|
+
c.language
|
|
1407
1814
|
FROM chunks c
|
|
1408
1815
|
WHERE c.id IN (${chunkIds.map(() => "?").join(",")})
|
|
1409
1816
|
`).all(...chunkIds);
|
|
@@ -1414,7 +1821,10 @@ function openCatalog(options) {
|
|
|
1414
1821
|
pageUrl: row.page_url,
|
|
1415
1822
|
pageTitle: row.page_title,
|
|
1416
1823
|
sectionTitle: row.section_title,
|
|
1417
|
-
markdown: row.markdown
|
|
1824
|
+
markdown: row.markdown,
|
|
1825
|
+
pageKind: row.page_kind,
|
|
1826
|
+
filePath: row.file_path,
|
|
1827
|
+
language: row.language
|
|
1418
1828
|
}));
|
|
1419
1829
|
},
|
|
1420
1830
|
queueLatestEmbeddingJobs(sourceIds) {
|
|
@@ -1798,7 +2208,10 @@ function openCatalog(options) {
|
|
|
1798
2208
|
c.page_url,
|
|
1799
2209
|
c.page_title,
|
|
1800
2210
|
c.section_title,
|
|
1801
|
-
c.markdown
|
|
2211
|
+
c.markdown,
|
|
2212
|
+
c.page_kind,
|
|
2213
|
+
c.file_path,
|
|
2214
|
+
c.language
|
|
1802
2215
|
FROM chunks c
|
|
1803
2216
|
WHERE c.id = ?
|
|
1804
2217
|
`).get(chunkId);
|
|
@@ -1812,23 +2225,336 @@ function openCatalog(options) {
|
|
|
1812
2225
|
pageUrl: row.page_url,
|
|
1813
2226
|
pageTitle: row.page_title,
|
|
1814
2227
|
sectionTitle: row.section_title,
|
|
1815
|
-
markdown: row.markdown
|
|
2228
|
+
markdown: row.markdown,
|
|
2229
|
+
pageKind: row.page_kind,
|
|
2230
|
+
filePath: row.file_path,
|
|
2231
|
+
language: row.language
|
|
1816
2232
|
};
|
|
1817
2233
|
}
|
|
1818
2234
|
};
|
|
1819
2235
|
}
|
|
1820
2236
|
|
|
1821
2237
|
// src/daemon.ts
|
|
1822
|
-
import { existsSync as
|
|
2238
|
+
import { existsSync as existsSync3 } from "fs";
|
|
1823
2239
|
import { resolve as resolve5 } from "path";
|
|
1824
2240
|
import { setTimeout as sleep2 } from "timers/promises";
|
|
1825
2241
|
|
|
1826
2242
|
// src/fetch/fetch-source.ts
|
|
1827
|
-
import { mkdirSync as
|
|
1828
|
-
import { join as
|
|
2243
|
+
import { mkdirSync as mkdirSync4, writeFileSync as writeFileSync2 } from "fs";
|
|
2244
|
+
import { join as join5 } from "path";
|
|
1829
2245
|
import { setTimeout as sleep } from "timers/promises";
|
|
1830
2246
|
import { chromium } from "playwright";
|
|
1831
2247
|
|
|
2248
|
+
// src/git/git-source.ts
|
|
2249
|
+
import { existsSync as existsSync2, mkdirSync as mkdirSync3, writeFileSync } from "fs";
|
|
2250
|
+
import { dirname as dirname2, join as join4 } from "path";
|
|
2251
|
+
import { execFile } from "child_process";
|
|
2252
|
+
import { promisify } from "util";
|
|
2253
|
+
var execFileAsync = promisify(execFile);
|
|
2254
|
+
function nowIso2() {
|
|
2255
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
2256
|
+
}
|
|
2257
|
+
function getGitMirrorDir(dataDir, sourceId) {
|
|
2258
|
+
return join4(dataDir, "git-mirrors", `${sourceId}.git`);
|
|
2259
|
+
}
|
|
2260
|
+
function resolveEnvValue(name, env) {
|
|
2261
|
+
const value = env[name];
|
|
2262
|
+
if (!value) {
|
|
2263
|
+
throw new AiocsError(
|
|
2264
|
+
AIOCS_ERROR_CODES.authEnvMissing,
|
|
2265
|
+
`Missing required environment variable '${name}' for authenticated source access`,
|
|
2266
|
+
{ envVar: name }
|
|
2267
|
+
);
|
|
2268
|
+
}
|
|
2269
|
+
return value;
|
|
2270
|
+
}
|
|
2271
|
+
function buildGitAuthHeader(spec, env) {
|
|
2272
|
+
if (!spec.repo.auth) {
|
|
2273
|
+
return null;
|
|
2274
|
+
}
|
|
2275
|
+
const token = resolveEnvValue(spec.repo.auth.tokenFromEnv, env);
|
|
2276
|
+
if (spec.repo.auth.scheme === "bearer") {
|
|
2277
|
+
return `AUTHORIZATION: Bearer ${token}`;
|
|
2278
|
+
}
|
|
2279
|
+
const credentials = Buffer.from(`${spec.repo.auth.username}:${token}`, "utf8").toString("base64");
|
|
2280
|
+
return `AUTHORIZATION: Basic ${credentials}`;
|
|
2281
|
+
}
|
|
2282
|
+
async function runGit(args, options = {}) {
|
|
2283
|
+
const commandArgs = options.authHeader ? ["-c", `http.extraHeader=${options.authHeader}`, ...args] : args;
|
|
2284
|
+
const result = await execFileAsync("git", commandArgs, {
|
|
2285
|
+
cwd: options.cwd,
|
|
2286
|
+
env: {
|
|
2287
|
+
...process.env,
|
|
2288
|
+
...options.env,
|
|
2289
|
+
GIT_TERMINAL_PROMPT: "0"
|
|
2290
|
+
},
|
|
2291
|
+
encoding: options.encoding ?? "utf8",
|
|
2292
|
+
maxBuffer: 32 * 1024 * 1024
|
|
2293
|
+
}).catch((error) => {
|
|
2294
|
+
throw new AiocsError(
|
|
2295
|
+
AIOCS_ERROR_CODES.internalError,
|
|
2296
|
+
`Git command failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
2297
|
+
{
|
|
2298
|
+
args
|
|
2299
|
+
}
|
|
2300
|
+
);
|
|
2301
|
+
});
|
|
2302
|
+
return result.stdout;
|
|
2303
|
+
}
|
|
2304
|
+
async function ensureGitMirror(spec, dataDir, env) {
|
|
2305
|
+
const mirrorDir = getGitMirrorDir(dataDir, spec.id);
|
|
2306
|
+
mkdirSync3(dirname2(mirrorDir), { recursive: true });
|
|
2307
|
+
const authHeader = buildGitAuthHeader(spec, env);
|
|
2308
|
+
if (!existsSync2(mirrorDir)) {
|
|
2309
|
+
await runGit(["clone", "--mirror", spec.repo.url, mirrorDir], {
|
|
2310
|
+
env,
|
|
2311
|
+
authHeader
|
|
2312
|
+
});
|
|
2313
|
+
return mirrorDir;
|
|
2314
|
+
}
|
|
2315
|
+
await runGit(["--git-dir", mirrorDir, "remote", "set-url", "origin", spec.repo.url], {
|
|
2316
|
+
env,
|
|
2317
|
+
authHeader
|
|
2318
|
+
});
|
|
2319
|
+
await runGit(["--git-dir", mirrorDir, "fetch", "--prune", "--prune-tags", "--tags", "origin"], {
|
|
2320
|
+
env,
|
|
2321
|
+
authHeader
|
|
2322
|
+
});
|
|
2323
|
+
return mirrorDir;
|
|
2324
|
+
}
|
|
2325
|
+
async function resolveGitCommit(mirrorDir, ref, env) {
|
|
2326
|
+
const stdout = await runGit(["--git-dir", mirrorDir, "rev-parse", `${ref}^{commit}`], {
|
|
2327
|
+
env
|
|
2328
|
+
});
|
|
2329
|
+
return String(stdout).trim();
|
|
2330
|
+
}
|
|
2331
|
+
async function listRepoFiles(mirrorDir, commitSha, env) {
|
|
2332
|
+
const stdout = await runGit(["--git-dir", mirrorDir, "ls-tree", "-r", "-z", "--name-only", commitSha], {
|
|
2333
|
+
env,
|
|
2334
|
+
encoding: "buffer"
|
|
2335
|
+
});
|
|
2336
|
+
const entries = stdout instanceof Buffer ? stdout.toString("utf8") : String(stdout);
|
|
2337
|
+
return entries.split("\0").map((entry) => entry.trim()).filter(Boolean);
|
|
2338
|
+
}
|
|
2339
|
+
function isIncluded(spec, filePath) {
|
|
2340
|
+
if (!matchesPatterns(filePath, spec.repo.include)) {
|
|
2341
|
+
return false;
|
|
2342
|
+
}
|
|
2343
|
+
if (spec.repo.exclude.length > 0 && matchesPatterns(filePath, spec.repo.exclude)) {
|
|
2344
|
+
return false;
|
|
2345
|
+
}
|
|
2346
|
+
return true;
|
|
2347
|
+
}
|
|
2348
|
+
async function getObjectSize(mirrorDir, commitSha, filePath, env) {
|
|
2349
|
+
const stdout = await runGit(["--git-dir", mirrorDir, "cat-file", "-s", `${commitSha}:${filePath}`], {
|
|
2350
|
+
env
|
|
2351
|
+
});
|
|
2352
|
+
return Number(String(stdout).trim());
|
|
2353
|
+
}
|
|
2354
|
+
function isProbablyBinary(buffer) {
|
|
2355
|
+
return buffer.includes(0);
|
|
2356
|
+
}
|
|
2357
|
+
async function readRepoFile(mirrorDir, commitSha, filePath, env) {
|
|
2358
|
+
const stdout = await runGit(["--git-dir", mirrorDir, "show", `${commitSha}:${filePath}`], {
|
|
2359
|
+
env,
|
|
2360
|
+
encoding: "buffer"
|
|
2361
|
+
});
|
|
2362
|
+
return stdout instanceof Buffer ? stdout : Buffer.from(String(stdout), "utf8");
|
|
2363
|
+
}
|
|
2364
|
+
function normalizeRepoUrl(repoUrl) {
|
|
2365
|
+
return new URL(repoUrl);
|
|
2366
|
+
}
|
|
2367
|
+
function normalizeRepoWebBase(repoUrl) {
|
|
2368
|
+
const url = normalizeRepoUrl(repoUrl);
|
|
2369
|
+
const pathname = url.pathname.replace(/\.git$/i, "");
|
|
2370
|
+
return `${url.origin}${pathname}`;
|
|
2371
|
+
}
|
|
2372
|
+
function buildRepoFileUrl(spec, filePath) {
|
|
2373
|
+
const url = normalizeRepoUrl(spec.repo.url);
|
|
2374
|
+
const encodedPath = filePath.split("/").map((segment) => encodeURIComponent(segment)).join("/");
|
|
2375
|
+
const encodedRef = spec.repo.ref.split("/").map((segment) => encodeURIComponent(segment)).join("/");
|
|
2376
|
+
if (url.protocol === "file:") {
|
|
2377
|
+
return `${spec.repo.url}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
|
|
2378
|
+
}
|
|
2379
|
+
const base = normalizeRepoWebBase(spec.repo.url);
|
|
2380
|
+
if (url.hostname === "github.com") {
|
|
2381
|
+
return `${base}/blob/${encodedRef}/${encodedPath}`;
|
|
2382
|
+
}
|
|
2383
|
+
if (url.hostname === "gitlab.com") {
|
|
2384
|
+
return `${base}/-/blob/${encodedRef}/${encodedPath}`;
|
|
2385
|
+
}
|
|
2386
|
+
return `${base}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
|
|
2387
|
+
}
|
|
2388
|
+
function persistGitSnapshotFiles(input, snapshotId, pages) {
|
|
2389
|
+
const snapshotDir = join4(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "files");
|
|
2390
|
+
for (const page of pages) {
|
|
2391
|
+
const filePath = join4(snapshotDir, page.filePath);
|
|
2392
|
+
mkdirSync3(dirname2(filePath), { recursive: true });
|
|
2393
|
+
writeFileSync(filePath, page.markdown, "utf8");
|
|
2394
|
+
}
|
|
2395
|
+
}
|
|
2396
|
+
async function materializeGitPages(spec, mirrorDir, commitSha, env) {
|
|
2397
|
+
const repoFiles = await listRepoFiles(mirrorDir, commitSha, env);
|
|
2398
|
+
const includedFiles = repoFiles.filter((filePath) => isIncluded(spec, filePath));
|
|
2399
|
+
if (includedFiles.length > spec.repo.maxFiles) {
|
|
2400
|
+
throw new AiocsError(
|
|
2401
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
2402
|
+
`Git source '${spec.id}' matched ${includedFiles.length} files, exceeding maxFiles=${spec.repo.maxFiles}`
|
|
2403
|
+
);
|
|
2404
|
+
}
|
|
2405
|
+
const pages = [];
|
|
2406
|
+
for (const filePath of includedFiles) {
|
|
2407
|
+
const size = await getObjectSize(mirrorDir, commitSha, filePath, env);
|
|
2408
|
+
if (!Number.isFinite(size) || size > spec.repo.textFileMaxBytes) {
|
|
2409
|
+
continue;
|
|
2410
|
+
}
|
|
2411
|
+
const content = await readRepoFile(mirrorDir, commitSha, filePath, env).catch(() => null);
|
|
2412
|
+
if (!content || isProbablyBinary(content)) {
|
|
2413
|
+
continue;
|
|
2414
|
+
}
|
|
2415
|
+
const markdown = content.toString("utf8").trimEnd();
|
|
2416
|
+
if (!markdown.trim()) {
|
|
2417
|
+
continue;
|
|
2418
|
+
}
|
|
2419
|
+
pages.push({
|
|
2420
|
+
url: buildRepoFileUrl(spec, filePath),
|
|
2421
|
+
title: filePath,
|
|
2422
|
+
markdown,
|
|
2423
|
+
pageKind: "file",
|
|
2424
|
+
filePath,
|
|
2425
|
+
language: detectLanguage(filePath)
|
|
2426
|
+
});
|
|
2427
|
+
}
|
|
2428
|
+
return pages.sort((left, right) => left.filePath.localeCompare(right.filePath));
|
|
2429
|
+
}
|
|
2430
|
+
function assertCanaryPathInScope(spec, check) {
|
|
2431
|
+
if (!isIncluded(spec, check.path)) {
|
|
2432
|
+
throw new AiocsError(
|
|
2433
|
+
AIOCS_ERROR_CODES.invalidArgument,
|
|
2434
|
+
`Git canary path '${check.path}' is outside the configured include/exclude scope`
|
|
2435
|
+
);
|
|
2436
|
+
}
|
|
2437
|
+
}
|
|
2438
|
+
async function readCanaryTarget(spec, mirrorDir, commitSha, check, env) {
|
|
2439
|
+
assertCanaryPathInScope(spec, check);
|
|
2440
|
+
const content = await readRepoFile(mirrorDir, commitSha, check.path, env);
|
|
2441
|
+
if (isProbablyBinary(content)) {
|
|
2442
|
+
throw new Error(`Canary target '${check.path}' is binary`);
|
|
2443
|
+
}
|
|
2444
|
+
return {
|
|
2445
|
+
url: buildRepoFileUrl(spec, check.path),
|
|
2446
|
+
title: check.path,
|
|
2447
|
+
markdown: content.toString("utf8").trimEnd(),
|
|
2448
|
+
pageKind: "file",
|
|
2449
|
+
filePath: check.path,
|
|
2450
|
+
language: detectLanguage(check.path)
|
|
2451
|
+
};
|
|
2452
|
+
}
|
|
2453
|
+
async function fetchGitSource(input) {
|
|
2454
|
+
const spec = input.catalog.getSourceSpec(input.sourceId);
|
|
2455
|
+
if (!spec || !isGitSourceSpec(spec)) {
|
|
2456
|
+
throw new AiocsError(
|
|
2457
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
2458
|
+
`Unknown git source '${input.sourceId}'`
|
|
2459
|
+
);
|
|
2460
|
+
}
|
|
2461
|
+
const env = input.env ?? process.env;
|
|
2462
|
+
const mirrorDir = await ensureGitMirror(spec, input.dataDir, env);
|
|
2463
|
+
const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
|
|
2464
|
+
const pages = await materializeGitPages(spec, mirrorDir, commitSha, env);
|
|
2465
|
+
if (pages.length === 0) {
|
|
2466
|
+
throw new AiocsError(
|
|
2467
|
+
AIOCS_ERROR_CODES.noPagesFetched,
|
|
2468
|
+
`No text files fetched for git source '${input.sourceId}'`
|
|
2469
|
+
);
|
|
2470
|
+
}
|
|
2471
|
+
const result = input.catalog.recordSuccessfulSnapshot({
|
|
2472
|
+
sourceId: input.sourceId,
|
|
2473
|
+
detectedVersion: commitSha,
|
|
2474
|
+
revisionKey: commitSha,
|
|
2475
|
+
pages
|
|
2476
|
+
});
|
|
2477
|
+
if (!result.reused) {
|
|
2478
|
+
persistGitSnapshotFiles(input, result.snapshotId, pages);
|
|
2479
|
+
}
|
|
2480
|
+
return {
|
|
2481
|
+
snapshotId: result.snapshotId,
|
|
2482
|
+
pageCount: pages.length,
|
|
2483
|
+
reused: result.reused,
|
|
2484
|
+
detectedVersion: commitSha
|
|
2485
|
+
};
|
|
2486
|
+
}
|
|
2487
|
+
async function runGitSourceCanary(input) {
|
|
2488
|
+
const spec = input.catalog.getSourceSpec(input.sourceId);
|
|
2489
|
+
if (!spec || !isGitSourceSpec(spec)) {
|
|
2490
|
+
throw new AiocsError(
|
|
2491
|
+
AIOCS_ERROR_CODES.sourceNotFound,
|
|
2492
|
+
`Unknown git source '${input.sourceId}'`
|
|
2493
|
+
);
|
|
2494
|
+
}
|
|
2495
|
+
const env = input.env ?? process.env;
|
|
2496
|
+
const dataDir = input.dataDir ?? join4(process.env.HOME ?? "", ".aiocs", "data");
|
|
2497
|
+
const mirrorDir = await ensureGitMirror(spec, dataDir, env);
|
|
2498
|
+
const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
|
|
2499
|
+
const canary = resolveSourceCanary(spec);
|
|
2500
|
+
const checkedAt = nowIso2();
|
|
2501
|
+
const checks = [];
|
|
2502
|
+
for (const check of canary.checks) {
|
|
2503
|
+
try {
|
|
2504
|
+
const page = await readCanaryTarget(spec, mirrorDir, commitSha, check, env);
|
|
2505
|
+
if (check.expectedTitle && !page.title.includes(check.expectedTitle)) {
|
|
2506
|
+
throw new Error(`Expected title to include '${check.expectedTitle}'`);
|
|
2507
|
+
}
|
|
2508
|
+
if (check.expectedText && !page.markdown.includes(check.expectedText)) {
|
|
2509
|
+
throw new Error(`Expected markdown to include '${check.expectedText}'`);
|
|
2510
|
+
}
|
|
2511
|
+
if (page.markdown.trim().length < check.minContentLength) {
|
|
2512
|
+
throw new Error(`Expected content length to be at least ${check.minContentLength}`);
|
|
2513
|
+
}
|
|
2514
|
+
checks.push({
|
|
2515
|
+
path: check.path,
|
|
2516
|
+
status: "pass",
|
|
2517
|
+
title: page.title,
|
|
2518
|
+
markdownLength: page.markdown.trim().length
|
|
2519
|
+
});
|
|
2520
|
+
} catch (error) {
|
|
2521
|
+
checks.push({
|
|
2522
|
+
path: check.path,
|
|
2523
|
+
status: "fail",
|
|
2524
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2525
|
+
});
|
|
2526
|
+
}
|
|
2527
|
+
}
|
|
2528
|
+
const passCount = checks.filter((check) => check.status === "pass").length;
|
|
2529
|
+
const failCount = checks.length - passCount;
|
|
2530
|
+
const status = failCount > 0 ? "fail" : "pass";
|
|
2531
|
+
const result = {
|
|
2532
|
+
sourceId: input.sourceId,
|
|
2533
|
+
status,
|
|
2534
|
+
checkedAt,
|
|
2535
|
+
summary: {
|
|
2536
|
+
checkCount: checks.length,
|
|
2537
|
+
passCount,
|
|
2538
|
+
failCount
|
|
2539
|
+
},
|
|
2540
|
+
checks
|
|
2541
|
+
};
|
|
2542
|
+
input.catalog.recordCanaryRun({
|
|
2543
|
+
sourceId: input.sourceId,
|
|
2544
|
+
status,
|
|
2545
|
+
checkedAt,
|
|
2546
|
+
details: result
|
|
2547
|
+
});
|
|
2548
|
+
if (status === "fail") {
|
|
2549
|
+
throw new AiocsError(
|
|
2550
|
+
AIOCS_ERROR_CODES.canaryFailed,
|
|
2551
|
+
`Git source canary failed for '${input.sourceId}'`,
|
|
2552
|
+
result
|
|
2553
|
+
);
|
|
2554
|
+
}
|
|
2555
|
+
return result;
|
|
2556
|
+
}
|
|
2557
|
+
|
|
1832
2558
|
// src/fetch/extract.ts
|
|
1833
2559
|
import { JSDOM } from "jsdom";
|
|
1834
2560
|
import { Readability } from "@mozilla/readability";
|
|
@@ -2009,36 +2735,10 @@ async function extractPage(page, strategy) {
|
|
|
2009
2735
|
return runReadabilityStrategy(page);
|
|
2010
2736
|
}
|
|
2011
2737
|
|
|
2012
|
-
// src/fetch/url-patterns.ts
|
|
2013
|
-
function escapeRegex(value) {
|
|
2014
|
-
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
2015
|
-
}
|
|
2016
|
-
function patternToRegex(pattern) {
|
|
2017
|
-
let regex = "^";
|
|
2018
|
-
for (let index = 0; index < pattern.length; index += 1) {
|
|
2019
|
-
const current = pattern[index];
|
|
2020
|
-
const next = pattern[index + 1];
|
|
2021
|
-
if (current === "*" && next === "*") {
|
|
2022
|
-
regex += ".*";
|
|
2023
|
-
index += 1;
|
|
2024
|
-
continue;
|
|
2025
|
-
}
|
|
2026
|
-
if (current === "*") {
|
|
2027
|
-
regex += "[^?#]*";
|
|
2028
|
-
continue;
|
|
2029
|
-
}
|
|
2030
|
-
regex += escapeRegex(current ?? "");
|
|
2031
|
-
}
|
|
2032
|
-
return new RegExp(`${regex}$`);
|
|
2033
|
-
}
|
|
2034
|
-
function matchesPatterns(url, patterns) {
|
|
2035
|
-
return patterns.some((pattern) => patternToRegex(pattern).test(url));
|
|
2036
|
-
}
|
|
2037
|
-
|
|
2038
2738
|
// src/fetch/fetch-source.ts
|
|
2039
2739
|
var MAX_FETCH_ATTEMPTS = 3;
|
|
2040
2740
|
var RETRY_DELAY_MS = 250;
|
|
2041
|
-
function
|
|
2741
|
+
function nowIso3() {
|
|
2042
2742
|
return (/* @__PURE__ */ new Date()).toISOString();
|
|
2043
2743
|
}
|
|
2044
2744
|
function canonicalizeUrl(raw) {
|
|
@@ -2108,14 +2808,14 @@ async function extractRawMarkdownPage(url, response) {
|
|
|
2108
2808
|
};
|
|
2109
2809
|
}
|
|
2110
2810
|
function persistSnapshotPages(input, snapshotId, pages) {
|
|
2111
|
-
const snapshotDir =
|
|
2112
|
-
|
|
2811
|
+
const snapshotDir = join5(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
|
|
2812
|
+
mkdirSync4(snapshotDir, { recursive: true });
|
|
2113
2813
|
pages.forEach((page, index) => {
|
|
2114
2814
|
const filename = `${String(index + 1).padStart(3, "0")}-${slugify(page.title)}.md`;
|
|
2115
|
-
|
|
2815
|
+
writeFileSync2(join5(snapshotDir, filename), page.markdown, "utf8");
|
|
2116
2816
|
});
|
|
2117
2817
|
}
|
|
2118
|
-
function
|
|
2818
|
+
function resolveEnvValue2(name, env) {
|
|
2119
2819
|
const value = env[name];
|
|
2120
2820
|
if (!value) {
|
|
2121
2821
|
throw new AiocsError(
|
|
@@ -2131,13 +2831,13 @@ function resolveEnvValue(name, env) {
|
|
|
2131
2831
|
function resolveSourceAuth(spec, env) {
|
|
2132
2832
|
const scopedHeaders = (spec.auth?.headers ?? []).map((header) => ({
|
|
2133
2833
|
name: header.name,
|
|
2134
|
-
value:
|
|
2834
|
+
value: resolveEnvValue2(header.valueFromEnv, env),
|
|
2135
2835
|
hosts: header.hosts ?? spec.allowedHosts,
|
|
2136
2836
|
...header.include ? { include: header.include } : {}
|
|
2137
2837
|
}));
|
|
2138
2838
|
const cookies = (spec.auth?.cookies ?? []).map((cookie) => ({
|
|
2139
2839
|
name: cookie.name,
|
|
2140
|
-
value:
|
|
2840
|
+
value: resolveEnvValue2(cookie.valueFromEnv, env),
|
|
2141
2841
|
domain: cookie.domain,
|
|
2142
2842
|
path: cookie.path,
|
|
2143
2843
|
...typeof cookie.secure === "boolean" ? { secure: cookie.secure } : {},
|
|
@@ -2236,6 +2936,14 @@ async function fetchSourceOnce(input) {
|
|
|
2236
2936
|
`Unknown source '${input.sourceId}'`
|
|
2237
2937
|
);
|
|
2238
2938
|
}
|
|
2939
|
+
if (isGitSourceSpec(spec)) {
|
|
2940
|
+
const result = await fetchGitSource(input);
|
|
2941
|
+
return {
|
|
2942
|
+
snapshotId: result.snapshotId,
|
|
2943
|
+
pageCount: result.pageCount,
|
|
2944
|
+
reused: result.reused
|
|
2945
|
+
};
|
|
2946
|
+
}
|
|
2239
2947
|
const session = await createSourceContext(spec, input.env ?? process.env);
|
|
2240
2948
|
const { page } = session;
|
|
2241
2949
|
const queue = spec.startUrls.map((url) => canonicalizeUrl(url));
|
|
@@ -2362,6 +3070,9 @@ async function runSourceCanaryOnce(input) {
|
|
|
2362
3070
|
`Unknown source '${input.sourceId}'`
|
|
2363
3071
|
);
|
|
2364
3072
|
}
|
|
3073
|
+
if (isGitSourceSpec(spec)) {
|
|
3074
|
+
return runGitSourceCanary(input);
|
|
3075
|
+
}
|
|
2365
3076
|
const canary = resolveSourceCanary(spec);
|
|
2366
3077
|
const session = await createSourceContext(spec, input.env ?? process.env);
|
|
2367
3078
|
const { page } = session;
|
|
@@ -2412,7 +3123,7 @@ async function runSourceCanaryOnce(input) {
|
|
|
2412
3123
|
const result = {
|
|
2413
3124
|
sourceId: input.sourceId,
|
|
2414
3125
|
status: checks.every((check) => check.status === "pass") ? "pass" : "fail",
|
|
2415
|
-
checkedAt:
|
|
3126
|
+
checkedAt: nowIso3(),
|
|
2416
3127
|
summary: {
|
|
2417
3128
|
checkCount: checks.length,
|
|
2418
3129
|
passCount: checks.filter((check) => check.status === "pass").length,
|
|
@@ -2655,6 +3366,9 @@ var AiocsVectorStore = class {
|
|
|
2655
3366
|
pageUrl: point.pageUrl,
|
|
2656
3367
|
pageTitle: point.pageTitle,
|
|
2657
3368
|
sectionTitle: point.sectionTitle,
|
|
3369
|
+
pageKind: point.pageKind,
|
|
3370
|
+
filePath: point.filePath,
|
|
3371
|
+
language: point.language,
|
|
2658
3372
|
modelKey: input.modelKey
|
|
2659
3373
|
}
|
|
2660
3374
|
}));
|
|
@@ -2867,7 +3581,10 @@ async function processEmbeddingJobs(input) {
|
|
|
2867
3581
|
snapshotId: chunk.snapshotId,
|
|
2868
3582
|
pageUrl: chunk.pageUrl,
|
|
2869
3583
|
pageTitle: chunk.pageTitle,
|
|
2870
|
-
sectionTitle: chunk.sectionTitle
|
|
3584
|
+
sectionTitle: chunk.sectionTitle,
|
|
3585
|
+
pageKind: chunk.pageKind,
|
|
3586
|
+
filePath: chunk.filePath,
|
|
3587
|
+
language: chunk.language
|
|
2871
3588
|
}))
|
|
2872
3589
|
});
|
|
2873
3590
|
indexedChunkIds.push(...batch.map((chunk) => chunk.chunkId));
|
|
@@ -2960,7 +3677,7 @@ function getHybridRuntimeConfig(env = process.env) {
|
|
|
2960
3677
|
// src/spec/source-spec-files.ts
|
|
2961
3678
|
import { access, readdir } from "fs/promises";
|
|
2962
3679
|
import { constants as fsConstants } from "fs";
|
|
2963
|
-
import { extname as extname2, join as
|
|
3680
|
+
import { extname as extname2, join as join6, resolve as resolve4 } from "path";
|
|
2964
3681
|
var SOURCE_SPEC_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".json"]);
|
|
2965
3682
|
function uniqueResolvedPaths(paths) {
|
|
2966
3683
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -2987,7 +3704,7 @@ async function walkSourceSpecFiles(rootDir) {
|
|
|
2987
3704
|
const entries = await readdir(rootDir, { withFileTypes: true });
|
|
2988
3705
|
const discovered = [];
|
|
2989
3706
|
for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
|
|
2990
|
-
const entryPath =
|
|
3707
|
+
const entryPath = join6(rootDir, entry.name);
|
|
2991
3708
|
if (entry.isDirectory()) {
|
|
2992
3709
|
discovered.push(...await walkSourceSpecFiles(entryPath));
|
|
2993
3710
|
continue;
|
|
@@ -3007,7 +3724,7 @@ var DEFAULT_INTERVAL_MINUTES = 60;
|
|
|
3007
3724
|
var DEFAULT_CONTAINER_SOURCE_DIR = "/app/sources";
|
|
3008
3725
|
var BOOLEAN_TRUE_VALUES = /* @__PURE__ */ new Set(["1", "true", "yes", "on"]);
|
|
3009
3726
|
var BOOLEAN_FALSE_VALUES = /* @__PURE__ */ new Set(["0", "false", "no", "off"]);
|
|
3010
|
-
function
|
|
3727
|
+
function nowIso4() {
|
|
3011
3728
|
return (/* @__PURE__ */ new Date()).toISOString();
|
|
3012
3729
|
}
|
|
3013
3730
|
function parsePositiveInteger2(raw, variableName) {
|
|
@@ -3033,7 +3750,7 @@ function parseBoolean(raw, variableName) {
|
|
|
3033
3750
|
function parseDaemonConfig(env, options = {}) {
|
|
3034
3751
|
const intervalMinutes = env.AIOCS_DAEMON_INTERVAL_MINUTES ? parsePositiveInteger2(env.AIOCS_DAEMON_INTERVAL_MINUTES, "AIOCS_DAEMON_INTERVAL_MINUTES") : DEFAULT_INTERVAL_MINUTES;
|
|
3035
3752
|
const fetchOnStart = env.AIOCS_DAEMON_FETCH_ON_START ? parseBoolean(env.AIOCS_DAEMON_FETCH_ON_START, "AIOCS_DAEMON_FETCH_ON_START") : true;
|
|
3036
|
-
const defaultContainerSourceDir = options.containerSourceDir ?? (
|
|
3753
|
+
const defaultContainerSourceDir = options.containerSourceDir ?? (existsSync3(DEFAULT_CONTAINER_SOURCE_DIR) ? DEFAULT_CONTAINER_SOURCE_DIR : void 0);
|
|
3037
3754
|
const defaultSourceDirs = uniqueResolvedPaths([
|
|
3038
3755
|
options.bundledSourceDir ?? getBundledSourcesDir(),
|
|
3039
3756
|
options.userSourceDir ?? getAiocsSourcesDir(env),
|
|
@@ -3097,7 +3814,7 @@ async function bootstrapSourceSpecs(input) {
|
|
|
3097
3814
|
};
|
|
3098
3815
|
}
|
|
3099
3816
|
async function runDaemonCycle(input) {
|
|
3100
|
-
const startedAt =
|
|
3817
|
+
const startedAt = nowIso4();
|
|
3101
3818
|
const bootstrapped = await bootstrapSourceSpecs({
|
|
3102
3819
|
catalog: input.catalog,
|
|
3103
3820
|
sourceSpecDirs: input.sourceSpecDirs,
|
|
@@ -3127,6 +3844,7 @@ async function runDaemonCycle(input) {
|
|
|
3127
3844
|
const result = await runSourceCanary({
|
|
3128
3845
|
catalog: input.catalog,
|
|
3129
3846
|
sourceId,
|
|
3847
|
+
dataDir: input.dataDir,
|
|
3130
3848
|
env: process.env
|
|
3131
3849
|
});
|
|
3132
3850
|
canaried.push({
|
|
@@ -3185,7 +3903,7 @@ async function runDaemonCycle(input) {
|
|
|
3185
3903
|
}
|
|
3186
3904
|
return {
|
|
3187
3905
|
startedAt,
|
|
3188
|
-
finishedAt:
|
|
3906
|
+
finishedAt: nowIso4(),
|
|
3189
3907
|
dueSourceIds,
|
|
3190
3908
|
canaryDueSourceIds,
|
|
3191
3909
|
bootstrapped,
|
|
@@ -3201,7 +3919,7 @@ async function startDaemon(input) {
|
|
|
3201
3919
|
const intervalMs = input.config.intervalMinutes * 6e4;
|
|
3202
3920
|
input.catalog.resetRunningEmbeddingJobs();
|
|
3203
3921
|
input.catalog.markDaemonStarted({
|
|
3204
|
-
startedAt:
|
|
3922
|
+
startedAt: nowIso4(),
|
|
3205
3923
|
intervalMinutes: input.config.intervalMinutes,
|
|
3206
3924
|
fetchOnStart: input.config.fetchOnStart
|
|
3207
3925
|
});
|
|
@@ -3212,7 +3930,7 @@ async function startDaemon(input) {
|
|
|
3212
3930
|
sourceSpecDirs: input.config.sourceSpecDirs
|
|
3213
3931
|
});
|
|
3214
3932
|
const runCycle = async (reason) => {
|
|
3215
|
-
const startedAt =
|
|
3933
|
+
const startedAt = nowIso4();
|
|
3216
3934
|
input.catalog.markDaemonCycleStarted(startedAt);
|
|
3217
3935
|
input.logger.emit({
|
|
3218
3936
|
type: "daemon.cycle.started",
|
|
@@ -3238,7 +3956,7 @@ async function startDaemon(input) {
|
|
|
3238
3956
|
});
|
|
3239
3957
|
} catch (error) {
|
|
3240
3958
|
input.catalog.markDaemonCycleCompleted({
|
|
3241
|
-
completedAt:
|
|
3959
|
+
completedAt: nowIso4(),
|
|
3242
3960
|
status: "failed"
|
|
3243
3961
|
});
|
|
3244
3962
|
throw error;
|
|
@@ -3269,7 +3987,7 @@ async function startDaemon(input) {
|
|
|
3269
3987
|
// package.json
|
|
3270
3988
|
var package_default = {
|
|
3271
3989
|
name: "@bodhi-ventures/aiocs",
|
|
3272
|
-
version: "0.
|
|
3990
|
+
version: "0.2.0",
|
|
3273
3991
|
license: "MIT",
|
|
3274
3992
|
type: "module",
|
|
3275
3993
|
description: "Local-only documentation store, fetcher, and search CLI for AI agents.",
|
|
@@ -3317,28 +4035,28 @@ var package_default = {
|
|
|
3317
4035
|
"test:watch": "vitest"
|
|
3318
4036
|
},
|
|
3319
4037
|
dependencies: {
|
|
3320
|
-
"@modelcontextprotocol/sdk": "
|
|
3321
|
-
"@mozilla/readability": "
|
|
4038
|
+
"@modelcontextprotocol/sdk": "1.28.0",
|
|
4039
|
+
"@mozilla/readability": "0.6.0",
|
|
3322
4040
|
"@qdrant/js-client-rest": "1.17.0",
|
|
3323
|
-
"better-sqlite3": "
|
|
3324
|
-
commander: "
|
|
3325
|
-
jsdom: "
|
|
3326
|
-
playwright: "
|
|
3327
|
-
turndown: "
|
|
3328
|
-
"turndown-plugin-gfm": "
|
|
3329
|
-
yaml: "
|
|
3330
|
-
zod: "
|
|
4041
|
+
"better-sqlite3": "12.4.1",
|
|
4042
|
+
commander: "14.0.1",
|
|
4043
|
+
jsdom: "27.0.1",
|
|
4044
|
+
playwright: "1.57.0",
|
|
4045
|
+
turndown: "7.2.1",
|
|
4046
|
+
"turndown-plugin-gfm": "1.0.2",
|
|
4047
|
+
yaml: "2.8.1",
|
|
4048
|
+
zod: "4.1.12"
|
|
3331
4049
|
},
|
|
3332
4050
|
devDependencies: {
|
|
3333
|
-
"@types/better-sqlite3": "
|
|
3334
|
-
"@types/jsdom": "
|
|
3335
|
-
"@types/node": "
|
|
3336
|
-
"@types/turndown": "
|
|
3337
|
-
execa: "
|
|
3338
|
-
tsup: "
|
|
3339
|
-
tsx: "
|
|
3340
|
-
typescript: "
|
|
3341
|
-
vitest: "
|
|
4051
|
+
"@types/better-sqlite3": "7.6.13",
|
|
4052
|
+
"@types/jsdom": "21.1.7",
|
|
4053
|
+
"@types/node": "24.7.2",
|
|
4054
|
+
"@types/turndown": "5.0.5",
|
|
4055
|
+
execa: "9.6.0",
|
|
4056
|
+
tsup: "8.5.0",
|
|
4057
|
+
tsx: "4.20.6",
|
|
4058
|
+
typescript: "5.9.3",
|
|
4059
|
+
vitest: "3.2.4"
|
|
3342
4060
|
}
|
|
3343
4061
|
};
|
|
3344
4062
|
|
|
@@ -3352,7 +4070,7 @@ import { resolve as resolve8 } from "path";
|
|
|
3352
4070
|
|
|
3353
4071
|
// src/backup.ts
|
|
3354
4072
|
import { cp, mkdir, readdir as readdir2, readFile as readFile2, rename, rm, stat, writeFile } from "fs/promises";
|
|
3355
|
-
import { basename, dirname as
|
|
4073
|
+
import { basename, dirname as dirname3, join as join7, resolve as resolve6 } from "path";
|
|
3356
4074
|
import { randomUUID as randomUUID2 } from "crypto";
|
|
3357
4075
|
import Database2 from "better-sqlite3";
|
|
3358
4076
|
var CATALOG_DB_FILENAME = "catalog.sqlite";
|
|
@@ -3380,7 +4098,7 @@ async function isDirectoryEmpty(path) {
|
|
|
3380
4098
|
return (await readdir2(path)).length === 0;
|
|
3381
4099
|
}
|
|
3382
4100
|
async function listEntries(root, relativePath = "") {
|
|
3383
|
-
const absolutePath = relativePath ?
|
|
4101
|
+
const absolutePath = relativePath ? join7(root, relativePath) : root;
|
|
3384
4102
|
const stats = await stat(absolutePath);
|
|
3385
4103
|
if (!stats.isDirectory()) {
|
|
3386
4104
|
return [{
|
|
@@ -3396,7 +4114,7 @@ async function listEntries(root, relativePath = "") {
|
|
|
3396
4114
|
size: 0
|
|
3397
4115
|
}] : [];
|
|
3398
4116
|
for (const childName of childNames.sort()) {
|
|
3399
|
-
entries.push(...await listEntries(root, relativePath ?
|
|
4117
|
+
entries.push(...await listEntries(root, relativePath ? join7(relativePath, childName) : childName));
|
|
3400
4118
|
}
|
|
3401
4119
|
return entries;
|
|
3402
4120
|
}
|
|
@@ -3410,12 +4128,12 @@ async function copyIfPresent(from, to, entries, relativePrefix) {
|
|
|
3410
4128
|
entries.push(
|
|
3411
4129
|
...copiedEntries.map((entry) => ({
|
|
3412
4130
|
...entry,
|
|
3413
|
-
relativePath:
|
|
4131
|
+
relativePath: join7(relativePrefix, entry.relativePath)
|
|
3414
4132
|
}))
|
|
3415
4133
|
);
|
|
3416
4134
|
}
|
|
3417
4135
|
async function copyDataDirForBackup(from, to) {
|
|
3418
|
-
const sourceCatalogPath =
|
|
4136
|
+
const sourceCatalogPath = join7(from, CATALOG_DB_FILENAME);
|
|
3419
4137
|
if (!await pathExists2(sourceCatalogPath)) {
|
|
3420
4138
|
throw new AiocsError(
|
|
3421
4139
|
AIOCS_ERROR_CODES.backupSourceMissing,
|
|
@@ -3431,10 +4149,13 @@ async function copyDataDirForBackup(from, to) {
|
|
|
3431
4149
|
if (name === CATALOG_DB_FILENAME) {
|
|
3432
4150
|
return false;
|
|
3433
4151
|
}
|
|
4152
|
+
if (name === "git-mirrors") {
|
|
4153
|
+
return false;
|
|
4154
|
+
}
|
|
3434
4155
|
return !SQLITE_SIDE_CAR_SUFFIXES.some((suffix) => name === `${CATALOG_DB_FILENAME}${suffix}`);
|
|
3435
4156
|
}
|
|
3436
4157
|
});
|
|
3437
|
-
const targetCatalogPath =
|
|
4158
|
+
const targetCatalogPath = join7(to, CATALOG_DB_FILENAME);
|
|
3438
4159
|
const sourceCatalog = new Database2(sourceCatalogPath, { readonly: true });
|
|
3439
4160
|
try {
|
|
3440
4161
|
await sourceCatalog.backup(targetCatalogPath);
|
|
@@ -3443,7 +4164,7 @@ async function copyDataDirForBackup(from, to) {
|
|
|
3443
4164
|
}
|
|
3444
4165
|
}
|
|
3445
4166
|
async function loadValidatedBackupPayload(inputDir) {
|
|
3446
|
-
const manifestPath =
|
|
4167
|
+
const manifestPath = join7(inputDir, "manifest.json");
|
|
3447
4168
|
await assertSourceDirExists(inputDir);
|
|
3448
4169
|
if (!await pathExists2(manifestPath)) {
|
|
3449
4170
|
throw new AiocsError(
|
|
@@ -3458,21 +4179,21 @@ async function loadValidatedBackupPayload(inputDir) {
|
|
|
3458
4179
|
`Invalid backup manifest: ${manifestPath}`
|
|
3459
4180
|
);
|
|
3460
4181
|
}
|
|
3461
|
-
const backupDataDir =
|
|
4182
|
+
const backupDataDir = join7(inputDir, "data");
|
|
3462
4183
|
if (!await pathExists2(backupDataDir)) {
|
|
3463
4184
|
throw new AiocsError(
|
|
3464
4185
|
AIOCS_ERROR_CODES.backupInvalid,
|
|
3465
4186
|
`Backup payload is missing the data directory: ${backupDataDir}`
|
|
3466
4187
|
);
|
|
3467
4188
|
}
|
|
3468
|
-
const backupCatalogPath =
|
|
4189
|
+
const backupCatalogPath = join7(backupDataDir, CATALOG_DB_FILENAME);
|
|
3469
4190
|
if (!await pathExists2(backupCatalogPath)) {
|
|
3470
4191
|
throw new AiocsError(
|
|
3471
4192
|
AIOCS_ERROR_CODES.backupInvalid,
|
|
3472
4193
|
`Backup payload is missing the catalog database: ${backupCatalogPath}`
|
|
3473
4194
|
);
|
|
3474
4195
|
}
|
|
3475
|
-
const backupConfigDir =
|
|
4196
|
+
const backupConfigDir = join7(inputDir, "config");
|
|
3476
4197
|
return {
|
|
3477
4198
|
manifest,
|
|
3478
4199
|
backupDataDir,
|
|
@@ -3480,8 +4201,8 @@ async function loadValidatedBackupPayload(inputDir) {
|
|
|
3480
4201
|
};
|
|
3481
4202
|
}
|
|
3482
4203
|
async function prepareReplacementTarget(backupDir, targetDir) {
|
|
3483
|
-
const parentDir =
|
|
3484
|
-
const stagingDir =
|
|
4204
|
+
const parentDir = dirname3(targetDir);
|
|
4205
|
+
const stagingDir = join7(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
|
|
3485
4206
|
await rm(stagingDir, { recursive: true, force: true });
|
|
3486
4207
|
await mkdir(parentDir, { recursive: true });
|
|
3487
4208
|
await cp(backupDir, stagingDir, { recursive: true, force: true });
|
|
@@ -3503,13 +4224,13 @@ async function exportBackup(input) {
|
|
|
3503
4224
|
}
|
|
3504
4225
|
await mkdir(outputDir, { recursive: true });
|
|
3505
4226
|
const entries = [];
|
|
3506
|
-
await copyDataDirForBackup(dataDir,
|
|
3507
|
-
entries.push(...(await listEntries(
|
|
4227
|
+
await copyDataDirForBackup(dataDir, join7(outputDir, "data"));
|
|
4228
|
+
entries.push(...(await listEntries(join7(outputDir, "data"))).map((entry) => ({
|
|
3508
4229
|
...entry,
|
|
3509
|
-
relativePath:
|
|
4230
|
+
relativePath: join7("data", entry.relativePath)
|
|
3510
4231
|
})));
|
|
3511
4232
|
if (configDir) {
|
|
3512
|
-
await copyIfPresent(configDir,
|
|
4233
|
+
await copyIfPresent(configDir, join7(outputDir, "config"), entries, "config");
|
|
3513
4234
|
}
|
|
3514
4235
|
const manifest = {
|
|
3515
4236
|
formatVersion: 1,
|
|
@@ -3517,7 +4238,7 @@ async function exportBackup(input) {
|
|
|
3517
4238
|
packageVersion,
|
|
3518
4239
|
entries
|
|
3519
4240
|
};
|
|
3520
|
-
const manifestPath =
|
|
4241
|
+
const manifestPath = join7(outputDir, "manifest.json");
|
|
3521
4242
|
await writeFile(manifestPath, JSON.stringify(manifest, null, 2), "utf8");
|
|
3522
4243
|
return {
|
|
3523
4244
|
outputDir,
|
|
@@ -3693,9 +4414,9 @@ async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
|
|
|
3693
4414
|
|
|
3694
4415
|
// src/doctor.ts
|
|
3695
4416
|
import { access as access2 } from "fs/promises";
|
|
3696
|
-
import { execFile } from "child_process";
|
|
3697
|
-
import { promisify } from "util";
|
|
3698
|
-
var
|
|
4417
|
+
import { execFile as execFile2 } from "child_process";
|
|
4418
|
+
import { promisify as promisify2 } from "util";
|
|
4419
|
+
var execFileAsync2 = promisify2(execFile2);
|
|
3699
4420
|
function summarize(checks) {
|
|
3700
4421
|
const passCount = checks.filter((check) => check.status === "pass").length;
|
|
3701
4422
|
const warnCount = checks.filter((check) => check.status === "warn").length;
|
|
@@ -3782,6 +4503,25 @@ async function checkPlaywright() {
|
|
|
3782
4503
|
};
|
|
3783
4504
|
}
|
|
3784
4505
|
}
|
|
4506
|
+
async function checkGit() {
|
|
4507
|
+
try {
|
|
4508
|
+
const { stdout } = await execFileAsync2("git", ["--version"]);
|
|
4509
|
+
return {
|
|
4510
|
+
id: "git",
|
|
4511
|
+
status: "pass",
|
|
4512
|
+
summary: "Git executable is available.",
|
|
4513
|
+
details: {
|
|
4514
|
+
version: stdout.trim()
|
|
4515
|
+
}
|
|
4516
|
+
};
|
|
4517
|
+
} catch (error) {
|
|
4518
|
+
return {
|
|
4519
|
+
id: "git",
|
|
4520
|
+
status: "fail",
|
|
4521
|
+
summary: `Git is not ready: ${toErrorMessage(error)}`
|
|
4522
|
+
};
|
|
4523
|
+
}
|
|
4524
|
+
}
|
|
3785
4525
|
async function checkDaemonConfig(env) {
|
|
3786
4526
|
try {
|
|
3787
4527
|
const daemonConfig = parseDaemonConfig(env, {
|
|
@@ -4031,7 +4771,7 @@ async function checkEmbeddings(env) {
|
|
|
4031
4771
|
}
|
|
4032
4772
|
async function checkDocker() {
|
|
4033
4773
|
try {
|
|
4034
|
-
const { stdout } = await
|
|
4774
|
+
const { stdout } = await execFileAsync2("docker", ["info", "--format", "{{json .ServerVersion}}"]);
|
|
4035
4775
|
const version = JSON.parse(stdout.trim());
|
|
4036
4776
|
return {
|
|
4037
4777
|
id: "docker",
|
|
@@ -4047,7 +4787,7 @@ async function checkDocker() {
|
|
|
4047
4787
|
return {
|
|
4048
4788
|
id: "docker",
|
|
4049
4789
|
status: "warn",
|
|
4050
|
-
summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable
|
|
4790
|
+
summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable in this environment."
|
|
4051
4791
|
};
|
|
4052
4792
|
}
|
|
4053
4793
|
return {
|
|
@@ -4059,6 +4799,7 @@ async function checkDocker() {
|
|
|
4059
4799
|
}
|
|
4060
4800
|
async function runDoctor(env = process.env) {
|
|
4061
4801
|
const catalogCheck = await checkCatalog(env);
|
|
4802
|
+
const gitCheck = await checkGit();
|
|
4062
4803
|
const playwrightCheck = await checkPlaywright();
|
|
4063
4804
|
const { daemonConfigCheck, daemonConfig } = await checkDaemonConfig(env);
|
|
4064
4805
|
const sourceSpecDirsCheck = await checkSourceSpecDirs(daemonConfig);
|
|
@@ -4070,6 +4811,7 @@ async function runDoctor(env = process.env) {
|
|
|
4070
4811
|
const dockerCheck = await checkDocker();
|
|
4071
4812
|
const checks = [
|
|
4072
4813
|
catalogCheck,
|
|
4814
|
+
gitCheck,
|
|
4073
4815
|
playwrightCheck,
|
|
4074
4816
|
daemonConfigCheck,
|
|
4075
4817
|
sourceSpecDirsCheck,
|
|
@@ -4124,6 +4866,19 @@ function withScores(rows, scoreLookup) {
|
|
|
4124
4866
|
};
|
|
4125
4867
|
});
|
|
4126
4868
|
}
|
|
4869
|
+
function matchesChunkFilters(row, filters) {
|
|
4870
|
+
if (filters.pathPatterns && filters.pathPatterns.length > 0) {
|
|
4871
|
+
if (!row.filePath || !matchesPatterns(row.filePath, filters.pathPatterns)) {
|
|
4872
|
+
return false;
|
|
4873
|
+
}
|
|
4874
|
+
}
|
|
4875
|
+
if (filters.languages && filters.languages.length > 0) {
|
|
4876
|
+
if (!row.language || !filters.languages.includes(row.language.toLowerCase())) {
|
|
4877
|
+
return false;
|
|
4878
|
+
}
|
|
4879
|
+
}
|
|
4880
|
+
return true;
|
|
4881
|
+
}
|
|
4127
4882
|
async function searchHybridCatalog(input) {
|
|
4128
4883
|
const scope = input.catalog.resolveSearchScope({
|
|
4129
4884
|
query: input.query,
|
|
@@ -4131,6 +4886,8 @@ async function searchHybridCatalog(input) {
|
|
|
4131
4886
|
...input.searchInput.sourceIds ? { sourceIds: input.searchInput.sourceIds } : {},
|
|
4132
4887
|
...input.searchInput.snapshotId ? { snapshotId: input.searchInput.snapshotId } : {},
|
|
4133
4888
|
...input.searchInput.all ? { all: true } : {},
|
|
4889
|
+
...input.searchInput.pathPatterns ? { pathPatterns: input.searchInput.pathPatterns } : {},
|
|
4890
|
+
...input.searchInput.languages ? { languages: input.searchInput.languages } : {},
|
|
4134
4891
|
...typeof input.searchInput.limit === "number" ? { limit: input.searchInput.limit } : {},
|
|
4135
4892
|
...typeof input.searchInput.offset === "number" ? { offset: input.searchInput.offset } : {}
|
|
4136
4893
|
});
|
|
@@ -4193,13 +4950,25 @@ async function searchHybridCatalog(input) {
|
|
|
4193
4950
|
);
|
|
4194
4951
|
}
|
|
4195
4952
|
const vectorStore = new AiocsVectorStore(input.config);
|
|
4196
|
-
|
|
4953
|
+
const rawVectorCandidates = await vectorStore.search({
|
|
4197
4954
|
vector: queryVector,
|
|
4198
4955
|
snapshotIds: scope.snapshotIds,
|
|
4199
4956
|
sourceIds: scope.sourceIds,
|
|
4200
4957
|
modelKey,
|
|
4201
4958
|
limit: windowSize(scope.limit, scope.offset, input.config.vectorCandidateWindow)
|
|
4202
4959
|
});
|
|
4960
|
+
if (rawVectorCandidates.length > 0 && (scope.pathPatterns || scope.languages)) {
|
|
4961
|
+
const candidateRows = input.catalog.getChunksByIds(rawVectorCandidates.map((candidate) => candidate.chunkId));
|
|
4962
|
+
const allowedIds = new Set(
|
|
4963
|
+
candidateRows.filter((row) => matchesChunkFilters(row, {
|
|
4964
|
+
pathPatterns: scope.pathPatterns,
|
|
4965
|
+
languages: scope.languages
|
|
4966
|
+
})).map((row) => row.chunkId)
|
|
4967
|
+
);
|
|
4968
|
+
vectorCandidates = rawVectorCandidates.filter((candidate) => allowedIds.has(candidate.chunkId));
|
|
4969
|
+
} else {
|
|
4970
|
+
vectorCandidates = rawVectorCandidates;
|
|
4971
|
+
}
|
|
4203
4972
|
} catch (error) {
|
|
4204
4973
|
if (input.mode === "auto") {
|
|
4205
4974
|
return lexicalOnly();
|
|
@@ -4355,7 +5124,7 @@ async function refreshDueSources(sourceIdOrAll = "all") {
|
|
|
4355
5124
|
return { results };
|
|
4356
5125
|
}
|
|
4357
5126
|
async function runSourceCanaries(sourceIdOrAll) {
|
|
4358
|
-
const results = await withCatalog(async ({ catalog }) => {
|
|
5127
|
+
const results = await withCatalog(async ({ catalog, dataDir }) => {
|
|
4359
5128
|
const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
|
|
4360
5129
|
if (sourceIds.length === 0) {
|
|
4361
5130
|
return [];
|
|
@@ -4365,6 +5134,7 @@ async function runSourceCanaries(sourceIdOrAll) {
|
|
|
4365
5134
|
canaried.push(await runSourceCanary({
|
|
4366
5135
|
catalog,
|
|
4367
5136
|
sourceId,
|
|
5137
|
+
dataDir,
|
|
4368
5138
|
env: process.env
|
|
4369
5139
|
}));
|
|
4370
5140
|
}
|
|
@@ -4424,6 +5194,8 @@ async function searchCatalog(query, options) {
|
|
|
4424
5194
|
...explicitSources ? { sourceIds: options.source } : {},
|
|
4425
5195
|
...options.snapshot ? { snapshotId: options.snapshot } : {},
|
|
4426
5196
|
...options.all ? { all: true } : {},
|
|
5197
|
+
...options.path && options.path.length > 0 ? { pathPatterns: options.path } : {},
|
|
5198
|
+
...options.language && options.language.length > 0 ? { languages: options.language } : {},
|
|
4427
5199
|
...typeof options.limit === "number" ? { limit: options.limit } : {},
|
|
4428
5200
|
...typeof options.offset === "number" ? { offset: options.offset } : {}
|
|
4429
5201
|
}
|