@bodhi-ventures/aiocs 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,50 +43,198 @@ function toAiocsError(error) {
43
43
  return new AiocsError(AIOCS_ERROR_CODES.internalError, String(error));
44
44
  }
45
45
 
46
- // src/catalog/catalog.ts
46
+ // src/runtime/paths.ts
47
+ import { homedir } from "os";
48
+ import { join as join2, relative, resolve, sep } from "path";
47
49
  import { mkdirSync } from "fs";
48
- import { join, resolve as resolve2 } from "path";
50
+
51
+ // src/runtime/bundled-sources.ts
52
+ import { existsSync } from "fs";
53
+ import { dirname, join } from "path";
54
+ import { fileURLToPath } from "url";
55
+ function findPackageRoot(startDir) {
56
+ let currentDir = startDir;
57
+ while (true) {
58
+ if (existsSync(join(currentDir, "package.json")) && existsSync(join(currentDir, "sources"))) {
59
+ return currentDir;
60
+ }
61
+ const parentDir = dirname(currentDir);
62
+ if (parentDir === currentDir) {
63
+ throw new Error(`Could not locate aiocs package root from ${startDir}`);
64
+ }
65
+ currentDir = parentDir;
66
+ }
67
+ }
68
+ function getBundledSourcesDir() {
69
+ const currentFilePath = fileURLToPath(import.meta.url);
70
+ const packageRoot = findPackageRoot(dirname(currentFilePath));
71
+ return join(packageRoot, "sources");
72
+ }
73
+
74
+ // src/runtime/paths.ts
75
+ var PORTABLE_USER_SOURCES_PREFIX = "~/.aiocs/sources";
76
+ var PORTABLE_BUNDLED_SOURCES_PREFIX = "aiocs://bundled";
77
+ var CONTAINER_USER_SOURCES_DIR = "/root/.aiocs/sources";
78
+ var CONTAINER_BUNDLED_SOURCES_DIR = "/app/sources";
79
+ function expandTilde(path) {
80
+ if (path === "~") {
81
+ return homedir();
82
+ }
83
+ if (path.startsWith("~/")) {
84
+ return join2(homedir(), path.slice(2));
85
+ }
86
+ return path;
87
+ }
88
+ function getAiocsDataDir(env = process.env) {
89
+ const override = env.AIOCS_DATA_DIR;
90
+ if (override) {
91
+ mkdirSync(expandTilde(override), { recursive: true });
92
+ return expandTilde(override);
93
+ }
94
+ const target = join2(homedir(), ".aiocs", "data");
95
+ mkdirSync(target, { recursive: true });
96
+ return target;
97
+ }
98
+ function getAiocsConfigDir(env = process.env) {
99
+ const override = env.AIOCS_CONFIG_DIR;
100
+ if (override) {
101
+ mkdirSync(expandTilde(override), { recursive: true });
102
+ return expandTilde(override);
103
+ }
104
+ const target = join2(homedir(), ".aiocs", "config");
105
+ mkdirSync(target, { recursive: true });
106
+ return target;
107
+ }
108
+ function getAiocsSourcesDir(env = process.env) {
109
+ const override = env.AIOCS_SOURCES_DIR;
110
+ if (override) {
111
+ mkdirSync(expandTilde(override), { recursive: true });
112
+ return expandTilde(override);
113
+ }
114
+ const target = join2(homedir(), ".aiocs", "sources");
115
+ mkdirSync(target, { recursive: true });
116
+ return target;
117
+ }
118
+ function isWithinRoot(candidatePath, rootPath) {
119
+ return candidatePath === rootPath || candidatePath.startsWith(`${rootPath}${sep}`);
120
+ }
121
+ function toPortablePath(prefix, rootPath, candidatePath) {
122
+ const relativePath = relative(rootPath, candidatePath).split(sep).join("/");
123
+ return relativePath ? `${prefix}/${relativePath}` : prefix;
124
+ }
125
+ function canonicalizeManagedSpecPath(specPath, env = process.env) {
126
+ if (specPath === PORTABLE_USER_SOURCES_PREFIX || specPath.startsWith(`${PORTABLE_USER_SOURCES_PREFIX}/`) || specPath === PORTABLE_BUNDLED_SOURCES_PREFIX || specPath.startsWith(`${PORTABLE_BUNDLED_SOURCES_PREFIX}/`)) {
127
+ return specPath;
128
+ }
129
+ const resolvedPath = resolve(specPath);
130
+ const userRoots = [resolve(getAiocsSourcesDir(env)), CONTAINER_USER_SOURCES_DIR];
131
+ for (const rootPath of userRoots) {
132
+ if (isWithinRoot(resolvedPath, rootPath)) {
133
+ return toPortablePath(PORTABLE_USER_SOURCES_PREFIX, rootPath, resolvedPath);
134
+ }
135
+ }
136
+ const bundledRoots = [resolve(getBundledSourcesDir()), CONTAINER_BUNDLED_SOURCES_DIR];
137
+ for (const rootPath of bundledRoots) {
138
+ if (isWithinRoot(resolvedPath, rootPath)) {
139
+ return toPortablePath(PORTABLE_BUNDLED_SOURCES_PREFIX, rootPath, resolvedPath);
140
+ }
141
+ }
142
+ return resolvedPath;
143
+ }
144
+
145
+ // src/catalog/catalog.ts
146
+ import { mkdirSync as mkdirSync2 } from "fs";
147
+ import { join as join3, resolve as resolve3 } from "path";
49
148
  import { randomUUID } from "crypto";
50
149
  import Database from "better-sqlite3";
51
150
 
52
151
  // src/catalog/chunking.ts
53
152
  var MAX_CHUNK_BYTES = 16384;
153
+ var CHUNK_OVERLAP_LINES = 6;
54
154
  var HEADING_PATTERN = /^(#{1,6})\s+(.*)$/;
55
155
  function byteLength(value) {
56
156
  return Buffer.byteLength(value, "utf8");
57
157
  }
158
+ function normalizeLanguage(filePath, language) {
159
+ if (language) {
160
+ return language.toLowerCase();
161
+ }
162
+ if (!filePath) {
163
+ return null;
164
+ }
165
+ const lower = filePath.toLowerCase();
166
+ if (lower.endsWith(".md") || lower.endsWith(".mdx")) {
167
+ return "markdown";
168
+ }
169
+ if (lower.endsWith(".ts")) {
170
+ return "typescript";
171
+ }
172
+ if (lower.endsWith(".tsx")) {
173
+ return "tsx";
174
+ }
175
+ if (lower.endsWith(".js")) {
176
+ return "javascript";
177
+ }
178
+ if (lower.endsWith(".jsx")) {
179
+ return "jsx";
180
+ }
181
+ if (lower.endsWith(".json")) {
182
+ return "json";
183
+ }
184
+ if (lower.endsWith(".yaml") || lower.endsWith(".yml")) {
185
+ return "yaml";
186
+ }
187
+ if (lower.endsWith(".toml")) {
188
+ return "toml";
189
+ }
190
+ if (lower.endsWith(".py")) {
191
+ return "python";
192
+ }
193
+ if (lower.endsWith(".rs")) {
194
+ return "rust";
195
+ }
196
+ if (lower.endsWith(".go")) {
197
+ return "go";
198
+ }
199
+ if (lower.endsWith(".sql")) {
200
+ return "sql";
201
+ }
202
+ if (lower.endsWith(".sh")) {
203
+ return "shell";
204
+ }
205
+ return null;
206
+ }
207
+ function flushChunk(chunks, sectionTitle, current, chunkOrder) {
208
+ const trimmed = current.trim();
209
+ if (!trimmed) {
210
+ return chunkOrder;
211
+ }
212
+ chunks.push({
213
+ sectionTitle,
214
+ markdown: trimmed,
215
+ chunkOrder
216
+ });
217
+ return chunkOrder + 1;
218
+ }
58
219
  function splitLargeSection(sectionTitle, markdown, startOrder) {
59
220
  const lines = markdown.split("\n");
60
221
  const chunks = [];
61
222
  let current = "";
62
223
  let order = startOrder;
63
- const flush = () => {
64
- const trimmed = current.trim();
65
- if (!trimmed) {
66
- current = "";
67
- return;
68
- }
69
- chunks.push({
70
- sectionTitle,
71
- markdown: trimmed,
72
- chunkOrder: order
73
- });
74
- order += 1;
75
- current = "";
76
- };
77
224
  for (const line of lines) {
78
225
  const next = current ? `${current}
79
226
  ${line}` : line;
80
227
  if (current && byteLength(next) > MAX_CHUNK_BYTES) {
81
- flush();
228
+ order = flushChunk(chunks, sectionTitle, current, order);
229
+ current = "";
82
230
  }
83
231
  current = current ? `${current}
84
232
  ${line}` : line;
85
233
  }
86
- flush();
234
+ flushChunk(chunks, sectionTitle, current, order);
87
235
  return chunks;
88
236
  }
89
- function chunkMarkdown(pageTitle, markdown) {
237
+ function chunkMarkdownSectioned(pageTitle, markdown) {
90
238
  const trimmed = markdown.trim();
91
239
  if (!trimmed) {
92
240
  return [];
@@ -134,6 +282,154 @@ function chunkMarkdown(pageTitle, markdown) {
134
282
  }
135
283
  return chunks;
136
284
  }
285
+ function symbolBoundary(line, language) {
286
+ const trimmed = line.trim();
287
+ if (!trimmed) {
288
+ return null;
289
+ }
290
+ const patterns = [];
291
+ switch (language) {
292
+ case "typescript":
293
+ case "tsx":
294
+ case "javascript":
295
+ case "jsx":
296
+ patterns.push(
297
+ /^(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/,
298
+ /^(?:export\s+)?(?:default\s+)?class\s+([A-Za-z0-9_$]+)/,
299
+ /^(?:export\s+)?(?:interface|type|enum)\s+([A-Za-z0-9_$]+)/,
300
+ /^(?:export\s+)?const\s+([A-Za-z0-9_$]+)\s*=/
301
+ );
302
+ break;
303
+ case "python":
304
+ patterns.push(/^(?:async\s+def|def|class)\s+([A-Za-z0-9_]+)/);
305
+ break;
306
+ case "rust":
307
+ patterns.push(/^(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z0-9_]+)/, /^(?:pub\s+)?(?:struct|enum|trait)\s+([A-Za-z0-9_]+)/);
308
+ break;
309
+ case "go":
310
+ patterns.push(/^func\s+([A-Za-z0-9_]+)/, /^type\s+([A-Za-z0-9_]+)/);
311
+ break;
312
+ case "json":
313
+ case "yaml":
314
+ case "toml":
315
+ patterns.push(/^["']?([A-Za-z0-9_.-]+)["']?\s*[:=]/);
316
+ break;
317
+ default:
318
+ patterns.push(/^(?:export\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/, /^(?:class|interface|type|enum)\s+([A-Za-z0-9_$]+)/);
319
+ break;
320
+ }
321
+ for (const pattern of patterns) {
322
+ const match = trimmed.match(pattern);
323
+ if (match?.[1]) {
324
+ return match[1];
325
+ }
326
+ }
327
+ return null;
328
+ }
329
+ function discoverBoundaries(lines, title, language) {
330
+ const boundaries = [];
331
+ lines.forEach((line, index) => {
332
+ const symbol = symbolBoundary(line, language);
333
+ if (symbol) {
334
+ boundaries.push({
335
+ index,
336
+ title: symbol
337
+ });
338
+ }
339
+ });
340
+ if (boundaries.length === 0 || boundaries[0].index !== 0) {
341
+ boundaries.unshift({
342
+ index: 0,
343
+ title
344
+ });
345
+ }
346
+ return boundaries;
347
+ }
348
+ function buildWindowTitle(title, startLine, endLine) {
349
+ return `${title} (${startLine}-${endLine})`;
350
+ }
351
+ function chunkLineWindows(title, content, startOrder) {
352
+ const lines = content.split("\n");
353
+ const chunks = [];
354
+ let start = 0;
355
+ let order = startOrder;
356
+ while (start < lines.length) {
357
+ let end = start;
358
+ let current = "";
359
+ while (end < lines.length) {
360
+ const candidate = current ? `${current}
361
+ ${lines[end]}` : lines[end];
362
+ if (current && byteLength(candidate) > MAX_CHUNK_BYTES) {
363
+ break;
364
+ }
365
+ current = candidate;
366
+ end += 1;
367
+ }
368
+ const trimmed = current.trim();
369
+ if (!trimmed) {
370
+ break;
371
+ }
372
+ chunks.push({
373
+ sectionTitle: buildWindowTitle(title, start + 1, end),
374
+ markdown: trimmed,
375
+ chunkOrder: order
376
+ });
377
+ order += 1;
378
+ if (end >= lines.length) {
379
+ break;
380
+ }
381
+ start = Math.max(start + 1, end - CHUNK_OVERLAP_LINES);
382
+ }
383
+ return chunks;
384
+ }
385
+ function chunkByBoundaries(input, language) {
386
+ const trimmed = input.content.trim();
387
+ if (!trimmed) {
388
+ return [];
389
+ }
390
+ if (byteLength(trimmed) <= MAX_CHUNK_BYTES) {
391
+ return [{ sectionTitle: input.title, markdown: trimmed, chunkOrder: 0 }];
392
+ }
393
+ const lines = trimmed.split("\n");
394
+ const boundaries = discoverBoundaries(lines, input.title, language);
395
+ const chunks = [];
396
+ let order = 0;
397
+ for (let index = 0; index < boundaries.length; index += 1) {
398
+ const boundary = boundaries[index];
399
+ const nextIndex = boundaries[index + 1]?.index ?? lines.length;
400
+ const sectionLines = lines.slice(boundary.index, nextIndex);
401
+ const sectionContent = sectionLines.join("\n").trim();
402
+ if (!sectionContent) {
403
+ continue;
404
+ }
405
+ if (byteLength(sectionContent) <= MAX_CHUNK_BYTES) {
406
+ chunks.push({
407
+ sectionTitle: boundary.title,
408
+ markdown: sectionContent,
409
+ chunkOrder: order
410
+ });
411
+ order += 1;
412
+ continue;
413
+ }
414
+ const splitChunks = chunkLineWindows(boundary.title, sectionContent, order);
415
+ chunks.push(...splitChunks);
416
+ order = chunks.length;
417
+ }
418
+ return chunks.length > 0 ? chunks : chunkLineWindows(input.title, trimmed, 0);
419
+ }
420
+ function chunkContent(input) {
421
+ const language = normalizeLanguage(input.filePath, input.language);
422
+ if (language === "markdown") {
423
+ return chunkMarkdownSectioned(input.title, input.content);
424
+ }
425
+ if (!language) {
426
+ return chunkLineWindows(input.title, input.content.trim(), 0);
427
+ }
428
+ return chunkByBoundaries(input, language);
429
+ }
430
+ function detectLanguage(filePath, language) {
431
+ return normalizeLanguage(filePath, language);
432
+ }
137
433
 
138
434
  // src/catalog/fingerprint.ts
139
435
  import { createHash } from "crypto";
@@ -145,6 +441,7 @@ function buildSnapshotFingerprint(input) {
145
441
  const payload = JSON.stringify({
146
442
  sourceId: input.sourceId,
147
443
  configHash: input.configHash,
444
+ revisionKey: input.revisionKey ?? null,
148
445
  pages: normalizedPages
149
446
  });
150
447
  return sha256(payload);
@@ -152,12 +449,12 @@ function buildSnapshotFingerprint(input) {
152
449
 
153
450
  // src/catalog/project-scope.ts
154
451
  import { realpathSync } from "fs";
155
- import { resolve } from "path";
452
+ import { resolve as resolve2 } from "path";
156
453
  function isWithin(candidate, root) {
157
454
  return candidate === root || candidate.startsWith(`${root}/`);
158
455
  }
159
456
  function canonicalizeProjectPath(path) {
160
- const resolved = resolve(path);
457
+ const resolved = resolve2(path);
161
458
  try {
162
459
  return realpathSync.native(resolved);
163
460
  } catch {
@@ -173,22 +470,55 @@ function resolveProjectScope(cwd, scopes) {
173
470
  return normalizedScopes[0] ?? null;
174
471
  }
175
472
 
473
+ // src/patterns.ts
474
+ function escapeRegex(value) {
475
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
476
+ }
477
+ function patternToRegex(pattern) {
478
+ let regex = "^";
479
+ for (let index = 0; index < pattern.length; index += 1) {
480
+ const current = pattern[index];
481
+ const next = pattern[index + 1];
482
+ if (current === "*" && next === "*") {
483
+ regex += ".*";
484
+ index += 1;
485
+ continue;
486
+ }
487
+ if (current === "*") {
488
+ regex += "[^?#]*";
489
+ continue;
490
+ }
491
+ regex += escapeRegex(current ?? "");
492
+ }
493
+ return new RegExp(`${regex}$`);
494
+ }
495
+ function matchesPatterns(value, patterns) {
496
+ return patterns.some((pattern) => patternToRegex(pattern).test(value));
497
+ }
498
+ function toSqliteGlob(pattern) {
499
+ return pattern.replace(/\*\*/g, "*");
500
+ }
501
+
176
502
  // src/spec/source-spec.ts
177
503
  import { readFile } from "fs/promises";
178
504
  import { extname } from "path";
179
505
  import YAML from "yaml";
180
506
  import { z } from "zod";
181
507
  var patternSchema = z.string().min(1);
508
+ var positiveIntSchema = z.number().int().positive();
509
+ var scheduleSchema = z.object({
510
+ everyHours: positiveIntSchema
511
+ });
182
512
  var interactionSchema = z.discriminatedUnion("action", [
183
513
  z.object({
184
514
  action: z.literal("hover"),
185
515
  selector: z.string().min(1),
186
- timeoutMs: z.number().int().positive().optional()
516
+ timeoutMs: positiveIntSchema.optional()
187
517
  }),
188
518
  z.object({
189
519
  action: z.literal("click"),
190
520
  selector: z.string().min(1),
191
- timeoutMs: z.number().int().positive().optional()
521
+ timeoutMs: positiveIntSchema.optional()
192
522
  }),
193
523
  z.object({
194
524
  action: z.literal("press"),
@@ -196,13 +526,13 @@ var interactionSchema = z.discriminatedUnion("action", [
196
526
  }),
197
527
  z.object({
198
528
  action: z.literal("wait"),
199
- timeoutMs: z.number().int().positive()
529
+ timeoutMs: positiveIntSchema
200
530
  })
201
531
  ]);
202
532
  var clipboardExtractSchema = z.object({
203
533
  strategy: z.literal("clipboardButton"),
204
534
  interactions: z.array(interactionSchema).min(1),
205
- clipboardTimeoutMs: z.number().int().positive().default(1e4)
535
+ clipboardTimeoutMs: positiveIntSchema.default(1e4)
206
536
  });
207
537
  var selectorExtractSchema = z.object({
208
538
  strategy: z.literal("selector"),
@@ -211,13 +541,13 @@ var selectorExtractSchema = z.object({
211
541
  var readabilityExtractSchema = z.object({
212
542
  strategy: z.literal("readability")
213
543
  });
214
- var authHeaderSchema = z.object({
544
+ var webAuthHeaderSchema = z.object({
215
545
  name: z.string().min(1),
216
546
  valueFromEnv: z.string().min(1),
217
547
  hosts: z.array(z.string().min(1)).min(1).optional(),
218
548
  include: z.array(patternSchema).min(1).optional()
219
549
  });
220
- var authCookieSchema = z.object({
550
+ var webAuthCookieSchema = z.object({
221
551
  name: z.string().min(1),
222
552
  valueFromEnv: z.string().min(1),
223
553
  domain: z.string().min(1),
@@ -226,21 +556,36 @@ var authCookieSchema = z.object({
226
556
  httpOnly: z.boolean().optional(),
227
557
  sameSite: z.enum(["Strict", "Lax", "None"]).optional()
228
558
  });
229
- var canaryCheckSchema = z.object({
559
+ var webCanaryCheckSchema = z.object({
230
560
  url: z.string().url(),
231
561
  expectedTitle: z.string().min(1).optional(),
232
562
  expectedText: z.string().min(1).optional(),
233
- minMarkdownLength: z.number().int().positive().default(40)
563
+ minMarkdownLength: positiveIntSchema.default(40)
564
+ });
565
+ var gitAuthSchema = z.object({
566
+ tokenFromEnv: z.string().min(1),
567
+ username: z.string().min(1).default("x-access-token"),
568
+ scheme: z.enum(["basic", "bearer"]).default("basic")
569
+ });
570
+ var gitCanaryCheckSchema = z.object({
571
+ path: z.string().min(1),
572
+ expectedTitle: z.string().min(1).optional(),
573
+ expectedText: z.string().min(1).optional(),
574
+ minContentLength: positiveIntSchema.default(40)
234
575
  });
235
- var sourceSpecSchema = z.object({
576
+ var baseSourceSpecSchema = z.object({
236
577
  id: z.string().min(1).regex(/^[a-z0-9-]+$/),
237
578
  label: z.string().min(1),
579
+ schedule: scheduleSchema
580
+ });
581
+ var webSourceSpecSchema = baseSourceSpecSchema.extend({
582
+ kind: z.literal("web").default("web"),
238
583
  startUrls: z.array(z.string().url()).min(1),
239
584
  allowedHosts: z.array(z.string().min(1)).min(1),
240
585
  discovery: z.object({
241
586
  include: z.array(patternSchema).min(1),
242
- exclude: z.array(patternSchema),
243
- maxPages: z.number().int().positive()
587
+ exclude: z.array(patternSchema).default([]),
588
+ maxPages: positiveIntSchema
244
589
  }),
245
590
  extract: z.discriminatedUnion("strategy", [
246
591
  clipboardExtractSchema,
@@ -250,16 +595,13 @@ var sourceSpecSchema = z.object({
250
595
  normalize: z.object({
251
596
  prependSourceComment: z.boolean().default(true)
252
597
  }),
253
- schedule: z.object({
254
- everyHours: z.number().int().positive()
255
- }),
256
598
  auth: z.object({
257
- headers: z.array(authHeaderSchema).default([]),
258
- cookies: z.array(authCookieSchema).default([])
599
+ headers: z.array(webAuthHeaderSchema).default([]),
600
+ cookies: z.array(webAuthCookieSchema).default([])
259
601
  }).optional(),
260
602
  canary: z.object({
261
- everyHours: z.number().int().positive().optional(),
262
- checks: z.array(canaryCheckSchema).min(1)
603
+ everyHours: positiveIntSchema.optional(),
604
+ checks: z.array(webCanaryCheckSchema).min(1)
263
605
  }).optional()
264
606
  }).superRefine((spec, context) => {
265
607
  for (const [index, header] of (spec.auth?.headers ?? []).entries()) {
@@ -277,6 +619,47 @@ var sourceSpecSchema = z.object({
277
619
  }
278
620
  }
279
621
  });
622
+ var gitSourceSpecSchema = baseSourceSpecSchema.extend({
623
+ kind: z.literal("git"),
624
+ repo: z.object({
625
+ url: z.string().url(),
626
+ ref: z.string().min(1).default("HEAD"),
627
+ include: z.array(patternSchema).min(1),
628
+ exclude: z.array(patternSchema).default([]),
629
+ maxFiles: positiveIntSchema.default(2e3),
630
+ textFileMaxBytes: positiveIntSchema.default(262144),
631
+ auth: gitAuthSchema.optional()
632
+ }),
633
+ canary: z.object({
634
+ everyHours: positiveIntSchema.optional(),
635
+ checks: z.array(gitCanaryCheckSchema).min(1)
636
+ }).optional()
637
+ }).superRefine((spec, context) => {
638
+ const protocol = new URL(spec.repo.url).protocol;
639
+ if (!["https:", "http:", "file:"].includes(protocol)) {
640
+ context.addIssue({
641
+ code: z.ZodIssueCode.custom,
642
+ path: ["repo", "url"],
643
+ message: `Unsupported git source protocol '${protocol}'. Use https:// or file://.`
644
+ });
645
+ }
646
+ });
647
+ var sourceSpecSchema = z.preprocess((value) => {
648
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
649
+ return value;
650
+ }
651
+ const candidate = value;
652
+ if (!("kind" in candidate)) {
653
+ return {
654
+ ...candidate,
655
+ kind: "web"
656
+ };
657
+ }
658
+ return candidate;
659
+ }, z.discriminatedUnion("kind", [
660
+ webSourceSpecSchema,
661
+ gitSourceSpecSchema
662
+ ]));
280
663
  function parseSourceSpec(raw, ext) {
281
664
  if (ext === ".json") {
282
665
  return JSON.parse(raw);
@@ -288,8 +671,27 @@ async function loadSourceSpec(path) {
288
671
  const parsed = parseSourceSpec(raw, extname(path).toLowerCase());
289
672
  return sourceSpecSchema.parse(parsed);
290
673
  }
674
+ function parseSourceSpecObject(value) {
675
+ return sourceSpecSchema.parse(value);
676
+ }
677
+ function isGitSourceSpec(spec) {
678
+ return spec.kind === "git";
679
+ }
291
680
  function resolveSourceCanary(spec) {
681
+ if (spec.kind === "git") {
682
+ return {
683
+ kind: "git",
684
+ everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
685
+ checks: spec.canary?.checks ?? [
686
+ {
687
+ path: "README.md",
688
+ minContentLength: 40
689
+ }
690
+ ]
691
+ };
692
+ }
292
693
  return {
694
+ kind: "web",
293
695
  everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
294
696
  checks: spec.canary?.checks ?? [
295
697
  {
@@ -341,6 +743,9 @@ function initSchema(db) {
341
743
  title TEXT NOT NULL,
342
744
  markdown TEXT NOT NULL,
343
745
  content_hash TEXT NOT NULL,
746
+ page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
747
+ file_path TEXT,
748
+ language TEXT,
344
749
  UNIQUE(snapshot_id, url)
345
750
  );
346
751
 
@@ -353,7 +758,10 @@ function initSchema(db) {
353
758
  page_title TEXT NOT NULL,
354
759
  section_title TEXT NOT NULL,
355
760
  chunk_order INTEGER NOT NULL,
356
- markdown TEXT NOT NULL
761
+ markdown TEXT NOT NULL,
762
+ page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
763
+ file_path TEXT,
764
+ language TEXT
357
765
  );
358
766
 
359
767
  CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
@@ -469,6 +877,26 @@ function initSchema(db) {
469
877
  if (!sourceColumns.some((column) => column.name === "next_canary_due_at")) {
470
878
  db.exec("ALTER TABLE sources ADD COLUMN next_canary_due_at TEXT");
471
879
  }
880
+ const pageColumns = db.prepare("PRAGMA table_info(pages)").all();
881
+ if (!pageColumns.some((column) => column.name === "page_kind")) {
882
+ db.exec(`ALTER TABLE pages ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
883
+ }
884
+ if (!pageColumns.some((column) => column.name === "file_path")) {
885
+ db.exec("ALTER TABLE pages ADD COLUMN file_path TEXT");
886
+ }
887
+ if (!pageColumns.some((column) => column.name === "language")) {
888
+ db.exec("ALTER TABLE pages ADD COLUMN language TEXT");
889
+ }
890
+ const chunkColumns = db.prepare("PRAGMA table_info(chunks)").all();
891
+ if (!chunkColumns.some((column) => column.name === "page_kind")) {
892
+ db.exec(`ALTER TABLE chunks ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
893
+ }
894
+ if (!chunkColumns.some((column) => column.name === "file_path")) {
895
+ db.exec("ALTER TABLE chunks ADD COLUMN file_path TEXT");
896
+ }
897
+ if (!chunkColumns.some((column) => column.name === "language")) {
898
+ db.exec("ALTER TABLE chunks ADD COLUMN language TEXT");
899
+ }
472
900
  }
473
901
  function nowIso() {
474
902
  return (/* @__PURE__ */ new Date()).toISOString();
@@ -476,6 +904,9 @@ function nowIso() {
476
904
  function addHoursIso(hours) {
477
905
  return new Date(Date.now() + hours * 60 * 60 * 1e3).toISOString();
478
906
  }
907
+ function resolveCanaryEveryHours(spec) {
908
+ return spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6));
909
+ }
479
910
  function stableStringify(value) {
480
911
  if (Array.isArray(value)) {
481
912
  return `[${value.map((entry) => stableStringify(entry)).join(",")}]`;
@@ -492,6 +923,20 @@ function normalizeQuery(query) {
492
923
  const words = query.replace(/[^\p{L}\p{N}]+/gu, " ").split(/\s+/).map((part) => part.trim()).filter(Boolean);
493
924
  return words.join(" ");
494
925
  }
926
+ function normalizePatternFilters(patterns) {
927
+ if (!patterns || patterns.length === 0) {
928
+ return null;
929
+ }
930
+ const normalized = [...new Set(patterns.map((pattern) => pattern.trim()).filter(Boolean))];
931
+ return normalized.length > 0 ? normalized : null;
932
+ }
933
+ function normalizeLanguageFilters(languages) {
934
+ if (!languages || languages.length === 0) {
935
+ return null;
936
+ }
937
+ const normalized = [...new Set(languages.map((language) => language.trim().toLowerCase()).filter(Boolean))];
938
+ return normalized.length > 0 ? normalized : null;
939
+ }
495
940
  function assertPaginationValue(value, field, fallback) {
496
941
  if (typeof value === "undefined") {
497
942
  return fallback;
@@ -511,9 +956,9 @@ function assertPaginationValue(value, field, fallback) {
511
956
  return value;
512
957
  }
513
958
  function openCatalog(options) {
514
- const dataDir = resolve2(options.dataDir);
515
- mkdirSync(dataDir, { recursive: true });
516
- const db = new Database(join(dataDir, "catalog.sqlite"));
959
+ const dataDir = resolve3(options.dataDir);
960
+ mkdirSync2(dataDir, { recursive: true });
961
+ const db = new Database(join3(dataDir, "catalog.sqlite"));
517
962
  initSchema(db);
518
963
  const listProjectLinks = () => {
519
964
  const rows = db.prepare("SELECT project_path, source_id FROM project_links ORDER BY project_path, source_id").all();
@@ -548,7 +993,9 @@ function openCatalog(options) {
548
993
  limit,
549
994
  offset,
550
995
  sourceIds: null,
551
- snapshotIds: []
996
+ snapshotIds: [],
997
+ pathPatterns: normalizePatternFilters(input.pathPatterns),
998
+ languages: normalizeLanguageFilters(input.languages)
552
999
  };
553
1000
  }
554
1001
  const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
@@ -562,7 +1009,9 @@ function openCatalog(options) {
562
1009
  limit,
563
1010
  offset,
564
1011
  sourceIds: filterSourceIds,
565
- snapshotIds: latestSnapshotIds
1012
+ snapshotIds: latestSnapshotIds,
1013
+ pathPatterns: normalizePatternFilters(input.pathPatterns),
1014
+ languages: normalizeLanguageFilters(input.languages)
566
1015
  };
567
1016
  };
568
1017
  const searchLexicalByScope = (input) => {
@@ -580,10 +1029,14 @@ function openCatalog(options) {
580
1029
  }
581
1030
  const whereSnapshotPlaceholders = input.scope.snapshotIds.map(() => "?").join(",");
582
1031
  const sourceSql = input.scope.sourceIds ? `AND c.source_id IN (${input.scope.sourceIds.map(() => "?").join(",")})` : "";
1032
+ const pathSql = input.scope.pathPatterns ? `AND c.file_path IS NOT NULL AND (${input.scope.pathPatterns.map(() => "c.file_path GLOB ?").join(" OR ")})` : "";
1033
+ const languageSql = input.scope.languages ? `AND c.language IN (${input.scope.languages.map(() => "?").join(",")})` : "";
583
1034
  const queryArgs = [
584
1035
  normalized,
585
1036
  ...input.scope.snapshotIds,
586
- ...input.scope.sourceIds ?? []
1037
+ ...input.scope.sourceIds ?? [],
1038
+ ...(input.scope.pathPatterns ?? []).map((pattern) => toSqliteGlob(pattern)),
1039
+ ...input.scope.languages ?? []
587
1040
  ];
588
1041
  const totalRow = db.prepare(`
589
1042
  SELECT COUNT(*) AS total
@@ -592,6 +1045,8 @@ function openCatalog(options) {
592
1045
  WHERE chunks_fts MATCH ?
593
1046
  AND c.snapshot_id IN (${whereSnapshotPlaceholders})
594
1047
  ${sourceSql}
1048
+ ${pathSql}
1049
+ ${languageSql}
595
1050
  `).get(...queryArgs);
596
1051
  const rows = db.prepare(`
597
1052
  SELECT
@@ -601,12 +1056,17 @@ function openCatalog(options) {
601
1056
  c.page_url,
602
1057
  c.page_title,
603
1058
  c.section_title,
604
- c.markdown
1059
+ c.markdown,
1060
+ c.page_kind,
1061
+ c.file_path,
1062
+ c.language
605
1063
  FROM chunks_fts
606
1064
  JOIN chunks c ON c.id = chunks_fts.rowid
607
1065
  WHERE chunks_fts MATCH ?
608
1066
  AND c.snapshot_id IN (${whereSnapshotPlaceholders})
609
1067
  ${sourceSql}
1068
+ ${pathSql}
1069
+ ${languageSql}
610
1070
  ORDER BY bm25(chunks_fts), c.id
611
1071
  LIMIT ?
612
1072
  OFFSET ?
@@ -618,7 +1078,10 @@ function openCatalog(options) {
618
1078
  pageUrl: row.page_url,
619
1079
  pageTitle: row.page_title,
620
1080
  sectionTitle: row.section_title,
621
- markdown: row.markdown
1081
+ markdown: row.markdown,
1082
+ pageKind: row.page_kind,
1083
+ filePath: row.file_path,
1084
+ language: row.language
622
1085
  }));
623
1086
  return {
624
1087
  total: totalRow.total,
@@ -788,10 +1251,10 @@ function openCatalog(options) {
788
1251
  const timestamp = nowIso();
789
1252
  const configHash = sha256(stableStringify(spec));
790
1253
  const existing = db.prepare("SELECT id, created_at, next_due_at, next_canary_due_at, config_hash FROM sources WHERE id = ?").get(spec.id);
791
- const resolvedSpecPath = options2?.specPath ? resolve2(options2.specPath) : null;
1254
+ const resolvedSpecPath = options2?.specPath ? canonicalizeManagedSpecPath(options2.specPath) : null;
792
1255
  const nextDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_due_at : timestamp;
793
- const canaryConfig = resolveSourceCanary(spec);
794
- const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryConfig.everyHours) : timestamp;
1256
+ const canaryEveryHours = resolveCanaryEveryHours(spec);
1257
+ const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryEveryHours) : timestamp;
795
1258
  const configChanged = Boolean(existing && existing.config_hash !== configHash);
796
1259
  db.prepare(`
797
1260
  INSERT INTO sources (
@@ -836,6 +1299,7 @@ function openCatalog(options) {
836
1299
  SELECT
837
1300
  id,
838
1301
  label,
1302
+ spec_json,
839
1303
  spec_path,
840
1304
  next_due_at,
841
1305
  next_canary_due_at,
@@ -848,21 +1312,25 @@ function openCatalog(options) {
848
1312
  FROM sources
849
1313
  ORDER BY id
850
1314
  `).all();
851
- return rows.map((row) => ({
852
- id: row.id,
853
- label: row.label,
854
- specPath: row.spec_path,
855
- nextDueAt: row.next_due_at,
856
- isDue: Date.parse(row.next_due_at) <= Date.now(),
857
- nextCanaryDueAt: row.next_canary_due_at,
858
- isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
859
- lastCheckedAt: row.last_checked_at,
860
- lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
861
- lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
862
- lastCanaryCheckedAt: row.last_canary_checked_at,
863
- lastSuccessfulCanaryAt: row.last_successful_canary_at,
864
- lastCanaryStatus: row.last_canary_status
865
- }));
1315
+ return rows.map((row) => {
1316
+ const storedSpec = parseSourceSpecObject(JSON.parse(row.spec_json));
1317
+ return {
1318
+ id: row.id,
1319
+ kind: storedSpec.kind,
1320
+ label: row.label,
1321
+ specPath: row.spec_path ? canonicalizeManagedSpecPath(row.spec_path) : null,
1322
+ nextDueAt: row.next_due_at,
1323
+ isDue: Date.parse(row.next_due_at) <= Date.now(),
1324
+ nextCanaryDueAt: row.next_canary_due_at,
1325
+ isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
1326
+ lastCheckedAt: row.last_checked_at,
1327
+ lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
1328
+ lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
1329
+ lastCanaryCheckedAt: row.last_canary_checked_at,
1330
+ lastSuccessfulCanaryAt: row.last_successful_canary_at,
1331
+ lastCanaryStatus: row.last_canary_status
1332
+ };
1333
+ });
866
1334
  },
867
1335
  listDueSourceIds(referenceTime = nowIso()) {
868
1336
  const rows = db.prepare(`
@@ -923,11 +1391,15 @@ function openCatalog(options) {
923
1391
  const pagesWithHashes = input.pages.map((page) => ({
924
1392
  ...page,
925
1393
  markdown: page.markdown.trim(),
926
- contentHash: sha256(page.markdown.trim())
1394
+ contentHash: sha256(page.markdown.trim()),
1395
+ pageKind: page.pageKind ?? "document",
1396
+ filePath: page.filePath ?? null,
1397
+ language: detectLanguage(page.filePath, page.language)
927
1398
  }));
928
1399
  const fingerprint = buildSnapshotFingerprint({
929
1400
  sourceId: input.sourceId,
930
1401
  configHash: sourceRow.config_hash,
1402
+ ...input.revisionKey ? { revisionKey: input.revisionKey } : {},
931
1403
  pages: pagesWithHashes.map((page) => ({
932
1404
  url: page.url,
933
1405
  contentHash: page.contentHash
@@ -964,13 +1436,13 @@ function openCatalog(options) {
964
1436
  ) VALUES (?, ?, ?, ?, ?, ?, ?)
965
1437
  `);
966
1438
  const insertPage = db.prepare(`
967
- INSERT INTO pages (snapshot_id, url, title, markdown, content_hash)
968
- VALUES (?, ?, ?, ?, ?)
1439
+ INSERT INTO pages (snapshot_id, url, title, markdown, content_hash, page_kind, file_path, language)
1440
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
969
1441
  `);
970
1442
  const insertChunk = db.prepare(`
971
1443
  INSERT INTO chunks (
972
- source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown
973
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1444
+ source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown, page_kind, file_path, language
1445
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
974
1446
  `);
975
1447
  const insertRun = db.prepare(`
976
1448
  INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
@@ -987,9 +1459,23 @@ function openCatalog(options) {
987
1459
  checkedAt
988
1460
  );
989
1461
  for (const page of pagesWithHashes) {
990
- const pageInsert = insertPage.run(snapshotId, page.url, page.title, page.markdown, page.contentHash);
1462
+ const pageInsert = insertPage.run(
1463
+ snapshotId,
1464
+ page.url,
1465
+ page.title,
1466
+ page.markdown,
1467
+ page.contentHash,
1468
+ page.pageKind,
1469
+ page.filePath,
1470
+ page.language
1471
+ );
991
1472
  const pageId = Number(pageInsert.lastInsertRowid);
992
- const chunks = chunkMarkdown(page.title, page.markdown);
1473
+ const chunks = chunkContent({
1474
+ title: page.title,
1475
+ content: page.markdown,
1476
+ filePath: page.filePath,
1477
+ language: page.language
1478
+ });
993
1479
  for (const chunk of chunks) {
994
1480
  insertChunk.run(
995
1481
  input.sourceId,
@@ -999,7 +1485,10 @@ function openCatalog(options) {
999
1485
  page.title,
1000
1486
  chunk.sectionTitle,
1001
1487
  chunk.chunkOrder,
1002
- chunk.markdown
1488
+ chunk.markdown,
1489
+ page.pageKind,
1490
+ page.filePath,
1491
+ page.language
1003
1492
  );
1004
1493
  }
1005
1494
  }
@@ -1050,7 +1539,6 @@ function openCatalog(options) {
1050
1539
  );
1051
1540
  }
1052
1541
  const spec = JSON.parse(sourceRow.spec_json);
1053
- const canary = resolveSourceCanary(spec);
1054
1542
  db.prepare(`
1055
1543
  INSERT INTO canary_runs (id, source_id, status, checked_at, details_json)
1056
1544
  VALUES (?, ?, ?, ?, ?)
@@ -1075,7 +1563,7 @@ function openCatalog(options) {
1075
1563
  input.status,
1076
1564
  input.checkedAt,
1077
1565
  input.status,
1078
- addHoursIso(canary.everyHours),
1566
+ addHoursIso(resolveCanaryEveryHours(spec)),
1079
1567
  input.checkedAt,
1080
1568
  input.sourceId
1081
1569
  );
@@ -1086,8 +1574,9 @@ function openCatalog(options) {
1086
1574
  return [];
1087
1575
  }
1088
1576
  const activeSourceKeys = new Set(
1089
- input.activeSources.map((source) => `${source.sourceId}::${resolve2(source.specPath)}`)
1577
+ input.activeSources.map((source) => `${source.sourceId}::${canonicalizeManagedSpecPath(source.specPath)}`)
1090
1578
  );
1579
+ const normalizedManagedRoots = input.managedRoots.map((managedRoot) => canonicalizeManagedSpecPath(managedRoot));
1091
1580
  const rows = db.prepare(`
1092
1581
  SELECT id, spec_path
1093
1582
  FROM sources
@@ -1098,8 +1587,8 @@ function openCatalog(options) {
1098
1587
  if (!row.spec_path) {
1099
1588
  return false;
1100
1589
  }
1101
- const normalizedSpecPath = resolve2(row.spec_path);
1102
- return input.managedRoots.some(
1590
+ const normalizedSpecPath = canonicalizeManagedSpecPath(row.spec_path);
1591
+ return normalizedManagedRoots.some(
1103
1592
  (managedRoot) => normalizedSpecPath === managedRoot || normalizedSpecPath.startsWith(`${managedRoot}/`)
1104
1593
  ) && !activeSourceKeys.has(`${row.id}::${normalizedSpecPath}`);
1105
1594
  }).map((row) => row.id);
@@ -1154,7 +1643,7 @@ function openCatalog(options) {
1154
1643
  );
1155
1644
  }
1156
1645
  const loadSnapshotPages = (snapshotId) => db.prepare(`
1157
- SELECT url, title, markdown, content_hash
1646
+ SELECT url, title, markdown, content_hash, page_kind, file_path, language
1158
1647
  FROM pages
1159
1648
  WHERE snapshot_id = ?
1160
1649
  ORDER BY url
@@ -1165,11 +1654,17 @@ function openCatalog(options) {
1165
1654
  const afterMap = new Map(afterPages.map((page) => [page.url, page]));
1166
1655
  const addedPages = afterPages.filter((page) => !beforeMap.has(page.url)).map((page) => ({
1167
1656
  url: page.url,
1168
- title: page.title
1657
+ title: page.title,
1658
+ pageKind: page.page_kind,
1659
+ filePath: page.file_path,
1660
+ language: page.language
1169
1661
  }));
1170
1662
  const removedPages = beforePages.filter((page) => !afterMap.has(page.url)).map((page) => ({
1171
1663
  url: page.url,
1172
- title: page.title
1664
+ title: page.title,
1665
+ pageKind: page.page_kind,
1666
+ filePath: page.file_path,
1667
+ language: page.language
1173
1668
  }));
1174
1669
  const summarizeLineDiff = (beforeMarkdown, afterMarkdown) => {
1175
1670
  const beforeLines = beforeMarkdown.split("\n");
@@ -1194,6 +1689,9 @@ function openCatalog(options) {
1194
1689
  url: before.url,
1195
1690
  beforeTitle: before.title,
1196
1691
  afterTitle: after.title,
1692
+ pageKind: after.page_kind,
1693
+ filePath: after.file_path,
1694
+ language: after.language,
1197
1695
  lineSummary: summarizeLineDiff(before.markdown, after.markdown)
1198
1696
  }));
1199
1697
  const unchangedPageCount = beforePages.filter((page) => {
@@ -1239,11 +1737,14 @@ function openCatalog(options) {
1239
1737
  c.page_url,
1240
1738
  c.page_title,
1241
1739
  c.section_title,
1242
- c.markdown
1740
+ c.markdown,
1741
+ c.page_kind,
1742
+ c.file_path,
1743
+ c.language
1243
1744
  FROM chunks c
1244
1745
  WHERE c.source_id = ?
1245
1746
  AND c.snapshot_id = ?
1246
- ORDER BY c.id
1747
+ ORDER BY c.id
1247
1748
  `).all(input.sourceId, input.snapshotId);
1248
1749
  return rows.map((row) => ({
1249
1750
  chunkId: row.chunk_id,
@@ -1253,6 +1754,9 @@ function openCatalog(options) {
1253
1754
  pageTitle: row.page_title,
1254
1755
  sectionTitle: row.section_title,
1255
1756
  markdown: row.markdown,
1757
+ pageKind: row.page_kind,
1758
+ filePath: row.file_path,
1759
+ language: row.language,
1256
1760
  contentHash: sha256(row.markdown)
1257
1761
  }));
1258
1762
  },
@@ -1303,7 +1807,10 @@ function openCatalog(options) {
1303
1807
  c.page_url,
1304
1808
  c.page_title,
1305
1809
  c.section_title,
1306
- c.markdown
1810
+ c.markdown,
1811
+ c.page_kind,
1812
+ c.file_path,
1813
+ c.language
1307
1814
  FROM chunks c
1308
1815
  WHERE c.id IN (${chunkIds.map(() => "?").join(",")})
1309
1816
  `).all(...chunkIds);
@@ -1314,7 +1821,10 @@ function openCatalog(options) {
1314
1821
  pageUrl: row.page_url,
1315
1822
  pageTitle: row.page_title,
1316
1823
  sectionTitle: row.section_title,
1317
- markdown: row.markdown
1824
+ markdown: row.markdown,
1825
+ pageKind: row.page_kind,
1826
+ filePath: row.file_path,
1827
+ language: row.language
1318
1828
  }));
1319
1829
  },
1320
1830
  queueLatestEmbeddingJobs(sourceIds) {
@@ -1698,7 +2208,10 @@ function openCatalog(options) {
1698
2208
  c.page_url,
1699
2209
  c.page_title,
1700
2210
  c.section_title,
1701
- c.markdown
2211
+ c.markdown,
2212
+ c.page_kind,
2213
+ c.file_path,
2214
+ c.language
1702
2215
  FROM chunks c
1703
2216
  WHERE c.id = ?
1704
2217
  `).get(chunkId);
@@ -1712,65 +2225,335 @@ function openCatalog(options) {
1712
2225
  pageUrl: row.page_url,
1713
2226
  pageTitle: row.page_title,
1714
2227
  sectionTitle: row.section_title,
1715
- markdown: row.markdown
2228
+ markdown: row.markdown,
2229
+ pageKind: row.page_kind,
2230
+ filePath: row.file_path,
2231
+ language: row.language
1716
2232
  };
1717
2233
  }
1718
2234
  };
1719
2235
  }
1720
2236
 
1721
- // src/runtime/paths.ts
1722
- import { homedir } from "os";
1723
- import { join as join2 } from "path";
1724
- import { mkdirSync as mkdirSync2 } from "fs";
1725
- function expandTilde(path) {
1726
- if (path === "~") {
1727
- return homedir();
2237
+ // src/daemon.ts
2238
+ import { existsSync as existsSync3 } from "fs";
2239
+ import { resolve as resolve5 } from "path";
2240
+ import { setTimeout as sleep2 } from "timers/promises";
2241
+
2242
+ // src/fetch/fetch-source.ts
2243
+ import { mkdirSync as mkdirSync4, writeFileSync as writeFileSync2 } from "fs";
2244
+ import { join as join5 } from "path";
2245
+ import { setTimeout as sleep } from "timers/promises";
2246
+ import { chromium } from "playwright";
2247
+
2248
+ // src/git/git-source.ts
2249
+ import { existsSync as existsSync2, mkdirSync as mkdirSync3, writeFileSync } from "fs";
2250
+ import { dirname as dirname2, join as join4 } from "path";
2251
+ import { execFile } from "child_process";
2252
+ import { promisify } from "util";
2253
+ var execFileAsync = promisify(execFile);
2254
+ function nowIso2() {
2255
+ return (/* @__PURE__ */ new Date()).toISOString();
2256
+ }
2257
+ function getGitMirrorDir(dataDir, sourceId) {
2258
+ return join4(dataDir, "git-mirrors", `${sourceId}.git`);
2259
+ }
2260
+ function resolveEnvValue(name, env) {
2261
+ const value = env[name];
2262
+ if (!value) {
2263
+ throw new AiocsError(
2264
+ AIOCS_ERROR_CODES.authEnvMissing,
2265
+ `Missing required environment variable '${name}' for authenticated source access`,
2266
+ { envVar: name }
2267
+ );
1728
2268
  }
1729
- if (path.startsWith("~/")) {
1730
- return join2(homedir(), path.slice(2));
2269
+ return value;
2270
+ }
2271
+ function buildGitAuthHeader(spec, env) {
2272
+ if (!spec.repo.auth) {
2273
+ return null;
1731
2274
  }
1732
- return path;
2275
+ const token = resolveEnvValue(spec.repo.auth.tokenFromEnv, env);
2276
+ if (spec.repo.auth.scheme === "bearer") {
2277
+ return `AUTHORIZATION: Bearer ${token}`;
2278
+ }
2279
+ const credentials = Buffer.from(`${spec.repo.auth.username}:${token}`, "utf8").toString("base64");
2280
+ return `AUTHORIZATION: Basic ${credentials}`;
2281
+ }
2282
+ async function runGit(args, options = {}) {
2283
+ const commandArgs = options.authHeader ? ["-c", `http.extraHeader=${options.authHeader}`, ...args] : args;
2284
+ const result = await execFileAsync("git", commandArgs, {
2285
+ cwd: options.cwd,
2286
+ env: {
2287
+ ...process.env,
2288
+ ...options.env,
2289
+ GIT_TERMINAL_PROMPT: "0"
2290
+ },
2291
+ encoding: options.encoding ?? "utf8",
2292
+ maxBuffer: 32 * 1024 * 1024
2293
+ }).catch((error) => {
2294
+ throw new AiocsError(
2295
+ AIOCS_ERROR_CODES.internalError,
2296
+ `Git command failed: ${error instanceof Error ? error.message : String(error)}`,
2297
+ {
2298
+ args
2299
+ }
2300
+ );
2301
+ });
2302
+ return result.stdout;
2303
+ }
2304
+ async function ensureGitMirror(spec, dataDir, env) {
2305
+ const mirrorDir = getGitMirrorDir(dataDir, spec.id);
2306
+ mkdirSync3(dirname2(mirrorDir), { recursive: true });
2307
+ const authHeader = buildGitAuthHeader(spec, env);
2308
+ if (!existsSync2(mirrorDir)) {
2309
+ await runGit(["clone", "--mirror", spec.repo.url, mirrorDir], {
2310
+ env,
2311
+ authHeader
2312
+ });
2313
+ return mirrorDir;
2314
+ }
2315
+ await runGit(["--git-dir", mirrorDir, "remote", "set-url", "origin", spec.repo.url], {
2316
+ env,
2317
+ authHeader
2318
+ });
2319
+ await runGit(["--git-dir", mirrorDir, "fetch", "--prune", "--prune-tags", "--tags", "origin"], {
2320
+ env,
2321
+ authHeader
2322
+ });
2323
+ return mirrorDir;
1733
2324
  }
1734
- function getAiocsDataDir(env = process.env) {
1735
- const override = env.AIOCS_DATA_DIR;
1736
- if (override) {
1737
- mkdirSync2(expandTilde(override), { recursive: true });
1738
- return expandTilde(override);
2325
+ async function resolveGitCommit(mirrorDir, ref, env) {
2326
+ const stdout = await runGit(["--git-dir", mirrorDir, "rev-parse", `${ref}^{commit}`], {
2327
+ env
2328
+ });
2329
+ return String(stdout).trim();
2330
+ }
2331
+ async function listRepoFiles(mirrorDir, commitSha, env) {
2332
+ const stdout = await runGit(["--git-dir", mirrorDir, "ls-tree", "-r", "-z", "--name-only", commitSha], {
2333
+ env,
2334
+ encoding: "buffer"
2335
+ });
2336
+ const entries = stdout instanceof Buffer ? stdout.toString("utf8") : String(stdout);
2337
+ return entries.split("\0").map((entry) => entry.trim()).filter(Boolean);
2338
+ }
2339
+ function isIncluded(spec, filePath) {
2340
+ if (!matchesPatterns(filePath, spec.repo.include)) {
2341
+ return false;
1739
2342
  }
1740
- const target = join2(homedir(), ".aiocs", "data");
1741
- mkdirSync2(target, { recursive: true });
1742
- return target;
2343
+ if (spec.repo.exclude.length > 0 && matchesPatterns(filePath, spec.repo.exclude)) {
2344
+ return false;
2345
+ }
2346
+ return true;
1743
2347
  }
1744
- function getAiocsConfigDir(env = process.env) {
1745
- const override = env.AIOCS_CONFIG_DIR;
1746
- if (override) {
1747
- mkdirSync2(expandTilde(override), { recursive: true });
1748
- return expandTilde(override);
2348
+ async function getObjectSize(mirrorDir, commitSha, filePath, env) {
2349
+ const stdout = await runGit(["--git-dir", mirrorDir, "cat-file", "-s", `${commitSha}:${filePath}`], {
2350
+ env
2351
+ });
2352
+ return Number(String(stdout).trim());
2353
+ }
2354
+ function isProbablyBinary(buffer) {
2355
+ return buffer.includes(0);
2356
+ }
2357
+ async function readRepoFile(mirrorDir, commitSha, filePath, env) {
2358
+ const stdout = await runGit(["--git-dir", mirrorDir, "show", `${commitSha}:${filePath}`], {
2359
+ env,
2360
+ encoding: "buffer"
2361
+ });
2362
+ return stdout instanceof Buffer ? stdout : Buffer.from(String(stdout), "utf8");
2363
+ }
2364
+ function normalizeRepoUrl(repoUrl) {
2365
+ return new URL(repoUrl);
2366
+ }
2367
+ function normalizeRepoWebBase(repoUrl) {
2368
+ const url = normalizeRepoUrl(repoUrl);
2369
+ const pathname = url.pathname.replace(/\.git$/i, "");
2370
+ return `${url.origin}${pathname}`;
2371
+ }
2372
+ function buildRepoFileUrl(spec, filePath) {
2373
+ const url = normalizeRepoUrl(spec.repo.url);
2374
+ const encodedPath = filePath.split("/").map((segment) => encodeURIComponent(segment)).join("/");
2375
+ const encodedRef = spec.repo.ref.split("/").map((segment) => encodeURIComponent(segment)).join("/");
2376
+ if (url.protocol === "file:") {
2377
+ return `${spec.repo.url}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
1749
2378
  }
1750
- const target = join2(homedir(), ".aiocs", "config");
1751
- mkdirSync2(target, { recursive: true });
1752
- return target;
2379
+ const base = normalizeRepoWebBase(spec.repo.url);
2380
+ if (url.hostname === "github.com") {
2381
+ return `${base}/blob/${encodedRef}/${encodedPath}`;
2382
+ }
2383
+ if (url.hostname === "gitlab.com") {
2384
+ return `${base}/-/blob/${encodedRef}/${encodedPath}`;
2385
+ }
2386
+ return `${base}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
1753
2387
  }
1754
- function getAiocsSourcesDir(env = process.env) {
1755
- const override = env.AIOCS_SOURCES_DIR;
1756
- if (override) {
1757
- mkdirSync2(expandTilde(override), { recursive: true });
1758
- return expandTilde(override);
2388
+ function persistGitSnapshotFiles(input, snapshotId, pages) {
2389
+ const snapshotDir = join4(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "files");
2390
+ for (const page of pages) {
2391
+ const filePath = join4(snapshotDir, page.filePath);
2392
+ mkdirSync3(dirname2(filePath), { recursive: true });
2393
+ writeFileSync(filePath, page.markdown, "utf8");
1759
2394
  }
1760
- const target = join2(homedir(), ".aiocs", "sources");
1761
- mkdirSync2(target, { recursive: true });
1762
- return target;
1763
2395
  }
1764
-
1765
- // src/daemon.ts
1766
- import { resolve as resolve4 } from "path";
1767
- import { setTimeout as sleep2 } from "timers/promises";
1768
-
1769
- // src/fetch/fetch-source.ts
1770
- import { mkdirSync as mkdirSync3, writeFileSync } from "fs";
1771
- import { join as join3 } from "path";
1772
- import { setTimeout as sleep } from "timers/promises";
1773
- import { chromium } from "playwright";
2396
+ async function materializeGitPages(spec, mirrorDir, commitSha, env) {
2397
+ const repoFiles = await listRepoFiles(mirrorDir, commitSha, env);
2398
+ const includedFiles = repoFiles.filter((filePath) => isIncluded(spec, filePath));
2399
+ if (includedFiles.length > spec.repo.maxFiles) {
2400
+ throw new AiocsError(
2401
+ AIOCS_ERROR_CODES.invalidArgument,
2402
+ `Git source '${spec.id}' matched ${includedFiles.length} files, exceeding maxFiles=${spec.repo.maxFiles}`
2403
+ );
2404
+ }
2405
+ const pages = [];
2406
+ for (const filePath of includedFiles) {
2407
+ const size = await getObjectSize(mirrorDir, commitSha, filePath, env);
2408
+ if (!Number.isFinite(size) || size > spec.repo.textFileMaxBytes) {
2409
+ continue;
2410
+ }
2411
+ const content = await readRepoFile(mirrorDir, commitSha, filePath, env).catch(() => null);
2412
+ if (!content || isProbablyBinary(content)) {
2413
+ continue;
2414
+ }
2415
+ const markdown = content.toString("utf8").trimEnd();
2416
+ if (!markdown.trim()) {
2417
+ continue;
2418
+ }
2419
+ pages.push({
2420
+ url: buildRepoFileUrl(spec, filePath),
2421
+ title: filePath,
2422
+ markdown,
2423
+ pageKind: "file",
2424
+ filePath,
2425
+ language: detectLanguage(filePath)
2426
+ });
2427
+ }
2428
+ return pages.sort((left, right) => left.filePath.localeCompare(right.filePath));
2429
+ }
2430
+ function assertCanaryPathInScope(spec, check) {
2431
+ if (!isIncluded(spec, check.path)) {
2432
+ throw new AiocsError(
2433
+ AIOCS_ERROR_CODES.invalidArgument,
2434
+ `Git canary path '${check.path}' is outside the configured include/exclude scope`
2435
+ );
2436
+ }
2437
+ }
2438
+ async function readCanaryTarget(spec, mirrorDir, commitSha, check, env) {
2439
+ assertCanaryPathInScope(spec, check);
2440
+ const content = await readRepoFile(mirrorDir, commitSha, check.path, env);
2441
+ if (isProbablyBinary(content)) {
2442
+ throw new Error(`Canary target '${check.path}' is binary`);
2443
+ }
2444
+ return {
2445
+ url: buildRepoFileUrl(spec, check.path),
2446
+ title: check.path,
2447
+ markdown: content.toString("utf8").trimEnd(),
2448
+ pageKind: "file",
2449
+ filePath: check.path,
2450
+ language: detectLanguage(check.path)
2451
+ };
2452
+ }
2453
+ async function fetchGitSource(input) {
2454
+ const spec = input.catalog.getSourceSpec(input.sourceId);
2455
+ if (!spec || !isGitSourceSpec(spec)) {
2456
+ throw new AiocsError(
2457
+ AIOCS_ERROR_CODES.sourceNotFound,
2458
+ `Unknown git source '${input.sourceId}'`
2459
+ );
2460
+ }
2461
+ const env = input.env ?? process.env;
2462
+ const mirrorDir = await ensureGitMirror(spec, input.dataDir, env);
2463
+ const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
2464
+ const pages = await materializeGitPages(spec, mirrorDir, commitSha, env);
2465
+ if (pages.length === 0) {
2466
+ throw new AiocsError(
2467
+ AIOCS_ERROR_CODES.noPagesFetched,
2468
+ `No text files fetched for git source '${input.sourceId}'`
2469
+ );
2470
+ }
2471
+ const result = input.catalog.recordSuccessfulSnapshot({
2472
+ sourceId: input.sourceId,
2473
+ detectedVersion: commitSha,
2474
+ revisionKey: commitSha,
2475
+ pages
2476
+ });
2477
+ if (!result.reused) {
2478
+ persistGitSnapshotFiles(input, result.snapshotId, pages);
2479
+ }
2480
+ return {
2481
+ snapshotId: result.snapshotId,
2482
+ pageCount: pages.length,
2483
+ reused: result.reused,
2484
+ detectedVersion: commitSha
2485
+ };
2486
+ }
2487
+ async function runGitSourceCanary(input) {
2488
+ const spec = input.catalog.getSourceSpec(input.sourceId);
2489
+ if (!spec || !isGitSourceSpec(spec)) {
2490
+ throw new AiocsError(
2491
+ AIOCS_ERROR_CODES.sourceNotFound,
2492
+ `Unknown git source '${input.sourceId}'`
2493
+ );
2494
+ }
2495
+ const env = input.env ?? process.env;
2496
+ const dataDir = input.dataDir ?? join4(process.env.HOME ?? "", ".aiocs", "data");
2497
+ const mirrorDir = await ensureGitMirror(spec, dataDir, env);
2498
+ const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
2499
+ const canary = resolveSourceCanary(spec);
2500
+ const checkedAt = nowIso2();
2501
+ const checks = [];
2502
+ for (const check of canary.checks) {
2503
+ try {
2504
+ const page = await readCanaryTarget(spec, mirrorDir, commitSha, check, env);
2505
+ if (check.expectedTitle && !page.title.includes(check.expectedTitle)) {
2506
+ throw new Error(`Expected title to include '${check.expectedTitle}'`);
2507
+ }
2508
+ if (check.expectedText && !page.markdown.includes(check.expectedText)) {
2509
+ throw new Error(`Expected markdown to include '${check.expectedText}'`);
2510
+ }
2511
+ if (page.markdown.trim().length < check.minContentLength) {
2512
+ throw new Error(`Expected content length to be at least ${check.minContentLength}`);
2513
+ }
2514
+ checks.push({
2515
+ path: check.path,
2516
+ status: "pass",
2517
+ title: page.title,
2518
+ markdownLength: page.markdown.trim().length
2519
+ });
2520
+ } catch (error) {
2521
+ checks.push({
2522
+ path: check.path,
2523
+ status: "fail",
2524
+ errorMessage: error instanceof Error ? error.message : String(error)
2525
+ });
2526
+ }
2527
+ }
2528
+ const passCount = checks.filter((check) => check.status === "pass").length;
2529
+ const failCount = checks.length - passCount;
2530
+ const status = failCount > 0 ? "fail" : "pass";
2531
+ const result = {
2532
+ sourceId: input.sourceId,
2533
+ status,
2534
+ checkedAt,
2535
+ summary: {
2536
+ checkCount: checks.length,
2537
+ passCount,
2538
+ failCount
2539
+ },
2540
+ checks
2541
+ };
2542
+ input.catalog.recordCanaryRun({
2543
+ sourceId: input.sourceId,
2544
+ status,
2545
+ checkedAt,
2546
+ details: result
2547
+ });
2548
+ if (status === "fail") {
2549
+ throw new AiocsError(
2550
+ AIOCS_ERROR_CODES.canaryFailed,
2551
+ `Git source canary failed for '${input.sourceId}'`,
2552
+ result
2553
+ );
2554
+ }
2555
+ return result;
2556
+ }
1774
2557
 
1775
2558
  // src/fetch/extract.ts
1776
2559
  import { JSDOM } from "jsdom";
@@ -1952,36 +2735,10 @@ async function extractPage(page, strategy) {
1952
2735
  return runReadabilityStrategy(page);
1953
2736
  }
1954
2737
 
1955
- // src/fetch/url-patterns.ts
1956
- function escapeRegex(value) {
1957
- return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
1958
- }
1959
- function patternToRegex(pattern) {
1960
- let regex = "^";
1961
- for (let index = 0; index < pattern.length; index += 1) {
1962
- const current = pattern[index];
1963
- const next = pattern[index + 1];
1964
- if (current === "*" && next === "*") {
1965
- regex += ".*";
1966
- index += 1;
1967
- continue;
1968
- }
1969
- if (current === "*") {
1970
- regex += "[^?#]*";
1971
- continue;
1972
- }
1973
- regex += escapeRegex(current ?? "");
1974
- }
1975
- return new RegExp(`${regex}$`);
1976
- }
1977
- function matchesPatterns(url, patterns) {
1978
- return patterns.some((pattern) => patternToRegex(pattern).test(url));
1979
- }
1980
-
1981
2738
  // src/fetch/fetch-source.ts
1982
2739
  var MAX_FETCH_ATTEMPTS = 3;
1983
2740
  var RETRY_DELAY_MS = 250;
1984
- function nowIso2() {
2741
+ function nowIso3() {
1985
2742
  return (/* @__PURE__ */ new Date()).toISOString();
1986
2743
  }
1987
2744
  function canonicalizeUrl(raw) {
@@ -2051,14 +2808,14 @@ async function extractRawMarkdownPage(url, response) {
2051
2808
  };
2052
2809
  }
2053
2810
  function persistSnapshotPages(input, snapshotId, pages) {
2054
- const snapshotDir = join3(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
2055
- mkdirSync3(snapshotDir, { recursive: true });
2811
+ const snapshotDir = join5(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
2812
+ mkdirSync4(snapshotDir, { recursive: true });
2056
2813
  pages.forEach((page, index) => {
2057
2814
  const filename = `${String(index + 1).padStart(3, "0")}-${slugify(page.title)}.md`;
2058
- writeFileSync(join3(snapshotDir, filename), page.markdown, "utf8");
2815
+ writeFileSync2(join5(snapshotDir, filename), page.markdown, "utf8");
2059
2816
  });
2060
2817
  }
2061
- function resolveEnvValue(name, env) {
2818
+ function resolveEnvValue2(name, env) {
2062
2819
  const value = env[name];
2063
2820
  if (!value) {
2064
2821
  throw new AiocsError(
@@ -2074,13 +2831,13 @@ function resolveEnvValue(name, env) {
2074
2831
  function resolveSourceAuth(spec, env) {
2075
2832
  const scopedHeaders = (spec.auth?.headers ?? []).map((header) => ({
2076
2833
  name: header.name,
2077
- value: resolveEnvValue(header.valueFromEnv, env),
2834
+ value: resolveEnvValue2(header.valueFromEnv, env),
2078
2835
  hosts: header.hosts ?? spec.allowedHosts,
2079
2836
  ...header.include ? { include: header.include } : {}
2080
2837
  }));
2081
2838
  const cookies = (spec.auth?.cookies ?? []).map((cookie) => ({
2082
2839
  name: cookie.name,
2083
- value: resolveEnvValue(cookie.valueFromEnv, env),
2840
+ value: resolveEnvValue2(cookie.valueFromEnv, env),
2084
2841
  domain: cookie.domain,
2085
2842
  path: cookie.path,
2086
2843
  ...typeof cookie.secure === "boolean" ? { secure: cookie.secure } : {},
@@ -2179,6 +2936,14 @@ async function fetchSourceOnce(input) {
2179
2936
  `Unknown source '${input.sourceId}'`
2180
2937
  );
2181
2938
  }
2939
+ if (isGitSourceSpec(spec)) {
2940
+ const result = await fetchGitSource(input);
2941
+ return {
2942
+ snapshotId: result.snapshotId,
2943
+ pageCount: result.pageCount,
2944
+ reused: result.reused
2945
+ };
2946
+ }
2182
2947
  const session = await createSourceContext(spec, input.env ?? process.env);
2183
2948
  const { page } = session;
2184
2949
  const queue = spec.startUrls.map((url) => canonicalizeUrl(url));
@@ -2305,6 +3070,9 @@ async function runSourceCanaryOnce(input) {
2305
3070
  `Unknown source '${input.sourceId}'`
2306
3071
  );
2307
3072
  }
3073
+ if (isGitSourceSpec(spec)) {
3074
+ return runGitSourceCanary(input);
3075
+ }
2308
3076
  const canary = resolveSourceCanary(spec);
2309
3077
  const session = await createSourceContext(spec, input.env ?? process.env);
2310
3078
  const { page } = session;
@@ -2355,7 +3123,7 @@ async function runSourceCanaryOnce(input) {
2355
3123
  const result = {
2356
3124
  sourceId: input.sourceId,
2357
3125
  status: checks.every((check) => check.status === "pass") ? "pass" : "fail",
2358
- checkedAt: nowIso2(),
3126
+ checkedAt: nowIso3(),
2359
3127
  summary: {
2360
3128
  checkCount: checks.length,
2361
3129
  passCount: checks.filter((check) => check.status === "pass").length,
@@ -2404,6 +3172,30 @@ function getEmbeddingModelKey(config) {
2404
3172
  function normalizeBaseUrl(baseUrl) {
2405
3173
  return baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
2406
3174
  }
3175
+ function normalizeEmbeddingWhitespace(value) {
3176
+ return value.replace(/\s+/g, " ").trim();
3177
+ }
3178
+ function truncateEmbeddingText(value, maxChars) {
3179
+ if (value.length <= maxChars) {
3180
+ return value;
3181
+ }
3182
+ const slice = value.slice(0, maxChars);
3183
+ const lastWhitespace = slice.lastIndexOf(" ");
3184
+ if (lastWhitespace >= Math.floor(maxChars * 0.8)) {
3185
+ return slice.slice(0, lastWhitespace).trim();
3186
+ }
3187
+ return slice.trim();
3188
+ }
3189
+ function prepareTextForEmbedding(markdown, maxChars) {
3190
+ const withoutComments = markdown.replace(/<!--[\s\S]*?-->/g, " ");
3191
+ const withoutImages = withoutComments.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, "$1");
3192
+ const withoutLinks = withoutImages.replace(/\[([^\]]+)\]\(([^)]+)\)/g, "$1");
3193
+ const withoutHtml = withoutLinks.replace(/<[^>]+>/g, " ");
3194
+ const withoutCodeFenceMarkers = withoutHtml.replace(/```[^\n]*\n/g, "\n").replace(/```/g, "\n");
3195
+ const withoutInlineCodeTicks = withoutCodeFenceMarkers.replace(/`([^`]+)`/g, "$1");
3196
+ const normalized = normalizeEmbeddingWhitespace(withoutInlineCodeTicks);
3197
+ return truncateEmbeddingText(normalized, maxChars);
3198
+ }
2407
3199
  async function parseJsonResponse(response) {
2408
3200
  const text = await response.text();
2409
3201
  if (!text) {
@@ -2422,6 +3214,7 @@ async function embedTexts(config, texts) {
2422
3214
  if (texts.length === 0) {
2423
3215
  return [];
2424
3216
  }
3217
+ const preparedTexts = texts.map((text) => prepareTextForEmbedding(text, config.ollamaMaxInputChars));
2425
3218
  const response = await fetch(`${normalizeBaseUrl(config.ollamaBaseUrl)}/api/embed`, {
2426
3219
  method: "POST",
2427
3220
  headers: {
@@ -2430,7 +3223,7 @@ async function embedTexts(config, texts) {
2430
3223
  signal: AbortSignal.timeout(config.ollamaTimeoutMs),
2431
3224
  body: JSON.stringify({
2432
3225
  model: config.ollamaEmbeddingModel,
2433
- input: texts
3226
+ input: preparedTexts
2434
3227
  })
2435
3228
  }).catch((error) => {
2436
3229
  throw new AiocsError(
@@ -2573,6 +3366,9 @@ var AiocsVectorStore = class {
2573
3366
  pageUrl: point.pageUrl,
2574
3367
  pageTitle: point.pageTitle,
2575
3368
  sectionTitle: point.sectionTitle,
3369
+ pageKind: point.pageKind,
3370
+ filePath: point.filePath,
3371
+ language: point.language,
2576
3372
  modelKey: input.modelKey
2577
3373
  }
2578
3374
  }));
@@ -2785,7 +3581,10 @@ async function processEmbeddingJobs(input) {
2785
3581
  snapshotId: chunk.snapshotId,
2786
3582
  pageUrl: chunk.pageUrl,
2787
3583
  pageTitle: chunk.pageTitle,
2788
- sectionTitle: chunk.sectionTitle
3584
+ sectionTitle: chunk.sectionTitle,
3585
+ pageKind: chunk.pageKind,
3586
+ filePath: chunk.filePath,
3587
+ language: chunk.language
2789
3588
  }))
2790
3589
  });
2791
3590
  indexedChunkIds.push(...batch.map((chunk) => chunk.chunkId));
@@ -2823,29 +3622,6 @@ async function processEmbeddingJobs(input) {
2823
3622
  };
2824
3623
  }
2825
3624
 
2826
- // src/runtime/bundled-sources.ts
2827
- import { existsSync } from "fs";
2828
- import { dirname, join as join4 } from "path";
2829
- import { fileURLToPath } from "url";
2830
- function findPackageRoot(startDir) {
2831
- let currentDir = startDir;
2832
- while (true) {
2833
- if (existsSync(join4(currentDir, "package.json")) && existsSync(join4(currentDir, "sources"))) {
2834
- return currentDir;
2835
- }
2836
- const parentDir = dirname(currentDir);
2837
- if (parentDir === currentDir) {
2838
- throw new Error(`Could not locate aiocs package root from ${startDir}`);
2839
- }
2840
- currentDir = parentDir;
2841
- }
2842
- }
2843
- function getBundledSourcesDir() {
2844
- const currentFilePath = fileURLToPath(import.meta.url);
2845
- const packageRoot = findPackageRoot(dirname(currentFilePath));
2846
- return join4(packageRoot, "sources");
2847
- }
2848
-
2849
3625
  // src/runtime/hybrid-config.ts
2850
3626
  function parsePositiveInteger(value, field, fallback) {
2851
3627
  if (typeof value === "undefined" || value.trim() === "") {
@@ -2888,7 +3664,8 @@ function getHybridRuntimeConfig(env = process.env) {
2888
3664
  embeddingProvider: "ollama",
2889
3665
  ollamaBaseUrl: env.AIOCS_OLLAMA_BASE_URL ?? "http://127.0.0.1:11434",
2890
3666
  ollamaEmbeddingModel: env.AIOCS_OLLAMA_EMBEDDING_MODEL ?? "nomic-embed-text",
2891
- ollamaTimeoutMs: parsePositiveInteger(env.AIOCS_OLLAMA_TIMEOUT_MS, "AIOCS_OLLAMA_TIMEOUT_MS", 1e3),
3667
+ ollamaTimeoutMs: parsePositiveInteger(env.AIOCS_OLLAMA_TIMEOUT_MS, "AIOCS_OLLAMA_TIMEOUT_MS", 1e4),
3668
+ ollamaMaxInputChars: parsePositiveInteger(env.AIOCS_OLLAMA_MAX_INPUT_CHARS, "AIOCS_OLLAMA_MAX_INPUT_CHARS", 4e3),
2892
3669
  embeddingBatchSize: parsePositiveInteger(env.AIOCS_EMBEDDING_BATCH_SIZE, "AIOCS_EMBEDDING_BATCH_SIZE", 32),
2893
3670
  embeddingJobsPerCycle: parsePositiveInteger(env.AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE, "AIOCS_EMBEDDING_JOB_LIMIT_PER_CYCLE", 2),
2894
3671
  lexicalCandidateWindow: parsePositiveInteger(env.AIOCS_LEXICAL_CANDIDATE_WINDOW, "AIOCS_LEXICAL_CANDIDATE_WINDOW", 40),
@@ -2900,13 +3677,13 @@ function getHybridRuntimeConfig(env = process.env) {
2900
3677
  // src/spec/source-spec-files.ts
2901
3678
  import { access, readdir } from "fs/promises";
2902
3679
  import { constants as fsConstants } from "fs";
2903
- import { extname as extname2, join as join5, resolve as resolve3 } from "path";
3680
+ import { extname as extname2, join as join6, resolve as resolve4 } from "path";
2904
3681
  var SOURCE_SPEC_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".json"]);
2905
3682
  function uniqueResolvedPaths(paths) {
2906
3683
  const seen = /* @__PURE__ */ new Set();
2907
3684
  const unique = [];
2908
3685
  for (const rawPath of paths) {
2909
- const normalized = resolve3(rawPath);
3686
+ const normalized = resolve4(rawPath);
2910
3687
  if (seen.has(normalized)) {
2911
3688
  continue;
2912
3689
  }
@@ -2927,7 +3704,7 @@ async function walkSourceSpecFiles(rootDir) {
2927
3704
  const entries = await readdir(rootDir, { withFileTypes: true });
2928
3705
  const discovered = [];
2929
3706
  for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
2930
- const entryPath = join5(rootDir, entry.name);
3707
+ const entryPath = join6(rootDir, entry.name);
2931
3708
  if (entry.isDirectory()) {
2932
3709
  discovered.push(...await walkSourceSpecFiles(entryPath));
2933
3710
  continue;
@@ -2947,7 +3724,7 @@ var DEFAULT_INTERVAL_MINUTES = 60;
2947
3724
  var DEFAULT_CONTAINER_SOURCE_DIR = "/app/sources";
2948
3725
  var BOOLEAN_TRUE_VALUES = /* @__PURE__ */ new Set(["1", "true", "yes", "on"]);
2949
3726
  var BOOLEAN_FALSE_VALUES = /* @__PURE__ */ new Set(["0", "false", "no", "off"]);
2950
- function nowIso3() {
3727
+ function nowIso4() {
2951
3728
  return (/* @__PURE__ */ new Date()).toISOString();
2952
3729
  }
2953
3730
  function parsePositiveInteger2(raw, variableName) {
@@ -2973,10 +3750,11 @@ function parseBoolean(raw, variableName) {
2973
3750
  function parseDaemonConfig(env, options = {}) {
2974
3751
  const intervalMinutes = env.AIOCS_DAEMON_INTERVAL_MINUTES ? parsePositiveInteger2(env.AIOCS_DAEMON_INTERVAL_MINUTES, "AIOCS_DAEMON_INTERVAL_MINUTES") : DEFAULT_INTERVAL_MINUTES;
2975
3752
  const fetchOnStart = env.AIOCS_DAEMON_FETCH_ON_START ? parseBoolean(env.AIOCS_DAEMON_FETCH_ON_START, "AIOCS_DAEMON_FETCH_ON_START") : true;
3753
+ const defaultContainerSourceDir = options.containerSourceDir ?? (existsSync3(DEFAULT_CONTAINER_SOURCE_DIR) ? DEFAULT_CONTAINER_SOURCE_DIR : void 0);
2976
3754
  const defaultSourceDirs = uniqueResolvedPaths([
2977
3755
  options.bundledSourceDir ?? getBundledSourcesDir(),
2978
3756
  options.userSourceDir ?? getAiocsSourcesDir(env),
2979
- options.containerSourceDir ?? DEFAULT_CONTAINER_SOURCE_DIR
3757
+ ...defaultContainerSourceDir ? [defaultContainerSourceDir] : []
2980
3758
  ]);
2981
3759
  const sourceSpecDirs = env.AIOCS_SOURCE_SPEC_DIRS ? uniqueResolvedPaths(
2982
3760
  env.AIOCS_SOURCE_SPEC_DIRS.split(",").map((entry) => entry.trim()).filter(Boolean)
@@ -3023,7 +3801,7 @@ async function bootstrapSourceSpecs(input) {
3023
3801
  throw new Error(`No source spec files found in configured directories: ${normalizedSourceSpecDirs.join(", ")}`);
3024
3802
  }
3025
3803
  const removedSourceIds = input.catalog.removeManagedSources({
3026
- managedRoots: existingDirs.map((sourceSpecDir) => resolve4(sourceSpecDir)),
3804
+ managedRoots: existingDirs.map((sourceSpecDir) => resolve5(sourceSpecDir)),
3027
3805
  activeSources: sources.map((source) => ({
3028
3806
  sourceId: source.sourceId,
3029
3807
  specPath: source.specPath
@@ -3036,7 +3814,7 @@ async function bootstrapSourceSpecs(input) {
3036
3814
  };
3037
3815
  }
3038
3816
  async function runDaemonCycle(input) {
3039
- const startedAt = nowIso3();
3817
+ const startedAt = nowIso4();
3040
3818
  const bootstrapped = await bootstrapSourceSpecs({
3041
3819
  catalog: input.catalog,
3042
3820
  sourceSpecDirs: input.sourceSpecDirs,
@@ -3066,6 +3844,7 @@ async function runDaemonCycle(input) {
3066
3844
  const result = await runSourceCanary({
3067
3845
  catalog: input.catalog,
3068
3846
  sourceId,
3847
+ dataDir: input.dataDir,
3069
3848
  env: process.env
3070
3849
  });
3071
3850
  canaried.push({
@@ -3124,7 +3903,7 @@ async function runDaemonCycle(input) {
3124
3903
  }
3125
3904
  return {
3126
3905
  startedAt,
3127
- finishedAt: nowIso3(),
3906
+ finishedAt: nowIso4(),
3128
3907
  dueSourceIds,
3129
3908
  canaryDueSourceIds,
3130
3909
  bootstrapped,
@@ -3140,7 +3919,7 @@ async function startDaemon(input) {
3140
3919
  const intervalMs = input.config.intervalMinutes * 6e4;
3141
3920
  input.catalog.resetRunningEmbeddingJobs();
3142
3921
  input.catalog.markDaemonStarted({
3143
- startedAt: nowIso3(),
3922
+ startedAt: nowIso4(),
3144
3923
  intervalMinutes: input.config.intervalMinutes,
3145
3924
  fetchOnStart: input.config.fetchOnStart
3146
3925
  });
@@ -3151,7 +3930,7 @@ async function startDaemon(input) {
3151
3930
  sourceSpecDirs: input.config.sourceSpecDirs
3152
3931
  });
3153
3932
  const runCycle = async (reason) => {
3154
- const startedAt = nowIso3();
3933
+ const startedAt = nowIso4();
3155
3934
  input.catalog.markDaemonCycleStarted(startedAt);
3156
3935
  input.logger.emit({
3157
3936
  type: "daemon.cycle.started",
@@ -3177,7 +3956,7 @@ async function startDaemon(input) {
3177
3956
  });
3178
3957
  } catch (error) {
3179
3958
  input.catalog.markDaemonCycleCompleted({
3180
- completedAt: nowIso3(),
3959
+ completedAt: nowIso4(),
3181
3960
  status: "failed"
3182
3961
  });
3183
3962
  throw error;
@@ -3208,7 +3987,7 @@ async function startDaemon(input) {
3208
3987
  // package.json
3209
3988
  var package_default = {
3210
3989
  name: "@bodhi-ventures/aiocs",
3211
- version: "0.1.1",
3990
+ version: "0.2.0",
3212
3991
  license: "MIT",
3213
3992
  type: "module",
3214
3993
  description: "Local-only documentation store, fetcher, and search CLI for AI agents.",
@@ -3256,28 +4035,28 @@ var package_default = {
3256
4035
  "test:watch": "vitest"
3257
4036
  },
3258
4037
  dependencies: {
3259
- "@modelcontextprotocol/sdk": "^1.28.0",
3260
- "@mozilla/readability": "^0.6.0",
4038
+ "@modelcontextprotocol/sdk": "1.28.0",
4039
+ "@mozilla/readability": "0.6.0",
3261
4040
  "@qdrant/js-client-rest": "1.17.0",
3262
- "better-sqlite3": "^12.4.1",
3263
- commander: "^14.0.1",
3264
- jsdom: "^27.0.1",
3265
- playwright: "^1.57.0",
3266
- turndown: "^7.2.1",
3267
- "turndown-plugin-gfm": "^1.0.2",
3268
- yaml: "^2.8.1",
3269
- zod: "^4.1.12"
4041
+ "better-sqlite3": "12.4.1",
4042
+ commander: "14.0.1",
4043
+ jsdom: "27.0.1",
4044
+ playwright: "1.57.0",
4045
+ turndown: "7.2.1",
4046
+ "turndown-plugin-gfm": "1.0.2",
4047
+ yaml: "2.8.1",
4048
+ zod: "4.1.12"
3270
4049
  },
3271
4050
  devDependencies: {
3272
- "@types/better-sqlite3": "^7.6.13",
3273
- "@types/jsdom": "^21.1.7",
3274
- "@types/node": "^24.7.2",
3275
- "@types/turndown": "^5.0.5",
3276
- execa: "^9.6.0",
3277
- tsup: "^8.5.0",
3278
- tsx: "^4.20.6",
3279
- typescript: "^5.9.3",
3280
- vitest: "^3.2.4"
4051
+ "@types/better-sqlite3": "7.6.13",
4052
+ "@types/jsdom": "21.1.7",
4053
+ "@types/node": "24.7.2",
4054
+ "@types/turndown": "5.0.5",
4055
+ execa: "9.6.0",
4056
+ tsup: "8.5.0",
4057
+ tsx: "4.20.6",
4058
+ typescript: "5.9.3",
4059
+ vitest: "3.2.4"
3281
4060
  }
3282
4061
  };
3283
4062
 
@@ -3287,11 +4066,11 @@ var packageVersion = package_default.version;
3287
4066
  var packageDescription = package_default.description;
3288
4067
 
3289
4068
  // src/services.ts
3290
- import { resolve as resolve7 } from "path";
4069
+ import { resolve as resolve8 } from "path";
3291
4070
 
3292
4071
  // src/backup.ts
3293
4072
  import { cp, mkdir, readdir as readdir2, readFile as readFile2, rename, rm, stat, writeFile } from "fs/promises";
3294
- import { basename, dirname as dirname2, join as join6, resolve as resolve5 } from "path";
4073
+ import { basename, dirname as dirname3, join as join7, resolve as resolve6 } from "path";
3295
4074
  import { randomUUID as randomUUID2 } from "crypto";
3296
4075
  import Database2 from "better-sqlite3";
3297
4076
  var CATALOG_DB_FILENAME = "catalog.sqlite";
@@ -3319,7 +4098,7 @@ async function isDirectoryEmpty(path) {
3319
4098
  return (await readdir2(path)).length === 0;
3320
4099
  }
3321
4100
  async function listEntries(root, relativePath = "") {
3322
- const absolutePath = relativePath ? join6(root, relativePath) : root;
4101
+ const absolutePath = relativePath ? join7(root, relativePath) : root;
3323
4102
  const stats = await stat(absolutePath);
3324
4103
  if (!stats.isDirectory()) {
3325
4104
  return [{
@@ -3335,7 +4114,7 @@ async function listEntries(root, relativePath = "") {
3335
4114
  size: 0
3336
4115
  }] : [];
3337
4116
  for (const childName of childNames.sort()) {
3338
- entries.push(...await listEntries(root, relativePath ? join6(relativePath, childName) : childName));
4117
+ entries.push(...await listEntries(root, relativePath ? join7(relativePath, childName) : childName));
3339
4118
  }
3340
4119
  return entries;
3341
4120
  }
@@ -3349,12 +4128,12 @@ async function copyIfPresent(from, to, entries, relativePrefix) {
3349
4128
  entries.push(
3350
4129
  ...copiedEntries.map((entry) => ({
3351
4130
  ...entry,
3352
- relativePath: join6(relativePrefix, entry.relativePath)
4131
+ relativePath: join7(relativePrefix, entry.relativePath)
3353
4132
  }))
3354
4133
  );
3355
4134
  }
3356
4135
  async function copyDataDirForBackup(from, to) {
3357
- const sourceCatalogPath = join6(from, CATALOG_DB_FILENAME);
4136
+ const sourceCatalogPath = join7(from, CATALOG_DB_FILENAME);
3358
4137
  if (!await pathExists2(sourceCatalogPath)) {
3359
4138
  throw new AiocsError(
3360
4139
  AIOCS_ERROR_CODES.backupSourceMissing,
@@ -3370,10 +4149,13 @@ async function copyDataDirForBackup(from, to) {
3370
4149
  if (name === CATALOG_DB_FILENAME) {
3371
4150
  return false;
3372
4151
  }
4152
+ if (name === "git-mirrors") {
4153
+ return false;
4154
+ }
3373
4155
  return !SQLITE_SIDE_CAR_SUFFIXES.some((suffix) => name === `${CATALOG_DB_FILENAME}${suffix}`);
3374
4156
  }
3375
4157
  });
3376
- const targetCatalogPath = join6(to, CATALOG_DB_FILENAME);
4158
+ const targetCatalogPath = join7(to, CATALOG_DB_FILENAME);
3377
4159
  const sourceCatalog = new Database2(sourceCatalogPath, { readonly: true });
3378
4160
  try {
3379
4161
  await sourceCatalog.backup(targetCatalogPath);
@@ -3382,7 +4164,7 @@ async function copyDataDirForBackup(from, to) {
3382
4164
  }
3383
4165
  }
3384
4166
  async function loadValidatedBackupPayload(inputDir) {
3385
- const manifestPath = join6(inputDir, "manifest.json");
4167
+ const manifestPath = join7(inputDir, "manifest.json");
3386
4168
  await assertSourceDirExists(inputDir);
3387
4169
  if (!await pathExists2(manifestPath)) {
3388
4170
  throw new AiocsError(
@@ -3397,21 +4179,21 @@ async function loadValidatedBackupPayload(inputDir) {
3397
4179
  `Invalid backup manifest: ${manifestPath}`
3398
4180
  );
3399
4181
  }
3400
- const backupDataDir = join6(inputDir, "data");
4182
+ const backupDataDir = join7(inputDir, "data");
3401
4183
  if (!await pathExists2(backupDataDir)) {
3402
4184
  throw new AiocsError(
3403
4185
  AIOCS_ERROR_CODES.backupInvalid,
3404
4186
  `Backup payload is missing the data directory: ${backupDataDir}`
3405
4187
  );
3406
4188
  }
3407
- const backupCatalogPath = join6(backupDataDir, CATALOG_DB_FILENAME);
4189
+ const backupCatalogPath = join7(backupDataDir, CATALOG_DB_FILENAME);
3408
4190
  if (!await pathExists2(backupCatalogPath)) {
3409
4191
  throw new AiocsError(
3410
4192
  AIOCS_ERROR_CODES.backupInvalid,
3411
4193
  `Backup payload is missing the catalog database: ${backupCatalogPath}`
3412
4194
  );
3413
4195
  }
3414
- const backupConfigDir = join6(inputDir, "config");
4196
+ const backupConfigDir = join7(inputDir, "config");
3415
4197
  return {
3416
4198
  manifest,
3417
4199
  backupDataDir,
@@ -3419,17 +4201,17 @@ async function loadValidatedBackupPayload(inputDir) {
3419
4201
  };
3420
4202
  }
3421
4203
  async function prepareReplacementTarget(backupDir, targetDir) {
3422
- const parentDir = dirname2(targetDir);
3423
- const stagingDir = join6(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
4204
+ const parentDir = dirname3(targetDir);
4205
+ const stagingDir = join7(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
3424
4206
  await rm(stagingDir, { recursive: true, force: true });
3425
4207
  await mkdir(parentDir, { recursive: true });
3426
4208
  await cp(backupDir, stagingDir, { recursive: true, force: true });
3427
4209
  return stagingDir;
3428
4210
  }
3429
4211
  async function exportBackup(input) {
3430
- const dataDir = resolve5(input.dataDir);
3431
- const outputDir = resolve5(input.outputDir);
3432
- const configDir = input.configDir ? resolve5(input.configDir) : void 0;
4212
+ const dataDir = resolve6(input.dataDir);
4213
+ const outputDir = resolve6(input.outputDir);
4214
+ const configDir = input.configDir ? resolve6(input.configDir) : void 0;
3433
4215
  await assertSourceDirExists(dataDir);
3434
4216
  if (!await isDirectoryEmpty(outputDir)) {
3435
4217
  if (!input.replaceExisting) {
@@ -3442,13 +4224,13 @@ async function exportBackup(input) {
3442
4224
  }
3443
4225
  await mkdir(outputDir, { recursive: true });
3444
4226
  const entries = [];
3445
- await copyDataDirForBackup(dataDir, join6(outputDir, "data"));
3446
- entries.push(...(await listEntries(join6(outputDir, "data"))).map((entry) => ({
4227
+ await copyDataDirForBackup(dataDir, join7(outputDir, "data"));
4228
+ entries.push(...(await listEntries(join7(outputDir, "data"))).map((entry) => ({
3447
4229
  ...entry,
3448
- relativePath: join6("data", entry.relativePath)
4230
+ relativePath: join7("data", entry.relativePath)
3449
4231
  })));
3450
4232
  if (configDir) {
3451
- await copyIfPresent(configDir, join6(outputDir, "config"), entries, "config");
4233
+ await copyIfPresent(configDir, join7(outputDir, "config"), entries, "config");
3452
4234
  }
3453
4235
  const manifest = {
3454
4236
  formatVersion: 1,
@@ -3456,7 +4238,7 @@ async function exportBackup(input) {
3456
4238
  packageVersion,
3457
4239
  entries
3458
4240
  };
3459
- const manifestPath = join6(outputDir, "manifest.json");
4241
+ const manifestPath = join7(outputDir, "manifest.json");
3460
4242
  await writeFile(manifestPath, JSON.stringify(manifest, null, 2), "utf8");
3461
4243
  return {
3462
4244
  outputDir,
@@ -3465,9 +4247,9 @@ async function exportBackup(input) {
3465
4247
  };
3466
4248
  }
3467
4249
  async function importBackup(input) {
3468
- const inputDir = resolve5(input.inputDir);
3469
- const dataDir = resolve5(input.dataDir);
3470
- const configDir = input.configDir ? resolve5(input.configDir) : void 0;
4250
+ const inputDir = resolve6(input.inputDir);
4251
+ const dataDir = resolve6(input.dataDir);
4252
+ const configDir = input.configDir ? resolve6(input.configDir) : void 0;
3471
4253
  const { manifest, backupDataDir, backupConfigDir } = await loadValidatedBackupPayload(inputDir);
3472
4254
  if (!await isDirectoryEmpty(dataDir)) {
3473
4255
  if (!input.replaceExisting) {
@@ -3511,7 +4293,7 @@ async function importBackup(input) {
3511
4293
 
3512
4294
  // src/coverage.ts
3513
4295
  import { readFile as readFile3 } from "fs/promises";
3514
- import { resolve as resolve6 } from "path";
4296
+ import { resolve as resolve7 } from "path";
3515
4297
  function normalizeText(value) {
3516
4298
  return value.replace(/[`*_~]+/g, "").replace(/\s+/g, " ").trim().toLowerCase();
3517
4299
  }
@@ -3560,7 +4342,7 @@ async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
3560
4342
  body: 0
3561
4343
  };
3562
4344
  for (const referenceFile of referenceFiles) {
3563
- const resolvedReferenceFile = resolve6(referenceFile);
4345
+ const resolvedReferenceFile = resolve7(referenceFile);
3564
4346
  let raw;
3565
4347
  try {
3566
4348
  raw = await readFile3(resolvedReferenceFile, "utf8");
@@ -3632,9 +4414,9 @@ async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
3632
4414
 
3633
4415
  // src/doctor.ts
3634
4416
  import { access as access2 } from "fs/promises";
3635
- import { execFile } from "child_process";
3636
- import { promisify } from "util";
3637
- var execFileAsync = promisify(execFile);
4417
+ import { execFile as execFile2 } from "child_process";
4418
+ import { promisify as promisify2 } from "util";
4419
+ var execFileAsync2 = promisify2(execFile2);
3638
4420
  function summarize(checks) {
3639
4421
  const passCount = checks.filter((check) => check.status === "pass").length;
3640
4422
  const warnCount = checks.filter((check) => check.status === "warn").length;
@@ -3721,6 +4503,25 @@ async function checkPlaywright() {
3721
4503
  };
3722
4504
  }
3723
4505
  }
4506
+ async function checkGit() {
4507
+ try {
4508
+ const { stdout } = await execFileAsync2("git", ["--version"]);
4509
+ return {
4510
+ id: "git",
4511
+ status: "pass",
4512
+ summary: "Git executable is available.",
4513
+ details: {
4514
+ version: stdout.trim()
4515
+ }
4516
+ };
4517
+ } catch (error) {
4518
+ return {
4519
+ id: "git",
4520
+ status: "fail",
4521
+ summary: `Git is not ready: ${toErrorMessage(error)}`
4522
+ };
4523
+ }
4524
+ }
3724
4525
  async function checkDaemonConfig(env) {
3725
4526
  try {
3726
4527
  const daemonConfig = parseDaemonConfig(env, {
@@ -3970,7 +4771,7 @@ async function checkEmbeddings(env) {
3970
4771
  }
3971
4772
  async function checkDocker() {
3972
4773
  try {
3973
- const { stdout } = await execFileAsync("docker", ["info", "--format", "{{json .ServerVersion}}"]);
4774
+ const { stdout } = await execFileAsync2("docker", ["info", "--format", "{{json .ServerVersion}}"]);
3974
4775
  const version = JSON.parse(stdout.trim());
3975
4776
  return {
3976
4777
  id: "docker",
@@ -3986,7 +4787,7 @@ async function checkDocker() {
3986
4787
  return {
3987
4788
  id: "docker",
3988
4789
  status: "warn",
3989
- summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable on this machine."
4790
+ summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable in this environment."
3990
4791
  };
3991
4792
  }
3992
4793
  return {
@@ -3998,6 +4799,7 @@ async function checkDocker() {
3998
4799
  }
3999
4800
  async function runDoctor(env = process.env) {
4000
4801
  const catalogCheck = await checkCatalog(env);
4802
+ const gitCheck = await checkGit();
4001
4803
  const playwrightCheck = await checkPlaywright();
4002
4804
  const { daemonConfigCheck, daemonConfig } = await checkDaemonConfig(env);
4003
4805
  const sourceSpecDirsCheck = await checkSourceSpecDirs(daemonConfig);
@@ -4009,6 +4811,7 @@ async function runDoctor(env = process.env) {
4009
4811
  const dockerCheck = await checkDocker();
4010
4812
  const checks = [
4011
4813
  catalogCheck,
4814
+ gitCheck,
4012
4815
  playwrightCheck,
4013
4816
  daemonConfigCheck,
4014
4817
  sourceSpecDirsCheck,
@@ -4063,6 +4866,19 @@ function withScores(rows, scoreLookup) {
4063
4866
  };
4064
4867
  });
4065
4868
  }
4869
+ function matchesChunkFilters(row, filters) {
4870
+ if (filters.pathPatterns && filters.pathPatterns.length > 0) {
4871
+ if (!row.filePath || !matchesPatterns(row.filePath, filters.pathPatterns)) {
4872
+ return false;
4873
+ }
4874
+ }
4875
+ if (filters.languages && filters.languages.length > 0) {
4876
+ if (!row.language || !filters.languages.includes(row.language.toLowerCase())) {
4877
+ return false;
4878
+ }
4879
+ }
4880
+ return true;
4881
+ }
4066
4882
  async function searchHybridCatalog(input) {
4067
4883
  const scope = input.catalog.resolveSearchScope({
4068
4884
  query: input.query,
@@ -4070,6 +4886,8 @@ async function searchHybridCatalog(input) {
4070
4886
  ...input.searchInput.sourceIds ? { sourceIds: input.searchInput.sourceIds } : {},
4071
4887
  ...input.searchInput.snapshotId ? { snapshotId: input.searchInput.snapshotId } : {},
4072
4888
  ...input.searchInput.all ? { all: true } : {},
4889
+ ...input.searchInput.pathPatterns ? { pathPatterns: input.searchInput.pathPatterns } : {},
4890
+ ...input.searchInput.languages ? { languages: input.searchInput.languages } : {},
4073
4891
  ...typeof input.searchInput.limit === "number" ? { limit: input.searchInput.limit } : {},
4074
4892
  ...typeof input.searchInput.offset === "number" ? { offset: input.searchInput.offset } : {}
4075
4893
  });
@@ -4132,13 +4950,25 @@ async function searchHybridCatalog(input) {
4132
4950
  );
4133
4951
  }
4134
4952
  const vectorStore = new AiocsVectorStore(input.config);
4135
- vectorCandidates = await vectorStore.search({
4953
+ const rawVectorCandidates = await vectorStore.search({
4136
4954
  vector: queryVector,
4137
4955
  snapshotIds: scope.snapshotIds,
4138
4956
  sourceIds: scope.sourceIds,
4139
4957
  modelKey,
4140
4958
  limit: windowSize(scope.limit, scope.offset, input.config.vectorCandidateWindow)
4141
4959
  });
4960
+ if (rawVectorCandidates.length > 0 && (scope.pathPatterns || scope.languages)) {
4961
+ const candidateRows = input.catalog.getChunksByIds(rawVectorCandidates.map((candidate) => candidate.chunkId));
4962
+ const allowedIds = new Set(
4963
+ candidateRows.filter((row) => matchesChunkFilters(row, {
4964
+ pathPatterns: scope.pathPatterns,
4965
+ languages: scope.languages
4966
+ })).map((row) => row.chunkId)
4967
+ );
4968
+ vectorCandidates = rawVectorCandidates.filter((candidate) => allowedIds.has(candidate.chunkId));
4969
+ } else {
4970
+ vectorCandidates = rawVectorCandidates;
4971
+ }
4142
4972
  } catch (error) {
4143
4973
  if (input.mode === "auto") {
4144
4974
  return lexicalOnly();
@@ -4226,7 +5056,7 @@ function withCatalog(run) {
4226
5056
  return Promise.resolve(run(ctx)).finally(() => ctx.catalog.close());
4227
5057
  }
4228
5058
  async function upsertSourceFromSpecFile(specFile) {
4229
- const specPath = resolve7(specFile);
5059
+ const specPath = resolve8(specFile);
4230
5060
  const spec = await loadSourceSpec(specPath);
4231
5061
  const result = await withCatalog(({ catalog }) => catalog.upsertSource(spec, { specPath }));
4232
5062
  return {
@@ -4294,7 +5124,7 @@ async function refreshDueSources(sourceIdOrAll = "all") {
4294
5124
  return { results };
4295
5125
  }
4296
5126
  async function runSourceCanaries(sourceIdOrAll) {
4297
- const results = await withCatalog(async ({ catalog }) => {
5127
+ const results = await withCatalog(async ({ catalog, dataDir }) => {
4298
5128
  const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
4299
5129
  if (sourceIds.length === 0) {
4300
5130
  return [];
@@ -4304,6 +5134,7 @@ async function runSourceCanaries(sourceIdOrAll) {
4304
5134
  canaried.push(await runSourceCanary({
4305
5135
  catalog,
4306
5136
  sourceId,
5137
+ dataDir,
4307
5138
  env: process.env
4308
5139
  }));
4309
5140
  }
@@ -4322,7 +5153,7 @@ async function diffSnapshotsForSource(input) {
4322
5153
  return withCatalog(({ catalog }) => catalog.diffSnapshots(input));
4323
5154
  }
4324
5155
  async function linkProjectSources(projectPath, sourceIds) {
4325
- const resolvedProjectPath = resolve7(projectPath);
5156
+ const resolvedProjectPath = resolve8(projectPath);
4326
5157
  await withCatalog(({ catalog }) => {
4327
5158
  catalog.linkProject(resolvedProjectPath, sourceIds);
4328
5159
  });
@@ -4332,7 +5163,7 @@ async function linkProjectSources(projectPath, sourceIds) {
4332
5163
  };
4333
5164
  }
4334
5165
  async function unlinkProjectSources(projectPath, sourceIds) {
4335
- const resolvedProjectPath = resolve7(projectPath);
5166
+ const resolvedProjectPath = resolve8(projectPath);
4336
5167
  await withCatalog(({ catalog }) => {
4337
5168
  catalog.unlinkProject(resolvedProjectPath, sourceIds);
4338
5169
  });
@@ -4342,7 +5173,7 @@ async function unlinkProjectSources(projectPath, sourceIds) {
4342
5173
  };
4343
5174
  }
4344
5175
  async function searchCatalog(query, options) {
4345
- const cwd = options.project ? resolve7(options.project) : process.cwd();
5176
+ const cwd = options.project ? resolve8(options.project) : process.cwd();
4346
5177
  const explicitSources = options.source.length > 0;
4347
5178
  const results = await withCatalog(({ catalog }) => {
4348
5179
  const hybridConfig = getHybridRuntimeConfig();
@@ -4363,6 +5194,8 @@ async function searchCatalog(query, options) {
4363
5194
  ...explicitSources ? { sourceIds: options.source } : {},
4364
5195
  ...options.snapshot ? { snapshotId: options.snapshot } : {},
4365
5196
  ...options.all ? { all: true } : {},
5197
+ ...options.path && options.path.length > 0 ? { pathPatterns: options.path } : {},
5198
+ ...options.language && options.language.length > 0 ? { languages: options.language } : {},
4366
5199
  ...typeof options.limit === "number" ? { limit: options.limit } : {},
4367
5200
  ...typeof options.offset === "number" ? { offset: options.offset } : {}
4368
5201
  }
@@ -4508,9 +5341,9 @@ export {
4508
5341
  AIOCS_ERROR_CODES,
4509
5342
  AiocsError,
4510
5343
  toAiocsError,
4511
- openCatalog,
4512
5344
  getAiocsDataDir,
4513
5345
  getAiocsConfigDir,
5346
+ openCatalog,
4514
5347
  parseDaemonConfig,
4515
5348
  startDaemon,
4516
5349
  packageName,