@bodhi-ventures/aiocs 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -150,42 +150,91 @@ import Database from "better-sqlite3";
150
150
 
151
151
  // src/catalog/chunking.ts
152
152
  var MAX_CHUNK_BYTES = 16384;
153
+ var CHUNK_OVERLAP_LINES = 6;
153
154
  var HEADING_PATTERN = /^(#{1,6})\s+(.*)$/;
154
155
  function byteLength(value) {
155
156
  return Buffer.byteLength(value, "utf8");
156
157
  }
158
+ function normalizeLanguage(filePath, language) {
159
+ if (language) {
160
+ return language.toLowerCase();
161
+ }
162
+ if (!filePath) {
163
+ return null;
164
+ }
165
+ const lower = filePath.toLowerCase();
166
+ if (lower.endsWith(".md") || lower.endsWith(".mdx")) {
167
+ return "markdown";
168
+ }
169
+ if (lower.endsWith(".ts")) {
170
+ return "typescript";
171
+ }
172
+ if (lower.endsWith(".tsx")) {
173
+ return "tsx";
174
+ }
175
+ if (lower.endsWith(".js")) {
176
+ return "javascript";
177
+ }
178
+ if (lower.endsWith(".jsx")) {
179
+ return "jsx";
180
+ }
181
+ if (lower.endsWith(".json")) {
182
+ return "json";
183
+ }
184
+ if (lower.endsWith(".yaml") || lower.endsWith(".yml")) {
185
+ return "yaml";
186
+ }
187
+ if (lower.endsWith(".toml")) {
188
+ return "toml";
189
+ }
190
+ if (lower.endsWith(".py")) {
191
+ return "python";
192
+ }
193
+ if (lower.endsWith(".rs")) {
194
+ return "rust";
195
+ }
196
+ if (lower.endsWith(".go")) {
197
+ return "go";
198
+ }
199
+ if (lower.endsWith(".sql")) {
200
+ return "sql";
201
+ }
202
+ if (lower.endsWith(".sh")) {
203
+ return "shell";
204
+ }
205
+ return null;
206
+ }
207
+ function flushChunk(chunks, sectionTitle, current, chunkOrder) {
208
+ const trimmed = current.trim();
209
+ if (!trimmed) {
210
+ return chunkOrder;
211
+ }
212
+ chunks.push({
213
+ sectionTitle,
214
+ markdown: trimmed,
215
+ chunkOrder
216
+ });
217
+ return chunkOrder + 1;
218
+ }
157
219
  function splitLargeSection(sectionTitle, markdown, startOrder) {
158
220
  const lines = markdown.split("\n");
159
221
  const chunks = [];
160
222
  let current = "";
161
223
  let order = startOrder;
162
- const flush = () => {
163
- const trimmed = current.trim();
164
- if (!trimmed) {
165
- current = "";
166
- return;
167
- }
168
- chunks.push({
169
- sectionTitle,
170
- markdown: trimmed,
171
- chunkOrder: order
172
- });
173
- order += 1;
174
- current = "";
175
- };
176
224
  for (const line of lines) {
177
225
  const next = current ? `${current}
178
226
  ${line}` : line;
179
227
  if (current && byteLength(next) > MAX_CHUNK_BYTES) {
180
- flush();
228
+ order = flushChunk(chunks, sectionTitle, current, order);
229
+ current = "";
181
230
  }
182
231
  current = current ? `${current}
183
232
  ${line}` : line;
184
233
  }
185
- flush();
234
+ flushChunk(chunks, sectionTitle, current, order);
186
235
  return chunks;
187
236
  }
188
- function chunkMarkdown(pageTitle, markdown) {
237
+ function chunkMarkdownSectioned(pageTitle, markdown) {
189
238
  const trimmed = markdown.trim();
190
239
  if (!trimmed) {
191
240
  return [];
@@ -233,6 +282,154 @@ function chunkMarkdown(pageTitle, markdown) {
233
282
  }
234
283
  return chunks;
235
284
  }
285
+ function symbolBoundary(line, language) {
286
+ const trimmed = line.trim();
287
+ if (!trimmed) {
288
+ return null;
289
+ }
290
+ const patterns = [];
291
+ switch (language) {
292
+ case "typescript":
293
+ case "tsx":
294
+ case "javascript":
295
+ case "jsx":
296
+ patterns.push(
297
+ /^(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/,
298
+ /^(?:export\s+)?(?:default\s+)?class\s+([A-Za-z0-9_$]+)/,
299
+ /^(?:export\s+)?(?:interface|type|enum)\s+([A-Za-z0-9_$]+)/,
300
+ /^(?:export\s+)?const\s+([A-Za-z0-9_$]+)\s*=/
301
+ );
302
+ break;
303
+ case "python":
304
+ patterns.push(/^(?:async\s+def|def|class)\s+([A-Za-z0-9_]+)/);
305
+ break;
306
+ case "rust":
307
+ patterns.push(/^(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z0-9_]+)/, /^(?:pub\s+)?(?:struct|enum|trait)\s+([A-Za-z0-9_]+)/);
308
+ break;
309
+ case "go":
310
+ patterns.push(/^func\s+([A-Za-z0-9_]+)/, /^type\s+([A-Za-z0-9_]+)/);
311
+ break;
312
+ case "json":
313
+ case "yaml":
314
+ case "toml":
315
+ patterns.push(/^["']?([A-Za-z0-9_.-]+)["']?\s*[:=]/);
316
+ break;
317
+ default:
318
+ patterns.push(/^(?:export\s+)?(?:async\s+)?function\s+([A-Za-z0-9_$]+)/, /^(?:class|interface|type|enum)\s+([A-Za-z0-9_$]+)/);
319
+ break;
320
+ }
321
+ for (const pattern of patterns) {
322
+ const match = trimmed.match(pattern);
323
+ if (match?.[1]) {
324
+ return match[1];
325
+ }
326
+ }
327
+ return null;
328
+ }
329
+ function discoverBoundaries(lines, title, language) {
330
+ const boundaries = [];
331
+ lines.forEach((line, index) => {
332
+ const symbol = symbolBoundary(line, language);
333
+ if (symbol) {
334
+ boundaries.push({
335
+ index,
336
+ title: symbol
337
+ });
338
+ }
339
+ });
340
+ if (boundaries.length === 0 || boundaries[0].index !== 0) {
341
+ boundaries.unshift({
342
+ index: 0,
343
+ title
344
+ });
345
+ }
346
+ return boundaries;
347
+ }
348
+ function buildWindowTitle(title, startLine, endLine) {
349
+ return `${title} (${startLine}-${endLine})`;
350
+ }
351
+ function chunkLineWindows(title, content, startOrder) {
352
+ const lines = content.split("\n");
353
+ const chunks = [];
354
+ let start = 0;
355
+ let order = startOrder;
356
+ while (start < lines.length) {
357
+ let end = start;
358
+ let current = "";
359
+ while (end < lines.length) {
360
+ const candidate = current ? `${current}
361
+ ${lines[end]}` : lines[end];
362
+ if (current && byteLength(candidate) > MAX_CHUNK_BYTES) {
363
+ break;
364
+ }
365
+ current = candidate;
366
+ end += 1;
367
+ }
368
+ const trimmed = current.trim();
369
+ if (!trimmed) {
370
+ break;
371
+ }
372
+ chunks.push({
373
+ sectionTitle: buildWindowTitle(title, start + 1, end),
374
+ markdown: trimmed,
375
+ chunkOrder: order
376
+ });
377
+ order += 1;
378
+ if (end >= lines.length) {
379
+ break;
380
+ }
381
+ start = Math.max(start + 1, end - CHUNK_OVERLAP_LINES);
382
+ }
383
+ return chunks;
384
+ }
385
+ function chunkByBoundaries(input, language) {
386
+ const trimmed = input.content.trim();
387
+ if (!trimmed) {
388
+ return [];
389
+ }
390
+ if (byteLength(trimmed) <= MAX_CHUNK_BYTES) {
391
+ return [{ sectionTitle: input.title, markdown: trimmed, chunkOrder: 0 }];
392
+ }
393
+ const lines = trimmed.split("\n");
394
+ const boundaries = discoverBoundaries(lines, input.title, language);
395
+ const chunks = [];
396
+ let order = 0;
397
+ for (let index = 0; index < boundaries.length; index += 1) {
398
+ const boundary = boundaries[index];
399
+ const nextIndex = boundaries[index + 1]?.index ?? lines.length;
400
+ const sectionLines = lines.slice(boundary.index, nextIndex);
401
+ const sectionContent = sectionLines.join("\n").trim();
402
+ if (!sectionContent) {
403
+ continue;
404
+ }
405
+ if (byteLength(sectionContent) <= MAX_CHUNK_BYTES) {
406
+ chunks.push({
407
+ sectionTitle: boundary.title,
408
+ markdown: sectionContent,
409
+ chunkOrder: order
410
+ });
411
+ order += 1;
412
+ continue;
413
+ }
414
+ const splitChunks = chunkLineWindows(boundary.title, sectionContent, order);
415
+ chunks.push(...splitChunks);
416
+ order = chunks.length;
417
+ }
418
+ return chunks.length > 0 ? chunks : chunkLineWindows(input.title, trimmed, 0);
419
+ }
420
+ function chunkContent(input) {
421
+ const language = normalizeLanguage(input.filePath, input.language);
422
+ if (language === "markdown") {
423
+ return chunkMarkdownSectioned(input.title, input.content);
424
+ }
425
+ if (!language) {
426
+ return chunkLineWindows(input.title, input.content.trim(), 0);
427
+ }
428
+ return chunkByBoundaries(input, language);
429
+ }
430
+ function detectLanguage(filePath, language) {
431
+ return normalizeLanguage(filePath, language);
432
+ }
236
433
 
237
434
  // src/catalog/fingerprint.ts
238
435
  import { createHash } from "crypto";
@@ -244,6 +441,7 @@ function buildSnapshotFingerprint(input) {
244
441
  const payload = JSON.stringify({
245
442
  sourceId: input.sourceId,
246
443
  configHash: input.configHash,
444
+ revisionKey: input.revisionKey ?? null,
247
445
  pages: normalizedPages
248
446
  });
249
447
  return sha256(payload);
@@ -272,22 +470,55 @@ function resolveProjectScope(cwd, scopes) {
272
470
  return normalizedScopes[0] ?? null;
273
471
  }
274
472
 
473
+ // src/patterns.ts
474
+ function escapeRegex(value) {
475
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
476
+ }
477
+ function patternToRegex(pattern) {
478
+ let regex = "^";
479
+ for (let index = 0; index < pattern.length; index += 1) {
480
+ const current = pattern[index];
481
+ const next = pattern[index + 1];
482
+ if (current === "*" && next === "*") {
483
+ regex += ".*";
484
+ index += 1;
485
+ continue;
486
+ }
487
+ if (current === "*") {
488
+ regex += "[^?#]*";
489
+ continue;
490
+ }
491
+ regex += escapeRegex(current ?? "");
492
+ }
493
+ return new RegExp(`${regex}$`);
494
+ }
495
+ function matchesPatterns(value, patterns) {
496
+ return patterns.some((pattern) => patternToRegex(pattern).test(value));
497
+ }
498
+ function toSqliteGlob(pattern) {
499
+ return pattern.replace(/\*\*/g, "*");
500
+ }
501
+
275
502
  // src/spec/source-spec.ts
276
503
  import { readFile } from "fs/promises";
277
504
  import { extname } from "path";
278
505
  import YAML from "yaml";
279
506
  import { z } from "zod";
280
507
  var patternSchema = z.string().min(1);
508
+ var positiveIntSchema = z.number().int().positive();
509
+ var scheduleSchema = z.object({
510
+ everyHours: positiveIntSchema
511
+ });
281
512
  var interactionSchema = z.discriminatedUnion("action", [
282
513
  z.object({
283
514
  action: z.literal("hover"),
284
515
  selector: z.string().min(1),
285
- timeoutMs: z.number().int().positive().optional()
516
+ timeoutMs: positiveIntSchema.optional()
286
517
  }),
287
518
  z.object({
288
519
  action: z.literal("click"),
289
520
  selector: z.string().min(1),
290
- timeoutMs: z.number().int().positive().optional()
521
+ timeoutMs: positiveIntSchema.optional()
291
522
  }),
292
523
  z.object({
293
524
  action: z.literal("press"),
@@ -295,13 +526,13 @@ var interactionSchema = z.discriminatedUnion("action", [
295
526
  }),
296
527
  z.object({
297
528
  action: z.literal("wait"),
298
- timeoutMs: z.number().int().positive()
529
+ timeoutMs: positiveIntSchema
299
530
  })
300
531
  ]);
301
532
  var clipboardExtractSchema = z.object({
302
533
  strategy: z.literal("clipboardButton"),
303
534
  interactions: z.array(interactionSchema).min(1),
304
- clipboardTimeoutMs: z.number().int().positive().default(1e4)
535
+ clipboardTimeoutMs: positiveIntSchema.default(1e4)
305
536
  });
306
537
  var selectorExtractSchema = z.object({
307
538
  strategy: z.literal("selector"),
@@ -310,13 +541,13 @@ var selectorExtractSchema = z.object({
310
541
  var readabilityExtractSchema = z.object({
311
542
  strategy: z.literal("readability")
312
543
  });
313
- var authHeaderSchema = z.object({
544
+ var webAuthHeaderSchema = z.object({
314
545
  name: z.string().min(1),
315
546
  valueFromEnv: z.string().min(1),
316
547
  hosts: z.array(z.string().min(1)).min(1).optional(),
317
548
  include: z.array(patternSchema).min(1).optional()
318
549
  });
319
- var authCookieSchema = z.object({
550
+ var webAuthCookieSchema = z.object({
320
551
  name: z.string().min(1),
321
552
  valueFromEnv: z.string().min(1),
322
553
  domain: z.string().min(1),
@@ -325,21 +556,36 @@ var authCookieSchema = z.object({
325
556
  httpOnly: z.boolean().optional(),
326
557
  sameSite: z.enum(["Strict", "Lax", "None"]).optional()
327
558
  });
328
- var canaryCheckSchema = z.object({
559
+ var webCanaryCheckSchema = z.object({
329
560
  url: z.string().url(),
330
561
  expectedTitle: z.string().min(1).optional(),
331
562
  expectedText: z.string().min(1).optional(),
332
- minMarkdownLength: z.number().int().positive().default(40)
563
+ minMarkdownLength: positiveIntSchema.default(40)
564
+ });
565
+ var gitAuthSchema = z.object({
566
+ tokenFromEnv: z.string().min(1),
567
+ username: z.string().min(1).default("x-access-token"),
568
+ scheme: z.enum(["basic", "bearer"]).default("basic")
569
+ });
570
+ var gitCanaryCheckSchema = z.object({
571
+ path: z.string().min(1),
572
+ expectedTitle: z.string().min(1).optional(),
573
+ expectedText: z.string().min(1).optional(),
574
+ minContentLength: positiveIntSchema.default(40)
333
575
  });
334
- var sourceSpecSchema = z.object({
576
+ var baseSourceSpecSchema = z.object({
335
577
  id: z.string().min(1).regex(/^[a-z0-9-]+$/),
336
578
  label: z.string().min(1),
579
+ schedule: scheduleSchema
580
+ });
581
+ var webSourceSpecSchema = baseSourceSpecSchema.extend({
582
+ kind: z.literal("web").default("web"),
337
583
  startUrls: z.array(z.string().url()).min(1),
338
584
  allowedHosts: z.array(z.string().min(1)).min(1),
339
585
  discovery: z.object({
340
586
  include: z.array(patternSchema).min(1),
341
- exclude: z.array(patternSchema),
342
- maxPages: z.number().int().positive()
587
+ exclude: z.array(patternSchema).default([]),
588
+ maxPages: positiveIntSchema
343
589
  }),
344
590
  extract: z.discriminatedUnion("strategy", [
345
591
  clipboardExtractSchema,
@@ -349,16 +595,13 @@ var sourceSpecSchema = z.object({
349
595
  normalize: z.object({
350
596
  prependSourceComment: z.boolean().default(true)
351
597
  }),
352
- schedule: z.object({
353
- everyHours: z.number().int().positive()
354
- }),
355
598
  auth: z.object({
356
- headers: z.array(authHeaderSchema).default([]),
357
- cookies: z.array(authCookieSchema).default([])
599
+ headers: z.array(webAuthHeaderSchema).default([]),
600
+ cookies: z.array(webAuthCookieSchema).default([])
358
601
  }).optional(),
359
602
  canary: z.object({
360
- everyHours: z.number().int().positive().optional(),
361
- checks: z.array(canaryCheckSchema).min(1)
603
+ everyHours: positiveIntSchema.optional(),
604
+ checks: z.array(webCanaryCheckSchema).min(1)
362
605
  }).optional()
363
606
  }).superRefine((spec, context) => {
364
607
  for (const [index, header] of (spec.auth?.headers ?? []).entries()) {
@@ -376,6 +619,47 @@ var sourceSpecSchema = z.object({
376
619
  }
377
620
  }
378
621
  });
622
+ var gitSourceSpecSchema = baseSourceSpecSchema.extend({
623
+ kind: z.literal("git"),
624
+ repo: z.object({
625
+ url: z.string().url(),
626
+ ref: z.string().min(1).default("HEAD"),
627
+ include: z.array(patternSchema).min(1),
628
+ exclude: z.array(patternSchema).default([]),
629
+ maxFiles: positiveIntSchema.default(2e3),
630
+ textFileMaxBytes: positiveIntSchema.default(262144),
631
+ auth: gitAuthSchema.optional()
632
+ }),
633
+ canary: z.object({
634
+ everyHours: positiveIntSchema.optional(),
635
+ checks: z.array(gitCanaryCheckSchema).min(1)
636
+ }).optional()
637
+ }).superRefine((spec, context) => {
638
+ const protocol = new URL(spec.repo.url).protocol;
639
+ if (!["https:", "http:", "file:"].includes(protocol)) {
640
+ context.addIssue({
641
+ code: z.ZodIssueCode.custom,
642
+ path: ["repo", "url"],
643
+ message: `Unsupported git source protocol '${protocol}'. Use https:// or file://.`
644
+ });
645
+ }
646
+ });
647
+ var sourceSpecSchema = z.preprocess((value) => {
648
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
649
+ return value;
650
+ }
651
+ const candidate = value;
652
+ if (!("kind" in candidate)) {
653
+ return {
654
+ ...candidate,
655
+ kind: "web"
656
+ };
657
+ }
658
+ return candidate;
659
+ }, z.discriminatedUnion("kind", [
660
+ webSourceSpecSchema,
661
+ gitSourceSpecSchema
662
+ ]));
379
663
  function parseSourceSpec(raw, ext) {
380
664
  if (ext === ".json") {
381
665
  return JSON.parse(raw);
@@ -387,8 +671,27 @@ async function loadSourceSpec(path) {
387
671
  const parsed = parseSourceSpec(raw, extname(path).toLowerCase());
388
672
  return sourceSpecSchema.parse(parsed);
389
673
  }
674
+ function parseSourceSpecObject(value) {
675
+ return sourceSpecSchema.parse(value);
676
+ }
677
+ function isGitSourceSpec(spec) {
678
+ return spec.kind === "git";
679
+ }
390
680
  function resolveSourceCanary(spec) {
681
+ if (spec.kind === "git") {
682
+ return {
683
+ kind: "git",
684
+ everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
685
+ checks: spec.canary?.checks ?? [
686
+ {
687
+ path: "README.md",
688
+ minContentLength: 40
689
+ }
690
+ ]
691
+ };
692
+ }
391
693
  return {
694
+ kind: "web",
392
695
  everyHours: spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6)),
393
696
  checks: spec.canary?.checks ?? [
394
697
  {
@@ -440,6 +743,9 @@ function initSchema(db) {
440
743
  title TEXT NOT NULL,
441
744
  markdown TEXT NOT NULL,
442
745
  content_hash TEXT NOT NULL,
746
+ page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
747
+ file_path TEXT,
748
+ language TEXT,
443
749
  UNIQUE(snapshot_id, url)
444
750
  );
445
751
 
@@ -452,7 +758,10 @@ function initSchema(db) {
452
758
  page_title TEXT NOT NULL,
453
759
  section_title TEXT NOT NULL,
454
760
  chunk_order INTEGER NOT NULL,
455
- markdown TEXT NOT NULL
761
+ markdown TEXT NOT NULL,
762
+ page_kind TEXT NOT NULL DEFAULT 'document' CHECK(page_kind IN ('document', 'file')),
763
+ file_path TEXT,
764
+ language TEXT
456
765
  );
457
766
 
458
767
  CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
@@ -568,6 +877,26 @@ function initSchema(db) {
568
877
  if (!sourceColumns.some((column) => column.name === "next_canary_due_at")) {
569
878
  db.exec("ALTER TABLE sources ADD COLUMN next_canary_due_at TEXT");
570
879
  }
880
+ const pageColumns = db.prepare("PRAGMA table_info(pages)").all();
881
+ if (!pageColumns.some((column) => column.name === "page_kind")) {
882
+ db.exec(`ALTER TABLE pages ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
883
+ }
884
+ if (!pageColumns.some((column) => column.name === "file_path")) {
885
+ db.exec("ALTER TABLE pages ADD COLUMN file_path TEXT");
886
+ }
887
+ if (!pageColumns.some((column) => column.name === "language")) {
888
+ db.exec("ALTER TABLE pages ADD COLUMN language TEXT");
889
+ }
890
+ const chunkColumns = db.prepare("PRAGMA table_info(chunks)").all();
891
+ if (!chunkColumns.some((column) => column.name === "page_kind")) {
892
+ db.exec(`ALTER TABLE chunks ADD COLUMN page_kind TEXT NOT NULL DEFAULT 'document'`);
893
+ }
894
+ if (!chunkColumns.some((column) => column.name === "file_path")) {
895
+ db.exec("ALTER TABLE chunks ADD COLUMN file_path TEXT");
896
+ }
897
+ if (!chunkColumns.some((column) => column.name === "language")) {
898
+ db.exec("ALTER TABLE chunks ADD COLUMN language TEXT");
899
+ }
571
900
  }
572
901
  function nowIso() {
573
902
  return (/* @__PURE__ */ new Date()).toISOString();
@@ -575,6 +904,9 @@ function nowIso() {
575
904
  function addHoursIso(hours) {
576
905
  return new Date(Date.now() + hours * 60 * 60 * 1e3).toISOString();
577
906
  }
907
+ function resolveCanaryEveryHours(spec) {
908
+ return spec.canary?.everyHours ?? Math.max(1, Math.min(spec.schedule.everyHours, 6));
909
+ }
578
910
  function stableStringify(value) {
579
911
  if (Array.isArray(value)) {
580
912
  return `[${value.map((entry) => stableStringify(entry)).join(",")}]`;
@@ -591,6 +923,20 @@ function normalizeQuery(query) {
591
923
  const words = query.replace(/[^\p{L}\p{N}]+/gu, " ").split(/\s+/).map((part) => part.trim()).filter(Boolean);
592
924
  return words.join(" ");
593
925
  }
926
+ function normalizePatternFilters(patterns) {
927
+ if (!patterns || patterns.length === 0) {
928
+ return null;
929
+ }
930
+ const normalized = [...new Set(patterns.map((pattern) => pattern.trim()).filter(Boolean))];
931
+ return normalized.length > 0 ? normalized : null;
932
+ }
933
+ function normalizeLanguageFilters(languages) {
934
+ if (!languages || languages.length === 0) {
935
+ return null;
936
+ }
937
+ const normalized = [...new Set(languages.map((language) => language.trim().toLowerCase()).filter(Boolean))];
938
+ return normalized.length > 0 ? normalized : null;
939
+ }
594
940
  function assertPaginationValue(value, field, fallback) {
595
941
  if (typeof value === "undefined") {
596
942
  return fallback;
@@ -647,7 +993,9 @@ function openCatalog(options) {
647
993
  limit,
648
994
  offset,
649
995
  sourceIds: null,
650
- snapshotIds: []
996
+ snapshotIds: [],
997
+ pathPatterns: normalizePatternFilters(input.pathPatterns),
998
+ languages: normalizeLanguageFilters(input.languages)
651
999
  };
652
1000
  }
653
1001
  const filterSourceIds = sourceIds && sourceIds.length > 0 ? [...new Set(sourceIds)] : null;
@@ -661,7 +1009,9 @@ function openCatalog(options) {
661
1009
  limit,
662
1010
  offset,
663
1011
  sourceIds: filterSourceIds,
664
- snapshotIds: latestSnapshotIds
1012
+ snapshotIds: latestSnapshotIds,
1013
+ pathPatterns: normalizePatternFilters(input.pathPatterns),
1014
+ languages: normalizeLanguageFilters(input.languages)
665
1015
  };
666
1016
  };
667
1017
  const searchLexicalByScope = (input) => {
@@ -679,10 +1029,14 @@ function openCatalog(options) {
679
1029
  }
680
1030
  const whereSnapshotPlaceholders = input.scope.snapshotIds.map(() => "?").join(",");
681
1031
  const sourceSql = input.scope.sourceIds ? `AND c.source_id IN (${input.scope.sourceIds.map(() => "?").join(",")})` : "";
1032
+ const pathSql = input.scope.pathPatterns ? `AND c.file_path IS NOT NULL AND (${input.scope.pathPatterns.map(() => "c.file_path GLOB ?").join(" OR ")})` : "";
1033
+ const languageSql = input.scope.languages ? `AND c.language IN (${input.scope.languages.map(() => "?").join(",")})` : "";
682
1034
  const queryArgs = [
683
1035
  normalized,
684
1036
  ...input.scope.snapshotIds,
685
- ...input.scope.sourceIds ?? []
1037
+ ...input.scope.sourceIds ?? [],
1038
+ ...(input.scope.pathPatterns ?? []).map((pattern) => toSqliteGlob(pattern)),
1039
+ ...input.scope.languages ?? []
686
1040
  ];
687
1041
  const totalRow = db.prepare(`
688
1042
  SELECT COUNT(*) AS total
@@ -691,6 +1045,8 @@ function openCatalog(options) {
691
1045
  WHERE chunks_fts MATCH ?
692
1046
  AND c.snapshot_id IN (${whereSnapshotPlaceholders})
693
1047
  ${sourceSql}
1048
+ ${pathSql}
1049
+ ${languageSql}
694
1050
  `).get(...queryArgs);
695
1051
  const rows = db.prepare(`
696
1052
  SELECT
@@ -700,12 +1056,17 @@ function openCatalog(options) {
700
1056
  c.page_url,
701
1057
  c.page_title,
702
1058
  c.section_title,
703
- c.markdown
1059
+ c.markdown,
1060
+ c.page_kind,
1061
+ c.file_path,
1062
+ c.language
704
1063
  FROM chunks_fts
705
1064
  JOIN chunks c ON c.id = chunks_fts.rowid
706
1065
  WHERE chunks_fts MATCH ?
707
1066
  AND c.snapshot_id IN (${whereSnapshotPlaceholders})
708
1067
  ${sourceSql}
1068
+ ${pathSql}
1069
+ ${languageSql}
709
1070
  ORDER BY bm25(chunks_fts), c.id
710
1071
  LIMIT ?
711
1072
  OFFSET ?
@@ -717,7 +1078,10 @@ function openCatalog(options) {
717
1078
  pageUrl: row.page_url,
718
1079
  pageTitle: row.page_title,
719
1080
  sectionTitle: row.section_title,
720
- markdown: row.markdown
1081
+ markdown: row.markdown,
1082
+ pageKind: row.page_kind,
1083
+ filePath: row.file_path,
1084
+ language: row.language
721
1085
  }));
722
1086
  return {
723
1087
  total: totalRow.total,
@@ -889,8 +1253,8 @@ function openCatalog(options) {
889
1253
  const existing = db.prepare("SELECT id, created_at, next_due_at, next_canary_due_at, config_hash FROM sources WHERE id = ?").get(spec.id);
890
1254
  const resolvedSpecPath = options2?.specPath ? canonicalizeManagedSpecPath(options2.specPath) : null;
891
1255
  const nextDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_due_at : timestamp;
892
- const canaryConfig = resolveSourceCanary(spec);
893
- const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryConfig.everyHours) : timestamp;
1256
+ const canaryEveryHours = resolveCanaryEveryHours(spec);
1257
+ const nextCanaryDueAt = !existing ? timestamp : existing.config_hash === configHash ? existing.next_canary_due_at ?? addHoursIso(canaryEveryHours) : timestamp;
894
1258
  const configChanged = Boolean(existing && existing.config_hash !== configHash);
895
1259
  db.prepare(`
896
1260
  INSERT INTO sources (
@@ -935,6 +1299,7 @@ function openCatalog(options) {
935
1299
  SELECT
936
1300
  id,
937
1301
  label,
1302
+ spec_json,
938
1303
  spec_path,
939
1304
  next_due_at,
940
1305
  next_canary_due_at,
@@ -947,21 +1312,25 @@ function openCatalog(options) {
947
1312
  FROM sources
948
1313
  ORDER BY id
949
1314
  `).all();
950
- return rows.map((row) => ({
951
- id: row.id,
952
- label: row.label,
953
- specPath: row.spec_path ? canonicalizeManagedSpecPath(row.spec_path) : null,
954
- nextDueAt: row.next_due_at,
955
- isDue: Date.parse(row.next_due_at) <= Date.now(),
956
- nextCanaryDueAt: row.next_canary_due_at,
957
- isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
958
- lastCheckedAt: row.last_checked_at,
959
- lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
960
- lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
961
- lastCanaryCheckedAt: row.last_canary_checked_at,
962
- lastSuccessfulCanaryAt: row.last_successful_canary_at,
963
- lastCanaryStatus: row.last_canary_status
964
- }));
1315
+ return rows.map((row) => {
1316
+ const storedSpec = parseSourceSpecObject(JSON.parse(row.spec_json));
1317
+ return {
1318
+ id: row.id,
1319
+ kind: storedSpec.kind,
1320
+ label: row.label,
1321
+ specPath: row.spec_path ? canonicalizeManagedSpecPath(row.spec_path) : null,
1322
+ nextDueAt: row.next_due_at,
1323
+ isDue: Date.parse(row.next_due_at) <= Date.now(),
1324
+ nextCanaryDueAt: row.next_canary_due_at,
1325
+ isCanaryDue: row.next_canary_due_at ? Date.parse(row.next_canary_due_at) <= Date.now() : false,
1326
+ lastCheckedAt: row.last_checked_at,
1327
+ lastSuccessfulSnapshotAt: row.last_successful_snapshot_at,
1328
+ lastSuccessfulSnapshotId: row.last_successful_snapshot_id,
1329
+ lastCanaryCheckedAt: row.last_canary_checked_at,
1330
+ lastSuccessfulCanaryAt: row.last_successful_canary_at,
1331
+ lastCanaryStatus: row.last_canary_status
1332
+ };
1333
+ });
965
1334
  },
966
1335
  listDueSourceIds(referenceTime = nowIso()) {
967
1336
  const rows = db.prepare(`
@@ -1022,11 +1391,15 @@ function openCatalog(options) {
1022
1391
  const pagesWithHashes = input.pages.map((page) => ({
1023
1392
  ...page,
1024
1393
  markdown: page.markdown.trim(),
1025
- contentHash: sha256(page.markdown.trim())
1394
+ contentHash: sha256(page.markdown.trim()),
1395
+ pageKind: page.pageKind ?? "document",
1396
+ filePath: page.filePath ?? null,
1397
+ language: detectLanguage(page.filePath, page.language)
1026
1398
  }));
1027
1399
  const fingerprint = buildSnapshotFingerprint({
1028
1400
  sourceId: input.sourceId,
1029
1401
  configHash: sourceRow.config_hash,
1402
+ ...input.revisionKey ? { revisionKey: input.revisionKey } : {},
1030
1403
  pages: pagesWithHashes.map((page) => ({
1031
1404
  url: page.url,
1032
1405
  contentHash: page.contentHash
@@ -1063,13 +1436,13 @@ function openCatalog(options) {
1063
1436
  ) VALUES (?, ?, ?, ?, ?, ?, ?)
1064
1437
  `);
1065
1438
  const insertPage = db.prepare(`
1066
- INSERT INTO pages (snapshot_id, url, title, markdown, content_hash)
1067
- VALUES (?, ?, ?, ?, ?)
1439
+ INSERT INTO pages (snapshot_id, url, title, markdown, content_hash, page_kind, file_path, language)
1440
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1068
1441
  `);
1069
1442
  const insertChunk = db.prepare(`
1070
1443
  INSERT INTO chunks (
1071
- source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown
1072
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1444
+ source_id, snapshot_id, page_id, page_url, page_title, section_title, chunk_order, markdown, page_kind, file_path, language
1445
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1073
1446
  `);
1074
1447
  const insertRun = db.prepare(`
1075
1448
  INSERT INTO fetch_runs (id, source_id, status, snapshot_id, started_at, finished_at)
@@ -1086,9 +1459,23 @@ function openCatalog(options) {
1086
1459
  checkedAt
1087
1460
  );
1088
1461
  for (const page of pagesWithHashes) {
1089
- const pageInsert = insertPage.run(snapshotId, page.url, page.title, page.markdown, page.contentHash);
1462
+ const pageInsert = insertPage.run(
1463
+ snapshotId,
1464
+ page.url,
1465
+ page.title,
1466
+ page.markdown,
1467
+ page.contentHash,
1468
+ page.pageKind,
1469
+ page.filePath,
1470
+ page.language
1471
+ );
1090
1472
  const pageId = Number(pageInsert.lastInsertRowid);
1091
- const chunks = chunkMarkdown(page.title, page.markdown);
1473
+ const chunks = chunkContent({
1474
+ title: page.title,
1475
+ content: page.markdown,
1476
+ filePath: page.filePath,
1477
+ language: page.language
1478
+ });
1092
1479
  for (const chunk of chunks) {
1093
1480
  insertChunk.run(
1094
1481
  input.sourceId,
@@ -1098,7 +1485,10 @@ function openCatalog(options) {
1098
1485
  page.title,
1099
1486
  chunk.sectionTitle,
1100
1487
  chunk.chunkOrder,
1101
- chunk.markdown
1488
+ chunk.markdown,
1489
+ page.pageKind,
1490
+ page.filePath,
1491
+ page.language
1102
1492
  );
1103
1493
  }
1104
1494
  }
@@ -1149,7 +1539,6 @@ function openCatalog(options) {
1149
1539
  );
1150
1540
  }
1151
1541
  const spec = JSON.parse(sourceRow.spec_json);
1152
- const canary = resolveSourceCanary(spec);
1153
1542
  db.prepare(`
1154
1543
  INSERT INTO canary_runs (id, source_id, status, checked_at, details_json)
1155
1544
  VALUES (?, ?, ?, ?, ?)
@@ -1174,7 +1563,7 @@ function openCatalog(options) {
1174
1563
  input.status,
1175
1564
  input.checkedAt,
1176
1565
  input.status,
1177
- addHoursIso(canary.everyHours),
1566
+ addHoursIso(resolveCanaryEveryHours(spec)),
1178
1567
  input.checkedAt,
1179
1568
  input.sourceId
1180
1569
  );
@@ -1254,7 +1643,7 @@ function openCatalog(options) {
1254
1643
  );
1255
1644
  }
1256
1645
  const loadSnapshotPages = (snapshotId) => db.prepare(`
1257
- SELECT url, title, markdown, content_hash
1646
+ SELECT url, title, markdown, content_hash, page_kind, file_path, language
1258
1647
  FROM pages
1259
1648
  WHERE snapshot_id = ?
1260
1649
  ORDER BY url
@@ -1265,11 +1654,17 @@ function openCatalog(options) {
1265
1654
  const afterMap = new Map(afterPages.map((page) => [page.url, page]));
1266
1655
  const addedPages = afterPages.filter((page) => !beforeMap.has(page.url)).map((page) => ({
1267
1656
  url: page.url,
1268
- title: page.title
1657
+ title: page.title,
1658
+ pageKind: page.page_kind,
1659
+ filePath: page.file_path,
1660
+ language: page.language
1269
1661
  }));
1270
1662
  const removedPages = beforePages.filter((page) => !afterMap.has(page.url)).map((page) => ({
1271
1663
  url: page.url,
1272
- title: page.title
1664
+ title: page.title,
1665
+ pageKind: page.page_kind,
1666
+ filePath: page.file_path,
1667
+ language: page.language
1273
1668
  }));
1274
1669
  const summarizeLineDiff = (beforeMarkdown, afterMarkdown) => {
1275
1670
  const beforeLines = beforeMarkdown.split("\n");
@@ -1294,6 +1689,9 @@ function openCatalog(options) {
1294
1689
  url: before.url,
1295
1690
  beforeTitle: before.title,
1296
1691
  afterTitle: after.title,
1692
+ pageKind: after.page_kind,
1693
+ filePath: after.file_path,
1694
+ language: after.language,
1297
1695
  lineSummary: summarizeLineDiff(before.markdown, after.markdown)
1298
1696
  }));
1299
1697
  const unchangedPageCount = beforePages.filter((page) => {
@@ -1339,11 +1737,14 @@ function openCatalog(options) {
1339
1737
  c.page_url,
1340
1738
  c.page_title,
1341
1739
  c.section_title,
1342
- c.markdown
1740
+ c.markdown,
1741
+ c.page_kind,
1742
+ c.file_path,
1743
+ c.language
1343
1744
  FROM chunks c
1344
1745
  WHERE c.source_id = ?
1345
1746
  AND c.snapshot_id = ?
1346
- ORDER BY c.id
1747
+ ORDER BY c.id
1347
1748
  `).all(input.sourceId, input.snapshotId);
1348
1749
  return rows.map((row) => ({
1349
1750
  chunkId: row.chunk_id,
@@ -1353,6 +1754,9 @@ function openCatalog(options) {
1353
1754
  pageTitle: row.page_title,
1354
1755
  sectionTitle: row.section_title,
1355
1756
  markdown: row.markdown,
1757
+ pageKind: row.page_kind,
1758
+ filePath: row.file_path,
1759
+ language: row.language,
1356
1760
  contentHash: sha256(row.markdown)
1357
1761
  }));
1358
1762
  },
@@ -1403,7 +1807,10 @@ function openCatalog(options) {
1403
1807
  c.page_url,
1404
1808
  c.page_title,
1405
1809
  c.section_title,
1406
- c.markdown
1810
+ c.markdown,
1811
+ c.page_kind,
1812
+ c.file_path,
1813
+ c.language
1407
1814
  FROM chunks c
1408
1815
  WHERE c.id IN (${chunkIds.map(() => "?").join(",")})
1409
1816
  `).all(...chunkIds);
@@ -1414,7 +1821,10 @@ function openCatalog(options) {
1414
1821
  pageUrl: row.page_url,
1415
1822
  pageTitle: row.page_title,
1416
1823
  sectionTitle: row.section_title,
1417
- markdown: row.markdown
1824
+ markdown: row.markdown,
1825
+ pageKind: row.page_kind,
1826
+ filePath: row.file_path,
1827
+ language: row.language
1418
1828
  }));
1419
1829
  },
1420
1830
  queueLatestEmbeddingJobs(sourceIds) {
@@ -1798,7 +2208,10 @@ function openCatalog(options) {
1798
2208
  c.page_url,
1799
2209
  c.page_title,
1800
2210
  c.section_title,
1801
- c.markdown
2211
+ c.markdown,
2212
+ c.page_kind,
2213
+ c.file_path,
2214
+ c.language
1802
2215
  FROM chunks c
1803
2216
  WHERE c.id = ?
1804
2217
  `).get(chunkId);
@@ -1812,23 +2225,336 @@ function openCatalog(options) {
1812
2225
  pageUrl: row.page_url,
1813
2226
  pageTitle: row.page_title,
1814
2227
  sectionTitle: row.section_title,
1815
- markdown: row.markdown
2228
+ markdown: row.markdown,
2229
+ pageKind: row.page_kind,
2230
+ filePath: row.file_path,
2231
+ language: row.language
1816
2232
  };
1817
2233
  }
1818
2234
  };
1819
2235
  }
1820
2236
 
1821
2237
  // src/daemon.ts
1822
- import { existsSync as existsSync2 } from "fs";
2238
+ import { existsSync as existsSync3 } from "fs";
1823
2239
  import { resolve as resolve5 } from "path";
1824
2240
  import { setTimeout as sleep2 } from "timers/promises";
1825
2241
 
1826
2242
  // src/fetch/fetch-source.ts
1827
- import { mkdirSync as mkdirSync3, writeFileSync } from "fs";
1828
- import { join as join4 } from "path";
2243
+ import { mkdirSync as mkdirSync4, writeFileSync as writeFileSync2 } from "fs";
2244
+ import { join as join5 } from "path";
1829
2245
  import { setTimeout as sleep } from "timers/promises";
1830
2246
  import { chromium } from "playwright";
1831
2247
 
2248
+ // src/git/git-source.ts
2249
+ import { existsSync as existsSync2, mkdirSync as mkdirSync3, writeFileSync } from "fs";
2250
+ import { dirname as dirname2, join as join4 } from "path";
2251
+ import { execFile } from "child_process";
2252
+ import { promisify } from "util";
2253
+ var execFileAsync = promisify(execFile);
2254
+ function nowIso2() {
2255
+ return (/* @__PURE__ */ new Date()).toISOString();
2256
+ }
2257
+ function getGitMirrorDir(dataDir, sourceId) {
2258
+ return join4(dataDir, "git-mirrors", `${sourceId}.git`);
2259
+ }
2260
+ function resolveEnvValue(name, env) {
2261
+ const value = env[name];
2262
+ if (!value) {
2263
+ throw new AiocsError(
2264
+ AIOCS_ERROR_CODES.authEnvMissing,
2265
+ `Missing required environment variable '${name}' for authenticated source access`,
2266
+ { envVar: name }
2267
+ );
2268
+ }
2269
+ return value;
2270
+ }
2271
+ function buildGitAuthHeader(spec, env) {
2272
+ if (!spec.repo.auth) {
2273
+ return null;
2274
+ }
2275
+ const token = resolveEnvValue(spec.repo.auth.tokenFromEnv, env);
2276
+ if (spec.repo.auth.scheme === "bearer") {
2277
+ return `AUTHORIZATION: Bearer ${token}`;
2278
+ }
2279
+ const credentials = Buffer.from(`${spec.repo.auth.username}:${token}`, "utf8").toString("base64");
2280
+ return `AUTHORIZATION: Basic ${credentials}`;
2281
+ }
2282
+ async function runGit(args, options = {}) {
2283
+ const commandArgs = options.authHeader ? ["-c", `http.extraHeader=${options.authHeader}`, ...args] : args;
2284
+ const result = await execFileAsync("git", commandArgs, {
2285
+ cwd: options.cwd,
2286
+ env: {
2287
+ ...process.env,
2288
+ ...options.env,
2289
+ GIT_TERMINAL_PROMPT: "0"
2290
+ },
2291
+ encoding: options.encoding ?? "utf8",
2292
+ maxBuffer: 32 * 1024 * 1024
2293
+ }).catch((error) => {
2294
+ throw new AiocsError(
2295
+ AIOCS_ERROR_CODES.internalError,
2296
+ `Git command failed: ${error instanceof Error ? error.message : String(error)}`,
2297
+ {
2298
+ args
2299
+ }
2300
+ );
2301
+ });
2302
+ return result.stdout;
2303
+ }
2304
+ async function ensureGitMirror(spec, dataDir, env) {
2305
+ const mirrorDir = getGitMirrorDir(dataDir, spec.id);
2306
+ mkdirSync3(dirname2(mirrorDir), { recursive: true });
2307
+ const authHeader = buildGitAuthHeader(spec, env);
2308
+ if (!existsSync2(mirrorDir)) {
2309
+ await runGit(["clone", "--mirror", spec.repo.url, mirrorDir], {
2310
+ env,
2311
+ authHeader
2312
+ });
2313
+ return mirrorDir;
2314
+ }
2315
+ await runGit(["--git-dir", mirrorDir, "remote", "set-url", "origin", spec.repo.url], {
2316
+ env,
2317
+ authHeader
2318
+ });
2319
+ await runGit(["--git-dir", mirrorDir, "fetch", "--prune", "--prune-tags", "--tags", "origin"], {
2320
+ env,
2321
+ authHeader
2322
+ });
2323
+ return mirrorDir;
2324
+ }
2325
+ async function resolveGitCommit(mirrorDir, ref, env) {
2326
+ const stdout = await runGit(["--git-dir", mirrorDir, "rev-parse", `${ref}^{commit}`], {
2327
+ env
2328
+ });
2329
+ return String(stdout).trim();
2330
+ }
2331
+ async function listRepoFiles(mirrorDir, commitSha, env) {
2332
+ const stdout = await runGit(["--git-dir", mirrorDir, "ls-tree", "-r", "-z", "--name-only", commitSha], {
2333
+ env,
2334
+ encoding: "buffer"
2335
+ });
2336
+ const entries = stdout instanceof Buffer ? stdout.toString("utf8") : String(stdout);
2337
+ return entries.split("\0").map((entry) => entry.trim()).filter(Boolean);
2338
+ }
2339
+ function isIncluded(spec, filePath) {
2340
+ if (!matchesPatterns(filePath, spec.repo.include)) {
2341
+ return false;
2342
+ }
2343
+ if (spec.repo.exclude.length > 0 && matchesPatterns(filePath, spec.repo.exclude)) {
2344
+ return false;
2345
+ }
2346
+ return true;
2347
+ }
2348
+ async function getObjectSize(mirrorDir, commitSha, filePath, env) {
2349
+ const stdout = await runGit(["--git-dir", mirrorDir, "cat-file", "-s", `${commitSha}:${filePath}`], {
2350
+ env
2351
+ });
2352
+ return Number(String(stdout).trim());
2353
+ }
2354
+ function isProbablyBinary(buffer) {
2355
+ return buffer.includes(0);
2356
+ }
2357
+ async function readRepoFile(mirrorDir, commitSha, filePath, env) {
2358
+ const stdout = await runGit(["--git-dir", mirrorDir, "show", `${commitSha}:${filePath}`], {
2359
+ env,
2360
+ encoding: "buffer"
2361
+ });
2362
+ return stdout instanceof Buffer ? stdout : Buffer.from(String(stdout), "utf8");
2363
+ }
2364
+ function normalizeRepoUrl(repoUrl) {
2365
+ return new URL(repoUrl);
2366
+ }
2367
+ function normalizeRepoWebBase(repoUrl) {
2368
+ const url = normalizeRepoUrl(repoUrl);
2369
+ const pathname = url.pathname.replace(/\.git$/i, "");
2370
+ return `${url.origin}${pathname}`;
2371
+ }
2372
+ function buildRepoFileUrl(spec, filePath) {
2373
+ const url = normalizeRepoUrl(spec.repo.url);
2374
+ const encodedPath = filePath.split("/").map((segment) => encodeURIComponent(segment)).join("/");
2375
+ const encodedRef = spec.repo.ref.split("/").map((segment) => encodeURIComponent(segment)).join("/");
2376
+ if (url.protocol === "file:") {
2377
+ return `${spec.repo.url}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
2378
+ }
2379
+ const base = normalizeRepoWebBase(spec.repo.url);
2380
+ if (url.hostname === "github.com") {
2381
+ return `${base}/blob/${encodedRef}/${encodedPath}`;
2382
+ }
2383
+ if (url.hostname === "gitlab.com") {
2384
+ return `${base}/-/blob/${encodedRef}/${encodedPath}`;
2385
+ }
2386
+ return `${base}#ref=${encodeURIComponent(spec.repo.ref)}&path=${encodeURIComponent(filePath)}`;
2387
+ }
2388
+ function persistGitSnapshotFiles(input, snapshotId, pages) {
2389
+ const snapshotDir = join4(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "files");
2390
+ for (const page of pages) {
2391
+ const filePath = join4(snapshotDir, page.filePath);
2392
+ mkdirSync3(dirname2(filePath), { recursive: true });
2393
+ writeFileSync(filePath, page.markdown, "utf8");
2394
+ }
2395
+ }
2396
+ async function materializeGitPages(spec, mirrorDir, commitSha, env) {
2397
+ const repoFiles = await listRepoFiles(mirrorDir, commitSha, env);
2398
+ const includedFiles = repoFiles.filter((filePath) => isIncluded(spec, filePath));
2399
+ if (includedFiles.length > spec.repo.maxFiles) {
2400
+ throw new AiocsError(
2401
+ AIOCS_ERROR_CODES.invalidArgument,
2402
+ `Git source '${spec.id}' matched ${includedFiles.length} files, exceeding maxFiles=${spec.repo.maxFiles}`
2403
+ );
2404
+ }
2405
+ const pages = [];
2406
+ for (const filePath of includedFiles) {
2407
+ const size = await getObjectSize(mirrorDir, commitSha, filePath, env);
2408
+ if (!Number.isFinite(size) || size > spec.repo.textFileMaxBytes) {
2409
+ continue;
2410
+ }
2411
+ const content = await readRepoFile(mirrorDir, commitSha, filePath, env).catch(() => null);
2412
+ if (!content || isProbablyBinary(content)) {
2413
+ continue;
2414
+ }
2415
+ const markdown = content.toString("utf8").trimEnd();
2416
+ if (!markdown.trim()) {
2417
+ continue;
2418
+ }
2419
+ pages.push({
2420
+ url: buildRepoFileUrl(spec, filePath),
2421
+ title: filePath,
2422
+ markdown,
2423
+ pageKind: "file",
2424
+ filePath,
2425
+ language: detectLanguage(filePath)
2426
+ });
2427
+ }
2428
+ return pages.sort((left, right) => left.filePath.localeCompare(right.filePath));
2429
+ }
2430
+ function assertCanaryPathInScope(spec, check) {
2431
+ if (!isIncluded(spec, check.path)) {
2432
+ throw new AiocsError(
2433
+ AIOCS_ERROR_CODES.invalidArgument,
2434
+ `Git canary path '${check.path}' is outside the configured include/exclude scope`
2435
+ );
2436
+ }
2437
+ }
2438
+ async function readCanaryTarget(spec, mirrorDir, commitSha, check, env) {
2439
+ assertCanaryPathInScope(spec, check);
2440
+ const content = await readRepoFile(mirrorDir, commitSha, check.path, env);
2441
+ if (isProbablyBinary(content)) {
2442
+ throw new Error(`Canary target '${check.path}' is binary`);
2443
+ }
2444
+ return {
2445
+ url: buildRepoFileUrl(spec, check.path),
2446
+ title: check.path,
2447
+ markdown: content.toString("utf8").trimEnd(),
2448
+ pageKind: "file",
2449
+ filePath: check.path,
2450
+ language: detectLanguage(check.path)
2451
+ };
2452
+ }
2453
+ async function fetchGitSource(input) {
2454
+ const spec = input.catalog.getSourceSpec(input.sourceId);
2455
+ if (!spec || !isGitSourceSpec(spec)) {
2456
+ throw new AiocsError(
2457
+ AIOCS_ERROR_CODES.sourceNotFound,
2458
+ `Unknown git source '${input.sourceId}'`
2459
+ );
2460
+ }
2461
+ const env = input.env ?? process.env;
2462
+ const mirrorDir = await ensureGitMirror(spec, input.dataDir, env);
2463
+ const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
2464
+ const pages = await materializeGitPages(spec, mirrorDir, commitSha, env);
2465
+ if (pages.length === 0) {
2466
+ throw new AiocsError(
2467
+ AIOCS_ERROR_CODES.noPagesFetched,
2468
+ `No text files fetched for git source '${input.sourceId}'`
2469
+ );
2470
+ }
2471
+ const result = input.catalog.recordSuccessfulSnapshot({
2472
+ sourceId: input.sourceId,
2473
+ detectedVersion: commitSha,
2474
+ revisionKey: commitSha,
2475
+ pages
2476
+ });
2477
+ if (!result.reused) {
2478
+ persistGitSnapshotFiles(input, result.snapshotId, pages);
2479
+ }
2480
+ return {
2481
+ snapshotId: result.snapshotId,
2482
+ pageCount: pages.length,
2483
+ reused: result.reused,
2484
+ detectedVersion: commitSha
2485
+ };
2486
+ }
2487
+ async function runGitSourceCanary(input) {
2488
+ const spec = input.catalog.getSourceSpec(input.sourceId);
2489
+ if (!spec || !isGitSourceSpec(spec)) {
2490
+ throw new AiocsError(
2491
+ AIOCS_ERROR_CODES.sourceNotFound,
2492
+ `Unknown git source '${input.sourceId}'`
2493
+ );
2494
+ }
2495
+ const env = input.env ?? process.env;
2496
+ const dataDir = input.dataDir ?? join4(process.env.HOME ?? "", ".aiocs", "data");
2497
+ const mirrorDir = await ensureGitMirror(spec, dataDir, env);
2498
+ const commitSha = await resolveGitCommit(mirrorDir, spec.repo.ref, env);
2499
+ const canary = resolveSourceCanary(spec);
2500
+ const checkedAt = nowIso2();
2501
+ const checks = [];
2502
+ for (const check of canary.checks) {
2503
+ try {
2504
+ const page = await readCanaryTarget(spec, mirrorDir, commitSha, check, env);
2505
+ if (check.expectedTitle && !page.title.includes(check.expectedTitle)) {
2506
+ throw new Error(`Expected title to include '${check.expectedTitle}'`);
2507
+ }
2508
+ if (check.expectedText && !page.markdown.includes(check.expectedText)) {
2509
+ throw new Error(`Expected markdown to include '${check.expectedText}'`);
2510
+ }
2511
+ if (page.markdown.trim().length < check.minContentLength) {
2512
+ throw new Error(`Expected content length to be at least ${check.minContentLength}`);
2513
+ }
2514
+ checks.push({
2515
+ path: check.path,
2516
+ status: "pass",
2517
+ title: page.title,
2518
+ markdownLength: page.markdown.trim().length
2519
+ });
2520
+ } catch (error) {
2521
+ checks.push({
2522
+ path: check.path,
2523
+ status: "fail",
2524
+ errorMessage: error instanceof Error ? error.message : String(error)
2525
+ });
2526
+ }
2527
+ }
2528
+ const passCount = checks.filter((check) => check.status === "pass").length;
2529
+ const failCount = checks.length - passCount;
2530
+ const status = failCount > 0 ? "fail" : "pass";
2531
+ const result = {
2532
+ sourceId: input.sourceId,
2533
+ status,
2534
+ checkedAt,
2535
+ summary: {
2536
+ checkCount: checks.length,
2537
+ passCount,
2538
+ failCount
2539
+ },
2540
+ checks
2541
+ };
2542
+ input.catalog.recordCanaryRun({
2543
+ sourceId: input.sourceId,
2544
+ status,
2545
+ checkedAt,
2546
+ details: result
2547
+ });
2548
+ if (status === "fail") {
2549
+ throw new AiocsError(
2550
+ AIOCS_ERROR_CODES.canaryFailed,
2551
+ `Git source canary failed for '${input.sourceId}'`,
2552
+ result
2553
+ );
2554
+ }
2555
+ return result;
2556
+ }
2557
+
1832
2558
  // src/fetch/extract.ts
1833
2559
  import { JSDOM } from "jsdom";
1834
2560
  import { Readability } from "@mozilla/readability";
@@ -2009,36 +2735,10 @@ async function extractPage(page, strategy) {
2009
2735
  return runReadabilityStrategy(page);
2010
2736
  }
2011
2737
 
2012
- // src/fetch/url-patterns.ts
2013
- function escapeRegex(value) {
2014
- return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
2015
- }
2016
- function patternToRegex(pattern) {
2017
- let regex = "^";
2018
- for (let index = 0; index < pattern.length; index += 1) {
2019
- const current = pattern[index];
2020
- const next = pattern[index + 1];
2021
- if (current === "*" && next === "*") {
2022
- regex += ".*";
2023
- index += 1;
2024
- continue;
2025
- }
2026
- if (current === "*") {
2027
- regex += "[^?#]*";
2028
- continue;
2029
- }
2030
- regex += escapeRegex(current ?? "");
2031
- }
2032
- return new RegExp(`${regex}$`);
2033
- }
2034
- function matchesPatterns(url, patterns) {
2035
- return patterns.some((pattern) => patternToRegex(pattern).test(url));
2036
- }
2037
-
2038
2738
  // src/fetch/fetch-source.ts
2039
2739
  var MAX_FETCH_ATTEMPTS = 3;
2040
2740
  var RETRY_DELAY_MS = 250;
2041
- function nowIso2() {
2741
+ function nowIso3() {
2042
2742
  return (/* @__PURE__ */ new Date()).toISOString();
2043
2743
  }
2044
2744
  function canonicalizeUrl(raw) {
@@ -2108,14 +2808,14 @@ async function extractRawMarkdownPage(url, response) {
2108
2808
  };
2109
2809
  }
2110
2810
  function persistSnapshotPages(input, snapshotId, pages) {
2111
- const snapshotDir = join4(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
2112
- mkdirSync3(snapshotDir, { recursive: true });
2811
+ const snapshotDir = join5(input.dataDir, "sources", input.sourceId, "snapshots", snapshotId, "pages");
2812
+ mkdirSync4(snapshotDir, { recursive: true });
2113
2813
  pages.forEach((page, index) => {
2114
2814
  const filename = `${String(index + 1).padStart(3, "0")}-${slugify(page.title)}.md`;
2115
- writeFileSync(join4(snapshotDir, filename), page.markdown, "utf8");
2815
+ writeFileSync2(join5(snapshotDir, filename), page.markdown, "utf8");
2116
2816
  });
2117
2817
  }
2118
- function resolveEnvValue(name, env) {
2818
+ function resolveEnvValue2(name, env) {
2119
2819
  const value = env[name];
2120
2820
  if (!value) {
2121
2821
  throw new AiocsError(
@@ -2131,13 +2831,13 @@ function resolveEnvValue(name, env) {
2131
2831
  function resolveSourceAuth(spec, env) {
2132
2832
  const scopedHeaders = (spec.auth?.headers ?? []).map((header) => ({
2133
2833
  name: header.name,
2134
- value: resolveEnvValue(header.valueFromEnv, env),
2834
+ value: resolveEnvValue2(header.valueFromEnv, env),
2135
2835
  hosts: header.hosts ?? spec.allowedHosts,
2136
2836
  ...header.include ? { include: header.include } : {}
2137
2837
  }));
2138
2838
  const cookies = (spec.auth?.cookies ?? []).map((cookie) => ({
2139
2839
  name: cookie.name,
2140
- value: resolveEnvValue(cookie.valueFromEnv, env),
2840
+ value: resolveEnvValue2(cookie.valueFromEnv, env),
2141
2841
  domain: cookie.domain,
2142
2842
  path: cookie.path,
2143
2843
  ...typeof cookie.secure === "boolean" ? { secure: cookie.secure } : {},
@@ -2236,6 +2936,14 @@ async function fetchSourceOnce(input) {
2236
2936
  `Unknown source '${input.sourceId}'`
2237
2937
  );
2238
2938
  }
2939
+ if (isGitSourceSpec(spec)) {
2940
+ const result = await fetchGitSource(input);
2941
+ return {
2942
+ snapshotId: result.snapshotId,
2943
+ pageCount: result.pageCount,
2944
+ reused: result.reused
2945
+ };
2946
+ }
2239
2947
  const session = await createSourceContext(spec, input.env ?? process.env);
2240
2948
  const { page } = session;
2241
2949
  const queue = spec.startUrls.map((url) => canonicalizeUrl(url));
@@ -2362,6 +3070,9 @@ async function runSourceCanaryOnce(input) {
2362
3070
  `Unknown source '${input.sourceId}'`
2363
3071
  );
2364
3072
  }
3073
+ if (isGitSourceSpec(spec)) {
3074
+ return runGitSourceCanary(input);
3075
+ }
2365
3076
  const canary = resolveSourceCanary(spec);
2366
3077
  const session = await createSourceContext(spec, input.env ?? process.env);
2367
3078
  const { page } = session;
@@ -2412,7 +3123,7 @@ async function runSourceCanaryOnce(input) {
2412
3123
  const result = {
2413
3124
  sourceId: input.sourceId,
2414
3125
  status: checks.every((check) => check.status === "pass") ? "pass" : "fail",
2415
- checkedAt: nowIso2(),
3126
+ checkedAt: nowIso3(),
2416
3127
  summary: {
2417
3128
  checkCount: checks.length,
2418
3129
  passCount: checks.filter((check) => check.status === "pass").length,
@@ -2655,6 +3366,9 @@ var AiocsVectorStore = class {
2655
3366
  pageUrl: point.pageUrl,
2656
3367
  pageTitle: point.pageTitle,
2657
3368
  sectionTitle: point.sectionTitle,
3369
+ pageKind: point.pageKind,
3370
+ filePath: point.filePath,
3371
+ language: point.language,
2658
3372
  modelKey: input.modelKey
2659
3373
  }
2660
3374
  }));
@@ -2867,7 +3581,10 @@ async function processEmbeddingJobs(input) {
2867
3581
  snapshotId: chunk.snapshotId,
2868
3582
  pageUrl: chunk.pageUrl,
2869
3583
  pageTitle: chunk.pageTitle,
2870
- sectionTitle: chunk.sectionTitle
3584
+ sectionTitle: chunk.sectionTitle,
3585
+ pageKind: chunk.pageKind,
3586
+ filePath: chunk.filePath,
3587
+ language: chunk.language
2871
3588
  }))
2872
3589
  });
2873
3590
  indexedChunkIds.push(...batch.map((chunk) => chunk.chunkId));
@@ -2960,7 +3677,7 @@ function getHybridRuntimeConfig(env = process.env) {
2960
3677
  // src/spec/source-spec-files.ts
2961
3678
  import { access, readdir } from "fs/promises";
2962
3679
  import { constants as fsConstants } from "fs";
2963
- import { extname as extname2, join as join5, resolve as resolve4 } from "path";
3680
+ import { extname as extname2, join as join6, resolve as resolve4 } from "path";
2964
3681
  var SOURCE_SPEC_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".json"]);
2965
3682
  function uniqueResolvedPaths(paths) {
2966
3683
  const seen = /* @__PURE__ */ new Set();
@@ -2987,7 +3704,7 @@ async function walkSourceSpecFiles(rootDir) {
2987
3704
  const entries = await readdir(rootDir, { withFileTypes: true });
2988
3705
  const discovered = [];
2989
3706
  for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
2990
- const entryPath = join5(rootDir, entry.name);
3707
+ const entryPath = join6(rootDir, entry.name);
2991
3708
  if (entry.isDirectory()) {
2992
3709
  discovered.push(...await walkSourceSpecFiles(entryPath));
2993
3710
  continue;
@@ -3007,7 +3724,7 @@ var DEFAULT_INTERVAL_MINUTES = 60;
3007
3724
  var DEFAULT_CONTAINER_SOURCE_DIR = "/app/sources";
3008
3725
  var BOOLEAN_TRUE_VALUES = /* @__PURE__ */ new Set(["1", "true", "yes", "on"]);
3009
3726
  var BOOLEAN_FALSE_VALUES = /* @__PURE__ */ new Set(["0", "false", "no", "off"]);
3010
- function nowIso3() {
3727
+ function nowIso4() {
3011
3728
  return (/* @__PURE__ */ new Date()).toISOString();
3012
3729
  }
3013
3730
  function parsePositiveInteger2(raw, variableName) {
@@ -3033,7 +3750,7 @@ function parseBoolean(raw, variableName) {
3033
3750
  function parseDaemonConfig(env, options = {}) {
3034
3751
  const intervalMinutes = env.AIOCS_DAEMON_INTERVAL_MINUTES ? parsePositiveInteger2(env.AIOCS_DAEMON_INTERVAL_MINUTES, "AIOCS_DAEMON_INTERVAL_MINUTES") : DEFAULT_INTERVAL_MINUTES;
3035
3752
  const fetchOnStart = env.AIOCS_DAEMON_FETCH_ON_START ? parseBoolean(env.AIOCS_DAEMON_FETCH_ON_START, "AIOCS_DAEMON_FETCH_ON_START") : true;
3036
- const defaultContainerSourceDir = options.containerSourceDir ?? (existsSync2(DEFAULT_CONTAINER_SOURCE_DIR) ? DEFAULT_CONTAINER_SOURCE_DIR : void 0);
3753
+ const defaultContainerSourceDir = options.containerSourceDir ?? (existsSync3(DEFAULT_CONTAINER_SOURCE_DIR) ? DEFAULT_CONTAINER_SOURCE_DIR : void 0);
3037
3754
  const defaultSourceDirs = uniqueResolvedPaths([
3038
3755
  options.bundledSourceDir ?? getBundledSourcesDir(),
3039
3756
  options.userSourceDir ?? getAiocsSourcesDir(env),
@@ -3097,7 +3814,7 @@ async function bootstrapSourceSpecs(input) {
3097
3814
  };
3098
3815
  }
3099
3816
  async function runDaemonCycle(input) {
3100
- const startedAt = nowIso3();
3817
+ const startedAt = nowIso4();
3101
3818
  const bootstrapped = await bootstrapSourceSpecs({
3102
3819
  catalog: input.catalog,
3103
3820
  sourceSpecDirs: input.sourceSpecDirs,
@@ -3127,6 +3844,7 @@ async function runDaemonCycle(input) {
3127
3844
  const result = await runSourceCanary({
3128
3845
  catalog: input.catalog,
3129
3846
  sourceId,
3847
+ dataDir: input.dataDir,
3130
3848
  env: process.env
3131
3849
  });
3132
3850
  canaried.push({
@@ -3185,7 +3903,7 @@ async function runDaemonCycle(input) {
3185
3903
  }
3186
3904
  return {
3187
3905
  startedAt,
3188
- finishedAt: nowIso3(),
3906
+ finishedAt: nowIso4(),
3189
3907
  dueSourceIds,
3190
3908
  canaryDueSourceIds,
3191
3909
  bootstrapped,
@@ -3201,7 +3919,7 @@ async function startDaemon(input) {
3201
3919
  const intervalMs = input.config.intervalMinutes * 6e4;
3202
3920
  input.catalog.resetRunningEmbeddingJobs();
3203
3921
  input.catalog.markDaemonStarted({
3204
- startedAt: nowIso3(),
3922
+ startedAt: nowIso4(),
3205
3923
  intervalMinutes: input.config.intervalMinutes,
3206
3924
  fetchOnStart: input.config.fetchOnStart
3207
3925
  });
@@ -3212,7 +3930,7 @@ async function startDaemon(input) {
3212
3930
  sourceSpecDirs: input.config.sourceSpecDirs
3213
3931
  });
3214
3932
  const runCycle = async (reason) => {
3215
- const startedAt = nowIso3();
3933
+ const startedAt = nowIso4();
3216
3934
  input.catalog.markDaemonCycleStarted(startedAt);
3217
3935
  input.logger.emit({
3218
3936
  type: "daemon.cycle.started",
@@ -3238,7 +3956,7 @@ async function startDaemon(input) {
3238
3956
  });
3239
3957
  } catch (error) {
3240
3958
  input.catalog.markDaemonCycleCompleted({
3241
- completedAt: nowIso3(),
3959
+ completedAt: nowIso4(),
3242
3960
  status: "failed"
3243
3961
  });
3244
3962
  throw error;
@@ -3269,7 +3987,7 @@ async function startDaemon(input) {
3269
3987
  // package.json
3270
3988
  var package_default = {
3271
3989
  name: "@bodhi-ventures/aiocs",
3272
- version: "0.1.2",
3990
+ version: "0.2.0",
3273
3991
  license: "MIT",
3274
3992
  type: "module",
3275
3993
  description: "Local-only documentation store, fetcher, and search CLI for AI agents.",
@@ -3317,28 +4035,28 @@ var package_default = {
3317
4035
  "test:watch": "vitest"
3318
4036
  },
3319
4037
  dependencies: {
3320
- "@modelcontextprotocol/sdk": "^1.28.0",
3321
- "@mozilla/readability": "^0.6.0",
4038
+ "@modelcontextprotocol/sdk": "1.28.0",
4039
+ "@mozilla/readability": "0.6.0",
3322
4040
  "@qdrant/js-client-rest": "1.17.0",
3323
- "better-sqlite3": "^12.4.1",
3324
- commander: "^14.0.1",
3325
- jsdom: "^27.0.1",
3326
- playwright: "^1.57.0",
3327
- turndown: "^7.2.1",
3328
- "turndown-plugin-gfm": "^1.0.2",
3329
- yaml: "^2.8.1",
3330
- zod: "^4.1.12"
4041
+ "better-sqlite3": "12.4.1",
4042
+ commander: "14.0.1",
4043
+ jsdom: "27.0.1",
4044
+ playwright: "1.57.0",
4045
+ turndown: "7.2.1",
4046
+ "turndown-plugin-gfm": "1.0.2",
4047
+ yaml: "2.8.1",
4048
+ zod: "4.1.12"
3331
4049
  },
3332
4050
  devDependencies: {
3333
- "@types/better-sqlite3": "^7.6.13",
3334
- "@types/jsdom": "^21.1.7",
3335
- "@types/node": "^24.7.2",
3336
- "@types/turndown": "^5.0.5",
3337
- execa: "^9.6.0",
3338
- tsup: "^8.5.0",
3339
- tsx: "^4.20.6",
3340
- typescript: "^5.9.3",
3341
- vitest: "^3.2.4"
4051
+ "@types/better-sqlite3": "7.6.13",
4052
+ "@types/jsdom": "21.1.7",
4053
+ "@types/node": "24.7.2",
4054
+ "@types/turndown": "5.0.5",
4055
+ execa: "9.6.0",
4056
+ tsup: "8.5.0",
4057
+ tsx: "4.20.6",
4058
+ typescript: "5.9.3",
4059
+ vitest: "3.2.4"
3342
4060
  }
3343
4061
  };
3344
4062
 
@@ -3352,7 +4070,7 @@ import { resolve as resolve8 } from "path";
3352
4070
 
3353
4071
  // src/backup.ts
3354
4072
  import { cp, mkdir, readdir as readdir2, readFile as readFile2, rename, rm, stat, writeFile } from "fs/promises";
3355
- import { basename, dirname as dirname2, join as join6, resolve as resolve6 } from "path";
4073
+ import { basename, dirname as dirname3, join as join7, resolve as resolve6 } from "path";
3356
4074
  import { randomUUID as randomUUID2 } from "crypto";
3357
4075
  import Database2 from "better-sqlite3";
3358
4076
  var CATALOG_DB_FILENAME = "catalog.sqlite";
@@ -3380,7 +4098,7 @@ async function isDirectoryEmpty(path) {
3380
4098
  return (await readdir2(path)).length === 0;
3381
4099
  }
3382
4100
  async function listEntries(root, relativePath = "") {
3383
- const absolutePath = relativePath ? join6(root, relativePath) : root;
4101
+ const absolutePath = relativePath ? join7(root, relativePath) : root;
3384
4102
  const stats = await stat(absolutePath);
3385
4103
  if (!stats.isDirectory()) {
3386
4104
  return [{
@@ -3396,7 +4114,7 @@ async function listEntries(root, relativePath = "") {
3396
4114
  size: 0
3397
4115
  }] : [];
3398
4116
  for (const childName of childNames.sort()) {
3399
- entries.push(...await listEntries(root, relativePath ? join6(relativePath, childName) : childName));
4117
+ entries.push(...await listEntries(root, relativePath ? join7(relativePath, childName) : childName));
3400
4118
  }
3401
4119
  return entries;
3402
4120
  }
@@ -3410,12 +4128,12 @@ async function copyIfPresent(from, to, entries, relativePrefix) {
3410
4128
  entries.push(
3411
4129
  ...copiedEntries.map((entry) => ({
3412
4130
  ...entry,
3413
- relativePath: join6(relativePrefix, entry.relativePath)
4131
+ relativePath: join7(relativePrefix, entry.relativePath)
3414
4132
  }))
3415
4133
  );
3416
4134
  }
3417
4135
  async function copyDataDirForBackup(from, to) {
3418
- const sourceCatalogPath = join6(from, CATALOG_DB_FILENAME);
4136
+ const sourceCatalogPath = join7(from, CATALOG_DB_FILENAME);
3419
4137
  if (!await pathExists2(sourceCatalogPath)) {
3420
4138
  throw new AiocsError(
3421
4139
  AIOCS_ERROR_CODES.backupSourceMissing,
@@ -3431,10 +4149,13 @@ async function copyDataDirForBackup(from, to) {
3431
4149
  if (name === CATALOG_DB_FILENAME) {
3432
4150
  return false;
3433
4151
  }
4152
+ if (name === "git-mirrors") {
4153
+ return false;
4154
+ }
3434
4155
  return !SQLITE_SIDE_CAR_SUFFIXES.some((suffix) => name === `${CATALOG_DB_FILENAME}${suffix}`);
3435
4156
  }
3436
4157
  });
3437
- const targetCatalogPath = join6(to, CATALOG_DB_FILENAME);
4158
+ const targetCatalogPath = join7(to, CATALOG_DB_FILENAME);
3438
4159
  const sourceCatalog = new Database2(sourceCatalogPath, { readonly: true });
3439
4160
  try {
3440
4161
  await sourceCatalog.backup(targetCatalogPath);
@@ -3443,7 +4164,7 @@ async function copyDataDirForBackup(from, to) {
3443
4164
  }
3444
4165
  }
3445
4166
  async function loadValidatedBackupPayload(inputDir) {
3446
- const manifestPath = join6(inputDir, "manifest.json");
4167
+ const manifestPath = join7(inputDir, "manifest.json");
3447
4168
  await assertSourceDirExists(inputDir);
3448
4169
  if (!await pathExists2(manifestPath)) {
3449
4170
  throw new AiocsError(
@@ -3458,21 +4179,21 @@ async function loadValidatedBackupPayload(inputDir) {
3458
4179
  `Invalid backup manifest: ${manifestPath}`
3459
4180
  );
3460
4181
  }
3461
- const backupDataDir = join6(inputDir, "data");
4182
+ const backupDataDir = join7(inputDir, "data");
3462
4183
  if (!await pathExists2(backupDataDir)) {
3463
4184
  throw new AiocsError(
3464
4185
  AIOCS_ERROR_CODES.backupInvalid,
3465
4186
  `Backup payload is missing the data directory: ${backupDataDir}`
3466
4187
  );
3467
4188
  }
3468
- const backupCatalogPath = join6(backupDataDir, CATALOG_DB_FILENAME);
4189
+ const backupCatalogPath = join7(backupDataDir, CATALOG_DB_FILENAME);
3469
4190
  if (!await pathExists2(backupCatalogPath)) {
3470
4191
  throw new AiocsError(
3471
4192
  AIOCS_ERROR_CODES.backupInvalid,
3472
4193
  `Backup payload is missing the catalog database: ${backupCatalogPath}`
3473
4194
  );
3474
4195
  }
3475
- const backupConfigDir = join6(inputDir, "config");
4196
+ const backupConfigDir = join7(inputDir, "config");
3476
4197
  return {
3477
4198
  manifest,
3478
4199
  backupDataDir,
@@ -3480,8 +4201,8 @@ async function loadValidatedBackupPayload(inputDir) {
3480
4201
  };
3481
4202
  }
3482
4203
  async function prepareReplacementTarget(backupDir, targetDir) {
3483
- const parentDir = dirname2(targetDir);
3484
- const stagingDir = join6(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
4204
+ const parentDir = dirname3(targetDir);
4205
+ const stagingDir = join7(parentDir, `.${basename(targetDir)}.import-${randomUUID2()}`);
3485
4206
  await rm(stagingDir, { recursive: true, force: true });
3486
4207
  await mkdir(parentDir, { recursive: true });
3487
4208
  await cp(backupDir, stagingDir, { recursive: true, force: true });
@@ -3503,13 +4224,13 @@ async function exportBackup(input) {
3503
4224
  }
3504
4225
  await mkdir(outputDir, { recursive: true });
3505
4226
  const entries = [];
3506
- await copyDataDirForBackup(dataDir, join6(outputDir, "data"));
3507
- entries.push(...(await listEntries(join6(outputDir, "data"))).map((entry) => ({
4227
+ await copyDataDirForBackup(dataDir, join7(outputDir, "data"));
4228
+ entries.push(...(await listEntries(join7(outputDir, "data"))).map((entry) => ({
3508
4229
  ...entry,
3509
- relativePath: join6("data", entry.relativePath)
4230
+ relativePath: join7("data", entry.relativePath)
3510
4231
  })));
3511
4232
  if (configDir) {
3512
- await copyIfPresent(configDir, join6(outputDir, "config"), entries, "config");
4233
+ await copyIfPresent(configDir, join7(outputDir, "config"), entries, "config");
3513
4234
  }
3514
4235
  const manifest = {
3515
4236
  formatVersion: 1,
@@ -3517,7 +4238,7 @@ async function exportBackup(input) {
3517
4238
  packageVersion,
3518
4239
  entries
3519
4240
  };
3520
- const manifestPath = join6(outputDir, "manifest.json");
4241
+ const manifestPath = join7(outputDir, "manifest.json");
3521
4242
  await writeFile(manifestPath, JSON.stringify(manifest, null, 2), "utf8");
3522
4243
  return {
3523
4244
  outputDir,
@@ -3693,9 +4414,9 @@ async function verifyCoverageAgainstReferences(corpus, referenceFiles) {
3693
4414
 
3694
4415
  // src/doctor.ts
3695
4416
  import { access as access2 } from "fs/promises";
3696
- import { execFile } from "child_process";
3697
- import { promisify } from "util";
3698
- var execFileAsync = promisify(execFile);
4417
+ import { execFile as execFile2 } from "child_process";
4418
+ import { promisify as promisify2 } from "util";
4419
+ var execFileAsync2 = promisify2(execFile2);
3699
4420
  function summarize(checks) {
3700
4421
  const passCount = checks.filter((check) => check.status === "pass").length;
3701
4422
  const warnCount = checks.filter((check) => check.status === "warn").length;
@@ -3782,6 +4503,25 @@ async function checkPlaywright() {
3782
4503
  };
3783
4504
  }
3784
4505
  }
4506
+ async function checkGit() {
4507
+ try {
4508
+ const { stdout } = await execFileAsync2("git", ["--version"]);
4509
+ return {
4510
+ id: "git",
4511
+ status: "pass",
4512
+ summary: "Git executable is available.",
4513
+ details: {
4514
+ version: stdout.trim()
4515
+ }
4516
+ };
4517
+ } catch (error) {
4518
+ return {
4519
+ id: "git",
4520
+ status: "fail",
4521
+ summary: `Git is not ready: ${toErrorMessage(error)}`
4522
+ };
4523
+ }
4524
+ }
3785
4525
  async function checkDaemonConfig(env) {
3786
4526
  try {
3787
4527
  const daemonConfig = parseDaemonConfig(env, {
@@ -4031,7 +4771,7 @@ async function checkEmbeddings(env) {
4031
4771
  }
4032
4772
  async function checkDocker() {
4033
4773
  try {
4034
- const { stdout } = await execFileAsync("docker", ["info", "--format", "{{json .ServerVersion}}"]);
4774
+ const { stdout } = await execFileAsync2("docker", ["info", "--format", "{{json .ServerVersion}}"]);
4035
4775
  const version = JSON.parse(stdout.trim());
4036
4776
  return {
4037
4777
  id: "docker",
@@ -4047,7 +4787,7 @@ async function checkDocker() {
4047
4787
  return {
4048
4788
  id: "docker",
4049
4789
  status: "warn",
4050
- summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable on this machine."
4790
+ summary: "Docker CLI is not installed; Docker-based daemon deployment is unavailable in this environment."
4051
4791
  };
4052
4792
  }
4053
4793
  return {
@@ -4059,6 +4799,7 @@ async function checkDocker() {
4059
4799
  }
4060
4800
  async function runDoctor(env = process.env) {
4061
4801
  const catalogCheck = await checkCatalog(env);
4802
+ const gitCheck = await checkGit();
4062
4803
  const playwrightCheck = await checkPlaywright();
4063
4804
  const { daemonConfigCheck, daemonConfig } = await checkDaemonConfig(env);
4064
4805
  const sourceSpecDirsCheck = await checkSourceSpecDirs(daemonConfig);
@@ -4070,6 +4811,7 @@ async function runDoctor(env = process.env) {
4070
4811
  const dockerCheck = await checkDocker();
4071
4812
  const checks = [
4072
4813
  catalogCheck,
4814
+ gitCheck,
4073
4815
  playwrightCheck,
4074
4816
  daemonConfigCheck,
4075
4817
  sourceSpecDirsCheck,
@@ -4124,6 +4866,19 @@ function withScores(rows, scoreLookup) {
4124
4866
  };
4125
4867
  });
4126
4868
  }
4869
+ function matchesChunkFilters(row, filters) {
4870
+ if (filters.pathPatterns && filters.pathPatterns.length > 0) {
4871
+ if (!row.filePath || !matchesPatterns(row.filePath, filters.pathPatterns)) {
4872
+ return false;
4873
+ }
4874
+ }
4875
+ if (filters.languages && filters.languages.length > 0) {
4876
+ if (!row.language || !filters.languages.includes(row.language.toLowerCase())) {
4877
+ return false;
4878
+ }
4879
+ }
4880
+ return true;
4881
+ }
4127
4882
  async function searchHybridCatalog(input) {
4128
4883
  const scope = input.catalog.resolveSearchScope({
4129
4884
  query: input.query,
@@ -4131,6 +4886,8 @@ async function searchHybridCatalog(input) {
4131
4886
  ...input.searchInput.sourceIds ? { sourceIds: input.searchInput.sourceIds } : {},
4132
4887
  ...input.searchInput.snapshotId ? { snapshotId: input.searchInput.snapshotId } : {},
4133
4888
  ...input.searchInput.all ? { all: true } : {},
4889
+ ...input.searchInput.pathPatterns ? { pathPatterns: input.searchInput.pathPatterns } : {},
4890
+ ...input.searchInput.languages ? { languages: input.searchInput.languages } : {},
4134
4891
  ...typeof input.searchInput.limit === "number" ? { limit: input.searchInput.limit } : {},
4135
4892
  ...typeof input.searchInput.offset === "number" ? { offset: input.searchInput.offset } : {}
4136
4893
  });
@@ -4193,13 +4950,25 @@ async function searchHybridCatalog(input) {
4193
4950
  );
4194
4951
  }
4195
4952
  const vectorStore = new AiocsVectorStore(input.config);
4196
- vectorCandidates = await vectorStore.search({
4953
+ const rawVectorCandidates = await vectorStore.search({
4197
4954
  vector: queryVector,
4198
4955
  snapshotIds: scope.snapshotIds,
4199
4956
  sourceIds: scope.sourceIds,
4200
4957
  modelKey,
4201
4958
  limit: windowSize(scope.limit, scope.offset, input.config.vectorCandidateWindow)
4202
4959
  });
4960
+ if (rawVectorCandidates.length > 0 && (scope.pathPatterns || scope.languages)) {
4961
+ const candidateRows = input.catalog.getChunksByIds(rawVectorCandidates.map((candidate) => candidate.chunkId));
4962
+ const allowedIds = new Set(
4963
+ candidateRows.filter((row) => matchesChunkFilters(row, {
4964
+ pathPatterns: scope.pathPatterns,
4965
+ languages: scope.languages
4966
+ })).map((row) => row.chunkId)
4967
+ );
4968
+ vectorCandidates = rawVectorCandidates.filter((candidate) => allowedIds.has(candidate.chunkId));
4969
+ } else {
4970
+ vectorCandidates = rawVectorCandidates;
4971
+ }
4203
4972
  } catch (error) {
4204
4973
  if (input.mode === "auto") {
4205
4974
  return lexicalOnly();
@@ -4355,7 +5124,7 @@ async function refreshDueSources(sourceIdOrAll = "all") {
4355
5124
  return { results };
4356
5125
  }
4357
5126
  async function runSourceCanaries(sourceIdOrAll) {
4358
- const results = await withCatalog(async ({ catalog }) => {
5127
+ const results = await withCatalog(async ({ catalog, dataDir }) => {
4359
5128
  const sourceIds = sourceIdOrAll === "all" ? catalog.listSources().map((item) => item.id) : [sourceIdOrAll];
4360
5129
  if (sourceIds.length === 0) {
4361
5130
  return [];
@@ -4365,6 +5134,7 @@ async function runSourceCanaries(sourceIdOrAll) {
4365
5134
  canaried.push(await runSourceCanary({
4366
5135
  catalog,
4367
5136
  sourceId,
5137
+ dataDir,
4368
5138
  env: process.env
4369
5139
  }));
4370
5140
  }
@@ -4424,6 +5194,8 @@ async function searchCatalog(query, options) {
4424
5194
  ...explicitSources ? { sourceIds: options.source } : {},
4425
5195
  ...options.snapshot ? { snapshotId: options.snapshot } : {},
4426
5196
  ...options.all ? { all: true } : {},
5197
+ ...options.path && options.path.length > 0 ? { pathPatterns: options.path } : {},
5198
+ ...options.language && options.language.length > 0 ? { languages: options.language } : {},
4427
5199
  ...typeof options.limit === "number" ? { limit: options.limit } : {},
4428
5200
  ...typeof options.offset === "number" ? { offset: options.offset } : {}
4429
5201
  }