preflight-mcp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,8 @@ import fs from 'node:fs/promises';
3
3
  import path from 'node:path';
4
4
  import { logger } from '../logging/logger.js';
5
5
  import { getLocalHeadSha, getRemoteHeadSha, parseOwnerRepo, shallowClone, toCloneUrl, } from './github.js';
6
- import { ingestRepoToBundle } from './ingest.js';
6
+ import { downloadAndExtractGitHubArchive } from './githubArchive.js';
7
+ import { classifyIngestedFileKind, ingestRepoToBundle } from './ingest.js';
7
8
  import { writeManifest, readManifest } from './manifest.js';
8
9
  import { getBundlePaths, repoMetaPath, repoNormDir, repoRawDir, repoRootDir } from './paths.js';
9
10
  import { writeAgentsMd, writeStartHereMd } from './guides.js';
@@ -13,22 +14,261 @@ import { ingestContext7Libraries } from './context7.js';
13
14
  import { ingestDeepWikiRepo } from './deepwiki.js';
14
15
  import { analyzeBundleStatic } from './analysis.js';
15
16
  import { autoDetectTags, generateDisplayName, generateDescription } from './tagging.js';
17
+ import { bundleCreationLimiter } from '../core/concurrency-limiter.js';
18
+ const DEDUP_INDEX_FILE = '.preflight-dedup-index.json';
19
+ function sha256Hex(text) {
20
+ return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
21
+ }
22
+ function normalizeList(values) {
23
+ return (values ?? [])
24
+ .map((s) => s.trim())
25
+ .filter(Boolean)
26
+ .map((s) => s.toLowerCase())
27
+ .sort();
28
+ }
29
+ function normalizeDeepWikiUrl(raw) {
30
+ const trimmed = raw.trim();
31
+ try {
32
+ const u = new URL(trimmed);
33
+ u.hash = '';
34
+ // Normalize host and strip trailing slash.
35
+ u.host = u.host.toLowerCase();
36
+ u.pathname = u.pathname.replace(/\/+$/g, '');
37
+ return u.toString();
38
+ }
39
+ catch {
40
+ return trimmed;
41
+ }
42
+ }
43
+ function canonicalizeCreateInput(input) {
44
+ const repos = input.repos
45
+ .map((r) => {
46
+ if (r.kind === 'github') {
47
+ const { owner, repo } = parseOwnerRepo(r.repo);
48
+ return {
49
+ kind: 'github',
50
+ repo: `${owner.toLowerCase()}/${repo.toLowerCase()}`,
51
+ ref: (r.ref ?? '').trim() || undefined,
52
+ };
53
+ }
54
+ if (r.kind === 'local') {
55
+ // For de-duplication, treat local imports as equivalent to github imports of the same logical repo/ref.
56
+ const { owner, repo } = parseOwnerRepo(r.repo);
57
+ return {
58
+ kind: 'github',
59
+ repo: `${owner.toLowerCase()}/${repo.toLowerCase()}`,
60
+ ref: (r.ref ?? '').trim() || undefined,
61
+ };
62
+ }
63
+ return {
64
+ kind: 'deepwiki',
65
+ url: normalizeDeepWikiUrl(r.url),
66
+ };
67
+ })
68
+ .sort((a, b) => {
69
+ const ka = a.kind === 'github' ? `github:${a.repo}:${a.ref ?? ''}` : `deepwiki:${a.url}`;
70
+ const kb = b.kind === 'github' ? `github:${b.repo}:${b.ref ?? ''}` : `deepwiki:${b.url}`;
71
+ return ka.localeCompare(kb);
72
+ });
73
+ return {
74
+ schemaVersion: 1,
75
+ repos,
76
+ libraries: normalizeList(input.libraries),
77
+ topics: normalizeList(input.topics),
78
+ };
79
+ }
80
+ export function computeCreateInputFingerprint(input) {
81
+ const canonical = canonicalizeCreateInput(input);
82
+ return sha256Hex(JSON.stringify(canonical));
83
+ }
84
+ function dedupIndexPath(storageDir) {
85
+ return path.join(storageDir, DEDUP_INDEX_FILE);
86
+ }
87
+ async function readDedupIndex(storageDir) {
88
+ const p = dedupIndexPath(storageDir);
89
+ try {
90
+ const raw = await fs.readFile(p, 'utf8');
91
+ const parsed = JSON.parse(raw);
92
+ if (parsed.schemaVersion !== 1 || typeof parsed.byFingerprint !== 'object' || !parsed.byFingerprint) {
93
+ return { schemaVersion: 1, updatedAt: nowIso(), byFingerprint: {} };
94
+ }
95
+ return parsed;
96
+ }
97
+ catch {
98
+ return { schemaVersion: 1, updatedAt: nowIso(), byFingerprint: {} };
99
+ }
100
+ }
101
+ async function writeDedupIndex(storageDir, idx) {
102
+ const p = dedupIndexPath(storageDir);
103
+ await ensureDir(path.dirname(p));
104
+ // Use atomic write (write to temp file, then rename) to prevent corruption
105
+ const tmpPath = `${p}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`;
106
+ try {
107
+ await fs.writeFile(tmpPath, JSON.stringify(idx, null, 2) + '\n', 'utf8');
108
+ // Atomic rename on POSIX; near-atomic on Windows
109
+ await fs.rename(tmpPath, p);
110
+ }
111
+ catch (err) {
112
+ // Clean up temp file on error
113
+ try {
114
+ await fs.unlink(tmpPath);
115
+ }
116
+ catch {
117
+ // Ignore cleanup errors
118
+ }
119
+ throw err;
120
+ }
121
+ }
122
+ async function updateDedupIndexBestEffort(cfg, fingerprint, bundleId, bundleUpdatedAt) {
123
+ for (const storageDir of cfg.storageDirs) {
124
+ try {
125
+ const parentAvailable = await isParentAvailable(storageDir);
126
+ if (!parentAvailable)
127
+ continue;
128
+ await ensureDir(storageDir);
129
+ const idx = await readDedupIndex(storageDir);
130
+ idx.byFingerprint[fingerprint] = { bundleId, bundleUpdatedAt };
131
+ idx.updatedAt = nowIso();
132
+ await writeDedupIndex(storageDir, idx);
133
+ }
134
+ catch {
135
+ // best-effort
136
+ }
137
+ }
138
+ }
139
+ async function readBundleSummary(cfg, bundleId) {
140
+ const storageDir = (await findBundleStorageDir(cfg.storageDirs, bundleId)) ?? (await getEffectiveStorageDir(cfg));
141
+ const paths = getBundlePaths(storageDir, bundleId);
142
+ const manifest = await readManifest(paths.manifestPath);
143
+ return {
144
+ bundleId: manifest.bundleId,
145
+ createdAt: manifest.createdAt,
146
+ updatedAt: manifest.updatedAt,
147
+ repos: manifest.repos.map((r) => ({
148
+ kind: r.kind,
149
+ id: r.id,
150
+ source: r.source,
151
+ headSha: r.headSha,
152
+ notes: r.notes,
153
+ })),
154
+ libraries: manifest.libraries,
155
+ };
156
+ }
157
+ export async function findBundleByInputs(cfg, input) {
158
+ const fingerprint = computeCreateInputFingerprint(input);
159
+ return findExistingBundleByFingerprint(cfg, fingerprint);
160
+ }
161
+ async function findExistingBundleByFingerprint(cfg, fingerprint) {
162
+ // Fast path: consult any available dedup index.
163
+ for (const storageDir of cfg.storageDirs) {
164
+ try {
165
+ if (!(await isPathAvailable(storageDir)))
166
+ continue;
167
+ const idx = await readDedupIndex(storageDir);
168
+ const hit = idx.byFingerprint[fingerprint];
169
+ if (hit?.bundleId && (await bundleExistsMulti(cfg.storageDirs, hit.bundleId))) {
170
+ return hit.bundleId;
171
+ }
172
+ }
173
+ catch {
174
+ // ignore
175
+ }
176
+ }
177
+ // Slow path: scan manifests (works even for bundles created before fingerprints existed).
178
+ let best = null;
179
+ for (const storageDir of cfg.storageDirs) {
180
+ if (!(await isPathAvailable(storageDir)))
181
+ continue;
182
+ const ids = await listBundles(storageDir);
183
+ for (const id of ids) {
184
+ try {
185
+ const paths = getBundlePaths(storageDir, id);
186
+ const manifest = await readManifest(paths.manifestPath);
187
+ const fp = computeCreateInputFingerprint({
188
+ repos: manifest.inputs.repos,
189
+ libraries: manifest.inputs.libraries,
190
+ topics: manifest.inputs.topics,
191
+ });
192
+ if (fp === fingerprint) {
193
+ const updatedAt = manifest.updatedAt;
194
+ if (!best || new Date(updatedAt) > new Date(best.updatedAt)) {
195
+ best = { bundleId: manifest.bundleId, updatedAt };
196
+ }
197
+ }
198
+ }
199
+ catch {
200
+ // ignore corrupt bundles
201
+ }
202
+ }
203
+ }
204
+ if (best) {
205
+ // Seed index for next time (best-effort).
206
+ await updateDedupIndexBestEffort(cfg, fingerprint, best.bundleId, best.updatedAt);
207
+ return best.bundleId;
208
+ }
209
+ return null;
210
+ }
16
211
  async function ensureDir(p) {
17
212
  await fs.mkdir(p, { recursive: true });
18
213
  }
19
214
  function nowIso() {
20
215
  return new Date().toISOString();
21
216
  }
217
+ function toPosix(p) {
218
+ return p.replaceAll('\\', '/');
219
+ }
220
+ function sha256Text(text) {
221
+ return crypto.createHash('sha256').update(text, 'utf8').digest('hex');
222
+ }
223
+ async function statOrNull(p) {
224
+ try {
225
+ return await fs.stat(p);
226
+ }
227
+ catch {
228
+ return null;
229
+ }
230
+ }
231
+ async function readUtf8OrNull(p) {
232
+ try {
233
+ return await fs.readFile(p, 'utf8');
234
+ }
235
+ catch {
236
+ return null;
237
+ }
238
+ }
239
+ async function* walkFilesNoIgnore(rootDir) {
240
+ const stack = [rootDir];
241
+ while (stack.length) {
242
+ const dir = stack.pop();
243
+ const entries = await fs.readdir(dir, { withFileTypes: true });
244
+ for (const ent of entries) {
245
+ const abs = path.join(dir, ent.name);
246
+ const rel = toPosix(path.relative(rootDir, abs));
247
+ if (ent.isDirectory()) {
248
+ stack.push(abs);
249
+ continue;
250
+ }
251
+ if (!ent.isFile())
252
+ continue;
253
+ yield { absPath: abs, relPosix: rel };
254
+ }
255
+ }
256
+ }
22
257
  async function writeRepoMeta(params) {
23
258
  await ensureDir(path.dirname(params.metaPath));
24
259
  const obj = {
25
260
  repoId: params.repoId,
26
261
  cloneUrl: params.cloneUrl,
27
- headSha: params.headSha,
28
262
  fetchedAt: params.fetchedAt,
29
263
  ingestedFiles: params.ingestedFiles,
30
264
  skipped: params.skipped,
31
265
  };
266
+ if (params.headSha)
267
+ obj.headSha = params.headSha;
268
+ if (params.source)
269
+ obj.source = params.source;
270
+ if (params.ref)
271
+ obj.ref = params.ref;
32
272
  await fs.writeFile(params.metaPath, JSON.stringify(obj, null, 2) + '\n', 'utf8');
33
273
  }
34
274
  async function rmIfExists(p) {
@@ -205,26 +445,47 @@ async function mirrorBundleToBackups(primaryDir, backupDirs, bundleId) {
205
445
  const srcPath = path.join(primaryDir, bundleId);
206
446
  const mirrored = [];
207
447
  const failed = [];
208
- for (const backupDir of backupDirs) {
209
- if (backupDir === primaryDir)
210
- continue; // Skip primary
448
+ // Mirror to all backup dirs in parallel for better performance
449
+ const mirrorPromises = backupDirs
450
+ .filter(dir => dir !== primaryDir) // Skip primary
451
+ .map(async (backupDir) => {
211
452
  const destPath = path.join(backupDir, bundleId);
212
453
  try {
213
454
  // Check if backup location is available
214
455
  const parentAvailable = await isParentAvailable(destPath);
215
456
  if (!parentAvailable) {
216
- failed.push({ path: backupDir, error: 'Mount not available' });
217
- continue;
457
+ return { success: false, path: backupDir, error: 'Mount not available' };
218
458
  }
219
459
  // Ensure backup dir exists
220
460
  await ensureDir(backupDir);
221
461
  // Remove old and copy new
222
462
  await rmIfExists(destPath);
223
463
  await copyDir(srcPath, destPath);
224
- mirrored.push(backupDir);
464
+ return { success: true, path: backupDir };
225
465
  }
226
466
  catch (err) {
227
- failed.push({ path: backupDir, error: err instanceof Error ? err.message : String(err) });
467
+ return {
468
+ success: false,
469
+ path: backupDir,
470
+ error: err instanceof Error ? err.message : String(err)
471
+ };
472
+ }
473
+ });
474
+ // Wait for all mirror operations to complete
475
+ const results = await Promise.allSettled(mirrorPromises);
476
+ for (const result of results) {
477
+ if (result.status === 'fulfilled') {
478
+ const { success, path: backupPath, error } = result.value;
479
+ if (success) {
480
+ mirrored.push(backupPath);
481
+ }
482
+ else {
483
+ failed.push({ path: backupPath, error: error ?? 'Unknown error' });
484
+ }
485
+ }
486
+ else {
487
+ // Promise rejection (shouldn't happen with try-catch, but handle it)
488
+ failed.push({ path: 'unknown', error: result.reason?.message ?? String(result.reason) });
228
489
  }
229
490
  }
230
491
  return { mirrored, failed };
@@ -330,13 +591,91 @@ async function syncStaleBackups(sourceDir, allDirs, bundleId) {
330
591
  }
331
592
  }
332
593
  }
594
+ async function writeLocalRepoMeta(params) {
595
+ await ensureDir(path.dirname(params.metaPath));
596
+ const obj = {
597
+ repoId: params.repoId,
598
+ source: 'local',
599
+ localPath: params.localPath,
600
+ ref: params.ref,
601
+ fetchedAt: params.fetchedAt,
602
+ ingestedFiles: params.ingestedFiles,
603
+ skipped: params.skipped,
604
+ };
605
+ await fs.writeFile(params.metaPath, JSON.stringify(obj, null, 2) + '\n', 'utf8');
606
+ }
607
+ async function ingestLocalRepo(params) {
608
+ const repoId = `${params.owner}/${params.repo}`;
609
+ const repoRoot = path.resolve(params.localPath);
610
+ const st = await fs.stat(repoRoot);
611
+ if (!st.isDirectory()) {
612
+ throw new Error(`Local repo path is not a directory: ${repoRoot}`);
613
+ }
614
+ const bundlePaths = getBundlePaths(params.storageDir, params.bundleId);
615
+ const rawDest = repoRawDir(bundlePaths, params.owner, params.repo);
616
+ const normDest = repoNormDir(bundlePaths, params.owner, params.repo);
617
+ await rmIfExists(rawDest);
618
+ await rmIfExists(normDest);
619
+ await ensureDir(rawDest);
620
+ await ensureDir(normDest);
621
+ const bundleNormPrefixPosix = `repos/${params.owner}/${params.repo}/norm`;
622
+ const ingested = await ingestRepoToBundle({
623
+ repoId,
624
+ repoRoot,
625
+ rawDestRoot: rawDest,
626
+ normDestRoot: normDest,
627
+ bundleNormPrefixPosix,
628
+ options: {
629
+ maxFileBytes: params.cfg.maxFileBytes,
630
+ maxTotalBytes: params.cfg.maxTotalBytes,
631
+ },
632
+ });
633
+ const fetchedAt = nowIso();
634
+ await writeLocalRepoMeta({
635
+ metaPath: repoMetaPath(bundlePaths, params.owner, params.repo),
636
+ repoId,
637
+ localPath: repoRoot,
638
+ ref: params.ref,
639
+ fetchedAt,
640
+ ingestedFiles: ingested.files.length,
641
+ skipped: ingested.skipped,
642
+ });
643
+ return { files: ingested.files, skipped: ingested.skipped };
644
+ }
333
645
  async function cloneAndIngestGitHubRepo(params) {
334
646
  const repoId = `${params.owner}/${params.repo}`;
335
647
  const cloneUrl = toCloneUrl({ owner: params.owner, repo: params.repo });
336
- const tmpCheckout = path.join(params.cfg.tmpDir, 'checkouts', params.bundleId, `${params.owner}__${params.repo}`);
337
- await rmIfExists(tmpCheckout);
338
- await shallowClone(cloneUrl, tmpCheckout, { ref: params.ref });
339
- const headSha = await getLocalHeadSha(tmpCheckout);
648
+ const tmpBase = path.join(params.cfg.tmpDir, 'checkouts', params.bundleId, `${params.owner}__${params.repo}`);
649
+ const tmpCheckoutGit = tmpBase;
650
+ const tmpArchiveDir = `${tmpBase}__archive`;
651
+ await rmIfExists(tmpCheckoutGit);
652
+ await rmIfExists(tmpArchiveDir);
653
+ let repoRootForIngest = tmpCheckoutGit;
654
+ let headSha;
655
+ const notes = [];
656
+ let source = 'git';
657
+ let fetchedAt = nowIso();
658
+ let refUsed = params.ref;
659
+ try {
660
+ await shallowClone(cloneUrl, tmpCheckoutGit, { ref: params.ref, timeoutMs: params.cfg.gitCloneTimeoutMs });
661
+ headSha = await getLocalHeadSha(tmpCheckoutGit);
662
+ }
663
+ catch (err) {
664
+ // Fallback: GitHub archive download (zipball) + extract.
665
+ source = 'archive';
666
+ const msg = err instanceof Error ? err.message : String(err);
667
+ notes.push(`git clone failed; used GitHub archive fallback: ${msg}`);
668
+ const archive = await downloadAndExtractGitHubArchive({
669
+ cfg: params.cfg,
670
+ owner: params.owner,
671
+ repo: params.repo,
672
+ ref: params.ref,
673
+ destDir: tmpArchiveDir,
674
+ });
675
+ repoRootForIngest = archive.repoRoot;
676
+ fetchedAt = archive.fetchedAt;
677
+ refUsed = archive.refUsed;
678
+ }
340
679
  const bundlePaths = getBundlePaths(params.storageDir, params.bundleId);
341
680
  const rawDest = repoRawDir(bundlePaths, params.owner, params.repo);
342
681
  const normDest = repoNormDir(bundlePaths, params.owner, params.repo);
@@ -347,7 +686,7 @@ async function cloneAndIngestGitHubRepo(params) {
347
686
  const bundleNormPrefixPosix = `repos/${params.owner}/${params.repo}/norm`;
348
687
  const ingested = await ingestRepoToBundle({
349
688
  repoId,
350
- repoRoot: tmpCheckout,
689
+ repoRoot: repoRootForIngest,
351
690
  rawDestRoot: rawDest,
352
691
  normDestRoot: normDest,
353
692
  bundleNormPrefixPosix,
@@ -356,7 +695,6 @@ async function cloneAndIngestGitHubRepo(params) {
356
695
  maxTotalBytes: params.cfg.maxTotalBytes,
357
696
  },
358
697
  });
359
- const fetchedAt = nowIso();
360
698
  await writeRepoMeta({
361
699
  metaPath: repoMetaPath(bundlePaths, params.owner, params.repo),
362
700
  repoId,
@@ -365,9 +703,12 @@ async function cloneAndIngestGitHubRepo(params) {
365
703
  fetchedAt,
366
704
  ingestedFiles: ingested.files.length,
367
705
  skipped: ingested.skipped,
706
+ source,
707
+ ref: refUsed,
368
708
  });
369
- await rmIfExists(tmpCheckout);
370
- return { headSha, files: ingested.files, skipped: ingested.skipped };
709
+ await rmIfExists(tmpCheckoutGit);
710
+ await rmIfExists(tmpArchiveDir);
711
+ return { headSha, files: ingested.files, skipped: ingested.skipped, notes, source };
371
712
  }
372
713
  function groupFilesByRepoId(files) {
373
714
  const byRepo = new Map();
@@ -401,7 +742,28 @@ async function generateFactsBestEffort(params) {
401
742
  logger.error('Static analysis exception', err instanceof Error ? err : undefined);
402
743
  }
403
744
  }
404
- export async function createBundle(cfg, input) {
745
+ export async function createBundle(cfg, input, options) {
746
+ // Apply concurrency limiting to prevent DoS attacks
747
+ return await bundleCreationLimiter.run(async () => {
748
+ return await createBundleInternal(cfg, input, options);
749
+ });
750
+ }
751
+ async function createBundleInternal(cfg, input, options) {
752
+ const fingerprint = computeCreateInputFingerprint(input);
753
+ const ifExists = options?.ifExists ?? 'error';
754
+ if (ifExists !== 'createNew') {
755
+ const existing = await findExistingBundleByFingerprint(cfg, fingerprint);
756
+ if (existing) {
757
+ if (ifExists === 'returnExisting') {
758
+ return await readBundleSummary(cfg, existing);
759
+ }
760
+ if (ifExists === 'updateExisting') {
761
+ const { summary } = await updateBundle(cfg, existing);
762
+ return summary;
763
+ }
764
+ throw new Error(`Bundle already exists for these inputs: ${existing}`);
765
+ }
766
+ }
405
767
  const bundleId = crypto.randomUUID();
406
768
  const createdAt = nowIso();
407
769
  // Use effective storage dir (falls back if primary unavailable)
@@ -417,7 +779,7 @@ export async function createBundle(cfg, input) {
417
779
  for (const repoInput of input.repos) {
418
780
  if (repoInput.kind === 'github') {
419
781
  const { owner, repo } = parseOwnerRepo(repoInput.repo);
420
- const { headSha, files, skipped } = await cloneAndIngestGitHubRepo({
782
+ const { headSha, files, skipped, notes, source } = await cloneAndIngestGitHubRepo({
421
783
  cfg,
422
784
  bundleId,
423
785
  storageDir: effectiveStorageDir,
@@ -426,7 +788,27 @@ export async function createBundle(cfg, input) {
426
788
  ref: repoInput.ref,
427
789
  });
428
790
  allIngestedFiles.push(...files);
429
- reposSummary.push({ kind: 'github', id: `${owner}/${repo}`, headSha, notes: skipped.slice(0, 50) });
791
+ reposSummary.push({
792
+ kind: 'github',
793
+ id: `${owner}/${repo}`,
794
+ source,
795
+ headSha,
796
+ notes: [...notes, ...skipped].slice(0, 50),
797
+ });
798
+ }
799
+ else if (repoInput.kind === 'local') {
800
+ const { owner, repo } = parseOwnerRepo(repoInput.repo);
801
+ const { files, skipped } = await ingestLocalRepo({
802
+ cfg,
803
+ bundleId,
804
+ storageDir: effectiveStorageDir,
805
+ owner,
806
+ repo,
807
+ localPath: repoInput.path,
808
+ ref: repoInput.ref,
809
+ });
810
+ allIngestedFiles.push(...files);
811
+ reposSummary.push({ kind: 'local', id: `${owner}/${repo}`, source: 'local', notes: skipped.slice(0, 50) });
430
812
  }
431
813
  else {
432
814
  // DeepWiki integration: fetch and convert to Markdown.
@@ -439,6 +821,7 @@ export async function createBundle(cfg, input) {
439
821
  reposSummary.push({
440
822
  kind: 'deepwiki',
441
823
  id: deepwikiResult.summary.repoId,
824
+ source: 'deepwiki',
442
825
  notes: deepwikiResult.summary.notes,
443
826
  });
444
827
  }
@@ -482,6 +865,7 @@ export async function createBundle(cfg, input) {
482
865
  bundleId,
483
866
  createdAt,
484
867
  updatedAt: createdAt,
868
+ fingerprint,
485
869
  displayName,
486
870
  description,
487
871
  tags,
@@ -494,6 +878,7 @@ export async function createBundle(cfg, input) {
494
878
  repos: reposSummary.map((r) => ({
495
879
  kind: r.kind,
496
880
  id: r.id,
881
+ source: r.source,
497
882
  headSha: r.headSha,
498
883
  fetchedAt: createdAt,
499
884
  notes: r.notes,
@@ -516,7 +901,7 @@ export async function createBundle(cfg, input) {
516
901
  });
517
902
  // Overview (S2: factual-only with evidence pointers).
518
903
  const perRepoOverviews = reposSummary
519
- .filter((r) => r.kind === 'github')
904
+ .filter((r) => r.kind === 'github' || r.kind === 'local')
520
905
  .map((r) => {
521
906
  const repoId = r.id;
522
907
  const repoFiles = allIngestedFiles.filter((f) => f.repoId === repoId);
@@ -547,6 +932,8 @@ export async function createBundle(cfg, input) {
547
932
  logger.error(errorMsg);
548
933
  throw new Error(errorMsg);
549
934
  }
935
+ // Update de-duplication index (best-effort). This is intentionally after validation.
936
+ await updateDedupIndexBestEffort(cfg, fingerprint, bundleId, createdAt);
550
937
  const summary = {
551
938
  bundleId,
552
939
  createdAt,
@@ -592,6 +979,14 @@ export async function checkForUpdates(cfg, bundleId) {
592
979
  hasUpdates = true;
593
980
  details.push({ repoId, currentSha: prev?.headSha, remoteSha, changed });
594
981
  }
982
+ else if (repoInput.kind === 'local') {
983
+ const { owner, repo } = parseOwnerRepo(repoInput.repo);
984
+ const repoId = `${owner}/${repo}`;
985
+ // We can't reliably detect whether local files changed without scanning; assume possible update.
986
+ const prev = manifest.repos.find((r) => r.id === repoId);
987
+ details.push({ repoId, currentSha: prev?.headSha, changed: true });
988
+ hasUpdates = true;
989
+ }
595
990
  else {
596
991
  // DeepWiki: can't easily detect changes, assume possible update
597
992
  details.push({ repoId: repoInput.url, changed: true });
@@ -600,6 +995,257 @@ export async function checkForUpdates(cfg, bundleId) {
600
995
  }
601
996
  return { hasUpdates, details };
602
997
  }
998
+ async function scanBundleIndexableFiles(params) {
999
+ const files = [];
1000
+ const skipped = [];
1001
+ let totalBytes = 0;
1002
+ const pushFile = async (f) => {
1003
+ const st = await statOrNull(f.absPath);
1004
+ if (!st?.isFile())
1005
+ return;
1006
+ if (st.size > params.cfg.maxFileBytes) {
1007
+ skipped.push(`${f.bundleRelPosix} (too large: ${st.size} bytes)`);
1008
+ return;
1009
+ }
1010
+ if (totalBytes + st.size > params.cfg.maxTotalBytes) {
1011
+ skipped.push(`(bundle maxTotalBytes reached) stopped before: ${f.bundleRelPosix}`);
1012
+ return;
1013
+ }
1014
+ const text = await readUtf8OrNull(f.absPath);
1015
+ if (text === null) {
1016
+ skipped.push(`${f.bundleRelPosix} (unreadable utf8)`);
1017
+ return;
1018
+ }
1019
+ const normalized = text.replace(/\r\n/g, '\n');
1020
+ const sha256 = sha256Text(normalized);
1021
+ totalBytes += st.size;
1022
+ files.push({
1023
+ repoId: f.repoId,
1024
+ kind: f.kind,
1025
+ repoRelativePath: f.repoRelativePath,
1026
+ bundleNormRelativePath: f.bundleRelPosix,
1027
+ bundleNormAbsPath: f.absPath,
1028
+ sha256,
1029
+ bytes: st.size,
1030
+ });
1031
+ };
1032
+ // 1) repos/<owner>/<repo>/norm/** (github/local)
1033
+ try {
1034
+ const owners = await fs.readdir(params.reposDir, { withFileTypes: true });
1035
+ for (const ownerEnt of owners) {
1036
+ if (!ownerEnt.isDirectory())
1037
+ continue;
1038
+ const owner = ownerEnt.name;
1039
+ const ownerDir = path.join(params.reposDir, owner);
1040
+ const repos = await fs.readdir(ownerDir, { withFileTypes: true });
1041
+ for (const repoEnt of repos) {
1042
+ if (!repoEnt.isDirectory())
1043
+ continue;
1044
+ const repo = repoEnt.name;
1045
+ const normDir = path.join(ownerDir, repo, 'norm');
1046
+ const normSt = await statOrNull(normDir);
1047
+ if (!normSt?.isDirectory())
1048
+ continue;
1049
+ for await (const wf of walkFilesNoIgnore(normDir)) {
1050
+ const repoRel = wf.relPosix;
1051
+ const kind = classifyIngestedFileKind(repoRel);
1052
+ const bundleRel = `repos/${owner}/${repo}/norm/${repoRel}`;
1053
+ await pushFile({
1054
+ repoId: `${owner}/${repo}`,
1055
+ kind,
1056
+ repoRelativePath: repoRel,
1057
+ bundleRelPosix: bundleRel,
1058
+ absPath: wf.absPath,
1059
+ });
1060
+ }
1061
+ }
1062
+ }
1063
+ }
1064
+ catch {
1065
+ // ignore missing repos dir
1066
+ }
1067
+ // 2) libraries/context7/** (docs-only)
1068
+ const context7Dir = path.join(params.librariesDir, 'context7');
1069
+ const ctxSt = await statOrNull(context7Dir);
1070
+ if (ctxSt?.isDirectory()) {
1071
+ for await (const wf of walkFilesNoIgnore(context7Dir)) {
1072
+ // Match original ingestion: only .md docs are indexed from Context7.
1073
+ if (!wf.relPosix.toLowerCase().endsWith('.md'))
1074
+ continue;
1075
+ const relFromLibRoot = wf.relPosix; // relative to libraries/context7
1076
+ const parts = relFromLibRoot.split('/').filter(Boolean);
1077
+ const fileName = parts[parts.length - 1] ?? '';
1078
+ const dirParts = parts.slice(0, -1);
1079
+ let repoId = 'context7:unknown';
1080
+ if (dirParts[0] === '_unresolved' && dirParts[1]) {
1081
+ repoId = `context7:unresolved/${dirParts[1]}`;
1082
+ }
1083
+ else if (dirParts.length > 0) {
1084
+ repoId = `context7:/${dirParts.join('/')}`;
1085
+ }
1086
+ const bundleRel = `libraries/context7/${relFromLibRoot}`;
1087
+ await pushFile({
1088
+ repoId,
1089
+ kind: 'doc',
1090
+ repoRelativePath: fileName,
1091
+ bundleRelPosix: bundleRel,
1092
+ absPath: wf.absPath,
1093
+ });
1094
+ }
1095
+ }
1096
+ // 3) deepwiki/<owner>/<repo>/norm/** (docs-only)
1097
+ const deepwikiDir = path.join(params.bundleRootDir, 'deepwiki');
1098
+ const dwSt = await statOrNull(deepwikiDir);
1099
+ if (dwSt?.isDirectory()) {
1100
+ // Only walk the norm subtrees.
1101
+ const owners = await fs.readdir(deepwikiDir, { withFileTypes: true });
1102
+ for (const ownerEnt of owners) {
1103
+ if (!ownerEnt.isDirectory())
1104
+ continue;
1105
+ const owner = ownerEnt.name;
1106
+ const ownerDir = path.join(deepwikiDir, owner);
1107
+ const repos = await fs.readdir(ownerDir, { withFileTypes: true });
1108
+ for (const repoEnt of repos) {
1109
+ if (!repoEnt.isDirectory())
1110
+ continue;
1111
+ const repo = repoEnt.name;
1112
+ const normDir = path.join(ownerDir, repo, 'norm');
1113
+ const normSt = await statOrNull(normDir);
1114
+ if (!normSt?.isDirectory())
1115
+ continue;
1116
+ for await (const wf of walkFilesNoIgnore(normDir)) {
1117
+ if (!wf.relPosix.toLowerCase().endsWith('.md'))
1118
+ continue;
1119
+ const bundleRel = `deepwiki/${owner}/${repo}/norm/${wf.relPosix}`;
1120
+ await pushFile({
1121
+ repoId: `deepwiki:${owner}/${repo}`,
1122
+ kind: 'doc',
1123
+ repoRelativePath: wf.relPosix,
1124
+ bundleRelPosix: bundleRel,
1125
+ absPath: wf.absPath,
1126
+ });
1127
+ }
1128
+ }
1129
+ }
1130
+ }
1131
+ return { files, totalBytes, skipped };
1132
+ }
1133
+ export async function repairBundle(cfg, bundleId, options) {
1134
+ const mode = options?.mode ?? 'repair';
1135
+ const rebuildIndexOpt = options?.rebuildIndex ?? true;
1136
+ const rebuildGuidesOpt = options?.rebuildGuides ?? true;
1137
+ const rebuildOverviewOpt = options?.rebuildOverview ?? true;
1138
+ const storageDir = await findBundleStorageDir(cfg.storageDirs, bundleId);
1139
+ if (!storageDir) {
1140
+ throw new Error(`Bundle not found: ${bundleId}`);
1141
+ }
1142
+ const paths = getBundlePaths(storageDir, bundleId);
1143
+ const before = await validateBundleCompleteness(paths.rootDir);
1144
+ if (mode === 'validate') {
1145
+ return {
1146
+ bundleId,
1147
+ mode,
1148
+ repaired: false,
1149
+ actionsTaken: [],
1150
+ before,
1151
+ after: before,
1152
+ };
1153
+ }
1154
+ // Manifest is required for safe repairs (no fetching/re-ingest).
1155
+ const manifest = await readManifest(paths.manifestPath);
1156
+ const actionsTaken = [];
1157
+ // Determine what needs repair.
1158
+ const stAgents = await statOrNull(paths.agentsPath);
1159
+ const stStartHere = await statOrNull(paths.startHerePath);
1160
+ const stOverview = await statOrNull(paths.overviewPath);
1161
+ const stIndex = await statOrNull(paths.searchDbPath);
1162
+ const needsAgents = !stAgents || stAgents.size === 0;
1163
+ const needsStartHere = !stStartHere || stStartHere.size === 0;
1164
+ const needsOverview = !stOverview || stOverview.size === 0;
1165
+ const needsIndex = !stIndex || stIndex.size === 0;
1166
+ // Scan bundle files once if needed for index/overview.
1167
+ let scanned = null;
1168
+ const needScan = (rebuildIndexOpt && needsIndex) || (rebuildOverviewOpt && needsOverview);
1169
+ if (needScan) {
1170
+ scanned = await scanBundleIndexableFiles({
1171
+ cfg,
1172
+ bundleRootDir: paths.rootDir,
1173
+ reposDir: paths.reposDir,
1174
+ librariesDir: paths.librariesDir,
1175
+ });
1176
+ if (scanned.skipped.length) {
1177
+ actionsTaken.push(`scan: skipped ${scanned.skipped.length} file(s)`);
1178
+ }
1179
+ }
1180
+ if (rebuildIndexOpt && needsIndex) {
1181
+ const files = scanned?.files ?? [];
1182
+ await rebuildIndex(paths.searchDbPath, files, { includeDocs: true, includeCode: true });
1183
+ actionsTaken.push(`rebuildIndex: indexed ${files.length} file(s)`);
1184
+ }
1185
+ if (rebuildGuidesOpt && needsAgents) {
1186
+ await writeAgentsMd(paths.agentsPath);
1187
+ actionsTaken.push('writeAgentsMd');
1188
+ }
1189
+ if (rebuildGuidesOpt && needsStartHere) {
1190
+ await writeStartHereMd({
1191
+ targetPath: paths.startHerePath,
1192
+ bundleId,
1193
+ repos: (manifest.repos ?? []).map((r) => ({ id: r.id, headSha: r.headSha })),
1194
+ libraries: manifest.libraries,
1195
+ });
1196
+ actionsTaken.push('writeStartHereMd');
1197
+ }
1198
+ if (rebuildOverviewOpt && needsOverview) {
1199
+ const allFiles = scanned?.files ?? [];
1200
+ const perRepoOverviews = (manifest.repos ?? [])
1201
+ .filter((r) => r.kind === 'github' || r.kind === 'local')
1202
+ .map((r) => {
1203
+ const repoId = r.id;
1204
+ const repoFiles = allFiles.filter((f) => f.repoId === repoId);
1205
+ return { repoId, headSha: r.headSha, files: repoFiles };
1206
+ });
1207
+ const md = await generateOverviewMarkdown({
1208
+ bundleId,
1209
+ bundleRootDir: paths.rootDir,
1210
+ repos: perRepoOverviews,
1211
+ libraries: manifest.libraries,
1212
+ });
1213
+ await writeOverviewFile(paths.overviewPath, md);
1214
+ actionsTaken.push('writeOverviewFile');
1215
+ }
1216
+ let updatedAt;
1217
+ if (actionsTaken.length > 0) {
1218
+ updatedAt = nowIso();
1219
+ const fingerprint = manifest.fingerprint ??
1220
+ computeCreateInputFingerprint({
1221
+ repos: manifest.inputs.repos,
1222
+ libraries: manifest.inputs.libraries,
1223
+ topics: manifest.inputs.topics,
1224
+ });
1225
+ const newManifest = {
1226
+ ...manifest,
1227
+ updatedAt,
1228
+ fingerprint,
1229
+ };
1230
+ await writeManifest(paths.manifestPath, newManifest);
1231
+ // Keep the de-duplication index fresh (best-effort).
1232
+ await updateDedupIndexBestEffort(cfg, fingerprint, bundleId, updatedAt);
1233
+ // Mirror to backup storage directories (non-blocking on failures)
1234
+ if (cfg.storageDirs.length > 1) {
1235
+ await mirrorBundleToBackups(storageDir, cfg.storageDirs, bundleId);
1236
+ }
1237
+ }
1238
+ const after = await validateBundleCompleteness(paths.rootDir);
1239
+ return {
1240
+ bundleId,
1241
+ mode,
1242
+ repaired: actionsTaken.length > 0,
1243
+ actionsTaken,
1244
+ before,
1245
+ after,
1246
+ updatedAt,
1247
+ };
1248
+ }
603
1249
  export async function updateBundle(cfg, bundleId, options) {
604
1250
  // Use effective storage dir (falls back if primary unavailable)
605
1251
  const effectiveStorageDir = await getEffectiveStorageDirForWrite(cfg);
@@ -626,7 +1272,7 @@ export async function updateBundle(cfg, bundleId, options) {
626
1272
  if (remoteSha && prev?.headSha && remoteSha !== prev.headSha) {
627
1273
  changed = true;
628
1274
  }
629
- const { headSha, files, skipped } = await cloneAndIngestGitHubRepo({
1275
+ const { headSha, files, skipped, notes, source } = await cloneAndIngestGitHubRepo({
630
1276
  cfg,
631
1277
  bundleId,
632
1278
  storageDir: effectiveStorageDir,
@@ -634,11 +1280,31 @@ export async function updateBundle(cfg, bundleId, options) {
634
1280
  repo,
635
1281
  ref: repoInput.ref,
636
1282
  });
637
- if (prev?.headSha && headSha !== prev.headSha) {
1283
+ if (prev?.headSha && headSha && headSha !== prev.headSha) {
1284
+ changed = true;
1285
+ }
1286
+ // If we had to fall back to an archive, treat as changed (we don't have git metadata).
1287
+ if (source === 'archive') {
638
1288
  changed = true;
639
1289
  }
640
1290
  allIngestedFiles.push(...files);
641
- reposSummary.push({ kind: 'github', id: repoId, headSha, notes: skipped.slice(0, 50) });
1291
+ reposSummary.push({ kind: 'github', id: repoId, source, headSha, notes: [...notes, ...skipped].slice(0, 50) });
1292
+ }
1293
+ else if (repoInput.kind === 'local') {
1294
+ const { owner, repo } = parseOwnerRepo(repoInput.repo);
1295
+ const repoId = `${owner}/${repo}`;
1296
+ const { files, skipped } = await ingestLocalRepo({
1297
+ cfg,
1298
+ bundleId,
1299
+ storageDir: effectiveStorageDir,
1300
+ owner,
1301
+ repo,
1302
+ localPath: repoInput.path,
1303
+ ref: repoInput.ref,
1304
+ });
1305
+ allIngestedFiles.push(...files);
1306
+ reposSummary.push({ kind: 'local', id: repoId, source: 'local', notes: skipped.slice(0, 50) });
1307
+ changed = true;
642
1308
  }
643
1309
  else {
644
1310
  // DeepWiki integration: fetch and convert to Markdown.
@@ -651,6 +1317,7 @@ export async function updateBundle(cfg, bundleId, options) {
651
1317
  reposSummary.push({
652
1318
  kind: 'deepwiki',
653
1319
  id: deepwikiResult.summary.repoId,
1320
+ source: 'deepwiki',
654
1321
  notes: deepwikiResult.summary.notes,
655
1322
  });
656
1323
  // Always mark as changed for DeepWiki since we can't easily detect content changes.
@@ -676,12 +1343,19 @@ export async function updateBundle(cfg, bundleId, options) {
676
1343
  includeDocs: manifest.index.includeDocs,
677
1344
  includeCode: manifest.index.includeCode,
678
1345
  });
1346
+ const fingerprint = computeCreateInputFingerprint({
1347
+ repos: manifest.inputs.repos,
1348
+ libraries: manifest.inputs.libraries,
1349
+ topics: manifest.inputs.topics,
1350
+ });
679
1351
  const newManifest = {
680
1352
  ...manifest,
681
1353
  updatedAt,
1354
+ fingerprint,
682
1355
  repos: reposSummary.map((r) => ({
683
1356
  kind: r.kind,
684
1357
  id: r.id,
1358
+ source: r.source,
685
1359
  headSha: r.headSha,
686
1360
  fetchedAt: updatedAt,
687
1361
  notes: r.notes,
@@ -698,7 +1372,7 @@ export async function updateBundle(cfg, bundleId, options) {
698
1372
  libraries: librariesSummary,
699
1373
  });
700
1374
  const perRepoOverviews = reposSummary
701
- .filter((r) => r.kind === 'github')
1375
+ .filter((r) => r.kind === 'github' || r.kind === 'local')
702
1376
  .map((r) => {
703
1377
  const repoId = r.id;
704
1378
  const repoFiles = allIngestedFiles.filter((f) => f.repoId === repoId);
@@ -722,6 +1396,8 @@ export async function updateBundle(cfg, bundleId, options) {
722
1396
  if (cfg.storageDirs.length > 1) {
723
1397
  await mirrorBundleToBackups(effectiveStorageDir, cfg.storageDirs, bundleId);
724
1398
  }
1399
+ // Keep the de-duplication index fresh (best-effort).
1400
+ await updateDedupIndexBestEffort(cfg, fingerprint, bundleId, updatedAt);
725
1401
  const summary = {
726
1402
  bundleId,
727
1403
  createdAt: manifest.createdAt,