@pratik7368patil/anchor-core 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -334,6 +334,48 @@ CREATE VIRTUAL TABLE IF NOT EXISTS wisdom_units_fts USING fts5(
334
334
  category
335
335
  );
336
336
 
337
+ CREATE TABLE IF NOT EXISTS code_files (
338
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
339
+ repo_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
340
+ path TEXT NOT NULL,
341
+ language TEXT,
342
+ size_bytes INTEGER NOT NULL,
343
+ content_hash TEXT NOT NULL,
344
+ updated_at TEXT NOT NULL,
345
+ UNIQUE(repo_id, path)
346
+ );
347
+
348
+ CREATE TABLE IF NOT EXISTS code_chunks (
349
+ id TEXT PRIMARY KEY,
350
+ repo_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
351
+ file_id INTEGER NOT NULL REFERENCES code_files(id) ON DELETE CASCADE,
352
+ repo TEXT NOT NULL,
353
+ file_path TEXT NOT NULL,
354
+ language TEXT,
355
+ start_line INTEGER NOT NULL,
356
+ end_line INTEGER NOT NULL,
357
+ sanitized_text TEXT NOT NULL,
358
+ symbols_json TEXT NOT NULL,
359
+ content_hash TEXT NOT NULL,
360
+ updated_at TEXT NOT NULL
361
+ );
362
+
363
+ CREATE VIRTUAL TABLE IF NOT EXISTS code_chunks_fts USING fts5(
364
+ chunkId UNINDEXED,
365
+ sanitizedText,
366
+ filePath,
367
+ symbols,
368
+ language
369
+ );
370
+
371
+ CREATE TABLE IF NOT EXISTS code_index_state (
372
+ repo TEXT PRIMARY KEY,
373
+ last_indexed_at TEXT NOT NULL,
374
+ indexed_files INTEGER NOT NULL,
375
+ code_chunks INTEGER NOT NULL,
376
+ skipped_files INTEGER NOT NULL
377
+ );
378
+
337
379
  CREATE TABLE IF NOT EXISTS sync_state (
338
380
  repo TEXT PRIMARY KEY,
339
381
  last_sync_at TEXT,
@@ -346,6 +388,8 @@ CREATE INDEX IF NOT EXISTS idx_pr_files_path ON pr_files(path);
346
388
  CREATE INDEX IF NOT EXISTS idx_pr_comments_source ON pr_comments(source_type);
347
389
  CREATE INDEX IF NOT EXISTS idx_wisdom_units_category ON wisdom_units(category);
348
390
  CREATE INDEX IF NOT EXISTS idx_wisdom_units_pr ON wisdom_units(pr_id);
391
+ CREATE INDEX IF NOT EXISTS idx_code_files_path ON code_files(path);
392
+ CREATE INDEX IF NOT EXISTS idx_code_chunks_file_path ON code_chunks(file_path);
349
393
  `;
350
394
 
351
395
  // src/db/database.ts
@@ -365,8 +409,10 @@ function initializeSchema(db) {
365
409
  function checkSchema(db) {
366
410
  try {
367
411
  const tables = db.prepare("SELECT name FROM sqlite_master WHERE type IN ('table', 'virtual') AND name = ?").all("wisdom_units_fts");
412
+ const codeTables = db.prepare("SELECT name FROM sqlite_master WHERE type IN ('table', 'virtual') AND name = ?").all("code_chunks_fts");
368
413
  const wisdom = db.prepare("SELECT name FROM sqlite_master WHERE name = ?").all("wisdom_units");
369
- return tables.length > 0 && wisdom.length > 0;
414
+ const code = db.prepare("SELECT name FROM sqlite_master WHERE name = ?").all("code_chunks");
415
+ return tables.length > 0 && wisdom.length > 0 && codeTables.length > 0 && code.length > 0;
370
416
  } catch {
371
417
  return false;
372
418
  }
@@ -543,6 +589,87 @@ function upsertPullRequest(db, pr, wisdomUnits) {
543
589
  const comments = (pr.reviews?.length ?? 0) + (pr.reviewComments?.length ?? 0) + (pr.issueComments?.length ?? 0);
544
590
  return { files: pr.files.length, comments, wisdom: wisdomUnits.length };
545
591
  }
592
+ function replaceCodeIndex(db, repo, codeFiles, codeChunks, skippedFiles, cwd) {
593
+ initializeSchema(db);
594
+ const repoId = ensureRepository(db, repo);
595
+ const now = (/* @__PURE__ */ new Date()).toISOString();
596
+ const transaction = db.transaction(() => {
597
+ const existingChunks = db.prepare("SELECT id FROM code_chunks WHERE repo_id = ?").all(repoId);
598
+ const deleteFts = db.prepare("DELETE FROM code_chunks_fts WHERE chunkId = ?");
599
+ for (const row of existingChunks) deleteFts.run(row.id);
600
+ db.prepare("DELETE FROM code_chunks WHERE repo_id = ?").run(repoId);
601
+ db.prepare("DELETE FROM code_files WHERE repo_id = ?").run(repoId);
602
+ const insertFile = db.prepare(
603
+ `INSERT INTO code_files
604
+ (repo_id, path, language, size_bytes, content_hash, updated_at)
605
+ VALUES (?, ?, ?, ?, ?, ?)`
606
+ );
607
+ for (const file of codeFiles) {
608
+ insertFile.run(
609
+ repoId,
610
+ file.path,
611
+ file.language ?? null,
612
+ file.sizeBytes,
613
+ file.contentHash,
614
+ file.updatedAt
615
+ );
616
+ }
617
+ const fileRows = db.prepare("SELECT id, path FROM code_files WHERE repo_id = ?").all(repoId);
618
+ const fileIds = new Map(fileRows.map((row) => [row.path, row.id]));
619
+ const insertChunk = db.prepare(
620
+ `INSERT INTO code_chunks
621
+ (id, repo_id, file_id, repo, file_path, language, start_line, end_line, sanitized_text,
622
+ symbols_json, content_hash, updated_at)
623
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
624
+ );
625
+ const insertFts = db.prepare(
626
+ `INSERT INTO code_chunks_fts
627
+ (chunkId, sanitizedText, filePath, symbols, language)
628
+ VALUES (?, ?, ?, ?, ?)`
629
+ );
630
+ for (const chunk of codeChunks) {
631
+ const fileId = fileIds.get(chunk.filePath);
632
+ if (!fileId) continue;
633
+ insertChunk.run(
634
+ chunk.id,
635
+ repoId,
636
+ fileId,
637
+ chunk.repo,
638
+ chunk.filePath,
639
+ chunk.language ?? null,
640
+ chunk.startLine,
641
+ chunk.endLine,
642
+ chunk.sanitizedText,
643
+ JSON.stringify(chunk.symbols),
644
+ chunk.contentHash,
645
+ chunk.updatedAt
646
+ );
647
+ insertFts.run(
648
+ chunk.id,
649
+ chunk.sanitizedText,
650
+ chunk.filePath,
651
+ chunk.symbols.join(" "),
652
+ chunk.language ?? ""
653
+ );
654
+ }
655
+ db.prepare(
656
+ `INSERT INTO code_index_state (repo, last_indexed_at, indexed_files, code_chunks, skipped_files)
657
+ VALUES (?, ?, ?, ?, ?)
658
+ ON CONFLICT(repo) DO UPDATE SET
659
+ last_indexed_at = excluded.last_indexed_at,
660
+ indexed_files = excluded.indexed_files,
661
+ code_chunks = excluded.code_chunks,
662
+ skipped_files = excluded.skipped_files`
663
+ ).run(repo, now, codeFiles.length, codeChunks.length, skippedFiles);
664
+ });
665
+ transaction();
666
+ return {
667
+ indexedFiles: codeFiles.length,
668
+ codeChunksCreated: codeChunks.length,
669
+ skippedFiles,
670
+ databasePath: defaultDatabasePath(cwd)
671
+ };
672
+ }
546
673
  function getIndexStatus(cwd, githubTokenConfigured = Boolean(resolveGitHubToken({ cwd }).token), databasePath = defaultDatabasePath(cwd)) {
547
674
  if (!fs2.existsSync(databasePath)) {
548
675
  return {
@@ -551,12 +678,15 @@ function getIndexStatus(cwd, githubTokenConfigured = Boolean(resolveGitHubToken(
551
678
  fileCount: 0,
552
679
  commentCount: 0,
553
680
  wisdomUnitCount: 0,
681
+ codeFileCount: 0,
682
+ codeChunkCount: 0,
554
683
  githubTokenConfigured,
555
684
  health: "missing_database"
556
685
  };
557
686
  }
558
687
  const db = openAnchorDatabase(cwd, databasePath);
559
688
  try {
689
+ initializeSchema(db);
560
690
  if (!checkSchema(db)) {
561
691
  return {
562
692
  databasePath,
@@ -564,6 +694,8 @@ function getIndexStatus(cwd, githubTokenConfigured = Boolean(resolveGitHubToken(
564
694
  fileCount: 0,
565
695
  commentCount: 0,
566
696
  wisdomUnitCount: 0,
697
+ codeFileCount: 0,
698
+ codeChunkCount: 0,
567
699
  githubTokenConfigured,
568
700
  health: "schema_invalid"
569
701
  };
@@ -571,7 +703,9 @@ function getIndexStatus(cwd, githubTokenConfigured = Boolean(resolveGitHubToken(
571
703
  const count = (table) => db.prepare(`SELECT COUNT(*) AS count FROM ${table}`).get().count;
572
704
  const repoRow = db.prepare("SELECT full_name FROM repositories ORDER BY id LIMIT 1").get();
573
705
  const syncRow = db.prepare("SELECT last_sync_at FROM sync_state ORDER BY updated_at DESC LIMIT 1").get();
706
+ const codeIndexRow = db.prepare("SELECT last_indexed_at FROM code_index_state ORDER BY last_indexed_at DESC LIMIT 1").get();
574
707
  const wisdomUnitCount = count("wisdom_units");
708
+ const codeChunkCount = count("code_chunks");
575
709
  return {
576
710
  repo: repoRow?.full_name,
577
711
  databasePath,
@@ -579,9 +713,12 @@ function getIndexStatus(cwd, githubTokenConfigured = Boolean(resolveGitHubToken(
579
713
  fileCount: count("pr_files"),
580
714
  commentCount: count("pr_comments"),
581
715
  wisdomUnitCount,
716
+ codeFileCount: count("code_files"),
717
+ codeChunkCount,
582
718
  lastSyncTime: syncRow?.last_sync_at ?? void 0,
719
+ lastCodeIndexTime: codeIndexRow?.last_indexed_at ?? void 0,
583
720
  githubTokenConfigured,
584
- health: wisdomUnitCount > 0 ? "ok" : "empty_index"
721
+ health: wisdomUnitCount > 0 || codeChunkCount > 0 ? "ok" : "empty_index"
585
722
  };
586
723
  } finally {
587
724
  db.close();
@@ -618,9 +755,260 @@ function chunkHistoricalText(text, maxChunkLength = 700) {
618
755
  return expanded.filter((chunk) => chunk.length >= 12 && hasHighSignalLanguage(chunk));
619
756
  }
620
757
 
621
- // src/indexer/wisdom-extractor.ts
758
+ // src/indexer/code-chunker.ts
622
759
  import crypto from "crypto";
623
760
  import path3 from "path";
761
+ var DEFAULT_CHUNK_LINES = 80;
762
+ var DEFAULT_OVERLAP_LINES = 8;
763
+ var FUNCTION_CALL_STOP_WORDS = /* @__PURE__ */ new Set([
764
+ "catch",
765
+ "describe",
766
+ "for",
767
+ "if",
768
+ "it",
769
+ "return",
770
+ "switch",
771
+ "test",
772
+ "while"
773
+ ]);
774
+ function stableCodeChunkId(file, startLine, endLine) {
775
+ const hash = crypto.createHash("sha256").update([file.repo, file.path, file.contentHash, startLine, endLine].join("\0")).digest("hex").slice(0, 24);
776
+ return `cc_${hash}`;
777
+ }
778
+ function extractCodeSymbols(text, filePath) {
779
+ const symbols = [];
780
+ const declarations = text.matchAll(
781
+ /\b(?:export\s+)?(?:async\s+)?(?:class|function|interface|type|enum|const|let|var)\s+([A-Za-z_$][\w$]*)/g
782
+ );
783
+ for (const match of declarations) symbols.push(match[1] ?? "");
784
+ const objectMethods = text.matchAll(
785
+ /\b([A-Za-z_$][\w$]{2,})\s*[:=]\s*(?:async\s*)?\([^)]*\)\s*=>/g
786
+ );
787
+ for (const match of objectMethods) symbols.push(match[1] ?? "");
788
+ const calls = text.matchAll(/\b([A-Za-z_$][\w$]{2,})\s*\(/g);
789
+ for (const match of calls) {
790
+ const candidate = match[1] ?? "";
791
+ if (!FUNCTION_CALL_STOP_WORDS.has(candidate)) symbols.push(candidate);
792
+ }
793
+ const basename = path3.basename(filePath).replace(/\.[^.]+$/, "");
794
+ if (/^[A-Za-z_$][\w$-]*$/.test(basename)) symbols.push(basename);
795
+ return uniqueStrings(symbols).slice(0, 40);
796
+ }
797
+ function chunkCodeFile(file, options = {}) {
798
+ const chunkLines = options.chunkLines ?? DEFAULT_CHUNK_LINES;
799
+ const overlapLines = Math.max(
800
+ 0,
801
+ Math.min(options.overlapLines ?? DEFAULT_OVERLAP_LINES, chunkLines - 1)
802
+ );
803
+ const lines = file.content.replace(/\r\n/g, "\n").split("\n");
804
+ const chunks = [];
805
+ for (let startIndex = 0; startIndex < lines.length; ) {
806
+ const endIndex = Math.min(lines.length, startIndex + chunkLines);
807
+ const rawText = lines.slice(startIndex, endIndex).join("\n");
808
+ const sanitizedText = sanitizeHistoricalText(rawText);
809
+ if (sanitizedText) {
810
+ chunks.push({
811
+ id: stableCodeChunkId(file, startIndex + 1, endIndex),
812
+ repo: file.repo,
813
+ filePath: file.path,
814
+ language: file.language,
815
+ startLine: startIndex + 1,
816
+ endLine: endIndex,
817
+ sanitizedText,
818
+ symbols: extractCodeSymbols(sanitizedText, file.path),
819
+ contentHash: file.contentHash,
820
+ updatedAt: file.updatedAt
821
+ });
822
+ }
823
+ if (endIndex >= lines.length) break;
824
+ startIndex = Math.max(startIndex + 1, endIndex - overlapLines);
825
+ }
826
+ return chunks;
827
+ }
828
+
829
+ // src/indexer/code-file-discovery.ts
830
+ import { execFileSync as execFileSync3 } from "child_process";
831
+ import crypto2 from "crypto";
832
+ import fs3 from "fs";
833
+ import path4 from "path";
834
+ var DEFAULT_MAX_CODE_FILE_BYTES = 512 * 1024;
835
+ var HARD_EXCLUDED_SEGMENTS = /* @__PURE__ */ new Set([
836
+ ".git",
837
+ ".anchor",
838
+ ".cursor",
839
+ ".codex",
840
+ ".aws",
841
+ ".ssh",
842
+ "node_modules",
843
+ ".nuxt",
844
+ ".next",
845
+ "dist",
846
+ "build",
847
+ "coverage",
848
+ ".turbo"
849
+ ]);
850
+ var LANGUAGE_BY_EXTENSION = {
851
+ ".cjs": "javascript",
852
+ ".css": "css",
853
+ ".go": "go",
854
+ ".html": "html",
855
+ ".java": "java",
856
+ ".js": "javascript",
857
+ ".json": "json",
858
+ ".jsx": "javascript",
859
+ ".md": "markdown",
860
+ ".mjs": "javascript",
861
+ ".py": "python",
862
+ ".rb": "ruby",
863
+ ".rs": "rust",
864
+ ".scss": "scss",
865
+ ".sh": "shell",
866
+ ".sql": "sql",
867
+ ".svelte": "svelte",
868
+ ".ts": "typescript",
869
+ ".tsx": "typescript",
870
+ ".vue": "vue",
871
+ ".yaml": "yaml",
872
+ ".yml": "yaml"
873
+ };
874
+ function normalizeGitPath(value) {
875
+ return value.replace(/\\/g, "/").replace(/^\.\/+/, "");
876
+ }
877
+ function isHardExcludedCodePath(filePath) {
878
+ const normalized = normalizeGitPath(filePath);
879
+ const segments = normalized.split("/");
880
+ if (segments.some((segment) => HARD_EXCLUDED_SEGMENTS.has(segment))) return true;
881
+ const basename = path4.posix.basename(normalized).toLowerCase();
882
+ if ([".netrc", ".npmrc", ".pypirc", ".yarnrc"].includes(basename)) return true;
883
+ if (basename === ".env" || basename.startsWith(".env.")) return true;
884
+ if (basename === "id_rsa" || basename === "id_rsa.pub" || basename === "id_dsa" || basename === "id_ecdsa" || basename === "id_ed25519") {
885
+ return true;
886
+ }
887
+ if (/\.(pem|key|p12|pfx)$/i.test(basename)) return true;
888
+ return false;
889
+ }
890
+ function languageForPath(filePath) {
891
+ const extension = path4.extname(filePath).toLowerCase();
892
+ return LANGUAGE_BY_EXTENSION[extension];
893
+ }
894
+ function isProbablyBinary(buffer) {
895
+ if (buffer.includes(0)) return true;
896
+ if (buffer.length === 0) return false;
897
+ let suspicious = 0;
898
+ for (const byte of buffer) {
899
+ const isAllowedControl = byte === 9 || byte === 10 || byte === 13;
900
+ if (byte < 32 && !isAllowedControl) suspicious += 1;
901
+ }
902
+ return suspicious / buffer.length > 0.01;
903
+ }
904
+ function discoverGitFiles(cwd) {
905
+ const output = execFileSync3("git", ["ls-files", "--cached", "--others", "--exclude-standard"], {
906
+ cwd,
907
+ encoding: "utf8",
908
+ stdio: ["ignore", "pipe", "pipe"]
909
+ });
910
+ return output.split("\n").map((line) => normalizeGitPath(line.trim())).filter(Boolean);
911
+ }
912
+ function discoverCodeFiles(cwd, repo, options = {}) {
913
+ const maxFileBytes = options.maxFileBytes ?? DEFAULT_MAX_CODE_FILE_BYTES;
914
+ const rootPath = path4.resolve(cwd);
915
+ const files = [];
916
+ let skippedFiles = 0;
917
+ for (const filePath of discoverGitFiles(cwd)) {
918
+ if (isHardExcludedCodePath(filePath)) {
919
+ skippedFiles += 1;
920
+ continue;
921
+ }
922
+ const absolutePath = path4.resolve(cwd, filePath);
923
+ const relativeToRoot = path4.relative(rootPath, absolutePath);
924
+ if (relativeToRoot.startsWith("..") || path4.isAbsolute(relativeToRoot)) {
925
+ skippedFiles += 1;
926
+ continue;
927
+ }
928
+ let stat;
929
+ try {
930
+ stat = fs3.statSync(absolutePath);
931
+ } catch {
932
+ skippedFiles += 1;
933
+ continue;
934
+ }
935
+ if (!stat.isFile() || stat.size > maxFileBytes) {
936
+ skippedFiles += 1;
937
+ continue;
938
+ }
939
+ const buffer = fs3.readFileSync(absolutePath);
940
+ if (isProbablyBinary(buffer)) {
941
+ skippedFiles += 1;
942
+ continue;
943
+ }
944
+ const content = buffer.toString("utf8");
945
+ files.push({
946
+ repo,
947
+ path: filePath,
948
+ language: languageForPath(filePath),
949
+ sizeBytes: stat.size,
950
+ contentHash: crypto2.createHash("sha256").update(buffer).digest("hex"),
951
+ updatedAt: stat.mtime.toISOString(),
952
+ absolutePath,
953
+ content
954
+ });
955
+ }
956
+ return { files, skippedFiles };
957
+ }
958
+
959
+ // src/indexer/code-indexer.ts
960
+ function indexCodebase(db, options) {
961
+ options.onProgress?.({ stage: "discovering_code_files", repo: options.repo });
962
+ const discovery = discoverCodeFiles(options.cwd, options.repo, {
963
+ maxFileBytes: options.maxFileBytes
964
+ });
965
+ options.onProgress?.({
966
+ stage: "discovered_code_files",
967
+ repo: options.repo,
968
+ files: discovery.files.length,
969
+ skippedFiles: discovery.skippedFiles
970
+ });
971
+ const chunks = [];
972
+ for (const [index, file] of discovery.files.entries()) {
973
+ options.onProgress?.({
974
+ stage: "indexing_code_file",
975
+ repo: options.repo,
976
+ current: index + 1,
977
+ total: discovery.files.length,
978
+ filePath: file.path
979
+ });
980
+ const fileChunks = chunkCodeFile(file);
981
+ chunks.push(...fileChunks);
982
+ options.onProgress?.({
983
+ stage: "indexed_code_file",
984
+ repo: options.repo,
985
+ current: index + 1,
986
+ total: discovery.files.length,
987
+ filePath: file.path,
988
+ chunks: fileChunks.length
989
+ });
990
+ }
991
+ return replaceCodeIndex(
992
+ db,
993
+ options.repo,
994
+ discovery.files.map(({ content: _content, absolutePath: _absolutePath, ...file }) => file),
995
+ chunks,
996
+ discovery.skippedFiles,
997
+ options.cwd
998
+ );
999
+ }
1000
+ function emptyCodeIndexSummary(cwd) {
1001
+ return {
1002
+ indexedFiles: 0,
1003
+ codeChunksCreated: 0,
1004
+ skippedFiles: 0,
1005
+ databasePath: defaultDatabasePath(cwd)
1006
+ };
1007
+ }
1008
+
1009
+ // src/indexer/wisdom-extractor.ts
1010
+ import crypto3 from "crypto";
1011
+ import path5 from "path";
624
1012
  var CATEGORY_KEYWORDS = [
625
1013
  ["security_note", /\b(security|secret|token|bearer|oauth|credential|xss|csrf|injection|sanitize|redact)\b/i],
626
1014
  ["architecture_decision", /\b(architecture decision|architectural|we intentionally|design decision)\b/i],
@@ -652,7 +1040,7 @@ function extractSymbols(text, filePaths) {
652
1040
  }
653
1041
  }
654
1042
  for (const filePath of filePaths) {
655
- const basename = path3.basename(filePath).replace(/\.[^.]+$/, "");
1043
+ const basename = path5.basename(filePath).replace(/\.[^.]+$/, "");
656
1044
  if (/^[A-Za-z_$][\w$]*$/.test(basename)) symbols.push(basename);
657
1045
  }
658
1046
  return uniqueStrings(symbols).slice(0, 30);
@@ -676,7 +1064,7 @@ function confidenceFor(entry, text, category, duplicateCount) {
676
1064
  return Math.max(0, Math.min(1, Number(confidence.toFixed(2))));
677
1065
  }
678
1066
  function stableWisdomId(pr, sourceType, text, filePaths, createdAt, authors) {
679
- const hash = crypto.createHash("sha256").update(
1067
+ const hash = crypto3.createHash("sha256").update(
680
1068
  [pr.repo, pr.number, sourceType, canonicalizeText(text), filePaths.join("|"), createdAt, authors.join("|")].join(
681
1069
  "\0"
682
1070
  )
@@ -872,7 +1260,7 @@ function shouldSyncSince(db, repo, fallbackSince) {
872
1260
  }
873
1261
 
874
1262
  // src/retrieval/query-builder.ts
875
- import path4 from "path";
1263
+ import path6 from "path";
876
1264
  var CATEGORY_HINTS = [
877
1265
  "security",
878
1266
  "regression",
@@ -897,8 +1285,8 @@ function buildFtsQuery(input) {
897
1285
  const baseText = "task" in input ? input.task : input.query;
898
1286
  const fileTerms = files.flatMap((file) => [
899
1287
  file,
900
- path4.basename(file),
901
- ...path4.dirname(file).split(/[\\/]/).filter(Boolean)
1288
+ path6.basename(file),
1289
+ ...path6.dirname(file).split(/[\\/]/).filter(Boolean)
902
1290
  ]);
903
1291
  const tokens = uniqueStrings([
904
1292
  ...tokenizeSearchText(baseText, 24),
@@ -917,7 +1305,7 @@ function clampMaxResults(value, defaultValue) {
917
1305
  }
918
1306
 
919
1307
  // src/retrieval/ranker.ts
920
- import path5 from "path";
1308
+ import path7 from "path";
921
1309
  function parseJsonArray(value) {
922
1310
  try {
923
1311
  const parsed = JSON.parse(value);
@@ -964,11 +1352,11 @@ function filePathMatch(unitPaths, queryFiles) {
964
1352
  if (queryFiles.length === 0 || unitPaths.length === 0) return 0;
965
1353
  let best = 0;
966
1354
  for (const queryFile of queryFiles) {
967
- const queryBase = path5.basename(queryFile).toLowerCase();
968
- const queryDir = path5.dirname(queryFile).toLowerCase();
1355
+ const queryBase = path7.basename(queryFile).toLowerCase();
1356
+ const queryDir = path7.dirname(queryFile).toLowerCase();
969
1357
  for (const unitPath of unitPaths) {
970
- const unitBase = path5.basename(unitPath).toLowerCase();
971
- const unitDir = path5.dirname(unitPath).toLowerCase();
1358
+ const unitBase = path7.basename(unitPath).toLowerCase();
1359
+ const unitDir = path7.dirname(unitPath).toLowerCase();
972
1360
  const q = queryFile.toLowerCase();
973
1361
  const u = unitPath.toLowerCase();
974
1362
  if (q === u) best = Math.max(best, 1);
@@ -1099,6 +1487,159 @@ function rankWisdomUnits(db, input) {
1099
1487
  return [...grouped.values()].sort((a, b) => b.score - a.score || b.confidence - a.confidence).slice(0, limit);
1100
1488
  }
1101
1489
 
1490
+ // src/retrieval/code-ranker.ts
1491
+ import path8 from "path";
1492
+ function parseJsonArray2(value) {
1493
+ try {
1494
+ const parsed = JSON.parse(value);
1495
+ return Array.isArray(parsed) ? parsed.filter((item) => typeof item === "string") : [];
1496
+ } catch {
1497
+ return [];
1498
+ }
1499
+ }
1500
+ function rowToCodeChunk(row) {
1501
+ return {
1502
+ id: row.id,
1503
+ repo: row.repo,
1504
+ filePath: row.file_path,
1505
+ language: row.language ?? void 0,
1506
+ startLine: row.start_line,
1507
+ endLine: row.end_line,
1508
+ sanitizedText: row.sanitized_text,
1509
+ symbols: parseJsonArray2(row.symbols_json),
1510
+ contentHash: row.content_hash,
1511
+ updatedAt: row.updated_at,
1512
+ bm25: row.bm25 ?? void 0
1513
+ };
1514
+ }
1515
+ function filePathMatch2(filePath, queryFiles) {
1516
+ if (queryFiles.length === 0) return 0;
1517
+ let best = 0;
1518
+ const unitBase = path8.basename(filePath).toLowerCase();
1519
+ const unitDir = path8.dirname(filePath).toLowerCase();
1520
+ const unit = filePath.toLowerCase();
1521
+ for (const queryFile of queryFiles) {
1522
+ const query = queryFile.toLowerCase();
1523
+ const queryBase = path8.basename(queryFile).toLowerCase();
1524
+ const queryDir = path8.dirname(queryFile).toLowerCase();
1525
+ if (query === unit) best = Math.max(best, 1);
1526
+ else if (queryBase === unitBase) best = Math.max(best, 0.72);
1527
+ else if (queryDir === unitDir) best = Math.max(best, 0.62);
1528
+ else if (unitDir.startsWith(queryDir) || queryDir.startsWith(unitDir))
1529
+ best = Math.max(best, 0.38);
1530
+ else if (queryBase && unitBase && queryBase.split(".")[0] === unitBase.split(".")[0]) {
1531
+ best = Math.max(best, 0.48);
1532
+ }
1533
+ }
1534
+ return best;
1535
+ }
1536
+ function symbolMatch2(chunk, querySymbols) {
1537
+ if (querySymbols.length === 0) return 0;
1538
+ const chunkSymbols = chunk.symbols.map((symbol) => symbol.toLowerCase());
1539
+ const text = chunk.sanitizedText.toLowerCase();
1540
+ let best = 0;
1541
+ for (const symbol of querySymbols) {
1542
+ const lower = symbol.toLowerCase();
1543
+ if (chunkSymbols.includes(lower)) best = Math.max(best, 1);
1544
+ else if (new RegExp(`\\b${escapeRegExp2(lower)}\\b`, "i").test(text)) best = Math.max(best, 0.7);
1545
+ else if (chunkSymbols.some((candidate) => candidate.includes(lower) || lower.includes(candidate))) {
1546
+ best = Math.max(best, 0.42);
1547
+ }
1548
+ }
1549
+ return best;
1550
+ }
1551
+ function textMatch2(chunk, input) {
1552
+ const tokens = tokenizeSearchText(
1553
+ `${input.task} ${input.diff ?? ""} ${input.currentCode ?? ""}`,
1554
+ 40
1555
+ );
1556
+ const haystack = `${chunk.sanitizedText} ${chunk.filePath} ${chunk.symbols.join(" ")}`.toLowerCase();
1557
+ const overlap = tokens.length ? tokens.filter((token) => haystack.includes(token.toLowerCase())).length / tokens.length : 0;
1558
+ const bm25Signal = chunk.bm25 === void 0 ? 0 : Math.max(0.25, Math.min(1, 1 / (1 + Math.abs(chunk.bm25))));
1559
+ return Math.max(overlap, bm25Signal);
1560
+ }
1561
+ function recencyScore2(chunk) {
1562
+ const timestamp = Date.parse(chunk.updatedAt);
1563
+ if (Number.isNaN(timestamp)) return 0.25;
1564
+ const ageDays = Math.max(0, (Date.now() - timestamp) / (1e3 * 60 * 60 * 24));
1565
+ if (ageDays < 30) return 1;
1566
+ if (ageDays < 180) return 0.75;
1567
+ if (ageDays < 730) return 0.45;
1568
+ return 0.25;
1569
+ }
1570
+ function escapeRegExp2(value) {
1571
+ return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1572
+ }
1573
+ function escapeLike(value) {
1574
+ return value.replace(/[\\%_]/g, (match) => `\\${match}`);
1575
+ }
1576
+ function loadCodeCandidates(db, input) {
1577
+ const candidates = /* @__PURE__ */ new Map();
1578
+ const ftsQuery = buildFtsQuery(input);
1579
+ if (ftsQuery) {
1580
+ const rows = db.prepare(
1581
+ `SELECT cc.*, bm25(code_chunks_fts) AS bm25
1582
+ FROM code_chunks_fts
1583
+ JOIN code_chunks cc ON cc.id = code_chunks_fts.chunkId
1584
+ WHERE code_chunks_fts MATCH ?
1585
+ ORDER BY bm25(code_chunks_fts)
1586
+ LIMIT 150`
1587
+ ).all(ftsQuery);
1588
+ for (const row of rows) {
1589
+ const chunk = rowToCodeChunk(row);
1590
+ candidates.set(chunk.id, chunk);
1591
+ }
1592
+ }
1593
+ for (const file of input.files ?? []) {
1594
+ const basename = path8.basename(file);
1595
+ const rows = db.prepare(
1596
+ `SELECT cc.*, NULL AS bm25
1597
+ FROM code_chunks cc
1598
+ WHERE cc.file_path = ?
1599
+ OR cc.file_path LIKE ? ESCAPE '\\'
1600
+ LIMIT 80`
1601
+ ).all(file, `%/${escapeLike(basename)}`);
1602
+ for (const row of rows) {
1603
+ const chunk = rowToCodeChunk(row);
1604
+ candidates.set(chunk.id, { ...chunk, bm25: candidates.get(chunk.id)?.bm25 ?? chunk.bm25 });
1605
+ }
1606
+ }
1607
+ if (candidates.size === 0) {
1608
+ const rows = db.prepare(
1609
+ `SELECT cc.*, NULL AS bm25
1610
+ FROM code_chunks cc
1611
+ ORDER BY updated_at DESC
1612
+ LIMIT 80`
1613
+ ).all();
1614
+ for (const row of rows) {
1615
+ const chunk = rowToCodeChunk(row);
1616
+ candidates.set(chunk.id, chunk);
1617
+ }
1618
+ }
1619
+ return [...candidates.values()];
1620
+ }
1621
+ function rankCodeChunks(db, input) {
1622
+ const queryFiles = input.files ?? [];
1623
+ const querySymbols = input.symbols ?? [];
1624
+ const ranked = loadCodeCandidates(db, input).map((chunk) => {
1625
+ const parts = {
1626
+ filePathMatch: filePathMatch2(chunk.filePath, queryFiles),
1627
+ symbolMatch: symbolMatch2(chunk, querySymbols),
1628
+ textMatch: textMatch2(chunk, input),
1629
+ recency: recencyScore2(chunk)
1630
+ };
1631
+ const score = 0.4 * parts.filePathMatch + 0.25 * parts.symbolMatch + 0.25 * parts.textMatch + 0.1 * parts.recency;
1632
+ return {
1633
+ ...chunk,
1634
+ symbols: uniqueStrings(chunk.symbols),
1635
+ score: Number(score.toFixed(4)),
1636
+ scoreParts: parts
1637
+ };
1638
+ }).sort((a, b) => b.score - a.score || b.startLine - a.startLine);
1639
+ const limit = Math.min(5, clampMaxResults(input.maxResults, 5));
1640
+ return ranked.slice(0, limit);
1641
+ }
1642
+
1102
1643
  // src/retrieval/formatter.ts
1103
1644
  function evidenceLine(unit) {
1104
1645
  const author = unit.authors[0] ? ` by @${unit.authors[0]}` : "";
@@ -1125,14 +1666,20 @@ function whyItMatters(unit, input) {
1125
1666
  function riskLines(units) {
1126
1667
  const risks = /* @__PURE__ */ new Set();
1127
1668
  for (const unit of units) {
1128
- if (unit.category === "security_note") risks.add("Avoid logging, exposing, or weakening security-sensitive values.");
1129
- if (unit.category === "bug_regression") risks.add("Check for regressions similar to the cited PR history.");
1130
- if (unit.category === "api_contract") risks.add("Preserve documented API and backward-compatibility contracts.");
1131
- if (unit.category === "constraint") risks.add("Do not remove constraints without verifying the original reason no longer applies.");
1669
+ if (unit.category === "security_note")
1670
+ risks.add("Avoid logging, exposing, or weakening security-sensitive values.");
1671
+ if (unit.category === "bug_regression")
1672
+ risks.add("Check for regressions similar to the cited PR history.");
1673
+ if (unit.category === "api_contract")
1674
+ risks.add("Preserve documented API and backward-compatibility contracts.");
1675
+ if (unit.category === "constraint")
1676
+ risks.add(
1677
+ "Do not remove constraints without verifying the original reason no longer applies."
1678
+ );
1132
1679
  }
1133
1680
  return [...risks].slice(0, 4);
1134
1681
  }
1135
- function formatAnchorContext(units, input) {
1682
+ function formatAnchorContext(units, input, codeChunks = []) {
1136
1683
  const lines = ["# Anchor Context", "", "## Must know", ""];
1137
1684
  if (units.length === 0) {
1138
1685
  lines.push("No directly relevant indexed PR history found.", "");
@@ -1146,6 +1693,18 @@ function formatAnchorContext(units, input) {
1146
1693
  lines.push("");
1147
1694
  });
1148
1695
  }
1696
+ lines.push("## Codebase Evidence", "");
1697
+ if (codeChunks.length === 0) {
1698
+ lines.push("No directly relevant indexed codebase context found.", "");
1699
+ } else {
1700
+ codeChunks.forEach((chunk, index) => {
1701
+ const symbols = chunk.symbols.length ? `; symbols: ${chunk.symbols.slice(0, 6).join(", ")}` : "";
1702
+ lines.push(`${index + 1}. ${chunk.filePath}:${chunk.startLine}-${chunk.endLine}${symbols}`);
1703
+ lines.push(` Why it matters: Current code near this match may affect the requested edit.`);
1704
+ lines.push(` Snippet: ${clipSentence(chunk.sanitizedText, 260)}`);
1705
+ lines.push("");
1706
+ });
1707
+ }
1149
1708
  lines.push("## Risks", "");
1150
1709
  const risks = riskLines(units);
1151
1710
  if (risks.length === 0) {
@@ -1172,6 +1731,15 @@ function formatAnchorContext(units, input) {
1172
1731
  filePaths: unit.filePaths,
1173
1732
  symbols: unit.symbols,
1174
1733
  duplicateCount: unit.duplicateCount
1734
+ })),
1735
+ codeEvidence: codeChunks.map((chunk) => ({
1736
+ id: chunk.id,
1737
+ score: chunk.score,
1738
+ filePath: chunk.filePath,
1739
+ language: chunk.language,
1740
+ startLine: chunk.startLine,
1741
+ endLine: chunk.endLine,
1742
+ symbols: chunk.symbols
1175
1743
  }))
1176
1744
  }
1177
1745
  };
@@ -1220,7 +1788,10 @@ function formatIndexStatus(status) {
1220
1788
  `- Files: ${status.fileCount}`,
1221
1789
  `- Comments: ${status.commentCount}`,
1222
1790
  `- Wisdom units: ${status.wisdomUnitCount}`,
1791
+ `- Code files: ${status.codeFileCount}`,
1792
+ `- Code chunks: ${status.codeChunkCount}`,
1223
1793
  `- Last sync: ${status.lastSyncTime ?? "never"}`,
1794
+ `- Last code index: ${status.lastCodeIndexTime ?? "never"}`,
1224
1795
  `- GitHub token configured: ${status.githubTokenConfigured ? "yes" : "no"}`,
1225
1796
  `- Health: ${status.health}`
1226
1797
  ];
@@ -1416,8 +1987,8 @@ async function fetchMergedPullRequests(options) {
1416
1987
  }
1417
1988
 
1418
1989
  // src/doctor.ts
1419
- import fs3 from "fs";
1420
- import path6 from "path";
1990
+ import fs4 from "fs";
1991
+ import path9 from "path";
1421
1992
  function check(name, ok, message, fix) {
1422
1993
  return { name, ok, message, fix: ok ? void 0 : fix };
1423
1994
  }
@@ -1478,12 +2049,12 @@ async function runDoctor(options) {
1478
2049
  )
1479
2050
  );
1480
2051
  }
1481
- const cursorConfigPath = path6.join(gitRoot ?? cwd, ".cursor", "mcp.json");
2052
+ const cursorConfigPath = path9.join(gitRoot ?? cwd, ".cursor", "mcp.json");
1482
2053
  let cursorConfig;
1483
2054
  let cursorConfigValid = false;
1484
- if (fs3.existsSync(cursorConfigPath)) {
2055
+ if (fs4.existsSync(cursorConfigPath)) {
1485
2056
  try {
1486
- cursorConfig = JSON.parse(fs3.readFileSync(cursorConfigPath, "utf8"));
2057
+ cursorConfig = JSON.parse(fs4.readFileSync(cursorConfigPath, "utf8"));
1487
2058
  cursorConfigValid = true;
1488
2059
  } catch {
1489
2060
  cursorConfigValid = false;
@@ -1492,7 +2063,7 @@ async function runDoctor(options) {
1492
2063
  checks.push(
1493
2064
  check(
1494
2065
  ".cursor/mcp.json valid",
1495
- fs3.existsSync(cursorConfigPath) && cursorConfigValid,
2066
+ fs4.existsSync(cursorConfigPath) && cursorConfigValid,
1496
2067
  cursorConfigValid ? ".cursor/mcp.json exists and is valid JSON." : ".cursor/mcp.json is missing or invalid.",
1497
2068
  "Run anchor init. If the file is malformed, fix the JSON and rerun anchor init."
1498
2069
  )
@@ -1509,7 +2080,7 @@ async function runDoctor(options) {
1509
2080
  )
1510
2081
  );
1511
2082
  const dbPath = defaultDatabasePath(gitRoot ?? cwd);
1512
- const dbExists = fs3.existsSync(dbPath);
2083
+ const dbExists = fs4.existsSync(dbPath);
1513
2084
  checks.push(
1514
2085
  check(
1515
2086
  ".anchor/index.sqlite exists",
@@ -1553,12 +2124,12 @@ async function runDoctor(options) {
1553
2124
  "Run pnpm build, then try anchor serve from the repository."
1554
2125
  )
1555
2126
  );
1556
- const rulePath = path6.join(gitRoot ?? cwd, ".cursor", "rules", "anchor.mdc");
2127
+ const rulePath = path9.join(gitRoot ?? cwd, ".cursor", "rules", "anchor.mdc");
1557
2128
  checks.push(
1558
2129
  check(
1559
2130
  "Cursor rule file exists",
1560
- fs3.existsSync(rulePath),
1561
- fs3.existsSync(rulePath) ? "Cursor rule file exists." : "Cursor rule file is missing.",
2131
+ fs4.existsSync(rulePath),
2132
+ fs4.existsSync(rulePath) ? "Cursor rule file exists." : "Cursor rule file is missing.",
1562
2133
  "Run anchor init to create .cursor/rules/anchor.mdc."
1563
2134
  )
1564
2135
  );
@@ -1566,12 +2137,14 @@ async function runDoctor(options) {
1566
2137
  }
1567
2138
  export {
1568
2139
  ANCHOR_CURSOR_RULE,
2140
+ DEFAULT_MAX_CODE_FILE_BYTES,
1569
2141
  SCHEMA_SQL,
1570
2142
  anchorMcpEntry,
1571
2143
  buildFtsQuery,
1572
2144
  canonicalizeText,
1573
2145
  categorizeWisdom,
1574
2146
  checkSchema,
2147
+ chunkCodeFile,
1575
2148
  chunkHistoricalText,
1576
2149
  clampMaxResults,
1577
2150
  clipSentence,
@@ -1579,10 +2152,13 @@ export {
1579
2152
  defaultDatabasePath,
1580
2153
  detectGitHubRepo,
1581
2154
  detectGitRoot,
2155
+ discoverCodeFiles,
2156
+ emptyCodeIndexSummary,
1582
2157
  ensureAnchorGitExclude,
1583
2158
  ensureCursorConfig,
1584
2159
  ensureCursorRule,
1585
2160
  ensureRepository,
2161
+ extractCodeSymbols,
1586
2162
  extractSymbols,
1587
2163
  extractWisdomUnits,
1588
2164
  fetchMergedPullRequests,
@@ -1594,15 +2170,19 @@ export {
1594
2170
  getLastSyncTime,
1595
2171
  githubAuthFixMessage,
1596
2172
  hasHighSignalLanguage,
2173
+ indexCodebase,
1597
2174
  indexPullRequests,
1598
2175
  initializeSchema,
2176
+ isHardExcludedCodePath,
1599
2177
  mergeAnchorMcpConfig,
1600
2178
  normalizePullRequest,
1601
2179
  openAnchorDatabase,
1602
2180
  parseGitHubRemote,
2181
+ rankCodeChunks,
1603
2182
  rankWisdomUnits,
1604
2183
  redactSecrets,
1605
2184
  redactedHistoricalText,
2185
+ replaceCodeIndex,
1606
2186
  resolveGitHubToken,
1607
2187
  resolvePullRequestDetailConcurrency,
1608
2188
  resolvePullRequestFetchLimit,