@zokizuan/satori-core 0.2.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,7 +11,7 @@ Maintained by: `ham-zax` (`@zokizuan`).
11
11
  - code splitting (AST + LangChain fallback)
12
12
  - embedding generation
13
13
  - vector persistence and search via Milvus
14
- - incremental sync via Merkle-based change detection
14
+ - incremental sync via stat-first change detection with hash-on-change verification
15
15
 
16
16
  ## Install
17
17
 
@@ -1,6 +1,6 @@
1
1
  import { Splitter } from '../splitter';
2
2
  import { Embedding } from '../embedding';
3
- import { VectorDatabase } from '../vectordb';
3
+ import { VectorDatabase, IndexCompletionMarkerDocument } from '../vectordb';
4
4
  import { SemanticSearchResult } from '../types';
5
5
  import { FileSynchronizer } from '../sync/synchronizer';
6
6
  export interface ContextConfig {
@@ -126,6 +126,11 @@ export declare class Context {
126
126
  * @param threshold Similarity threshold
127
127
  */
128
128
  semanticSearch(codebasePath: string, query: string, topK?: number, threshold?: number, filterExpr?: string): Promise<SemanticSearchResult[]>;
129
+ private buildSemanticSearchFilterExpr;
130
+ private queryCompletionMarkerRows;
131
+ clearIndexCompletionMarker(codebasePath: string): Promise<void>;
132
+ writeIndexCompletionMarker(codebasePath: string, marker: IndexCompletionMarkerDocument): Promise<void>;
133
+ getIndexCompletionMarker(codebasePath: string): Promise<IndexCompletionMarkerDocument | null>;
129
134
  /**
130
135
  * Check if index exists for codebase
131
136
  * @param codebasePath Codebase path to check
@@ -230,17 +235,12 @@ export declare class Context {
230
235
  */
231
236
  private loadIgnorePatterns;
232
237
  /**
233
- * Find root-level .xxxignore files in the codebase directory.
234
- * v1 policy: only root ignore files are loaded (nested .gitignore files are ignored).
238
+ * Find supported root ignore files in the codebase directory.
239
+ * v1 policy: only repo-root .satoriignore and .gitignore are loaded.
235
240
  * @param codebasePath Path to the codebase
236
241
  * @returns Array of ignore file paths
237
242
  */
238
243
  private findIgnoreFiles;
239
- /**
240
- * Load global ignore file from ~/.satori/.satoriignore
241
- * @returns Array of ignore patterns
242
- */
243
- private loadGlobalIgnoreFile;
244
244
  /**
245
245
  * Load ignore patterns from a specific ignore file
246
246
  * @param filePath Path to the ignore file
@@ -39,6 +39,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
39
39
  exports.Context = void 0;
40
40
  const splitter_1 = require("../splitter");
41
41
  const embedding_1 = require("../embedding");
42
+ const vectordb_1 = require("../vectordb");
42
43
  const env_manager_1 = require("../utils/env-manager");
43
44
  const defaults_1 = require("../config/defaults");
44
45
  const language_1 = require("../language");
@@ -283,6 +284,22 @@ class Context {
283
284
  this.synchronizers.set(collectionName, newSynchronizer);
284
285
  }
285
286
  const currentSynchronizer = this.synchronizers.get(collectionName);
287
+ const collectionExists = await this.vectorDatabase.hasCollection(collectionName);
288
+ if (!collectionExists) {
289
+ console.warn(`[Context] ⚠️ Collection '${collectionName}' is missing. Rebuilding full index before incremental sync resumes.`);
290
+ const changedFiles = this.normalizeRelativePathsForCodebase(codebasePath, await this.getCodeFiles(codebasePath));
291
+ if (changedFiles.length === 0) {
292
+ progressCallback?.({ phase: 'No files to index', current: 100, total: 100, percentage: 100 });
293
+ return { added: 0, removed: 0, modified: 0, changedFiles: [] };
294
+ }
295
+ await this.indexCodebase(codebasePath, progressCallback);
296
+ return {
297
+ added: changedFiles.length,
298
+ removed: 0,
299
+ modified: 0,
300
+ changedFiles
301
+ };
302
+ }
286
303
  progressCallback?.({ phase: 'Checking for file changes...', current: 0, total: 100, percentage: 0 });
287
304
  const { added, removed, modified } = await currentSynchronizer.checkForChanges();
288
305
  const totalChanges = added.length + removed.length + modified.length;
@@ -347,6 +364,7 @@ class Context {
347
364
  const isHybrid = this.getIsHybrid();
348
365
  const searchType = isHybrid === true ? 'hybrid search' : 'semantic search';
349
366
  console.log(`[Context] 🔍 Executing ${searchType}: "${query}" in ${codebasePath}`);
367
+ const effectiveFilterExpr = this.buildSemanticSearchFilterExpr(filterExpr);
350
368
  const normalizeBreadcrumbs = (value) => {
351
369
  if (!Array.isArray(value)) {
352
370
  return undefined;
@@ -405,7 +423,8 @@ class Context {
405
423
  params: { k: 100 }
406
424
  },
407
425
  limit: topK,
408
- filterExpr
426
+ threshold,
427
+ filterExpr: effectiveFilterExpr
409
428
  });
410
429
  console.log(`[Context] 🔍 Raw search results count: ${searchResults.length}`);
411
430
  // 4. Convert to semantic search result format
@@ -432,7 +451,7 @@ class Context {
432
451
  // 1. Generate query vector
433
452
  const queryEmbedding = await this.embedding.embed(query);
434
453
  // 2. Search in vector database
435
- const searchResults = await this.vectorDatabase.search(collectionName, queryEmbedding.vector, { topK, threshold, filterExpr });
454
+ const searchResults = await this.vectorDatabase.search(collectionName, queryEmbedding.vector, { topK, threshold, filterExpr: effectiveFilterExpr });
436
455
  // 3. Convert to semantic search result format
437
456
  const results = searchResults.map(result => ({
438
457
  content: result.document.content,
@@ -450,6 +469,103 @@ class Context {
450
469
  return results;
451
470
  }
452
471
  }
472
+ buildSemanticSearchFilterExpr(filterExpr) {
473
+ const markerExclusion = `fileExtension != "${vectordb_1.INDEX_COMPLETION_MARKER_FILE_EXTENSION}"`;
474
+ if (!filterExpr || filterExpr.trim().length === 0) {
475
+ return markerExclusion;
476
+ }
477
+ return `(${filterExpr}) and (${markerExclusion})`;
478
+ }
479
+ async queryCompletionMarkerRows(collectionName) {
480
+ return this.vectorDatabase.query(collectionName, `id == "${vectordb_1.INDEX_COMPLETION_MARKER_DOC_ID}"`, ['id', 'metadata'], 8);
481
+ }
482
+ async clearIndexCompletionMarker(codebasePath) {
483
+ const collectionName = this.resolveCollectionName(codebasePath);
484
+ const hasCollection = await this.vectorDatabase.hasCollection(collectionName);
485
+ if (!hasCollection) {
486
+ return;
487
+ }
488
+ const rows = await this.queryCompletionMarkerRows(collectionName);
489
+ const markerIds = rows
490
+ .map((row) => (typeof row.id === 'string' ? row.id : ''))
491
+ .filter((id) => id.length > 0);
492
+ if (markerIds.length === 0) {
493
+ return;
494
+ }
495
+ await this.vectorDatabase.delete(collectionName, Array.from(new Set(markerIds)));
496
+ }
497
+ async writeIndexCompletionMarker(codebasePath, marker) {
498
+ const collectionName = this.resolveCollectionName(codebasePath);
499
+ const hasCollection = await this.vectorDatabase.hasCollection(collectionName);
500
+ if (!hasCollection) {
501
+ throw new Error(`Cannot write completion marker: collection '${collectionName}' does not exist.`);
502
+ }
503
+ await this.clearIndexCompletionMarker(codebasePath);
504
+ const vector = new Array(this.embedding.getDimension()).fill(0);
505
+ const markerDoc = {
506
+ id: vectordb_1.INDEX_COMPLETION_MARKER_DOC_ID,
507
+ vector,
508
+ content: 'satori index completion marker',
509
+ relativePath: vectordb_1.INDEX_COMPLETION_MARKER_RELATIVE_PATH,
510
+ startLine: 0,
511
+ endLine: 0,
512
+ fileExtension: vectordb_1.INDEX_COMPLETION_MARKER_FILE_EXTENSION,
513
+ metadata: marker,
514
+ };
515
+ if (this.getIsHybrid() === true) {
516
+ await this.vectorDatabase.insertHybrid(collectionName, [markerDoc]);
517
+ }
518
+ else {
519
+ await this.vectorDatabase.insert(collectionName, [markerDoc]);
520
+ }
521
+ }
522
+ async getIndexCompletionMarker(codebasePath) {
523
+ const collectionName = this.resolveCollectionName(codebasePath);
524
+ const hasCollection = await this.vectorDatabase.hasCollection(collectionName);
525
+ if (!hasCollection) {
526
+ return null;
527
+ }
528
+ const rows = await this.queryCompletionMarkerRows(collectionName);
529
+ for (const row of rows) {
530
+ const rawMetadata = row?.metadata;
531
+ if (typeof rawMetadata !== 'string') {
532
+ continue;
533
+ }
534
+ try {
535
+ const parsed = JSON.parse(rawMetadata);
536
+ if (parsed?.kind !== 'satori_index_completion_v1') {
537
+ continue;
538
+ }
539
+ if (typeof parsed.codebasePath !== 'string' || typeof parsed.runId !== 'string') {
540
+ continue;
541
+ }
542
+ if (!parsed.fingerprint || typeof parsed.fingerprint !== 'object') {
543
+ continue;
544
+ }
545
+ const indexedFiles = Number(parsed.indexedFiles);
546
+ const totalChunks = Number(parsed.totalChunks);
547
+ if (!Number.isFinite(indexedFiles) || !Number.isFinite(totalChunks)) {
548
+ continue;
549
+ }
550
+ if (typeof parsed.completedAt !== 'string' || Number.isNaN(Date.parse(parsed.completedAt))) {
551
+ continue;
552
+ }
553
+ return {
554
+ kind: 'satori_index_completion_v1',
555
+ codebasePath: parsed.codebasePath,
556
+ fingerprint: parsed.fingerprint,
557
+ indexedFiles,
558
+ totalChunks,
559
+ completedAt: parsed.completedAt,
560
+ runId: parsed.runId,
561
+ };
562
+ }
563
+ catch {
564
+ continue;
565
+ }
566
+ }
567
+ return null;
568
+ }
453
569
  /**
454
570
  * Check if index exists for codebase
455
571
  * @param codebasePath Codebase path to check
@@ -475,6 +591,8 @@ class Context {
475
591
  }
476
592
  // Delete snapshot file
477
593
  await synchronizer_1.FileSynchronizer.deleteSnapshot(codebasePath);
594
+ this.synchronizers.delete(collectionName);
595
+ this.ignoreStateByCollection.delete(collectionName);
478
596
  progressCallback?.({ phase: 'Index cleared', current: 100, total: 100, percentage: 100 });
479
597
  console.log('[Context] ✅ Index data cleaned');
480
598
  }
@@ -722,6 +840,7 @@ class Context {
722
840
  let processedFiles = 0;
723
841
  let totalChunks = 0;
724
842
  let limitReached = false;
843
+ const describeError = (error) => error instanceof Error ? error.message : String(error);
725
844
  for (let i = 0; i < filePaths.length; i++) {
726
845
  const filePath = filePaths[i];
727
846
  try {
@@ -750,6 +869,7 @@ class Context {
750
869
  if (error instanceof Error) {
751
870
  console.error('[Context] Stack trace:', error.stack);
752
871
  }
872
+ throw new Error(`Failed to persist ${searchType} chunks while indexing ${filePath}: ${describeError(error)}`);
753
873
  }
754
874
  finally {
755
875
  chunkBuffer = []; // Always clear buffer, even on failure
@@ -769,7 +889,8 @@ class Context {
769
889
  }
770
890
  }
771
891
  catch (error) {
772
- console.warn(`[Context] ⚠️ Skipping file ${filePath}: ${error}`);
892
+ console.error(`[Context] Failed to index file ${filePath}: ${describeError(error)}`);
893
+ throw error;
773
894
  }
774
895
  }
775
896
  // Process any remaining chunks in the buffer
@@ -784,6 +905,7 @@ class Context {
784
905
  if (error instanceof Error) {
785
906
  console.error('[Context] Stack trace:', error.stack);
786
907
  }
908
+ throw new Error(`Failed to persist final ${searchType} chunk batch: ${describeError(error)}`);
787
909
  }
788
910
  }
789
911
  return {
@@ -920,18 +1042,15 @@ class Context {
920
1042
  async loadIgnorePatterns(codebasePath) {
921
1043
  try {
922
1044
  let fileBasedPatterns = [];
923
- // Load all .xxxignore files in codebase directory (excluding legacy .contextignore)
1045
+ // v1 policy: only repo-root .satoriignore and .gitignore are supported.
924
1046
  const ignoreFiles = await this.findIgnoreFiles(codebasePath);
925
1047
  for (const ignoreFile of ignoreFiles) {
926
1048
  const patterns = await this.loadIgnoreFile(ignoreFile, path.basename(ignoreFile));
927
1049
  fileBasedPatterns.push(...patterns);
928
1050
  }
929
- // Load global ~/.satori/.satoriignore
930
- const globalIgnorePatterns = await this.loadGlobalIgnoreFile();
931
- fileBasedPatterns.push(...globalIgnorePatterns);
932
1051
  this.setFileBasedPatternsForCodebase(codebasePath, fileBasedPatterns);
933
1052
  if (fileBasedPatterns.length > 0) {
934
- console.log(`[Context] 🚫 Loaded total ${fileBasedPatterns.length} ignore patterns from all ignore files`);
1053
+ console.log(`[Context] 🚫 Loaded total ${fileBasedPatterns.length} ignore patterns from supported root ignore files`);
935
1054
  }
936
1055
  else {
937
1056
  console.log('📄 No ignore files found; effective rules reset to base + runtime custom');
@@ -943,21 +1062,25 @@ class Context {
943
1062
  }
944
1063
  }
945
1064
  /**
946
- * Find root-level .xxxignore files in the codebase directory.
947
- * v1 policy: only root ignore files are loaded (nested .gitignore files are ignored).
1065
+ * Find supported root ignore files in the codebase directory.
1066
+ * v1 policy: only repo-root .satoriignore and .gitignore are loaded.
948
1067
  * @param codebasePath Path to the codebase
949
1068
  * @returns Array of ignore file paths
950
1069
  */
951
1070
  async findIgnoreFiles(codebasePath) {
952
1071
  try {
953
- const entries = await fs.promises.readdir(codebasePath, { withFileTypes: true });
954
1072
  const ignoreFiles = [];
955
- for (const entry of entries) {
956
- if (entry.isFile() &&
957
- entry.name.startsWith('.') &&
958
- entry.name.endsWith('ignore') &&
959
- entry.name !== '.contextignore') {
960
- ignoreFiles.push(path.join(codebasePath, entry.name));
1073
+ const supportedIgnoreFiles = ['.satoriignore', '.gitignore'];
1074
+ for (const fileName of supportedIgnoreFiles) {
1075
+ const absolutePath = path.join(codebasePath, fileName);
1076
+ try {
1077
+ const stat = await fs.promises.stat(absolutePath);
1078
+ if (stat.isFile()) {
1079
+ ignoreFiles.push(absolutePath);
1080
+ }
1081
+ }
1082
+ catch {
1083
+ // Missing ignore file is expected.
961
1084
  }
962
1085
  }
963
1086
  if (ignoreFiles.length > 0) {
@@ -970,21 +1093,6 @@ class Context {
970
1093
  return [];
971
1094
  }
972
1095
  }
973
- /**
974
- * Load global ignore file from ~/.satori/.satoriignore
975
- * @returns Array of ignore patterns
976
- */
977
- async loadGlobalIgnoreFile() {
978
- try {
979
- const homeDir = require('os').homedir();
980
- const globalIgnorePath = path.join(homeDir, '.satori', '.satoriignore');
981
- return await this.loadIgnoreFile(globalIgnorePath, 'global .satoriignore');
982
- }
983
- catch (error) {
984
- // Global ignore file is optional, don't log warnings
985
- return [];
986
- }
987
- }
988
1096
  /**
989
1097
  * Load ignore patterns from a specific ignore file
990
1098
  * @param filePath Path to the ignore file
@@ -1,26 +1,7 @@
1
- export interface MerkleDAGNode {
2
- id: string;
3
- hash: string;
4
- data: string;
5
- parents: string[];
6
- children: string[];
7
- }
8
- export declare class MerkleDAG {
9
- nodes: Map<string, MerkleDAGNode>;
10
- rootIds: string[];
11
- constructor();
12
- private hash;
13
- addNode(data: string, parentId?: string): string;
14
- getNode(nodeId: string): MerkleDAGNode | undefined;
15
- getAllNodes(): MerkleDAGNode[];
16
- getRootNodes(): MerkleDAGNode[];
17
- getLeafNodes(): MerkleDAGNode[];
18
- serialize(): any;
19
- static deserialize(data: any): MerkleDAG;
20
- static compare(dag1: MerkleDAG, dag2: MerkleDAG): {
21
- added: string[];
22
- removed: string[];
23
- modified: string[];
24
- };
25
- }
1
+ /**
2
+ * Compute a deterministic Merkle-like root from file hashes.
3
+ * Input keys are expected to be normalized relative paths.
4
+ */
5
+ export declare function computeMerkleRoot(fileHashes: Map<string, string>): string;
6
+ export declare function computeMerkleLeaf(relativePath: string, hash: string): string;
26
7
  //# sourceMappingURL=merkle.d.ts.map
@@ -33,80 +33,28 @@ var __importStar = (this && this.__importStar) || (function () {
33
33
  };
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.MerkleDAG = void 0;
36
+ exports.computeMerkleRoot = computeMerkleRoot;
37
+ exports.computeMerkleLeaf = computeMerkleLeaf;
37
38
  const crypto = __importStar(require("crypto"));
38
- class MerkleDAG {
39
- constructor() {
40
- this.nodes = new Map();
41
- this.rootIds = [];
42
- }
43
- hash(data) {
44
- return crypto.createHash('sha256').update(data).digest('hex');
45
- }
46
- addNode(data, parentId) {
47
- const nodeId = this.hash(data);
48
- const node = {
49
- id: nodeId,
50
- hash: nodeId,
51
- data,
52
- parents: [],
53
- children: []
54
- };
55
- // If there's a parent, create the relationship
56
- if (parentId) {
57
- const parentNode = this.nodes.get(parentId);
58
- if (parentNode) {
59
- node.parents.push(parentId);
60
- parentNode.children.push(nodeId);
61
- this.nodes.set(parentId, parentNode);
62
- }
63
- }
64
- else {
65
- // If no parent, it's a root node
66
- this.rootIds.push(nodeId);
67
- }
68
- this.nodes.set(nodeId, node);
69
- return nodeId;
70
- }
71
- getNode(nodeId) {
72
- return this.nodes.get(nodeId);
73
- }
74
- getAllNodes() {
75
- return Array.from(this.nodes.values());
76
- }
77
- getRootNodes() {
78
- return this.rootIds.map(id => this.nodes.get(id)).filter(Boolean);
79
- }
80
- getLeafNodes() {
81
- return Array.from(this.nodes.values()).filter(node => node.children.length === 0);
82
- }
83
- serialize() {
84
- return {
85
- nodes: Array.from(this.nodes.entries()),
86
- rootIds: this.rootIds
87
- };
88
- }
89
- static deserialize(data) {
90
- const dag = new MerkleDAG();
91
- dag.nodes = new Map(data.nodes);
92
- dag.rootIds = data.rootIds;
93
- return dag;
94
- }
95
- static compare(dag1, dag2) {
96
- const nodes1 = new Map(Array.from(dag1.getAllNodes()).map(n => [n.id, n]));
97
- const nodes2 = new Map(Array.from(dag2.getAllNodes()).map(n => [n.id, n]));
98
- const added = Array.from(nodes2.keys()).filter(k => !nodes1.has(k));
99
- const removed = Array.from(nodes1.keys()).filter(k => !nodes2.has(k));
100
- // For modified, we'll check if the data has changed for nodes that exist in both
101
- const modified = [];
102
- for (const [id, node1] of Array.from(nodes1.entries())) {
103
- const node2 = nodes2.get(id);
104
- if (node2 && node1.data !== node2.data) {
105
- modified.push(id);
106
- }
107
- }
108
- return { added, removed, modified };
109
- }
39
+ function hashChunk(value) {
40
+ return crypto.createHash('sha256').update(value).digest('hex');
41
+ }
42
+ /**
43
+ * Compute a deterministic Merkle-like root from file hashes.
44
+ * Input keys are expected to be normalized relative paths.
45
+ */
46
+ function computeMerkleRoot(fileHashes) {
47
+ const hasher = crypto.createHash('sha256');
48
+ const sortedEntries = Array.from(fileHashes.entries()).sort(([a], [b]) => a.localeCompare(b));
49
+ for (const [relativePath, hash] of sortedEntries) {
50
+ hasher.update(relativePath);
51
+ hasher.update('\0');
52
+ hasher.update(hash);
53
+ hasher.update('\n');
54
+ }
55
+ return hasher.digest('hex');
56
+ }
57
+ function computeMerkleLeaf(relativePath, hash) {
58
+ return hashChunk(`${relativePath}\0${hash}`);
110
59
  }
111
- exports.MerkleDAG = MerkleDAG;
112
60
  //# sourceMappingURL=merkle.js.map
@@ -1,33 +1,55 @@
1
+ export interface FileChangeResult {
2
+ added: string[];
3
+ removed: string[];
4
+ modified: string[];
5
+ hashedCount: number;
6
+ partialScan: boolean;
7
+ unscannedDirPrefixes: string[];
8
+ fullHashRun: boolean;
9
+ }
1
10
  export declare class FileSynchronizer {
2
11
  private fileHashes;
3
- private merkleDAG;
12
+ private fileStats;
13
+ private merkleRoot;
4
14
  private rootDir;
5
15
  private snapshotPath;
6
16
  private ignorePatterns;
7
17
  private ignoreMatcher;
18
+ private partialScan;
19
+ private unscannedDirPrefixes;
20
+ private fullHashCounter;
8
21
  constructor(rootDir: string, ignorePatterns?: string[]);
9
- private getSnapshotPath;
10
- private hashFile;
11
- private generateFileHashes;
22
+ static canonicalizeSnapshotIdentityPath(codebasePath: string): string;
23
+ static snapshotPathFromCanonicalPath(canonicalPath: string): string;
24
+ static getSnapshotPathForCodebase(codebasePath: string): string;
25
+ private static trimTrailingSeparators;
26
+ private normalizeRelPath;
27
+ private isPathWithinPrefix;
28
+ private normalizeAndCompressPrefixes;
12
29
  private shouldIgnore;
13
- private buildMerkleDAG;
14
- initialize(): Promise<void>;
15
- checkForChanges(): Promise<{
16
- added: string[];
17
- removed: string[];
18
- modified: string[];
19
- }>;
30
+ private parsePositiveInt;
31
+ private getHashConcurrency;
32
+ private getFullHashInterval;
33
+ private hashFileBytes;
34
+ private isSignatureEqual;
35
+ private scanDirectory;
36
+ private hashCandidatesWithConcurrency;
37
+ private buildEffectiveState;
20
38
  private compareStates;
39
+ private arraysEqual;
40
+ private saveSnapshot;
41
+ private loadSnapshot;
42
+ private scanCurrentState;
43
+ initialize(): Promise<void>;
44
+ checkForChanges(): Promise<FileChangeResult>;
21
45
  getFileHash(filePath: string): string | undefined;
22
46
  /**
23
47
  * Return tracked (currently considered indexable) relative file paths.
24
48
  * This reflects the synchronizer snapshot under the active ignore rules.
25
49
  */
26
50
  getTrackedRelativePaths(): string[];
27
- private saveSnapshot;
28
- private loadSnapshot;
29
51
  /**
30
- * Delete snapshot file for a given codebase path
52
+ * Delete snapshot file for a given codebase path.
31
53
  */
32
54
  static deleteSnapshot(codebasePath: string): Promise<void>;
33
55
  }