@goshenkata/dryscan-core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,63 @@
1
+ {
2
+ "name": "@goshenkata/dryscan-core",
3
+ "version": "1.0.0",
4
+ "description": "Core library for DryScan - semantic code duplication analyzer",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "import": "./dist/index.js",
11
+ "types": "./dist/index.d.ts"
12
+ }
13
+ },
14
+ "scripts": {
15
+ "build": "tsup src/index.ts --format esm --dts --sourcemap --clean --outDir dist",
16
+ "clean": "rm -rf dist",
17
+ "test": "tsx ../node_modules/mocha/bin/mocha \"test/**/*.test.mjs\"",
18
+ "coverage": "c8 tsx ../node_modules/mocha/bin/mocha \"test/**/*.test.mjs\""
19
+ },
20
+ "engines": {
21
+ "node": ">=18.0.0"
22
+ },
23
+ "keywords": [
24
+ "code-analysis",
25
+ "duplication-detection",
26
+ "semantic-analysis"
27
+ ],
28
+ "author": "Goshenkata",
29
+ "license": "MIT",
30
+ "devDependencies": {
31
+ "@types/better-sqlite3": "^7.6.13",
32
+ "@types/debug": "^4.1.12",
33
+ "@types/node": "^25.0.3",
34
+ "chai": "^6.2.2",
35
+ "mocha": "^11.7.5",
36
+ "sinon": "^21.0.1",
37
+ "tsup": "^8.5.1",
38
+ "tsx": "^4.21.0",
39
+ "typescript": "^5.9.3"
40
+ },
41
+ "dependencies": {
42
+ "@langchain/core": "^1.1.8",
43
+ "@langchain/google-genai": "^2.1.3",
44
+ "@langchain/ollama": "^1.1.0",
45
+ "better-sqlite3": "^12.5.0",
46
+ "debug": "^4.4.3",
47
+ "glob-gitignore": "^1.0.15",
48
+ "ignore": "^7.0.5",
49
+ "jsonschema": "^1.5.0",
50
+ "langchain": "^1.2.3",
51
+ "minimatch": "^10.1.1",
52
+ "reflect-metadata": "^0.2.2",
53
+ "short-uuid": "^6.0.3",
54
+ "tree-sitter": "^0.25.0",
55
+ "tree-sitter-java": "^0.23.5",
56
+ "tree-sitter-python": "^0.25.0",
57
+ "typeorm": "^0.3.28",
58
+ "upath": "^2.0.1"
59
+ },
60
+ "publishConfig": {
61
+ "access": "public"
62
+ }
63
+ }
package/src/DryScan.ts ADDED
@@ -0,0 +1,169 @@
1
+ import upath from "upath";
2
+ import fs from "fs/promises";
3
+ import { DuplicateAnalysisResult, DuplicateReport } from "./types";
4
+ import { DRYSCAN_DIR, INDEX_DB } from "./const";
5
+ import { defaultExtractors, IndexUnitExtractor } from "./IndexUnitExtractor";
6
+ import { DryScanDatabase } from "./db/DryScanDatabase";
7
+ import { RepositoryInitializer, InitOptions as InitServiceOptions } from "./services/RepositoryInitializer";
8
+ import { UpdateService } from "./services/UpdateService";
9
+ import { DuplicateService } from "./services/DuplicateService";
10
+ import { ExclusionService } from "./services/ExclusionService";
11
+ import { DryScanServiceDeps } from "./services/types";
12
+ import { configStore } from "./config/configStore";
13
+ import { DryConfig } from "./types";
14
+ import { PairingService } from "./services/PairingService";
15
+
16
+ export type InitOptions = InitServiceOptions;
17
+
18
+
19
+ export class DryScan {
20
+ repoPath: string;
21
+ private readonly extractor: IndexUnitExtractor;
22
+ private db: DryScanDatabase;
23
+ private readonly services: {
24
+ initializer: RepositoryInitializer;
25
+ updater: UpdateService;
26
+ duplicate: DuplicateService;
27
+ exclusion: ExclusionService;
28
+ };
29
+ private readonly serviceDeps: DryScanServiceDeps;
30
+
31
+ constructor(
32
+ repoPath: string,
33
+ extractor?: IndexUnitExtractor,
34
+ db?: DryScanDatabase
35
+ ) {
36
+ this.repoPath = repoPath;
37
+ this.extractor = extractor ?? new IndexUnitExtractor(repoPath, defaultExtractors(repoPath));
38
+ this.db = db ?? new DryScanDatabase();
39
+
40
+ this.serviceDeps = {
41
+ repoPath: this.repoPath,
42
+ db: this.db,
43
+ extractor: this.extractor,
44
+ pairing: new PairingService(this.extractor),
45
+ };
46
+
47
+ const exclusion = new ExclusionService(this.serviceDeps);
48
+ this.services = {
49
+ initializer: new RepositoryInitializer(this.serviceDeps, exclusion),
50
+ updater: new UpdateService(this.serviceDeps, exclusion),
51
+ duplicate: new DuplicateService(this.serviceDeps),
52
+ exclusion,
53
+ };
54
+ }
55
+
56
+ /**
57
+ * Initializes the DryScan repository with a 3-phase analysis:
58
+ * Phase 1: Extract and save all functions
59
+ * Phase 2: Resolve and save internal dependencies
60
+ * Phase 3: Compute and save semantic embeddings
61
+ */
62
+ async init(options?: InitOptions): Promise<void> {
63
+ console.log(`[DryScan] Initializing repository at ${this.repoPath}`);
64
+ console.log("[DryScan] Preparing database and cache...");
65
+ await configStore.init(this.repoPath);
66
+ await this.ensureDatabase();
67
+ if (await this.isInitialized()) {
68
+ console.log("[DryScan] Repository already initialized; skipping full init.");
69
+ return;
70
+ }
71
+ console.log("[DryScan] Starting initial scan (may take a moment)...");
72
+ await this.services.initializer.init(options);
73
+ console.log("[DryScan] Initial scan complete.");
74
+ }
75
+
76
+ /**
77
+ * Updates the index by detecting changed, new, and deleted files.
78
+ * Only reprocesses units in changed files for efficiency.
79
+ * Delegates to DryScanUpdater module for implementation.
80
+ *
81
+ * Update process:
82
+ * 1. List all current source files in repository
83
+ * 2. For each file, check if it's new, changed, or unchanged (via mtime + checksum)
84
+ * 3. Remove old units from changed/deleted files
85
+ * 4. Extract and save units from new/changed files
86
+ * 5. Recompute internal dependencies for affected units
87
+ * 6. Recompute embeddings for affected units
88
+ * 7. Update file tracking metadata
89
+ */
90
+ async updateIndex(): Promise<void> {
91
+ console.log(`[DryScan] Updating index at ${this.repoPath}...`);
92
+ console.log("[DryScan] Checking for file changes...");
93
+ const start = Date.now();
94
+ await this.ensureDatabase();
95
+ await this.services.updater.updateIndex();
96
+ const duration = Date.now() - start;
97
+ console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
98
+ }
99
+
100
+
101
+ /**
102
+ * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
103
+ */
104
+ async buildDuplicateReport(): Promise<DuplicateReport> {
105
+ const config = await this.loadConfig();
106
+ const analysis = await this.findDuplicates(config);
107
+ return {
108
+ version: 1,
109
+ generatedAt: new Date().toISOString(),
110
+ threshold: config.threshold,
111
+ score: analysis.score,
112
+ duplicates: analysis.duplicates,
113
+ };
114
+ }
115
+
116
+ /**
117
+ * Finds duplicate code blocks using cosine similarity on embeddings.
118
+ * Automatically updates the index before searching to ensure results are current.
119
+ * Compares all function pairs and returns groups with similarity above the configured threshold.
120
+ *
121
+ * @returns Analysis result with duplicate groups and duplication score
122
+ */
123
+ private async findDuplicates(config: DryConfig): Promise<DuplicateAnalysisResult> {
124
+ console.log(`[DryScan] Finding duplicates (threshold: ${config.threshold})...`);
125
+ await this.ensureDatabase();
126
+
127
+ console.log("[DryScan] Updating index...");
128
+ const updateStart = Date.now();
129
+ await this.updateIndex();
130
+ const updateDuration = Date.now() - updateStart;
131
+ console.log(`[DryScan] Index update took ${updateDuration}ms.`);
132
+
133
+ console.log("[DryScan] Detecting duplicates...");
134
+ const dupStart = Date.now();
135
+ const result = await this.services.duplicate.findDuplicates(config);
136
+ const dupDuration = Date.now() - dupStart;
137
+ console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
138
+
139
+ return result;
140
+ }
141
+
142
+ /**
143
+ * Cleans excludedPairs entries that no longer match any indexed units.
144
+ * Runs an update first to ensure the index reflects current code.
145
+ */
146
+ async cleanExclusions(): Promise<{ removed: number; kept: number }> {
147
+ await this.updateIndex();
148
+ return this.services.exclusion.cleanExclusions();
149
+ }
150
+
151
+ private async ensureDatabase(): Promise<void> {
152
+ if (this.db.isInitialized()) return;
153
+ const dbPath = upath.join(this.repoPath, DRYSCAN_DIR, INDEX_DB);
154
+ await fs.mkdir(upath.dirname(dbPath), { recursive: true });
155
+ await this.db.init(dbPath);
156
+ }
157
+
158
+ private async loadConfig(): Promise<DryConfig> {
159
+ return configStore.get(this.repoPath);
160
+ }
161
+
162
+ private async isInitialized(): Promise<boolean> {
163
+ if (!this.db.isInitialized()) return false;
164
+ const unitCount = await this.db.countUnits();
165
+ const initialized = unitCount > 0;
166
+ console.log(`[DryScan] Initialization check: ${unitCount} indexed units`);
167
+ return initialized;
168
+ }
169
+ }
@@ -0,0 +1,236 @@
1
+ import path from "path";
2
+ import fs from "fs/promises";
3
+ import debug from "debug";
4
+ import { IndexUnit } from "./types";
5
+ import { IndexUnitExtractor } from "./IndexUnitExtractor";
6
+ import { DryScanDatabase } from "./db/DryScanDatabase";
7
+ import { FileEntity } from "./db/entities/FileEntity";
8
+ import { EmbeddingService } from "./services/EmbeddingService";
9
+
10
+ const log = debug("DryScan:Updater");
11
+
12
+ /**
13
+ * DryScan Updater Module
14
+ *
15
+ * This module contains all incremental update logic for DryScan.
16
+ * Separated from DryScan.ts to keep that file focused on core operations.
17
+ *
18
+ * Represents the result of change detection.
19
+ * Categorizes files into added, changed, deleted, and unchanged.
20
+ */
21
+ export interface FileChangeSet {
22
+ added: string[];
23
+ changed: string[];
24
+ deleted: string[];
25
+ unchanged: string[];
26
+ }
27
+
28
+ /**
29
+ * Detects which files have been added, changed, or deleted since last scan.
30
+ * Uses mtime as fast check, then checksum for verification.
31
+ *
32
+ * @param repoPath - Root path of the repository
33
+ * @param extractor - Index unit extractor instance for file operations
34
+ * @param db - Database instance for retrieving tracked files
35
+ * @returns Change set with categorized file paths
36
+ */
37
+ export async function detectFileChanges(
38
+ repoPath: string,
39
+ extractor: IndexUnitExtractor,
40
+ db: DryScanDatabase
41
+ ): Promise<FileChangeSet> {
42
+ // Get current files in repository
43
+ const currentFiles = await extractor.listSourceFiles(repoPath);
44
+ const currentFileSet = new Set(currentFiles);
45
+
46
+ // Get tracked files from database
47
+ const trackedFiles = await db.getAllFiles();
48
+ const trackedFileMap = new Map(trackedFiles.map(f => [f.filePath, f]));
49
+
50
+ const added: string[] = [];
51
+ const changed: string[] = [];
52
+ const unchanged: string[] = [];
53
+
54
+ // Check each current file
55
+ for (const filePath of currentFiles) {
56
+ const tracked = trackedFileMap.get(filePath);
57
+
58
+ if (!tracked) {
59
+ // New file
60
+ added.push(filePath);
61
+ continue;
62
+ }
63
+
64
+ // Check if file changed using mtime first (fast check)
65
+ const fullPath = path.join(repoPath, filePath);
66
+ const stat = await fs.stat(fullPath);
67
+
68
+ if (stat.mtimeMs !== tracked.mtime) {
69
+ // Mtime changed, verify with checksum
70
+ const currentChecksum = await extractor.computeChecksum(fullPath);
71
+ if (currentChecksum !== tracked.checksum) {
72
+ changed.push(filePath);
73
+ } else {
74
+ // Mtime changed but content same
75
+ unchanged.push(filePath);
76
+ }
77
+ } else {
78
+ unchanged.push(filePath);
79
+ }
80
+ }
81
+
82
+ // Find deleted files
83
+ const deleted = trackedFiles
84
+ .map(f => f.filePath)
85
+ .filter(fp => !currentFileSet.has(fp));
86
+
87
+ return { added, changed, deleted, unchanged };
88
+ }
89
+
90
+ /**
91
+ * Extracts index units from a list of files.
92
+ * Used during incremental updates.
93
+ *
94
+ * @param filePaths - Array of relative file paths to extract from
95
+ * @param extractor - Index unit extractor instance
96
+ * @returns Array of extracted units
97
+ */
98
+ export async function extractUnitsFromFiles(
99
+ filePaths: string[],
100
+ extractor: IndexUnitExtractor
101
+ ): Promise<IndexUnit[]> {
102
+ const allUnits: IndexUnit[] = [];
103
+
104
+ for (const relPath of filePaths) {
105
+ const functions = await extractor.scan(relPath);
106
+ allUnits.push(...functions);
107
+ }
108
+
109
+ return allUnits;
110
+ }
111
+
112
+ /**
113
+ * Updates file tracking metadata after processing changes.
114
+ * Removes deleted files, updates changed files, adds new files.
115
+ *
116
+ * @param changeSet - Set of file changes to apply
117
+ * @param repoPath - Root path of the repository
118
+ * @param extractor - Index unit extractor for checksum computation
119
+ * @param db - Database instance for file tracking
120
+ */
121
+ export async function updateFileTracking(
122
+ changeSet: FileChangeSet,
123
+ repoPath: string,
124
+ extractor: IndexUnitExtractor,
125
+ db: DryScanDatabase
126
+ ): Promise<void> {
127
+ // Remove deleted files
128
+ if (changeSet.deleted.length > 0) {
129
+ if (typeof (db as any).removeFilesByFilePaths === "function") {
130
+ await (db as any).removeFilesByFilePaths(changeSet.deleted);
131
+ } else if (typeof (db as any).removeFiles === "function") {
132
+ await (db as any).removeFiles(changeSet.deleted);
133
+ }
134
+ }
135
+
136
+ // Create file entities for new and changed files
137
+ const filesToTrack = [...changeSet.added, ...changeSet.changed];
138
+ if (filesToTrack.length > 0) {
139
+ const fileEntities: FileEntity[] = [];
140
+
141
+ for (const relPath of filesToTrack) {
142
+ const fullPath = path.join(repoPath, relPath);
143
+ const stat = await fs.stat(fullPath);
144
+ const checksum = await extractor.computeChecksum(fullPath);
145
+
146
+ const fileEntity = new FileEntity();
147
+ fileEntity.filePath = relPath;
148
+ fileEntity.checksum = checksum;
149
+ fileEntity.mtime = stat.mtimeMs;
150
+
151
+ fileEntities.push(fileEntity);
152
+ }
153
+
154
+ await db.saveFiles(fileEntities);
155
+ }
156
+ }
157
+
158
+ /**
159
+ * Performs incremental update of the DryScan index.
160
+ * Detects file changes and reprocesses only affected files.
161
+ *
162
+ * @param repoPath - Root path of the repository
163
+ * @param extractor - Index unit extractor instance
164
+ * @param db - Database instance (must be initialized)
165
+ */
166
+ export async function performIncrementalUpdate(
167
+ repoPath: string,
168
+ extractor: IndexUnitExtractor,
169
+ db: DryScanDatabase,
170
+ ): Promise<FileChangeSet> {
171
+ log("Starting incremental update");
172
+ const embeddingService = new EmbeddingService(repoPath);
173
+
174
+ // Step 1: Detect changes
175
+ const changeSet = await detectFileChanges(repoPath, extractor, db);
176
+
177
+ if (changeSet.changed.length === 0 &&
178
+ changeSet.added.length === 0 &&
179
+ changeSet.deleted.length === 0) {
180
+ log("No changes detected. Index is up to date.");
181
+ return changeSet;
182
+ }
183
+
184
+ log(`Changes detected: ${changeSet.added.length} added, ${changeSet.changed.length} changed, ${changeSet.deleted.length} deleted`);
185
+
186
+ // Step 2: Remove old data for changed/deleted files
187
+ const filesToRemove = [...changeSet.changed, ...changeSet.deleted];
188
+ if (filesToRemove.length > 0) {
189
+ await db.removeUnitsByFilePaths(filesToRemove);
190
+ log(`Removed units from ${filesToRemove.length} files`);
191
+ }
192
+
193
+ // Step 3: Extract functions from new/changed files
194
+ const filesToProcess = [...changeSet.added, ...changeSet.changed];
195
+ if (filesToProcess.length > 0) {
196
+ const newUnits = await extractUnitsFromFiles(filesToProcess, extractor);
197
+ await db.saveUnits(newUnits);
198
+ log(`Extracted and saved ${newUnits.length} units from ${filesToProcess.length} files`);
199
+
200
+ // Step 4: Recompute embeddings for affected units only
201
+ const total = newUnits.length;
202
+ if (total > 0) {
203
+ log(`Recomputing embeddings for ${total} units`);
204
+ const progressInterval = Math.max(1, Math.ceil(total / 10));
205
+ const updatedWithEmbeddings = [] as IndexUnit[];
206
+
207
+ for (let i = 0; i < total; i++) {
208
+ const unit = newUnits[i];
209
+ try {
210
+ const enriched = await embeddingService.addEmbedding(unit);
211
+ updatedWithEmbeddings.push(enriched);
212
+ } catch (err: any) {
213
+ console.error(
214
+ `[DryScan] embedding failed for ${unit.filePath} (${unit.name}): ${err?.message || err}`
215
+ );
216
+ throw err;
217
+ }
218
+
219
+ const completed = i + 1;
220
+ if (completed === total || completed % progressInterval === 0) {
221
+ const pct = Math.floor((completed / total) * 100);
222
+ console.log(`[DryScan] Incremental embeddings ${completed}/${total} (${pct}%)`);
223
+ }
224
+ }
225
+
226
+ await db.updateUnits(updatedWithEmbeddings);
227
+ log(`Recomputed embeddings for ${updatedWithEmbeddings.length} units`);
228
+ }
229
+ }
230
+
231
+ // Step 5: Update file tracking
232
+ await updateFileTracking(changeSet, repoPath, extractor, db);
233
+ log("Incremental update complete");
234
+
235
+ return changeSet;
236
+ }
@@ -0,0 +1,71 @@
1
+ import path from "path";
2
+ import fs from "fs/promises";
3
+ import upath from "upath";
4
+ import { glob } from "glob-gitignore";
5
+ import ignore, { Ignore } from "ignore";
6
+ import { DryConfig } from "./types";
7
+
8
+ /**
9
+ * Gitignore helper that builds ignore matchers by combining default rules,
10
+ * repo .gitignore files, and config-driven exclusions.
11
+ */
12
+ export class Gitignore {
13
+ private readonly defaultIgnores = [".git/**", ".dry/**"];
14
+
15
+ constructor(private readonly root: string) {}
16
+
17
+ async buildMatcher(config: DryConfig): Promise<Ignore> {
18
+ const rules = await this.resolveRules(config);
19
+ return ignore({ allowRelativePaths: true }).add(rules);
20
+ }
21
+
22
+ private async resolveRules(config: DryConfig): Promise<string[]> {
23
+ const gitignoreRules = await this.loadGitignoreRules();
24
+ const configRules = config.excludedPaths || [];
25
+ return [...this.defaultIgnores, ...gitignoreRules, ...configRules];
26
+ }
27
+
28
+ private async loadGitignoreRules(): Promise<string[]> {
29
+ const gitignoreFiles = await glob("**/.gitignore", {
30
+ cwd: this.root,
31
+ dot: true,
32
+ nodir: true,
33
+ ignore: this.defaultIgnores,
34
+ });
35
+
36
+ const rules: string[] = [];
37
+
38
+ for (const file of gitignoreFiles) {
39
+ const absPath = path.join(this.root, file);
40
+ const dir = upath.normalizeTrim(upath.dirname(file));
41
+ const content = await fs.readFile(absPath, "utf8").catch(() => "");
42
+ const lines = content.split(/\r?\n/);
43
+
44
+ for (const raw of lines) {
45
+ const trimmed = raw.trim();
46
+ if (!trimmed || trimmed.startsWith("#")) continue;
47
+
48
+ const negated = trimmed.startsWith("!");
49
+ const body = negated ? trimmed.slice(1) : trimmed;
50
+
51
+ const scoped = this.scopeRule(body, dir);
52
+ if (!scoped) continue;
53
+
54
+ rules.push(negated ? `!${scoped}` : scoped);
55
+ }
56
+ }
57
+
58
+ return rules;
59
+ }
60
+
61
+ private scopeRule(rule: string, gitignoreDir: string): string | null {
62
+ const cleaned = rule.replace(/^\//, "");
63
+ if (!cleaned) return null;
64
+
65
+ if (!gitignoreDir || gitignoreDir === ".") {
66
+ return cleaned;
67
+ }
68
+
69
+ return upath.normalizeTrim(upath.join(gitignoreDir, cleaned));
70
+ }
71
+ }