@goshenkata/dryscan-core 1.2.8 → 1.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -1
- package/src/DryScan.ts +0 -166
- package/src/DryScanUpdater.ts +0 -236
- package/src/Gitignore.ts +0 -71
- package/src/IndexUnitExtractor.ts +0 -208
- package/src/config/configStore.ts +0 -55
- package/src/config/dryconfig.ts +0 -115
- package/src/config/indexConfig.ts +0 -13
- package/src/const.ts +0 -5
- package/src/db/DryScanDatabase.ts +0 -133
- package/src/db/entities/FileEntity.ts +0 -29
- package/src/db/entities/IndexUnitEntity.ts +0 -50
- package/src/extractors/LanguageExtractor.ts +0 -9
- package/src/extractors/java.ts +0 -376
- package/src/index.ts +0 -9
- package/src/services/DuplicateService.ts +0 -257
- package/src/services/DuplicationCache.ts +0 -210
- package/src/services/EmbeddingService.ts +0 -81
- package/src/services/ExclusionService.ts +0 -102
- package/src/services/PairingService.ts +0 -145
- package/src/services/ParallelSimilarity.ts +0 -59
- package/src/services/RepositoryInitializer.ts +0 -93
- package/src/services/UpdateService.ts +0 -31
- package/src/services/cosineSimilarityWorker.ts +0 -20
- package/src/services/types.ts +0 -10
- package/src/types/glob-gitignore.d.ts +0 -7
- package/src/types/short-uuid.d.ts +0 -7
- package/src/types/tree-sitter-langs.d.ts +0 -4
- package/src/types.ts +0 -76
- package/tsup.config.ts +0 -15
package/package.json
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@goshenkata/dryscan-core",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.10",
|
|
4
4
|
"description": "Core library for DryScan - semantic code duplication analyzer",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
7
7
|
"types": "./dist/index.d.ts",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist"
|
|
10
|
+
],
|
|
8
11
|
"exports": {
|
|
9
12
|
".": {
|
|
10
13
|
"import": "./dist/index.js",
|
package/src/DryScan.ts
DELETED
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
import upath from "upath";
|
|
2
|
-
import fs from "fs/promises";
|
|
3
|
-
import { DuplicateAnalysisResult, DuplicateReport } from "./types";
|
|
4
|
-
import { DRYSCAN_DIR, INDEX_DB } from "./const";
|
|
5
|
-
import { defaultExtractors, IndexUnitExtractor } from "./IndexUnitExtractor";
|
|
6
|
-
import { DryScanDatabase } from "./db/DryScanDatabase";
|
|
7
|
-
import { RepositoryInitializer, InitOptions as InitServiceOptions } from "./services/RepositoryInitializer";
|
|
8
|
-
import { UpdateService } from "./services/UpdateService";
|
|
9
|
-
import { DuplicateService } from "./services/DuplicateService";
|
|
10
|
-
import { ExclusionService } from "./services/ExclusionService";
|
|
11
|
-
import { DryScanServiceDeps } from "./services/types";
|
|
12
|
-
import { configStore } from "./config/configStore";
|
|
13
|
-
import { DryConfig } from "./types";
|
|
14
|
-
import { PairingService } from "./services/PairingService";
|
|
15
|
-
import { existsSync } from "fs";
|
|
16
|
-
|
|
17
|
-
export type InitOptions = InitServiceOptions;
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
export class DryScan {
|
|
21
|
-
repoPath: string;
|
|
22
|
-
private readonly extractor: IndexUnitExtractor;
|
|
23
|
-
private db: DryScanDatabase;
|
|
24
|
-
private readonly services: {
|
|
25
|
-
initializer: RepositoryInitializer;
|
|
26
|
-
updater: UpdateService;
|
|
27
|
-
duplicate: DuplicateService;
|
|
28
|
-
exclusion: ExclusionService;
|
|
29
|
-
};
|
|
30
|
-
private readonly serviceDeps: DryScanServiceDeps;
|
|
31
|
-
|
|
32
|
-
constructor(
|
|
33
|
-
repoPath: string,
|
|
34
|
-
extractor?: IndexUnitExtractor,
|
|
35
|
-
db?: DryScanDatabase
|
|
36
|
-
) {
|
|
37
|
-
this.repoPath = repoPath;
|
|
38
|
-
this.extractor = extractor ?? new IndexUnitExtractor(repoPath, defaultExtractors(repoPath));
|
|
39
|
-
this.db = db ?? new DryScanDatabase();
|
|
40
|
-
|
|
41
|
-
this.serviceDeps = {
|
|
42
|
-
repoPath: this.repoPath,
|
|
43
|
-
db: this.db,
|
|
44
|
-
extractor: this.extractor,
|
|
45
|
-
pairing: new PairingService(this.extractor),
|
|
46
|
-
};
|
|
47
|
-
|
|
48
|
-
const exclusion = new ExclusionService(this.serviceDeps);
|
|
49
|
-
this.services = {
|
|
50
|
-
initializer: new RepositoryInitializer(this.serviceDeps, exclusion),
|
|
51
|
-
updater: new UpdateService(this.serviceDeps, exclusion),
|
|
52
|
-
duplicate: new DuplicateService(this.serviceDeps),
|
|
53
|
-
exclusion,
|
|
54
|
-
};
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
/**
|
|
58
|
-
* Initializes the DryScan repository with a 3-phase analysis:
|
|
59
|
-
* Phase 1: Extract and save all functions
|
|
60
|
-
* Phase 2: Resolve and save internal dependencies
|
|
61
|
-
* Phase 3: Compute and save semantic embeddings
|
|
62
|
-
*/
|
|
63
|
-
async init(options?: InitOptions): Promise<void> {
|
|
64
|
-
console.log(`[DryScan] Initializing repository at ${this.repoPath}`);
|
|
65
|
-
|
|
66
|
-
const dryDir = upath.join(this.repoPath, DRYSCAN_DIR);
|
|
67
|
-
if (existsSync(dryDir)) {
|
|
68
|
-
console.warn(`[DryScan] Warning: a '.dry' folder already exists at ${dryDir}.`);
|
|
69
|
-
}
|
|
70
|
-
console.log("[DryScan] Preparing database and cache...");
|
|
71
|
-
await configStore.init(this.repoPath);
|
|
72
|
-
await this.ensureDatabase();
|
|
73
|
-
console.log("[DryScan] Starting initial scan (may take a moment)...");
|
|
74
|
-
await this.services.initializer.init(options);
|
|
75
|
-
console.log("[DryScan] Initial scan complete.");
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Updates the index by detecting changed, new, and deleted files.
|
|
80
|
-
* Only reprocesses units in changed files for efficiency.
|
|
81
|
-
* Delegates to DryScanUpdater module for implementation.
|
|
82
|
-
*
|
|
83
|
-
* Update process:
|
|
84
|
-
* 1. List all current source files in repository
|
|
85
|
-
* 2. For each file, check if it's new, changed, or unchanged (via mtime + checksum)
|
|
86
|
-
* 3. Remove old units from changed/deleted files
|
|
87
|
-
* 4. Extract and save units from new/changed files
|
|
88
|
-
* 5. Recompute internal dependencies for affected units
|
|
89
|
-
* 6. Recompute embeddings for affected units
|
|
90
|
-
* 7. Update file tracking metadata
|
|
91
|
-
*/
|
|
92
|
-
async updateIndex(): Promise<string[]> {
|
|
93
|
-
console.log(`[DryScan] Updating index at ${this.repoPath}...`);
|
|
94
|
-
console.log("[DryScan] Checking for file changes...");
|
|
95
|
-
const start = Date.now();
|
|
96
|
-
await this.ensureDatabase();
|
|
97
|
-
const dirtyPaths = await this.services.updater.updateIndex();
|
|
98
|
-
const duration = Date.now() - start;
|
|
99
|
-
console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
|
|
100
|
-
return dirtyPaths;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
/**
|
|
105
|
-
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
106
|
-
*/
|
|
107
|
-
async buildDuplicateReport(): Promise<DuplicateReport> {
|
|
108
|
-
const config = await this.loadConfig();
|
|
109
|
-
const analysis = await this.findDuplicates(config);
|
|
110
|
-
return {
|
|
111
|
-
version: 1,
|
|
112
|
-
generatedAt: new Date().toISOString(),
|
|
113
|
-
threshold: config.threshold,
|
|
114
|
-
grade: analysis.score.grade,
|
|
115
|
-
score: analysis.score,
|
|
116
|
-
duplicates: analysis.duplicates,
|
|
117
|
-
};
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Finds duplicate code blocks using cosine similarity on embeddings.
|
|
122
|
-
* Automatically updates the index before searching to ensure results are current.
|
|
123
|
-
* Compares all function pairs and returns groups with similarity above the configured threshold.
|
|
124
|
-
*
|
|
125
|
-
* @returns Analysis result with duplicate groups and duplication score
|
|
126
|
-
*/
|
|
127
|
-
private async findDuplicates(config: DryConfig): Promise<DuplicateAnalysisResult> {
|
|
128
|
-
console.log(`[DryScan] Finding duplicates (threshold: ${config.threshold})...`);
|
|
129
|
-
await this.ensureDatabase();
|
|
130
|
-
|
|
131
|
-
console.log("[DryScan] Updating index...");
|
|
132
|
-
const updateStart = Date.now();
|
|
133
|
-
const dirtyPaths = await this.updateIndex();
|
|
134
|
-
const updateDuration = Date.now() - updateStart;
|
|
135
|
-
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
136
|
-
|
|
137
|
-
console.log("[DryScan] Detecting duplicates...");
|
|
138
|
-
const dupStart = Date.now();
|
|
139
|
-
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
140
|
-
const dupDuration = Date.now() - dupStart;
|
|
141
|
-
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
142
|
-
|
|
143
|
-
return result;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Cleans excludedPairs entries that no longer match any indexed units.
|
|
148
|
-
* Runs an update first to ensure the index reflects current code.
|
|
149
|
-
*/
|
|
150
|
-
async cleanExclusions(): Promise<{ removed: number; kept: number }> {
|
|
151
|
-
await this.updateIndex();
|
|
152
|
-
return this.services.exclusion.cleanExclusions();
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
private async ensureDatabase(): Promise<void> {
|
|
156
|
-
if (this.db.isInitialized()) return;
|
|
157
|
-
const dbPath = upath.join(this.repoPath, DRYSCAN_DIR, INDEX_DB);
|
|
158
|
-
await fs.mkdir(upath.dirname(dbPath), { recursive: true });
|
|
159
|
-
await this.db.init(dbPath);
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
private async loadConfig(): Promise<DryConfig> {
|
|
163
|
-
return configStore.get(this.repoPath);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
}
|
package/src/DryScanUpdater.ts
DELETED
|
@@ -1,236 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import fs from "fs/promises";
|
|
3
|
-
import debug from "debug";
|
|
4
|
-
import { IndexUnit } from "./types";
|
|
5
|
-
import { IndexUnitExtractor } from "./IndexUnitExtractor";
|
|
6
|
-
import { DryScanDatabase } from "./db/DryScanDatabase";
|
|
7
|
-
import { FileEntity } from "./db/entities/FileEntity";
|
|
8
|
-
import { EmbeddingService } from "./services/EmbeddingService";
|
|
9
|
-
|
|
10
|
-
const log = debug("DryScan:Updater");
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* DryScan Updater Module
|
|
14
|
-
*
|
|
15
|
-
* This module contains all incremental update logic for DryScan.
|
|
16
|
-
* Separated from DryScan.ts to keep that file focused on core operations.
|
|
17
|
-
*
|
|
18
|
-
* Represents the result of change detection.
|
|
19
|
-
* Categorizes files into added, changed, deleted, and unchanged.
|
|
20
|
-
*/
|
|
21
|
-
export interface FileChangeSet {
|
|
22
|
-
added: string[];
|
|
23
|
-
changed: string[];
|
|
24
|
-
deleted: string[];
|
|
25
|
-
unchanged: string[];
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Detects which files have been added, changed, or deleted since last scan.
|
|
30
|
-
* Uses mtime as fast check, then checksum for verification.
|
|
31
|
-
*
|
|
32
|
-
* @param repoPath - Root path of the repository
|
|
33
|
-
* @param extractor - Index unit extractor instance for file operations
|
|
34
|
-
* @param db - Database instance for retrieving tracked files
|
|
35
|
-
* @returns Change set with categorized file paths
|
|
36
|
-
*/
|
|
37
|
-
export async function detectFileChanges(
|
|
38
|
-
repoPath: string,
|
|
39
|
-
extractor: IndexUnitExtractor,
|
|
40
|
-
db: DryScanDatabase
|
|
41
|
-
): Promise<FileChangeSet> {
|
|
42
|
-
// Get current files in repository
|
|
43
|
-
const currentFiles = await extractor.listSourceFiles(repoPath);
|
|
44
|
-
const currentFileSet = new Set(currentFiles);
|
|
45
|
-
|
|
46
|
-
// Get tracked files from database
|
|
47
|
-
const trackedFiles = await db.getAllFiles();
|
|
48
|
-
const trackedFileMap = new Map(trackedFiles.map(f => [f.filePath, f]));
|
|
49
|
-
|
|
50
|
-
const added: string[] = [];
|
|
51
|
-
const changed: string[] = [];
|
|
52
|
-
const unchanged: string[] = [];
|
|
53
|
-
|
|
54
|
-
// Check each current file
|
|
55
|
-
for (const filePath of currentFiles) {
|
|
56
|
-
const tracked = trackedFileMap.get(filePath);
|
|
57
|
-
|
|
58
|
-
if (!tracked) {
|
|
59
|
-
// New file
|
|
60
|
-
added.push(filePath);
|
|
61
|
-
continue;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Check if file changed using mtime first (fast check)
|
|
65
|
-
const fullPath = path.join(repoPath, filePath);
|
|
66
|
-
const stat = await fs.stat(fullPath);
|
|
67
|
-
|
|
68
|
-
if (stat.mtimeMs !== tracked.mtime) {
|
|
69
|
-
// Mtime changed, verify with checksum
|
|
70
|
-
const currentChecksum = await extractor.computeChecksum(fullPath);
|
|
71
|
-
if (currentChecksum !== tracked.checksum) {
|
|
72
|
-
changed.push(filePath);
|
|
73
|
-
} else {
|
|
74
|
-
// Mtime changed but content same
|
|
75
|
-
unchanged.push(filePath);
|
|
76
|
-
}
|
|
77
|
-
} else {
|
|
78
|
-
unchanged.push(filePath);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
// Find deleted files
|
|
83
|
-
const deleted = trackedFiles
|
|
84
|
-
.map(f => f.filePath)
|
|
85
|
-
.filter(fp => !currentFileSet.has(fp));
|
|
86
|
-
|
|
87
|
-
return { added, changed, deleted, unchanged };
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Extracts index units from a list of files.
|
|
92
|
-
* Used during incremental updates.
|
|
93
|
-
*
|
|
94
|
-
* @param filePaths - Array of relative file paths to extract from
|
|
95
|
-
* @param extractor - Index unit extractor instance
|
|
96
|
-
* @returns Array of extracted units
|
|
97
|
-
*/
|
|
98
|
-
export async function extractUnitsFromFiles(
|
|
99
|
-
filePaths: string[],
|
|
100
|
-
extractor: IndexUnitExtractor
|
|
101
|
-
): Promise<IndexUnit[]> {
|
|
102
|
-
const allUnits: IndexUnit[] = [];
|
|
103
|
-
|
|
104
|
-
for (const relPath of filePaths) {
|
|
105
|
-
const functions = await extractor.scan(relPath);
|
|
106
|
-
allUnits.push(...functions);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
return allUnits;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Updates file tracking metadata after processing changes.
|
|
114
|
-
* Removes deleted files, updates changed files, adds new files.
|
|
115
|
-
*
|
|
116
|
-
* @param changeSet - Set of file changes to apply
|
|
117
|
-
* @param repoPath - Root path of the repository
|
|
118
|
-
* @param extractor - Index unit extractor for checksum computation
|
|
119
|
-
* @param db - Database instance for file tracking
|
|
120
|
-
*/
|
|
121
|
-
export async function updateFileTracking(
|
|
122
|
-
changeSet: FileChangeSet,
|
|
123
|
-
repoPath: string,
|
|
124
|
-
extractor: IndexUnitExtractor,
|
|
125
|
-
db: DryScanDatabase
|
|
126
|
-
): Promise<void> {
|
|
127
|
-
// Remove deleted files
|
|
128
|
-
if (changeSet.deleted.length > 0) {
|
|
129
|
-
if (typeof (db as any).removeFilesByFilePaths === "function") {
|
|
130
|
-
await (db as any).removeFilesByFilePaths(changeSet.deleted);
|
|
131
|
-
} else if (typeof (db as any).removeFiles === "function") {
|
|
132
|
-
await (db as any).removeFiles(changeSet.deleted);
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
// Create file entities for new and changed files
|
|
137
|
-
const filesToTrack = [...changeSet.added, ...changeSet.changed];
|
|
138
|
-
if (filesToTrack.length > 0) {
|
|
139
|
-
const fileEntities: FileEntity[] = [];
|
|
140
|
-
|
|
141
|
-
for (const relPath of filesToTrack) {
|
|
142
|
-
const fullPath = path.join(repoPath, relPath);
|
|
143
|
-
const stat = await fs.stat(fullPath);
|
|
144
|
-
const checksum = await extractor.computeChecksum(fullPath);
|
|
145
|
-
|
|
146
|
-
const fileEntity = new FileEntity();
|
|
147
|
-
fileEntity.filePath = relPath;
|
|
148
|
-
fileEntity.checksum = checksum;
|
|
149
|
-
fileEntity.mtime = stat.mtimeMs;
|
|
150
|
-
|
|
151
|
-
fileEntities.push(fileEntity);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
await db.saveFiles(fileEntities);
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
/**
|
|
159
|
-
* Performs incremental update of the DryScan index.
|
|
160
|
-
* Detects file changes and reprocesses only affected files.
|
|
161
|
-
*
|
|
162
|
-
* @param repoPath - Root path of the repository
|
|
163
|
-
* @param extractor - Index unit extractor instance
|
|
164
|
-
* @param db - Database instance (must be initialized)
|
|
165
|
-
*/
|
|
166
|
-
export async function performIncrementalUpdate(
|
|
167
|
-
repoPath: string,
|
|
168
|
-
extractor: IndexUnitExtractor,
|
|
169
|
-
db: DryScanDatabase,
|
|
170
|
-
): Promise<FileChangeSet> {
|
|
171
|
-
log("Starting incremental update");
|
|
172
|
-
const embeddingService = new EmbeddingService(repoPath);
|
|
173
|
-
|
|
174
|
-
// Step 1: Detect changes
|
|
175
|
-
const changeSet = await detectFileChanges(repoPath, extractor, db);
|
|
176
|
-
|
|
177
|
-
if (changeSet.changed.length === 0 &&
|
|
178
|
-
changeSet.added.length === 0 &&
|
|
179
|
-
changeSet.deleted.length === 0) {
|
|
180
|
-
log("No changes detected. Index is up to date.");
|
|
181
|
-
return changeSet;
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
log(`Changes detected: ${changeSet.added.length} added, ${changeSet.changed.length} changed, ${changeSet.deleted.length} deleted`);
|
|
185
|
-
|
|
186
|
-
// Step 2: Remove old data for changed/deleted files
|
|
187
|
-
const filesToRemove = [...changeSet.changed, ...changeSet.deleted];
|
|
188
|
-
if (filesToRemove.length > 0) {
|
|
189
|
-
await db.removeUnitsByFilePaths(filesToRemove);
|
|
190
|
-
log(`Removed units from ${filesToRemove.length} files`);
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// Step 3: Extract functions from new/changed files
|
|
194
|
-
const filesToProcess = [...changeSet.added, ...changeSet.changed];
|
|
195
|
-
if (filesToProcess.length > 0) {
|
|
196
|
-
const newUnits = await extractUnitsFromFiles(filesToProcess, extractor);
|
|
197
|
-
await db.saveUnits(newUnits);
|
|
198
|
-
log(`Extracted and saved ${newUnits.length} units from ${filesToProcess.length} files`);
|
|
199
|
-
|
|
200
|
-
// Step 4: Recompute embeddings for affected units only
|
|
201
|
-
const total = newUnits.length;
|
|
202
|
-
if (total > 0) {
|
|
203
|
-
log(`Recomputing embeddings for ${total} units`);
|
|
204
|
-
const progressInterval = Math.max(1, Math.ceil(total / 10));
|
|
205
|
-
const updatedWithEmbeddings = [] as IndexUnit[];
|
|
206
|
-
|
|
207
|
-
for (let i = 0; i < total; i++) {
|
|
208
|
-
const unit = newUnits[i];
|
|
209
|
-
try {
|
|
210
|
-
const enriched = await embeddingService.addEmbedding(unit);
|
|
211
|
-
updatedWithEmbeddings.push(enriched);
|
|
212
|
-
} catch (err: any) {
|
|
213
|
-
console.error(
|
|
214
|
-
`[DryScan] embedding failed for ${unit.filePath} (${unit.name}): ${err?.message || err}`
|
|
215
|
-
);
|
|
216
|
-
throw err;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
const completed = i + 1;
|
|
220
|
-
if (completed === total || completed % progressInterval === 0) {
|
|
221
|
-
const pct = Math.floor((completed / total) * 100);
|
|
222
|
-
console.log(`[DryScan] Incremental embeddings ${completed}/${total} (${pct}%)`);
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
await db.updateUnits(updatedWithEmbeddings);
|
|
227
|
-
log(`Recomputed embeddings for ${updatedWithEmbeddings.length} units`);
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// Step 5: Update file tracking
|
|
232
|
-
await updateFileTracking(changeSet, repoPath, extractor, db);
|
|
233
|
-
log("Incremental update complete");
|
|
234
|
-
|
|
235
|
-
return changeSet;
|
|
236
|
-
}
|
package/src/Gitignore.ts
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import fs from "fs/promises";
|
|
3
|
-
import upath from "upath";
|
|
4
|
-
import { glob } from "glob-gitignore";
|
|
5
|
-
import ignore, { Ignore } from "ignore";
|
|
6
|
-
import { DryConfig } from "./types";
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Gitignore helper that builds ignore matchers by combining default rules,
|
|
10
|
-
* repo .gitignore files, and config-driven exclusions.
|
|
11
|
-
*/
|
|
12
|
-
export class Gitignore {
|
|
13
|
-
private readonly defaultIgnores = [".git/**", ".dry/**"];
|
|
14
|
-
|
|
15
|
-
constructor(private readonly root: string) {}
|
|
16
|
-
|
|
17
|
-
async buildMatcher(config: DryConfig): Promise<Ignore> {
|
|
18
|
-
const rules = await this.resolveRules(config);
|
|
19
|
-
return ignore({ allowRelativePaths: true }).add(rules);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
private async resolveRules(config: DryConfig): Promise<string[]> {
|
|
23
|
-
const gitignoreRules = await this.loadGitignoreRules();
|
|
24
|
-
const configRules = config.excludedPaths || [];
|
|
25
|
-
return [...this.defaultIgnores, ...gitignoreRules, ...configRules];
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
private async loadGitignoreRules(): Promise<string[]> {
|
|
29
|
-
const gitignoreFiles = await glob("**/.gitignore", {
|
|
30
|
-
cwd: this.root,
|
|
31
|
-
dot: true,
|
|
32
|
-
nodir: true,
|
|
33
|
-
ignore: this.defaultIgnores,
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
const rules: string[] = [];
|
|
37
|
-
|
|
38
|
-
for (const file of gitignoreFiles) {
|
|
39
|
-
const absPath = path.join(this.root, file);
|
|
40
|
-
const dir = upath.normalizeTrim(upath.dirname(file));
|
|
41
|
-
const content = await fs.readFile(absPath, "utf8").catch(() => "");
|
|
42
|
-
const lines = content.split(/\r?\n/);
|
|
43
|
-
|
|
44
|
-
for (const raw of lines) {
|
|
45
|
-
const trimmed = raw.trim();
|
|
46
|
-
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
47
|
-
|
|
48
|
-
const negated = trimmed.startsWith("!");
|
|
49
|
-
const body = negated ? trimmed.slice(1) : trimmed;
|
|
50
|
-
|
|
51
|
-
const scoped = this.scopeRule(body, dir);
|
|
52
|
-
if (!scoped) continue;
|
|
53
|
-
|
|
54
|
-
rules.push(negated ? `!${scoped}` : scoped);
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
return rules;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
private scopeRule(rule: string, gitignoreDir: string): string | null {
|
|
62
|
-
const cleaned = rule.replace(/^\//, "");
|
|
63
|
-
if (!cleaned) return null;
|
|
64
|
-
|
|
65
|
-
if (!gitignoreDir || gitignoreDir === ".") {
|
|
66
|
-
return cleaned;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
return upath.normalizeTrim(upath.join(gitignoreDir, cleaned));
|
|
70
|
-
}
|
|
71
|
-
}
|
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import type { Stats } from "fs";
|
|
3
|
-
import fs from "fs/promises";
|
|
4
|
-
import upath from "upath";
|
|
5
|
-
import crypto from "node:crypto";
|
|
6
|
-
import debug from "debug";
|
|
7
|
-
import { glob } from "glob-gitignore";
|
|
8
|
-
import { IndexUnit } from "./types";
|
|
9
|
-
import { LanguageExtractor } from "./extractors/LanguageExtractor";
|
|
10
|
-
import { JavaExtractor } from "./extractors/java";
|
|
11
|
-
import { FILE_CHECKSUM_ALGO } from "./const";
|
|
12
|
-
import { configStore } from "./config/configStore";
|
|
13
|
-
import { DryConfig } from "./types";
|
|
14
|
-
import { Gitignore } from "./Gitignore"
|
|
15
|
-
import { Ignore } from "ignore";
|
|
16
|
-
|
|
17
|
-
const log = debug("DryScan:Extractor");
|
|
18
|
-
|
|
19
|
-
export type { LanguageExtractor } from "./extractors/LanguageExtractor";
|
|
20
|
-
/**
|
|
21
|
-
* Returns the default set of language extractors supported by DryScan.
|
|
22
|
-
* Extend/override by passing custom extractors into the IndexUnitExtractor constructor.
|
|
23
|
-
*/
|
|
24
|
-
export function defaultExtractors(repoPath: string): LanguageExtractor[] {
|
|
25
|
-
return [new JavaExtractor(repoPath)];
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Extracts and indexes code units (classes, functions, blocks) for a repository.
|
|
30
|
-
* Owns shared file-system helpers and delegates language-specific parsing to LanguageExtractors.
|
|
31
|
-
*/
|
|
32
|
-
export class IndexUnitExtractor {
|
|
33
|
-
private readonly root: string;
|
|
34
|
-
readonly extractors: LanguageExtractor[];
|
|
35
|
-
private readonly gitignore: Gitignore;
|
|
36
|
-
|
|
37
|
-
constructor(
|
|
38
|
-
rootPath: string,
|
|
39
|
-
extractors?: LanguageExtractor[]
|
|
40
|
-
) {
|
|
41
|
-
this.root = rootPath;
|
|
42
|
-
this.extractors = extractors ?? defaultExtractors(rootPath);
|
|
43
|
-
this.gitignore = new Gitignore(this.root);
|
|
44
|
-
log("Initialized extractor for %s", this.root);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
/**
|
|
48
|
-
* Lists all supported source files from a path. Honors exclusion globs from config.
|
|
49
|
-
*/
|
|
50
|
-
async listSourceFiles(dirPath: string): Promise<string[]> {
|
|
51
|
-
const target = await this.resolveTarget(dirPath);
|
|
52
|
-
const config = await this.loadConfig();
|
|
53
|
-
const ignoreMatcher = await this.gitignore.buildMatcher(config);
|
|
54
|
-
|
|
55
|
-
if (target.stat.isFile()) {
|
|
56
|
-
return this.filterSingleFile(target.baseRel, ignoreMatcher);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const matches = await this.globSourceFiles(target.baseRel);
|
|
60
|
-
return this.filterSupportedFiles(matches, ignoreMatcher);
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Computes MD5 checksum of file content to track changes.
|
|
65
|
-
*/
|
|
66
|
-
async computeChecksum(filePath: string): Promise<string> {
|
|
67
|
-
const fullPath = path.isAbsolute(filePath)
|
|
68
|
-
? filePath
|
|
69
|
-
: path.join(this.root, filePath);
|
|
70
|
-
|
|
71
|
-
const content = await fs.readFile(fullPath, "utf8");
|
|
72
|
-
return crypto.createHash(FILE_CHECKSUM_ALGO).update(content).digest("hex");
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/**
|
|
76
|
-
* Scans a file or directory and extracts indexable units using the matching LanguageExtractor.
|
|
77
|
-
* The returned units have repo-relative file paths and no embedding attached.
|
|
78
|
-
*/
|
|
79
|
-
async scan(targetPath: string): Promise<IndexUnit[]> {
|
|
80
|
-
const fullPath = path.isAbsolute(targetPath)
|
|
81
|
-
? targetPath
|
|
82
|
-
: path.join(this.root, targetPath);
|
|
83
|
-
|
|
84
|
-
const stat = await fs.stat(fullPath).catch(() => null);
|
|
85
|
-
if (!stat) {
|
|
86
|
-
throw new Error(`Path not found: ${fullPath}`);
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
if (stat.isDirectory()) {
|
|
90
|
-
log("Scanning directory %s", fullPath);
|
|
91
|
-
return this.scanDirectory(fullPath);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
return this.scanFile(fullPath);
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* Scans a directory recursively, extracting units from supported files while honoring exclusions.
|
|
100
|
-
*/
|
|
101
|
-
private async scanDirectory(dir: string): Promise<IndexUnit[]> {
|
|
102
|
-
const out: IndexUnit[] = [];
|
|
103
|
-
const relDir = this.relPath(dir);
|
|
104
|
-
const files = await this.listSourceFiles(relDir);
|
|
105
|
-
for (const relFile of files) {
|
|
106
|
-
const absFile = path.join(this.root, relFile);
|
|
107
|
-
const extracted = await this.tryScanSupportedFile(absFile);
|
|
108
|
-
out.push(...extracted);
|
|
109
|
-
}
|
|
110
|
-
return out;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* Scans a single file and extracts supported units.
|
|
115
|
-
*/
|
|
116
|
-
private async scanFile(filePath: string): Promise<IndexUnit[]> {
|
|
117
|
-
return this.tryScanSupportedFile(filePath, true);
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Extracts units from a supported file.
|
|
122
|
-
* Optionally throws when the file type is unsupported (used when scanning an explicit file).
|
|
123
|
-
*/
|
|
124
|
-
private async tryScanSupportedFile(filePath: string, throwOnUnsupported = false): Promise<IndexUnit[]> {
|
|
125
|
-
const extractor = this.extractors.find(ex => ex.supports(filePath));
|
|
126
|
-
if (!extractor) {
|
|
127
|
-
if (throwOnUnsupported) {
|
|
128
|
-
throw new Error(`Unsupported file type: ${filePath}`);
|
|
129
|
-
}
|
|
130
|
-
return [];
|
|
131
|
-
}
|
|
132
|
-
const rel = this.relPath(filePath);
|
|
133
|
-
if (await this.shouldExclude(rel)) {
|
|
134
|
-
log("Skipping excluded file %s", rel);
|
|
135
|
-
return [];
|
|
136
|
-
}
|
|
137
|
-
const source = await fs.readFile(filePath, "utf8");
|
|
138
|
-
const units = await extractor.extractFromText(rel, source);
|
|
139
|
-
log("Extracted %d units from %s", units.length, rel);
|
|
140
|
-
return units.map(unit => ({
|
|
141
|
-
...unit,
|
|
142
|
-
filePath: rel,
|
|
143
|
-
embedding: undefined,
|
|
144
|
-
}));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
/**
|
|
148
|
-
* Converts an absolute path to a repo-relative, normalized (POSIX-style) path.
|
|
149
|
-
* This keeps paths stable across platforms and consistent in the index/DB.
|
|
150
|
-
*/
|
|
151
|
-
private relPath(absPath: string): string {
|
|
152
|
-
return this.normalizeRelPath(upath.relative(this.root, absPath));
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
/**
|
|
156
|
-
* Returns true if a repo-relative path matches any configured exclusion glob.
|
|
157
|
-
*/
|
|
158
|
-
private async shouldExclude(relPath: string): Promise<boolean> {
|
|
159
|
-
const config = await this.loadConfig();
|
|
160
|
-
const ignoreMatcher = await this.gitignore.buildMatcher(config);
|
|
161
|
-
return ignoreMatcher.ignores(this.normalizeRelPath(relPath));
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
private async loadConfig(): Promise<DryConfig> {
|
|
165
|
-
return await configStore.get(this.root);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
/**
|
|
169
|
-
* Normalizes repo-relative paths and strips leading "./" to keep matcher inputs consistent.
|
|
170
|
-
*/
|
|
171
|
-
private normalizeRelPath(relPath: string): string {
|
|
172
|
-
const normalized = upath.normalizeTrim(relPath);
|
|
173
|
-
return normalized.startsWith("./") ? normalized.slice(2) : normalized;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
private async resolveTarget(dirPath: string): Promise<{ fullPath: string; baseRel: string; stat: Stats; }> {
|
|
177
|
-
const fullPath = path.isAbsolute(dirPath) ? dirPath : path.join(this.root, dirPath);
|
|
178
|
-
const stat = await fs.stat(fullPath).catch(() => null);
|
|
179
|
-
if (!stat) {
|
|
180
|
-
throw new Error(`Path not found: ${fullPath}`);
|
|
181
|
-
}
|
|
182
|
-
const baseRel = this.relPath(fullPath);
|
|
183
|
-
log("Listing source files under %s", fullPath);
|
|
184
|
-
return { fullPath, baseRel, stat };
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
private async filterSingleFile(baseRel: string, ignoreMatcher: Ignore): Promise<string[]> {
|
|
188
|
-
const relFile = this.normalizeRelPath(baseRel);
|
|
189
|
-
if (ignoreMatcher.ignores(relFile)) return [];
|
|
190
|
-
return this.extractors.some((ex) => ex.supports(relFile)) ? [relFile] : [];
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
private async globSourceFiles(baseRel: string): Promise<string[]> {
|
|
194
|
-
const pattern = baseRel ? `${baseRel.replace(/\\/g, "/")}/**/*` : "**/*";
|
|
195
|
-
const matches = await glob(pattern, {
|
|
196
|
-
cwd: this.root,
|
|
197
|
-
dot: false,
|
|
198
|
-
nodir: true,
|
|
199
|
-
});
|
|
200
|
-
return matches.map((p: string) => this.normalizeRelPath(p));
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
private filterSupportedFiles(relPaths: string[], ignoreMatcher: Ignore): string[] {
|
|
204
|
-
return relPaths
|
|
205
|
-
.filter((relPath: string) => !ignoreMatcher.ignores(relPath))
|
|
206
|
-
.filter((relPath: string) => this.extractors.some((ex) => ex.supports(relPath)));
|
|
207
|
-
}
|
|
208
|
-
}
|