@goshenkata/dryscan-core 1.2.8 → 1.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -1
- package/src/DryScan.ts +0 -166
- package/src/DryScanUpdater.ts +0 -236
- package/src/Gitignore.ts +0 -71
- package/src/IndexUnitExtractor.ts +0 -208
- package/src/config/configStore.ts +0 -55
- package/src/config/dryconfig.ts +0 -115
- package/src/config/indexConfig.ts +0 -13
- package/src/const.ts +0 -5
- package/src/db/DryScanDatabase.ts +0 -133
- package/src/db/entities/FileEntity.ts +0 -29
- package/src/db/entities/IndexUnitEntity.ts +0 -50
- package/src/extractors/LanguageExtractor.ts +0 -9
- package/src/extractors/java.ts +0 -376
- package/src/index.ts +0 -9
- package/src/services/DuplicateService.ts +0 -257
- package/src/services/DuplicationCache.ts +0 -210
- package/src/services/EmbeddingService.ts +0 -81
- package/src/services/ExclusionService.ts +0 -102
- package/src/services/PairingService.ts +0 -145
- package/src/services/ParallelSimilarity.ts +0 -59
- package/src/services/RepositoryInitializer.ts +0 -93
- package/src/services/UpdateService.ts +0 -31
- package/src/services/cosineSimilarityWorker.ts +0 -20
- package/src/services/types.ts +0 -10
- package/src/types/glob-gitignore.d.ts +0 -7
- package/src/types/short-uuid.d.ts +0 -7
- package/src/types/tree-sitter-langs.d.ts +0 -4
- package/src/types.ts +0 -76
- package/tsup.config.ts +0 -15
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
import debug from "debug";
|
|
2
|
-
import { DuplicateGroup, IndexUnit } from "../types";
|
|
3
|
-
import { parallelCosineSimilarity } from "./ParallelSimilarity";
|
|
4
|
-
|
|
5
|
-
const log = debug("DryScan:DuplicationCache");
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* In-memory cache for duplicate comparison scores.
|
|
9
|
-
* Stores a global map of comparison keys and a per-file index for fast invalidation.
|
|
10
|
-
*/
|
|
11
|
-
export class DuplicationCache {
|
|
12
|
-
private static instance: DuplicationCache | null = null;
|
|
13
|
-
|
|
14
|
-
private readonly comparisons = new Map<string, number>();
|
|
15
|
-
private readonly fileIndex = new Map<string, Set<string>>();
|
|
16
|
-
private initialized = false;
|
|
17
|
-
|
|
18
|
-
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
19
|
-
private embSimMatrix: number[][] = [];
|
|
20
|
-
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
21
|
-
private embSimIndex = new Map<string, number>();
|
|
22
|
-
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
23
|
-
private parentSimCache = new Map<string, number>();
|
|
24
|
-
|
|
25
|
-
static getInstance(): DuplicationCache {
|
|
26
|
-
if (!DuplicationCache.instance) {
|
|
27
|
-
DuplicationCache.instance = new DuplicationCache();
|
|
28
|
-
}
|
|
29
|
-
return DuplicationCache.instance;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
/**
|
|
33
|
-
* Updates the cache with fresh duplicate groups. Not awaited by callers to avoid blocking.
|
|
34
|
-
*/
|
|
35
|
-
async update(groups: DuplicateGroup[]): Promise<void> {
|
|
36
|
-
if (!groups) return;
|
|
37
|
-
|
|
38
|
-
for (const group of groups) {
|
|
39
|
-
const key = this.makeKey(group.left.id, group.right.id);
|
|
40
|
-
this.comparisons.set(key, group.similarity);
|
|
41
|
-
this.addKeyForFile(group.left.filePath, key);
|
|
42
|
-
this.addKeyForFile(group.right.filePath, key);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
this.initialized = this.initialized || groups.length > 0;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Retrieves a cached similarity if present and valid for both file paths.
|
|
50
|
-
* Returns null when the cache has not been initialized or when the pair is missing.
|
|
51
|
-
*/
|
|
52
|
-
get(leftId: string, rightId: string, leftFilePath: string, rightFilePath: string): number | null {
|
|
53
|
-
if (!this.initialized) return null;
|
|
54
|
-
|
|
55
|
-
const key = this.makeKey(leftId, rightId);
|
|
56
|
-
if (!this.fileHasKey(leftFilePath, key) || !this.fileHasKey(rightFilePath, key)) {
|
|
57
|
-
return null;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
const value = this.comparisons.get(key);
|
|
61
|
-
return typeof value === "number" ? value : null;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* Invalidates all cached comparisons involving the provided file paths.
|
|
66
|
-
*/
|
|
67
|
-
async invalidate(paths: string[]): Promise<void> {
|
|
68
|
-
if (!this.initialized || !paths || paths.length === 0) return;
|
|
69
|
-
|
|
70
|
-
const unique = new Set(paths);
|
|
71
|
-
for (const filePath of unique) {
|
|
72
|
-
const keys = this.fileIndex.get(filePath);
|
|
73
|
-
if (!keys) continue;
|
|
74
|
-
|
|
75
|
-
for (const key of keys) {
|
|
76
|
-
this.comparisons.delete(key);
|
|
77
|
-
for (const [otherPath, otherKeys] of this.fileIndex.entries()) {
|
|
78
|
-
if (otherKeys.delete(key) && otherKeys.size === 0) {
|
|
79
|
-
this.fileIndex.delete(otherPath);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
this.fileIndex.delete(filePath);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
if (this.comparisons.size === 0) {
|
|
88
|
-
this.initialized = false;
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
/**
|
|
93
|
-
* Clears all cached data. Intended for test setup.
|
|
94
|
-
*/
|
|
95
|
-
clear(): void {
|
|
96
|
-
this.comparisons.clear();
|
|
97
|
-
this.fileIndex.clear();
|
|
98
|
-
this.initialized = false;
|
|
99
|
-
this.embSimMatrix = [];
|
|
100
|
-
this.embSimIndex.clear();
|
|
101
|
-
this.clearRunCaches();
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/**
|
|
105
|
-
* Resets per-run memoization (parent similarities).
|
|
106
|
-
* The embedding matrix is intentionally preserved so incremental runs can
|
|
107
|
-
* reuse clean×clean values across calls.
|
|
108
|
-
*/
|
|
109
|
-
clearRunCaches(): void {
|
|
110
|
-
this.parentSimCache.clear();
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* Builds or incrementally updates the embedding similarity matrix.
|
|
115
|
-
*
|
|
116
|
-
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
117
|
-
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
118
|
-
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
119
|
-
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
120
|
-
*/
|
|
121
|
-
async buildEmbSimCache(units: IndexUnit[], dirtyPaths?: string[]): Promise<void> {
|
|
122
|
-
const embedded = units.filter(u => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
123
|
-
if (embedded.length < 2) {
|
|
124
|
-
this.embSimMatrix = [];
|
|
125
|
-
this.embSimIndex.clear();
|
|
126
|
-
return;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
const embeddings = embedded.map(u => u.embedding as number[]);
|
|
130
|
-
const newIndex = new Map(embedded.map((u, i) => [u.id, i] as [string, number]));
|
|
131
|
-
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
132
|
-
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
133
|
-
|
|
134
|
-
if (!dirtySet || !hasPriorMatrix) {
|
|
135
|
-
// Full rebuild
|
|
136
|
-
this.embSimIndex = newIndex;
|
|
137
|
-
this.embSimMatrix = await parallelCosineSimilarity(embeddings, embeddings);
|
|
138
|
-
log("Built full embedding similarity matrix: %d units", embedded.length);
|
|
139
|
-
return;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
// Incremental: identify dirty unit IDs
|
|
143
|
-
const dirtyIds = new Set(embedded.filter(u => dirtySet.has(u.filePath)).map(u => u.id));
|
|
144
|
-
|
|
145
|
-
if (dirtyIds.size === 0) {
|
|
146
|
-
log("Matrix reused: no dirty units detected");
|
|
147
|
-
return;
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
const n = embedded.length;
|
|
151
|
-
|
|
152
|
-
// Start with zeroes; copy clean×clean values from prior matrix
|
|
153
|
-
const newMatrix: number[][] = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
154
|
-
for (let i = 0; i < n; i++) {
|
|
155
|
-
for (let j = 0; j < n; j++) {
|
|
156
|
-
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
157
|
-
const oi = this.embSimIndex.get(embedded[i].id);
|
|
158
|
-
const oj = this.embSimIndex.get(embedded[j].id);
|
|
159
|
-
if (oi !== undefined && oj !== undefined) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// Recompute dirty rows in one batched call
|
|
164
|
-
const dirtyIndices = embedded.reduce<number[]>((acc, u, i) => (dirtyIds.has(u.id) ? [...acc, i] : acc), []);
|
|
165
|
-
const dirtyRows = await parallelCosineSimilarity(dirtyIndices.map(i => embeddings[i]), embeddings);
|
|
166
|
-
dirtyIndices.forEach((rowIdx, di) => {
|
|
167
|
-
for (let j = 0; j < n; j++) {
|
|
168
|
-
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
169
|
-
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
170
|
-
}
|
|
171
|
-
});
|
|
172
|
-
|
|
173
|
-
this.embSimIndex = newIndex;
|
|
174
|
-
this.embSimMatrix = newMatrix;
|
|
175
|
-
log("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
179
|
-
getEmbSim(id1: string, id2: string): number | undefined {
|
|
180
|
-
const i = this.embSimIndex.get(id1);
|
|
181
|
-
const j = this.embSimIndex.get(id2);
|
|
182
|
-
if (i === undefined || j === undefined) return undefined;
|
|
183
|
-
return this.embSimMatrix[i][j];
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
187
|
-
getParentSim(key: string): number | undefined {
|
|
188
|
-
return this.parentSimCache.get(key);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
/** Stores a memoized parent similarity for the given stable key. */
|
|
192
|
-
setParentSim(key: string, sim: number): void {
|
|
193
|
-
this.parentSimCache.set(key, sim);
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
private addKeyForFile(filePath: string, key: string): void {
|
|
197
|
-
const current = this.fileIndex.get(filePath) ?? new Set<string>();
|
|
198
|
-
current.add(key);
|
|
199
|
-
this.fileIndex.set(filePath, current);
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
private fileHasKey(filePath: string, key: string): boolean {
|
|
203
|
-
const keys = this.fileIndex.get(filePath);
|
|
204
|
-
return keys ? keys.has(key) : false;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
private makeKey(leftId: string, rightId: string): string {
|
|
208
|
-
return [leftId, rightId].sort().join("::");
|
|
209
|
-
}
|
|
210
|
-
}
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import debug from "debug";
|
|
2
|
-
import { OllamaEmbeddings } from "@langchain/ollama";
|
|
3
|
-
import { HuggingFaceInferenceEmbeddings } from "@langchain/community/embeddings/hf";
|
|
4
|
-
import { IndexUnit } from "../types";
|
|
5
|
-
import { configStore } from "../config/configStore";
|
|
6
|
-
|
|
7
|
-
const log = debug("DryScan:EmbeddingService");
|
|
8
|
-
|
|
9
|
-
// Model names for each provider
|
|
10
|
-
const OLLAMA_MODEL = "qwen3-embedding:0.6b";
|
|
11
|
-
const HUGGINGFACE_MODEL = "Qwen/Qwen3-Embedding-0.6B";
|
|
12
|
-
|
|
13
|
-
export class EmbeddingService {
|
|
14
|
-
constructor(private readonly repoPath: string) { }
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* Generates an embedding for the given index unit using the configured provider.
|
|
18
|
-
* Skips embedding if code exceeds the configured context length.
|
|
19
|
-
*/
|
|
20
|
-
async addEmbedding(fn: IndexUnit): Promise<IndexUnit> {
|
|
21
|
-
const config = await configStore.get(this.repoPath);
|
|
22
|
-
const maxContext = config?.contextLength ?? 2048;
|
|
23
|
-
if (fn.code.length > maxContext) {
|
|
24
|
-
log(
|
|
25
|
-
"Skipping embedding for %s (code length %d exceeds context %d)",
|
|
26
|
-
fn.id,
|
|
27
|
-
fn.code.length,
|
|
28
|
-
maxContext
|
|
29
|
-
);
|
|
30
|
-
return { ...fn, embedding: null };
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
const source = config.embeddingSource;
|
|
34
|
-
if (!source) {
|
|
35
|
-
const message = `Embedding source is not configured for repository at ${this.repoPath}`;
|
|
36
|
-
log(message);
|
|
37
|
-
throw new Error(message);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const embeddings = this.buildProvider(source);
|
|
41
|
-
const embedding = await embeddings.embedQuery(fn.code);
|
|
42
|
-
return { ...fn, embedding };
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Builds the embedding provider based on the source configuration.
|
|
47
|
-
* - URL (http/https): Uses Ollama with "embeddinggemma" model
|
|
48
|
-
* - "huggingface": Uses HuggingFace Inference API with "embeddinggemma-300m" model
|
|
49
|
-
*/
|
|
50
|
-
private buildProvider(source: string) {
|
|
51
|
-
// HuggingFace Inference API
|
|
52
|
-
if (source.toLowerCase() === "huggingface") {
|
|
53
|
-
log("Using HuggingFace Inference with model: %s", HUGGINGFACE_MODEL);
|
|
54
|
-
return new HuggingFaceInferenceEmbeddings({
|
|
55
|
-
model: HUGGINGFACE_MODEL,
|
|
56
|
-
provider: "hf-inference",
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// Ollama keyword or direct URL
|
|
61
|
-
const ollamaBaseUrl = this.resolveOllamaBaseUrl(source);
|
|
62
|
-
if (ollamaBaseUrl !== null) {
|
|
63
|
-
log("Using Ollama%s with model: %s", ollamaBaseUrl ? ` at ${ollamaBaseUrl}` : "", OLLAMA_MODEL);
|
|
64
|
-
return new OllamaEmbeddings({ model: OLLAMA_MODEL, ...(ollamaBaseUrl && { baseUrl: ollamaBaseUrl }) });
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
const message = `Unsupported embedding source: ${source || "(empty)"}. Use "huggingface" or an Ollama URL.`;
|
|
68
|
-
log(message);
|
|
69
|
-
throw new Error(message);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
/**
|
|
73
|
-
* Returns the Ollama base URL if source is an HTTP URL, undefined if source is "ollama" (use default),
|
|
74
|
-
* or null if source is not an Ollama provider at all.
|
|
75
|
-
*/
|
|
76
|
-
private resolveOllamaBaseUrl(source: string): string | undefined | null {
|
|
77
|
-
if (/^https?:\/\//i.test(source)) return source;
|
|
78
|
-
if (source.toLowerCase() === "ollama") return undefined;
|
|
79
|
-
return null;
|
|
80
|
-
}
|
|
81
|
-
}
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import { DryConfig } from "../types";
|
|
2
|
-
import { configStore } from "../config/configStore";
|
|
3
|
-
import { DryScanServiceDeps } from "./types";
|
|
4
|
-
import { IndexUnitType } from "../types";
|
|
5
|
-
import { minimatch } from "minimatch";
|
|
6
|
-
import { ParsedPairKey } from "./PairingService";
|
|
7
|
-
|
|
8
|
-
export class ExclusionService {
|
|
9
|
-
private config?: DryConfig;
|
|
10
|
-
|
|
11
|
-
constructor(private readonly deps: DryScanServiceDeps) {}
|
|
12
|
-
|
|
13
|
-
async cleanupExcludedFiles(): Promise<void> {
|
|
14
|
-
const config = await this.loadConfig();
|
|
15
|
-
if (!config.excludedPaths || config.excludedPaths.length === 0) return;
|
|
16
|
-
|
|
17
|
-
const units = await this.deps.db.getAllUnits();
|
|
18
|
-
const files = await this.deps.db.getAllFiles();
|
|
19
|
-
|
|
20
|
-
const unitPathsToRemove = new Set<string>();
|
|
21
|
-
for (const unit of units) {
|
|
22
|
-
if (this.pathExcluded(unit.filePath)) {
|
|
23
|
-
unitPathsToRemove.add(unit.filePath);
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
const filePathsToRemove = new Set<string>();
|
|
28
|
-
for (const file of files) {
|
|
29
|
-
if (this.pathExcluded(file.filePath)) {
|
|
30
|
-
filePathsToRemove.add(file.filePath);
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
const paths = [...new Set([...unitPathsToRemove, ...filePathsToRemove])];
|
|
35
|
-
if (paths.length > 0) {
|
|
36
|
-
await this.deps.db.removeUnitsByFilePaths(paths);
|
|
37
|
-
await this.deps.db.removeFilesByFilePaths(paths);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
async cleanExclusions(): Promise<{ removed: number; kept: number }> {
|
|
42
|
-
const config = await this.loadConfig();
|
|
43
|
-
const units = await this.deps.db.getAllUnits();
|
|
44
|
-
|
|
45
|
-
const actualPairsByType = {
|
|
46
|
-
[IndexUnitType.CLASS]: this.buildPairKeys(units, IndexUnitType.CLASS),
|
|
47
|
-
[IndexUnitType.FUNCTION]: this.buildPairKeys(units, IndexUnitType.FUNCTION),
|
|
48
|
-
[IndexUnitType.BLOCK]: this.buildPairKeys(units, IndexUnitType.BLOCK),
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
const kept: string[] = [];
|
|
52
|
-
const removed: string[] = [];
|
|
53
|
-
|
|
54
|
-
for (const entry of config.excludedPairs || []) {
|
|
55
|
-
const parsed = this.deps.pairing.parsePairKey(entry);
|
|
56
|
-
if (!parsed) {
|
|
57
|
-
removed.push(entry);
|
|
58
|
-
continue;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
const candidates = actualPairsByType[parsed.type];
|
|
62
|
-
const matched = candidates.some((actual) => this.deps.pairing.pairKeyMatches(actual, parsed));
|
|
63
|
-
if (matched) {
|
|
64
|
-
kept.push(entry);
|
|
65
|
-
} else {
|
|
66
|
-
removed.push(entry);
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
const nextConfig: DryConfig = { ...config, excludedPairs: kept };
|
|
71
|
-
await configStore.save(this.deps.repoPath, nextConfig);
|
|
72
|
-
this.config = nextConfig;
|
|
73
|
-
|
|
74
|
-
return { removed: removed.length, kept: kept.length };
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
private pathExcluded(filePath: string): boolean {
|
|
78
|
-
const config = this.config;
|
|
79
|
-
if (!config || !config.excludedPaths || config.excludedPaths.length === 0) return false;
|
|
80
|
-
return config.excludedPaths.some((pattern) => minimatch(filePath, pattern, { dot: true }));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
private buildPairKeys(units: any[], type: IndexUnitType): ParsedPairKey[] {
|
|
84
|
-
const typed = units.filter((u) => u.unitType === type);
|
|
85
|
-
const pairs: ParsedPairKey[] = [];
|
|
86
|
-
for (let i = 0; i < typed.length; i++) {
|
|
87
|
-
for (let j = i + 1; j < typed.length; j++) {
|
|
88
|
-
const key = this.deps.pairing.pairKeyForUnits(typed[i], typed[j]);
|
|
89
|
-
const parsed = key ? this.deps.pairing.parsePairKey(key) : null;
|
|
90
|
-
if (parsed) {
|
|
91
|
-
pairs.push(parsed);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
return pairs;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
private async loadConfig(): Promise<DryConfig> {
|
|
99
|
-
this.config = await configStore.get(this.deps.repoPath);
|
|
100
|
-
return this.config;
|
|
101
|
-
}
|
|
102
|
-
}
|
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
import crypto from "node:crypto";
|
|
2
|
-
import debug from "debug";
|
|
3
|
-
import { minimatch } from "minimatch";
|
|
4
|
-
import { LanguageExtractor } from "../extractors/LanguageExtractor";
|
|
5
|
-
import { IndexUnitExtractor } from "../IndexUnitExtractor";
|
|
6
|
-
import { IndexUnit, IndexUnitType } from "../types";
|
|
7
|
-
import { BLOCK_HASH_ALGO } from "../const";
|
|
8
|
-
|
|
9
|
-
const log = debug("DryScan:pairs");
|
|
10
|
-
|
|
11
|
-
type UnitLike = Pick<IndexUnit, "unitType" | "filePath" | "name" | "code">;
|
|
12
|
-
|
|
13
|
-
export interface ParsedPairKey {
|
|
14
|
-
type: IndexUnitType;
|
|
15
|
-
left: string;
|
|
16
|
-
right: string;
|
|
17
|
-
key: string;
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Service for building and parsing pair keys with extractor-aware labeling.
|
|
22
|
-
*/
|
|
23
|
-
export class PairingService {
|
|
24
|
-
constructor(private readonly indexUnitExtractor: IndexUnitExtractor) {}
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Creates a stable, order-independent key for two units of the same type.
|
|
28
|
-
* Returns null when units differ in type so callers can skip invalid pairs.
|
|
29
|
-
*/
|
|
30
|
-
pairKeyForUnits(left: UnitLike, right: UnitLike): string | null {
|
|
31
|
-
if (left.unitType !== right.unitType) {
|
|
32
|
-
log("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
|
|
33
|
-
return null;
|
|
34
|
-
}
|
|
35
|
-
const type = left.unitType;
|
|
36
|
-
const leftLabel = this.unitLabel(left);
|
|
37
|
-
const rightLabel = this.unitLabel(right);
|
|
38
|
-
const [a, b] = [leftLabel, rightLabel].sort();
|
|
39
|
-
return `${type}|${a}|${b}`;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Parses a raw pair key into its components, returning null for malformed values.
|
|
44
|
-
* Sorting is applied so callers can compare pairs without worrying about order.
|
|
45
|
-
*/
|
|
46
|
-
parsePairKey(value: string): ParsedPairKey | null {
|
|
47
|
-
const parts = value.split("|");
|
|
48
|
-
if (parts.length !== 3) {
|
|
49
|
-
log("Invalid pair key format: %s", value);
|
|
50
|
-
return null;
|
|
51
|
-
}
|
|
52
|
-
const [typeRaw, leftRaw, rightRaw] = parts;
|
|
53
|
-
const type = this.stringToUnitType(typeRaw);
|
|
54
|
-
if (!type) {
|
|
55
|
-
log("Unknown unit type in pair key: %s", typeRaw);
|
|
56
|
-
return null;
|
|
57
|
-
}
|
|
58
|
-
const [left, right] = [leftRaw, rightRaw].sort();
|
|
59
|
-
return { type, left, right, key: `${type}|${left}|${right}` };
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Checks whether an actual pair key satisfies a pattern, with glob matching for class paths.
|
|
64
|
-
*/
|
|
65
|
-
pairKeyMatches(actual: ParsedPairKey, pattern: ParsedPairKey): boolean {
|
|
66
|
-
if (actual.type !== pattern.type) return false;
|
|
67
|
-
if (actual.type === IndexUnitType.CLASS) {
|
|
68
|
-
// Allow glob matching for class file paths.
|
|
69
|
-
const forward =
|
|
70
|
-
minimatch(actual.left, pattern.left, { dot: true }) &&
|
|
71
|
-
minimatch(actual.right, pattern.right, { dot: true });
|
|
72
|
-
const swapped =
|
|
73
|
-
minimatch(actual.left, pattern.right, { dot: true }) &&
|
|
74
|
-
minimatch(actual.right, pattern.left, { dot: true });
|
|
75
|
-
return forward || swapped;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// Functions and blocks use exact matching on canonical strings.
|
|
79
|
-
return (
|
|
80
|
-
(actual.left === pattern.left && actual.right === pattern.right) ||
|
|
81
|
-
(actual.left === pattern.right && actual.right === pattern.left)
|
|
82
|
-
);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Derives a reversible, extractor-aware label for a unit.
|
|
87
|
-
* Extractors may override; fallback uses a fixed format per unit type.
|
|
88
|
-
*/
|
|
89
|
-
unitLabel(unit: UnitLike): string {
|
|
90
|
-
const extractor = this.findExtractor(unit.filePath);
|
|
91
|
-
const customLabel = extractor?.unitLabel?.(unit as IndexUnit);
|
|
92
|
-
if (customLabel) return customLabel;
|
|
93
|
-
|
|
94
|
-
switch (unit.unitType) {
|
|
95
|
-
case IndexUnitType.CLASS:
|
|
96
|
-
return unit.filePath;
|
|
97
|
-
case IndexUnitType.FUNCTION:
|
|
98
|
-
return this.canonicalFunctionSignature(unit);
|
|
99
|
-
case IndexUnitType.BLOCK:
|
|
100
|
-
return this.normalizedBlockHash(unit);
|
|
101
|
-
default:
|
|
102
|
-
return unit.name;
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
private findExtractor(filePath: string): LanguageExtractor | undefined {
|
|
107
|
-
return this.indexUnitExtractor.extractors.find((ex) => ex.supports(filePath));
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
private canonicalFunctionSignature(unit: UnitLike): string {
|
|
111
|
-
const arity = this.extractArity(unit.code);
|
|
112
|
-
return `${unit.name}(arity:${arity})`;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
/**
|
|
116
|
-
* Normalizes block code (strips comments/whitespace) and hashes it for pair matching.
|
|
117
|
-
*/
|
|
118
|
-
private normalizedBlockHash(unit: UnitLike): string {
|
|
119
|
-
const normalized = this.normalizeCode(unit.code);
|
|
120
|
-
return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
private stringToUnitType(value: string): IndexUnitType | null {
|
|
124
|
-
if (value === IndexUnitType.CLASS) return IndexUnitType.CLASS;
|
|
125
|
-
if (value === IndexUnitType.FUNCTION) return IndexUnitType.FUNCTION;
|
|
126
|
-
if (value === IndexUnitType.BLOCK) return IndexUnitType.BLOCK;
|
|
127
|
-
return null;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
private extractArity(code: string): number {
|
|
131
|
-
const match = code.match(/^[^{]*?\(([^)]*)\)/s);
|
|
132
|
-
if (!match) return 0;
|
|
133
|
-
const params = match[1]
|
|
134
|
-
.split(",")
|
|
135
|
-
.map((p) => p.trim())
|
|
136
|
-
.filter(Boolean);
|
|
137
|
-
return params.length;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
private normalizeCode(code: string): string {
|
|
141
|
-
const withoutBlockComments = code.replace(/\/\*[\s\S]*?\*\//g, "");
|
|
142
|
-
const withoutLineComments = withoutBlockComments.replace(/\/\/[^\n\r]*/g, "");
|
|
143
|
-
return withoutLineComments.replace(/\s+/g, "");
|
|
144
|
-
}
|
|
145
|
-
}
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import os from "node:os";
|
|
2
|
-
import { Worker } from "node:worker_threads";
|
|
3
|
-
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
4
|
-
|
|
5
|
-
/** Minimum row count below which synchronous is faster than worker overhead. */
|
|
6
|
-
const MIN_PARALLEL_ROWS = 50;
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Computes cosineSimilarity(A, B) using worker threads for large inputs,
|
|
10
|
-
* falling back to the synchronous library call for small ones.
|
|
11
|
-
* B is packed into a SharedArrayBuffer shared across all workers — no copies.
|
|
12
|
-
*/
|
|
13
|
-
export async function parallelCosineSimilarity(A: number[][], B: number[][]): Promise<number[][]> {
|
|
14
|
-
if (A.length === 0 || B.length === 0) return [];
|
|
15
|
-
if (A.length < MIN_PARALLEL_ROWS) return cosineSimilarity(A, B);
|
|
16
|
-
|
|
17
|
-
const dims = A[0].length;
|
|
18
|
-
const chunkSize = Math.ceil(A.length / os.cpus().length);
|
|
19
|
-
|
|
20
|
-
const sharedB = new SharedArrayBuffer(B.length * dims * 8);
|
|
21
|
-
const bView = new Float64Array(sharedB);
|
|
22
|
-
B.forEach((row, i) => bView.set(row, i * dims));
|
|
23
|
-
|
|
24
|
-
// import.meta.resolve respects the active module loader:
|
|
25
|
-
// under tsx it remaps .js → .ts; in compiled output it stays .js.
|
|
26
|
-
const workerUrl = new URL("./services/cosineSimilarityWorker.js", import.meta.url);
|
|
27
|
-
const execArgv = workerUrl.pathname.endsWith(".ts") ? ["--import", "tsx/esm"] : [];
|
|
28
|
-
|
|
29
|
-
const chunks = Array.from(
|
|
30
|
-
{ length: Math.ceil(A.length / chunkSize) },
|
|
31
|
-
(_, i) => A.slice(i * chunkSize, (i + 1) * chunkSize),
|
|
32
|
-
);
|
|
33
|
-
|
|
34
|
-
const results = await Promise.all(chunks.map(chunk => runWorker(chunk, sharedB, B.length, dims, workerUrl, execArgv)));
|
|
35
|
-
return results.flat();
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
function runWorker(
|
|
39
|
-
chunk: number[][],
|
|
40
|
-
sharedB: SharedArrayBuffer,
|
|
41
|
-
bCount: number,
|
|
42
|
-
dims: number,
|
|
43
|
-
workerUrl: URL,
|
|
44
|
-
execArgv: string[],
|
|
45
|
-
): Promise<number[][]> {
|
|
46
|
-
return new Promise((resolve, reject) => {
|
|
47
|
-
const rowsFlat = new Float64Array(chunk.length * dims);
|
|
48
|
-
chunk.forEach((row, i) => rowsFlat.set(row, i * dims));
|
|
49
|
-
|
|
50
|
-
const worker = new Worker(workerUrl, {
|
|
51
|
-
workerData: { rowsBuffer: rowsFlat.buffer, rowCount: chunk.length, allBuffer: sharedB, allCount: bCount, dims },
|
|
52
|
-
transferList: [rowsFlat.buffer],
|
|
53
|
-
execArgv,
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
worker.once("message", ({ result }) => resolve(result));
|
|
57
|
-
worker.once("error", reject);
|
|
58
|
-
});
|
|
59
|
-
}
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import fs from "fs/promises";
|
|
3
|
-
import { DryScanServiceDeps } from "./types";
|
|
4
|
-
import { ExclusionService } from "./ExclusionService";
|
|
5
|
-
import { IndexUnit } from "../types";
|
|
6
|
-
import { EmbeddingService } from "./EmbeddingService";
|
|
7
|
-
import { FileEntity } from "../db/entities/FileEntity";
|
|
8
|
-
import { IndexUnitExtractor } from "../IndexUnitExtractor";
|
|
9
|
-
|
|
10
|
-
export interface InitOptions {
|
|
11
|
-
skipEmbeddings?: boolean;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export class RepositoryInitializer {
|
|
15
|
-
constructor(
|
|
16
|
-
private readonly deps: DryScanServiceDeps,
|
|
17
|
-
private readonly exclusionService: ExclusionService
|
|
18
|
-
) {}
|
|
19
|
-
|
|
20
|
-
async init(options?: InitOptions): Promise<void> {
|
|
21
|
-
const extractor = this.deps.extractor;
|
|
22
|
-
|
|
23
|
-
console.log("[DryScan] Phase 1/3: Extracting code units...");
|
|
24
|
-
await this.initUnits(extractor);
|
|
25
|
-
console.log("[DryScan] Phase 2/3: Computing embeddings (may be slow)...");
|
|
26
|
-
await this.computeEmbeddings(options?.skipEmbeddings === true);
|
|
27
|
-
console.log("[DryScan] Phase 3/3: Tracking files...");
|
|
28
|
-
await this.trackFiles(extractor);
|
|
29
|
-
await this.exclusionService.cleanupExcludedFiles();
|
|
30
|
-
console.log("[DryScan] Initialization phases complete.");
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
private async initUnits(extractor: IndexUnitExtractor): Promise<void> {
|
|
34
|
-
const units = await extractor.scan(this.deps.repoPath);
|
|
35
|
-
console.log(`[DryScan] Extracted ${units.length} index units.`);
|
|
36
|
-
await this.deps.db.saveUnits(units);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
private async computeEmbeddings(skipEmbeddings: boolean): Promise<void> {
|
|
40
|
-
if (skipEmbeddings) {
|
|
41
|
-
console.log("[DryScan] Skipping embedding computation by request.");
|
|
42
|
-
return;
|
|
43
|
-
}
|
|
44
|
-
const allUnits: IndexUnit[] = await this.deps.db.getAllUnits();
|
|
45
|
-
const total = allUnits.length;
|
|
46
|
-
console.log(`[DryScan] Computing embeddings for ${total} units...`);
|
|
47
|
-
|
|
48
|
-
const updated: IndexUnit[] = [];
|
|
49
|
-
const progressInterval = Math.max(1, Math.ceil(total / 10));
|
|
50
|
-
const embeddingService = new EmbeddingService(this.deps.repoPath);
|
|
51
|
-
|
|
52
|
-
for (let i = 0; i < total; i++) {
|
|
53
|
-
const unit = allUnits[i];
|
|
54
|
-
try {
|
|
55
|
-
const enriched = await embeddingService.addEmbedding(unit);
|
|
56
|
-
updated.push(enriched);
|
|
57
|
-
} catch (err: any) {
|
|
58
|
-
console.error(
|
|
59
|
-
`[DryScan] Embedding failed for ${unit.filePath} (${unit.name}): ${err?.message || err}`
|
|
60
|
-
);
|
|
61
|
-
throw err;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const completed = i + 1;
|
|
65
|
-
if (completed === total || completed % progressInterval === 0) {
|
|
66
|
-
const pct = Math.floor((completed / total) * 100);
|
|
67
|
-
console.log(`[DryScan] Embeddings ${completed}/${total} (${pct}%)`);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
await this.deps.db.updateUnits(updated);
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
private async trackFiles(extractor: IndexUnitExtractor): Promise<void> {
|
|
75
|
-
const allFunctions = await extractor.listSourceFiles(this.deps.repoPath);
|
|
76
|
-
const fileEntities: FileEntity[] = [];
|
|
77
|
-
|
|
78
|
-
for (const relPath of allFunctions) {
|
|
79
|
-
const fullPath = path.join(this.deps.repoPath, relPath);
|
|
80
|
-
const stat = await fs.stat(fullPath);
|
|
81
|
-
const checksum = await extractor.computeChecksum(fullPath);
|
|
82
|
-
|
|
83
|
-
const fileEntity = new FileEntity();
|
|
84
|
-
fileEntity.filePath = relPath;
|
|
85
|
-
fileEntity.checksum = checksum;
|
|
86
|
-
fileEntity.mtime = stat.mtimeMs;
|
|
87
|
-
fileEntities.push(fileEntity);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
await this.deps.db.saveFiles(fileEntities);
|
|
91
|
-
console.log(`[DryScan] Tracked ${fileEntities.length} files.`);
|
|
92
|
-
}
|
|
93
|
-
}
|