pastapolice 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +114 -3
- package/package.json +5 -1
package/dist/index.cjs
CHANGED
|
@@ -220025,6 +220025,10 @@ function isKeyword(text) {
|
|
|
220025
220025
|
function hashNormalized(normalized) {
|
|
220026
220026
|
return (0, import_xxhash.xxh64)(normalized).toString(16);
|
|
220027
220027
|
}
|
|
220028
|
+
function tokenizeNormalized(normalized) {
|
|
220029
|
+
const tokens = normalized.split(/[\s{}\[\]();,.<>+/*%=!?&|:]+/).filter((t) => t.length > 0);
|
|
220030
|
+
return tokens;
|
|
220031
|
+
}
|
|
220028
220032
|
|
|
220029
220033
|
// src/processor.ts
|
|
220030
220034
|
async function processFile(file, minLines, map) {
|
|
@@ -220065,6 +220069,22 @@ async function processFileSemantic(file, minLines, semanticMap) {
|
|
|
220065
220069
|
semanticMap.get(hash).push(func);
|
|
220066
220070
|
}
|
|
220067
220071
|
}
|
|
220072
|
+
async function processFileFuzzy(file, minLines, allFunctions = []) {
|
|
220073
|
+
const content = await (0, import_promises.readFile)(file, "utf-8");
|
|
220074
|
+
const functions = parseFunctions(file, content);
|
|
220075
|
+
const effectiveMinLines = Math.max(1, minLines);
|
|
220076
|
+
for (const func of functions) {
|
|
220077
|
+
func.file = file;
|
|
220078
|
+
const lineCount = func.end - func.start + 1;
|
|
220079
|
+
if (lineCount < effectiveMinLines) {
|
|
220080
|
+
continue;
|
|
220081
|
+
}
|
|
220082
|
+
normalizeFunction(file, content, func);
|
|
220083
|
+
func.tokens = tokenizeNormalized(func.normalized);
|
|
220084
|
+
allFunctions.push(func);
|
|
220085
|
+
}
|
|
220086
|
+
return allFunctions;
|
|
220087
|
+
}
|
|
220068
220088
|
function normalizeLine(line) {
|
|
220069
220089
|
return line.trim();
|
|
220070
220090
|
}
|
|
@@ -220080,6 +220100,49 @@ function getSemanticMatches(semanticMap) {
|
|
|
220080
220100
|
}
|
|
220081
220101
|
return matches;
|
|
220082
220102
|
}
|
|
220103
|
+
function jaccardSimilarity(tokens1, tokens2) {
|
|
220104
|
+
if (tokens1.length === 0 && tokens2.length === 0) return 1;
|
|
220105
|
+
if (tokens1.length === 0 || tokens2.length === 0) return 0;
|
|
220106
|
+
const set1 = new Set(tokens1);
|
|
220107
|
+
const set2 = new Set(tokens2);
|
|
220108
|
+
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
|
220109
|
+
const union = /* @__PURE__ */ new Set([...set1, ...set2]);
|
|
220110
|
+
return intersection.size / union.size;
|
|
220111
|
+
}
|
|
220112
|
+
function getFuzzyMatches(allFunctions, threshold = 0.6) {
|
|
220113
|
+
const matches = [];
|
|
220114
|
+
const checked = /* @__PURE__ */ new Set();
|
|
220115
|
+
for (let i = 0; i < allFunctions.length; i++) {
|
|
220116
|
+
for (let j = i + 1; j < allFunctions.length; j++) {
|
|
220117
|
+
const func1 = allFunctions[i];
|
|
220118
|
+
const func2 = allFunctions[j];
|
|
220119
|
+
if (func1.file === func2.file) continue;
|
|
220120
|
+
const pairKey = `${func1.file}:${func1.start}-${func1.end}|${func2.file}:${func2.start}-${func2.end}`;
|
|
220121
|
+
if (checked.has(pairKey)) continue;
|
|
220122
|
+
checked.add(pairKey);
|
|
220123
|
+
const tokens1 = func1.tokens || tokenizeNormalized(func1.normalized);
|
|
220124
|
+
const tokens2 = func2.tokens || tokenizeNormalized(func2.normalized);
|
|
220125
|
+
const similarity = jaccardSimilarity(tokens1, tokens2);
|
|
220126
|
+
if (similarity >= threshold) {
|
|
220127
|
+
const existingMatch = matches.find(
|
|
220128
|
+
(m) => m.pairs.some(
|
|
220129
|
+
(p) => p.func1.file === func1.file && p.func1.start === func1.start || p.func1.file === func2.file && p.func1.start === func2.start
|
|
220130
|
+
)
|
|
220131
|
+
);
|
|
220132
|
+
if (existingMatch) {
|
|
220133
|
+
existingMatch.pairs.push({ func1, func2, similarity });
|
|
220134
|
+
existingMatch.averageSimilarity = existingMatch.pairs.reduce((sum, p) => sum + p.similarity, 0) / existingMatch.pairs.length;
|
|
220135
|
+
} else {
|
|
220136
|
+
matches.push({
|
|
220137
|
+
pairs: [{ func1, func2, similarity }],
|
|
220138
|
+
averageSimilarity: similarity
|
|
220139
|
+
});
|
|
220140
|
+
}
|
|
220141
|
+
}
|
|
220142
|
+
}
|
|
220143
|
+
}
|
|
220144
|
+
return matches;
|
|
220145
|
+
}
|
|
220083
220146
|
|
|
220084
220147
|
// node_modules/chalk/source/vendor/ansi-styles/index.js
|
|
220085
220148
|
var ANSI_BACKGROUND_OFFSET = 10;
|
|
@@ -220624,12 +220687,56 @@ async function reportSemantic(matches) {
|
|
|
220624
220687
|
}
|
|
220625
220688
|
console.log(source_default.yellow(`Total semantic matches found: ${matches.length}`));
|
|
220626
220689
|
}
|
|
220690
|
+
async function reportFuzzy(matches) {
|
|
220691
|
+
if (matches.length === 0) {
|
|
220692
|
+
console.log(source_default.green("\u2705 No suspicious pasta found."));
|
|
220693
|
+
return;
|
|
220694
|
+
}
|
|
220695
|
+
console.log(source_default.red("\n\u{1F693} Fuzzy semantic matching complete!\n"));
|
|
220696
|
+
for (const match of matches) {
|
|
220697
|
+
const similarityPercent = Math.round(match.averageSimilarity * 100);
|
|
220698
|
+
console.log(source_default.red(`\u{1F50D} ${similarityPercent}% similar functions detected:
|
|
220699
|
+
`));
|
|
220700
|
+
const funcSet = /* @__PURE__ */ new Map();
|
|
220701
|
+
for (const pair of match.pairs) {
|
|
220702
|
+
const key1 = `${pair.func1.file}:${pair.func1.start}`;
|
|
220703
|
+
const key2 = `${pair.func2.file}:${pair.func2.start}`;
|
|
220704
|
+
if (!funcSet.has(key1)) funcSet.set(key1, pair.func1);
|
|
220705
|
+
if (!funcSet.has(key2)) funcSet.set(key2, pair.func2);
|
|
220706
|
+
}
|
|
220707
|
+
for (const func of funcSet.values()) {
|
|
220708
|
+
console.log(source_default.yellow(` ${func.signature}`));
|
|
220709
|
+
console.log(source_default.gray(` File: ${func.file}:${func.start}-${func.end}`));
|
|
220710
|
+
try {
|
|
220711
|
+
const content = await (0, import_promises2.readFile)(func.file, "utf-8");
|
|
220712
|
+
const lines = content.split("\n");
|
|
220713
|
+
const snippet = lines.slice(func.start - 1, func.end).join("\n");
|
|
220714
|
+
console.log(source_default.gray(" Code:"));
|
|
220715
|
+
snippet.split("\n").forEach((line) => {
|
|
220716
|
+
console.log(source_default.gray(" " + line));
|
|
220717
|
+
});
|
|
220718
|
+
} catch {
|
|
220719
|
+
}
|
|
220720
|
+
console.log();
|
|
220721
|
+
}
|
|
220722
|
+
console.log(source_default.gray("--------------------------------\n"));
|
|
220723
|
+
}
|
|
220724
|
+
console.log(source_default.yellow(`Total fuzzy matches found: ${matches.length}`));
|
|
220725
|
+
}
|
|
220627
220726
|
|
|
220628
220727
|
// src/scan.ts
|
|
220629
|
-
async function scan(root, minLines, syntactic = false) {
|
|
220728
|
+
async function scan(root, minLines, syntactic = false, fuzzy = false) {
|
|
220630
220729
|
console.log("\u{1F693} PastaPolice scanning...");
|
|
220631
220730
|
const files = await getSourceFiles(root);
|
|
220632
|
-
if (
|
|
220731
|
+
if (fuzzy) {
|
|
220732
|
+
console.log("\u{1F50D} Mode: Fuzzy semantic matching (AST-based with similarity scoring)");
|
|
220733
|
+
const allFunctions = [];
|
|
220734
|
+
for (const file of files) {
|
|
220735
|
+
await processFileFuzzy(file, 1, allFunctions);
|
|
220736
|
+
}
|
|
220737
|
+
const matches = getFuzzyMatches(allFunctions, 0.7);
|
|
220738
|
+
await reportFuzzy(matches);
|
|
220739
|
+
} else if (!syntactic) {
|
|
220633
220740
|
console.log("\u{1F50D} Mode: Semantic clone detection (AST-based)");
|
|
220634
220741
|
const semanticMap = /* @__PURE__ */ new Map();
|
|
220635
220742
|
for (const file of files) {
|
|
@@ -220652,10 +220759,14 @@ var program2 = new Command();
|
|
|
220652
220759
|
program2.name("PastaPolice").description("Detect copy-paste code blocks").argument("[path]", "path to scan", ".").option("-m, --min-lines <number>", "minimum lines per block", "5").option(
|
|
220653
220760
|
"-s, --syntactic",
|
|
220654
220761
|
"use syntactic (line-based) detection instead of semantic"
|
|
220762
|
+
).option(
|
|
220763
|
+
"-f, --fuzzy",
|
|
220764
|
+
"use fuzzy semantic matching to find similar (not exact) functions"
|
|
220655
220765
|
).action(async (path, options) => {
|
|
220656
220766
|
const minLines = parseInt(options.minLines, 10);
|
|
220657
220767
|
const syntactic = options.syntactic || false;
|
|
220658
|
-
|
|
220768
|
+
const fuzzy = options.fuzzy || false;
|
|
220769
|
+
await scan(path, minLines, syntactic, fuzzy);
|
|
220659
220770
|
});
|
|
220660
220771
|
program2.parse();
|
|
220661
220772
|
/*! Bundled license information:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pastapolice",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "Detect copy-paste and semantically similar code in TypeScript/JavaScript projects",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -10,6 +10,10 @@
|
|
|
10
10
|
"bin": {
|
|
11
11
|
"pastapolice": "./dist/index.cjs"
|
|
12
12
|
},
|
|
13
|
+
"repository": {
|
|
14
|
+
"type": "git",
|
|
15
|
+
"url": "https://github.com/lawlesx/PastaPolice.git"
|
|
16
|
+
},
|
|
13
17
|
"keywords": [
|
|
14
18
|
"duplicate",
|
|
15
19
|
"code",
|